diff --git a/nlp/llm/aquila2-34b/pytorch/README.md b/nlp/llm/aquila2-34b/pytorch/README.md
index 5fd7a50b2909dc64ea36a491f7e3f44cf892daef..9c95b4742b5601f0cf0280a9b7ddc27f11b003c2 100644
--- a/nlp/llm/aquila2-34b/pytorch/README.md
+++ b/nlp/llm/aquila2-34b/pytorch/README.md
@@ -29,7 +29,6 @@ ssh-copy-id -i ~/.ssh/id_rsa.pub ${host_name}  ## {host_name} can be a specified
 ```sh
 # install
 cd <DeepSparkHub_Root>/toolbox/Megatron-DeepSpeed
-bash build_megatron-deepspeed.sh && bash install_megatron-deepspeed.sh
 ```
 
 ### Preparing datasets on all nodes
diff --git a/nlp/llm/llama2-13b/pytorch/README.md b/nlp/llm/llama2-13b/pytorch/README.md
index 523498f8cecf6cd313b161282fa9d2e395c83321..1e76ae223ad64369d948ce5185437d15ffbda136 100644
--- a/nlp/llm/llama2-13b/pytorch/README.md
+++ b/nlp/llm/llama2-13b/pytorch/README.md
@@ -26,7 +26,6 @@ ssh-copy-id -i ~/.ssh/id_rsa.pub ${host_name}  ## {host_name} can be a specified
 ```sh
 # install
 cd <DeepSparkHub_Root>/toolbox/Megatron-DeepSpeed
-bash build_megatron-deepspeed.sh && bash install_megatron-deepspeed.sh
 ```
 
 ### Preparing datasets on all nodes
diff --git a/nlp/llm/llama2-34b/pytorch/README.md b/nlp/llm/llama2-34b/pytorch/README.md
index bb9c845f576c4218c4b37614397ade52b9a1c3ea..aa3c3bd5e2fd9b26577d576a6dffbb968a20974d 100644
--- a/nlp/llm/llama2-34b/pytorch/README.md
+++ b/nlp/llm/llama2-34b/pytorch/README.md
@@ -28,7 +28,6 @@ ssh-copy-id -i ~/.ssh/id_rsa.pub ${host_name}  ## {host_name} can be a specified
 ```sh
 # install
 cd <DeepSparkHub_Root>/toolbox/Megatron-DeepSpeed
-bash build_megatron-deepspeed.sh && bash install_megatron-deepspeed.sh
 ```
 
 ### Preparing datasets on all nodes
diff --git a/nlp/llm/llama2-7b/pytorch/README.md b/nlp/llm/llama2-7b/pytorch/README.md
index 800c59b0e5395b3891bcd2ac2194103eb5fc091f..fda863dc7e8532a24caf85274d85b550f294a463 100644
--- a/nlp/llm/llama2-7b/pytorch/README.md
+++ b/nlp/llm/llama2-7b/pytorch/README.md
@@ -22,13 +22,6 @@ tar -xf gpt_small_117M.tar
 rm -f gpt_small_117M.tar
 ```
 
-### Install Dependencies
-
-```sh
-# install
-bash build_megatron-deepspeed.sh && bash install_megatron-deepspeed.sh
-```
-
 ## Model Training
 
 ```sh
diff --git a/nlp/llm/llama2-7b_rlhf/pytorch/README.md b/nlp/llm/llama2-7b_rlhf/pytorch/README.md
index bde0481907e39c49764d699c5a542c6becabed7f..1b18409e7a37769601e1b7530023c59e460c80a5 100644
--- a/nlp/llm/llama2-7b_rlhf/pytorch/README.md
+++ b/nlp/llm/llama2-7b_rlhf/pytorch/README.md
@@ -21,26 +21,24 @@ Download dataset and convert it.
 cd <DeepSparkHub_Root>/toolbox/Megatron-DeepSpeed/
 
 pushd dataset/
-
 # get gpt_small_117M.tar
 wget http://files.deepspark.org.cn:880/deepspark/data/datasets/gpt_small_117M.tar
 tar -xf gpt_small_117M.tar
 rm -f gpt_small_117M.tar
 popd
-```
-
-### Install Dependencies
 
-```sh
-# install
-bash build_megatron-deepspeed.sh && bash install_megatron-deepspeed.sh
+# Download checkpoints as above and put them to proper path, then convert checkpoints.
+pushd checkpoints
+bash download_rlhf_checkpoints.sh
+bash convert_hf_2_meg.sh
+popd
 ```
 
 ## Model Training
 
 ```sh
 cd examples/llama2
-# Modify run_llama2_7b_1node.sh according to your machine: for example, HOST_NAME, ADDR_ARRAY, CONTAINER_NAME, NCCL_SOCKET_IFNAME
+# Modify run_llama2_7b_rlhf_node1.sh according to your machine: for example, HOST_NAME, ADDR_ARRAY, CONTAINER_NAME, NCCL_SOCKET_IFNAME, DATA_PATH
 bash run_llama2_7b_rlhf_node1.sh
 ```
 
diff --git a/nlp/llm/llama2-7b_sft/pytorch/README.md b/nlp/llm/llama2-7b_sft/pytorch/README.md
index a58ea7494ff6ed61ea01aea1764d622fc6243311..4020a1a61f7ccae1e3ed415dfad9a97753939012 100644
--- a/nlp/llm/llama2-7b_sft/pytorch/README.md
+++ b/nlp/llm/llama2-7b_sft/pytorch/README.md
@@ -24,13 +24,6 @@ rm -f gpt_small_117M.tar
 popd
 ```
 
-### Install Dependencies
-
-```sh
-# install
-bash build_megatron-deepspeed.sh && bash install_megatron-deepspeed.sh
-```
-
 ## Model Training
 
 ```sh
diff --git a/toolbox/Megatron-DeepSpeed/.coveragerc b/toolbox/Megatron-DeepSpeed/.coveragerc
deleted file mode 100644
index 29de6ff8a383d4cd31a87e3c10954df2fe90d419..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/.coveragerc
+++ /dev/null
@@ -1,5 +0,0 @@
-[html]
-directory = coverage
-
-[run]
-data_file = .coverage_$LOCAL_RANK
diff --git a/toolbox/Megatron-DeepSpeed/.gitignore b/toolbox/Megatron-DeepSpeed/.gitignore
deleted file mode 100644
index c37f055fa5ee00c789dea87eb30152f97b1b17eb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/.gitignore
+++ /dev/null
@@ -1,37 +0,0 @@
-__pycache__
-
-# Distribution / packaging
-build/
-build_pip/
-dist/
-*.egg-info/
-
-# binaries
-*.so
-
-# tmp files
-*.swp
-
-# AML workspace config file
-config.json
-
-# DeepSpeed config file
-ds_config.json
-
-.coverage_*
-*~
-slurm*
-logs
-
-# Data folder
-dataset/BookCorpusDataset/
-dataset/gpt_small_117M*
-dataset/dahoas/
-dataset/dahoas_*
-
-tests/test_logs/
-tests/exit_*
-
-checkpoints/output*/
-checkpoints/rlhf*/
-checkpoints/TinyLlama*/
diff --git a/toolbox/Megatron-DeepSpeed/.gitlab-ci.yml b/toolbox/Megatron-DeepSpeed/.gitlab-ci.yml
deleted file mode 100644
index 0abebc72a7f2fbce78d98533a8e4ffabc948407c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/.gitlab-ci.yml
+++ /dev/null
@@ -1,302 +0,0 @@
-image: nvcr.io/nvidia/pytorch:23.04-py3
-
-stages:
-  - test
-  - cleanup
-
-variables: &VARS
-  SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
-  DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
-  PYTORCH_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
-  PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
-  TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: L0 # Can specify levels
-  TESTS_TO_RUN_AFTER_MERGING: L0  # Can specify levels
-  TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
-  TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
-  DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
-
-unit_tests:
-  tags:
-    - docker_local_runner
-  stage: test
-  script:
-    - pip install pytest-cov
-    - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
-  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
-  artifacts:
-    paths:
-      - coverage
-    expire_in: 30 days
-  only:
-    - merge_requests
-
-.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
-  tags:
-    - ssh_selene_runner
-  stage: test
-  script: &selene-test-resume-launcher-script
-    - echo "Running selene resume from checkpoint test. "
-    - pwd
-    - export BUILD_DIR=`pwd`
-    - export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
-    - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
-    - export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS
-    - export DATA_DIR=$DATA_DIR
-    - echo "Run name is $RUN_NAME"
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
-    - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
-    - export LOGS_DIR=$BASE_DIR/logs
-    - export RESULTS_DIR=$BASE_DIR/results
-    - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
-    - echo "Submitting job"
-    - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES`
-    - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
-    - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
-    - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
-                "----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
-                "---------------------------------------------------\n"
-                "$(scontrol show job=${SLURM_JOBID})\n"
-                "---------------------------------------------------\n"
-    # Gitlab logs collapsible section markers
-    - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
-    # Follow output of the job
-    - echo "Finished job"
-    - export SLURM_STATE=$(sacct -j "${SLURM_JOBID}" --format State --parsable2 --noheader |& head -n 1)
-    - echo "Slurm job state $SLURM_STATE"
-    - if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
-    - source $PYTHON_VIRTUAL_ENV
-    - cmd="pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py"
-    - if $cmd; then echo "Pytest succeded"; else  echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs"; fi
-    - echo "Completed the job"
-  rules:
-    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
-      when: always
-    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
-      when: always
-    - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
-      when: always
-  allow_failure: false
-
-.selene_test_launcher: &selene-test-launcher
-  tags:
-    - ssh_selene_runner
-  stage: test
-  script: &selene-test-launcher-script
-    - echo "Running selene test"
-    - echo "$CI_MERGE_REQUEST_APPROVED"
-    - pwd
-    - export BUILD_DIR=`pwd`
-    - RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
-    - if [[ $USE_TE == 1 ]]; then RUN_NAME=${RUN_NAME}_te_enabled; fi
-    - export $RUN_NAME
-    - echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
-    - export USE_TE TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
-    - export MBS GBS
-    - export DATA_DIR=$DATA_DIR
-    - echo "Run name is $RUN_NAME"
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
-    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints/*
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs/*
-    - rm -rf $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
-    - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
-    - export LOGS_DIR=$BASE_DIR/logs
-    - export RESULTS_DIR=$BASE_DIR/results
-    - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
-    - echo "Submitting job"
-    - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,USE_TE,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE,MBS,GBS`
-    - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
-    - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
-    - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
-                "----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
-                "---------------------------------------------------\n"
-                "$(scontrol show job=${SLURM_JOBID})\n"
-                "---------------------------------------------------\n"
-    # Gitlab logs collapsible section markers
-    - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
-    # Follow output of the job
-    - echo "Finished job"
-    - echo "Slurm log dump start ------------------------------------------------------------"
-    - cat $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results/*
-    - echo "Slurm log dump end --------------------------------------------------------------"
-    - python3 $BUILD_DIR/tests/functional_tests/python_test_utils/check_slurm_job_completion.py $SLURM_JOBID
-    - if [ $? -ne 0 ]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
-    - source $PYTHON_VIRTUAL_ENV
-    - |
-      if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
-        python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
-      fi
-    - |
-      if [[ $USE_TE -ne 1 ]]; then
-        echo "Checking against ground truth file"
-        export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
-        cmd="pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py"
-        if $cmd; then echo "Pytest succeded"; else  echo "Pytest failed. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs"; fi
-      fi
-    - echo "Completed the job"
-  rules:
-    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
-      when: always
-    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
-      when: always
-    - if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
-      when: always
-  allow_failure: false
-
-train.te_gpt3.345m_tp2_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 1
-    TP_SIZE: 2
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "50:00"
-    TEST_LEVEL: L0
-
-train.gpt3.345m_tp4_pp1_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 0
-    TP_SIZE: 4
-    PP_SIZE: 1
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.gpt3.345m_tp2_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 0
-    TP_SIZE: 2
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.gpt3.345m_tp1_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 0
-    TP_SIZE: 1
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.gpt3.345m_tp1_pp4_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    USE_TE: 0
-    TP_SIZE: 1
-    PP_SIZE: 4
-    VP_SIZE: 1
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-resume.checkpoint.gpt3.345m_tp1_pp2_1node:
-  <<: *selene-test-resume-checkpoint-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: gpt3
-    TP_SIZE: 1
-    PP_SIZE: 2
-    NUM_NODES: 1
-    TIME_LIMIT: "30:00"
-    TEST_LEVEL: L0
-
-train.bert.345m_tp4_pp1_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 4
-    PP_SIZE: 1
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.bert.345m_tp2_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 2
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.bert.345m_tp1_pp2_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 1
-    PP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-train.bert.345m_tp1_pp4_1node_50steps:
-  <<: *selene-test-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 1
-    PP_SIZE: 4
-    VP_SIZE: 2
-    NUM_NODES: 1
-    MAX_STEPS: 50
-    TIME_LIMIT: "20:00"
-    TEST_LEVEL: L0
-
-resume.checkpoint.bert.345m_tp1_pp2_1node:
-  <<: *selene-test-resume-checkpoint-launcher
-  variables:
-    <<: [*VARS]
-    RUN_MODEL: bert
-    TP_SIZE: 1
-    PP_SIZE: 2
-    NUM_NODES: 1
-    TIME_LIMIT: "30:00"
-    TEST_LEVEL: L0
-
-cleanup.selene:
-  tags:
-    - ssh_selene_runner
-  stage: cleanup
-  variables:
-    <<: [*VARS]
-  script:
-    - set +e
-    - NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
-    - find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
-    - echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
-  allow_failure: true
-  rules:
-    - when: always
diff --git a/toolbox/Megatron-DeepSpeed/CODEOWNERS b/toolbox/Megatron-DeepSpeed/CODEOWNERS
deleted file mode 100644
index 60a921d7f6d586692195e89dba62a54f7d8ec23d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/CODEOWNERS
+++ /dev/null
@@ -1 +0,0 @@
-*       @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @duli2012 @mrwyattii @yaozhewei @arashb @xiaoxiawu-microsoft @guanhuawang
diff --git a/toolbox/Megatron-DeepSpeed/ILUVATAR.md b/toolbox/Megatron-DeepSpeed/ILUVATAR.md
deleted file mode 100644
index ac140aa2248e19ac9b1e7d66ecf0abffe727321c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/ILUVATAR.md
+++ /dev/null
@@ -1,99 +0,0 @@
-### 1. Install
-
-```
-bash clean_megatron-deepspeed.sh
-bash build_megatron-deepspeed.sh
-bash install_megatron-deepspeed.sh
-```
-
-### 2. CI Test
-
-#### 2.1 Test node = 1
-
-```
-cd ci && bash run_ci_tests_one_node.sh
-```
-
-#### 2.2 Test node >= 2
-
-First, you should make sure something below.
-
-1. The CI Test in 1 node can pass in master node container.
-2. Copy master node container environment to other node servers.
-3. Make sure the account name, contrainer name is the same in different node servers.
-4. Set up password free login between the master node container and other node servers.
-
-Second, set your node server info. You can set up like:
-
-```
-## The account in server
-export HOST_NAME="username"
-
-## Severs IP, begin with the master node server IP, and split by ","
-export ADDR_ARRAY="10.111.222.1,10.111.222.2"
-
-## Container name
-export CONTAINER_NAME="megatron-deepspeed"
-```
-
-Third, run.
-
-```
-cd ci && bash run_ci_tests_multi_node.sh
-```
-
-### 3. Run Aquila-7b bf16 pretrain
-
-#### 3.1 Download Dataset
-
-```
-bash dataset/download_dataset.sh
-bash dataset/download_vocab.sh
-```
-
-#### 3.2 Run node=1
-
-```
-cd examples/aquila && bash run_aquila_7b_node1_bf16.sh
-```
-
-#### 3.3 Run node=2
-
-First, you should make sure something below.
-
-1. The pretrain in 1 node run successfully in master node container.
-2. Copy master node container environment to other node servers.
-3. Make sure the account name, contrainer name is the same in different node servers.
-4. Set up password free login between the master node container and other node servers.
-5. Make megatron-deepspeed repo and dataset at same path in different node servers.
-
-Second, set your node server info. You can set up like:
-
-```
-## The account in server
-export HOST_NAME="username"
-
-## Severs IP, begin with the master node server IP, and split by ","
-export ADDR_ARRAY="10.111.222.1,10.111.222.2"
-
-## Container name
-export CONTAINER_NAME="megatron-deepspeed"
-```
-
-Third, run.
-
-```
-cd examples/aquila && bash run_aquila_7b_node2_bf16.sh
-```
-
-### 4. Data preprocess
-
-If you have an your own  jsonl text dataset, and you want to use it in Megatron-Deepspeed, you can do something to change it to proper format.
-
-For example, look file `megatron-deepspeed/dataset/convert_llama2tokenizer_dataset.sh` , you need to change some main parameter by your own, there contain:
-
-* tokenizer-type
-* tokenizer-model or (vocab-file and merge-file）
-* json-keys
-* input
-* output-prefix
diff --git a/toolbox/Megatron-DeepSpeed/LICENSE b/toolbox/Megatron-DeepSpeed/LICENSE
deleted file mode 100644
index 281fde95a677d5faceb5e2cf6a69fcdf67cc0c33..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/LICENSE
+++ /dev/null
@@ -1,376 +0,0 @@
-The following applies to all files unless otherwise noted:
-
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---
-
-This repository also contains code from Hugging Face Inc., Google Research,
-Facebook (from their Fairseq and Dino projects), Microsoft(from their 
-Swin-Transformer project)and Philip Popien. Files from these 
-organizations have notices at  the top of each file. Below are 
-licenses used in those files, as indicated.
-
-
-------------- LICENSE FOR Facebook, huggingface and Google Research code  --------------
-
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-------------- LICENSE FOR Facebook Fairseq code --------------
-
-MIT License
-
-Copyright (c) Facebook, Inc. and its affiliates.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-------------- LICENSE FOR Mircrosoft Swin transformer code --------------
-
-MIT License
-
-Copyright (c) Microsoft Corporation.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE
-
-
---------------- NVIDIA Source Code License for SegFormer -----------------
-1. Definitions
-
-“Licensor” means any person or entity that distributes its Work.
-
-“Software” means the original work of authorship made available under this
-License.
-
-“Work” means the Software and any additions to or derivative works of the
-Software that are made available under this License.
-
-The terms “reproduce,” “reproduction,” “derivative works,” and 
-“distribution” have the meaning as provided under U.S. copyright law;
-provided, however, that for the purposes of this License, derivative works
-shall not include works that remain separable from, or merely link 
-(or bind by name) to the interfaces of, the Work.
-
-Works, including the Software, are “made available” under this License by 
-including in or with the Work either (a) a copyright notice referencing 
-the applicability of this License to the Work, or (b) a copy of this License.
-
-2. License Grant
-
-2.1 Copyright Grant. Subject to the terms and conditions of this License,
-each Licensor grants to you a perpetual, worldwide, non-exclusive, 
-royalty-free, copyright license to reproduce, prepare derivative works of, 
-publicly  display, publicly perform, sublicense and distribute its Work 
-and any resulting derivative works in any form.
-
-3. Limitations
-
-3.1 Redistribution. You may reproduce or distribute the Work only if 
-(a) you do so under this License, (b) you include a complete copy of this 
-License with your distribution, and (c) you retain without modification any
-copyright, patent, trademark, or attribution notices that are present
-in the Work.
-
-3.2 Derivative Works. You may specify that additional or different terms 
-apply to the use, reproduction, and distribution of your derivative works 
-of the Work (“Your Terms”) only if (a) Your Terms provide that the use
-limitation in Section 3.3 applies to your derivative works, and (b) you 
-identify the specific derivative works that are subject to Your Terms. 
-Notwithstanding Your Terms, this License (including the redistribution
-requirements in Section 3.1) will continue to apply to the Work itself.
-
-3.3 Use Limitation. The Work and any derivative works thereof only may 
-be used or intended for use non-commercially. Notwithstanding the 
-foregoing, NVIDIA and its affiliates may use the Work and any derivative
-works commercially. As used herein, “non-commercially” means for research 
-or evaluation purposes only.
-
-3.4 Patent Claims. If you bring or threaten to bring a patent claim against 
-any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) 
-to enforce any patents that you allege are infringed by any Work, then
-your rights under this License from such Licensor (including the grant 
-in Section 2.1) will terminate immediately.
-
-3.5 Trademarks. This License does not grant any rights to use any Licensor’s 
-or its affiliates’ names, logos, or trademarks, except as necessary to 
-reproduce the notices described in this License.
-
-3.6 Termination. If you violate any term of this License, then your rights 
-under this License (including the grant in Section 2.1) will terminate 
-immediately.
-
-4. Disclaimer of Warranty.
-
-THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF 
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT.
-YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
-
-5. Limitation of Liability.
-
-EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL 
-THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE 
-SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
-INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT 
-OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK 
-(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
-LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER 
-COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-
diff --git a/toolbox/Megatron-DeepSpeed/MANIFEST.in b/toolbox/Megatron-DeepSpeed/MANIFEST.in
deleted file mode 100644
index b73769311f16c1b5a2a144cc45695490fbf4f2b1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/MANIFEST.in
+++ /dev/null
@@ -1,5 +0,0 @@
-include megatron_ds/data/Makefile
-include megatron_ds/data/helpers.cpp
-include megatron_ds/core/datasets/Makefile
-include megatron_ds/core/datasets/helpers.cpp
-recursive-include megatron_ds/fused_kernels *.cpp *.h *.cu *.tr *.cuh *.cc
diff --git a/toolbox/Megatron-DeepSpeed/README.md b/toolbox/Megatron-DeepSpeed/README.md
deleted file mode 100644
index 976f30b923235bc4897c53ef6f8e08affe55fb73..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/README.md
+++ /dev/null
@@ -1,530 +0,0 @@
-Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research related to training large transformer language models at scale. We developed efficient, model-parallel ([tensor](https://arxiv.org/pdf/1909.08053.pdf), [sequence](https://arxiv.org/pdf/2205.05198), and [pipeline](https://arxiv.org/pdf/2104.04473.pdf)), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision.
-
-Below are some of the projects where we have directly used Megatron:
-* [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf)
-* [BioMegatron: Larger Biomedical Domain Language Model](https://www.aclweb.org/anthology/2020.emnlp-main.379.pdf)
-* [End-to-End Training of Neural Retrievers for Open-Domain Question Answering](https://arxiv.org/abs/2101.00408)
-* [Large Scale Multi-Actor Generative Dialog Modeling](https://www.aclweb.org/anthology/2020.acl-main.8.pdf)
-* [Local Knowledge Powered Conversational Agents](https://arxiv.org/abs/2010.10150)
-* [MEGATRON-CNTRL: Controllable Story Generation with External Knowledge Using Large-Scale Language Models](https://www.aclweb.org/anthology/2020.emnlp-main.226.pdf)
-* [RACE Reading Comprehension Dataset Leaderboard](http://www.qizhexie.com/data/RACE_leaderboard.html)
-* [Training Question Answering Models From Synthetic Data](https://www.aclweb.org/anthology/2020.emnlp-main.468.pdf)
-* [Few-shot Instruction Prompts for Pretrained Language Models to Detect Social Biases](https://arxiv.org/abs/2112.07868)
-* [Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173)
-* [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](https://arxiv.org/abs/2201.11990)
-* [Multi-Stage Prompting for Knowledgeable Dialogue Generation](https://arxiv.org/abs/2203.08745)
-* [Evaluating Parameter Efficient Learning for Generation](https://aclanthology.org/2022.emnlp-main.319.pdf)
-
-Megatron is also used in [NeMo Megatron](https://developer.nvidia.com/nvidia-nemo#nemo-megatron), a framework to help enterprises overcome the challenges of building and training sophisticated natural language processing models with billions and trillions of parameters.
-
-Our codebase is capable of efficiently training very large (hundreds of billions of parameters) language models with both model and data parallelism. To demonstrate how the code scales with multiple GPUs and model sizes, we consider GPT models from 1 billion all the way to 1 trillion parameters. All models use a vocabulary size of 51,200 and a sequence length of 2048. We vary hidden size, number of attention heads, and number of layers to arrive at a specific model size. As the model size increases, we also modestly increase the batch size. We leverage [NVIDIA's Selene supercomputer](https://www.top500.org/system/179842/) to perform scaling studies and use up to 3072 [A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for the largest model. Each cluster node has 8 NVIDIA 80GB A100 GPUs. The graph below shows that we scale nearly linear up to 1 trillion parameter models running on 3072 GPUs. Note that these results are from benchmark runs and these models were not trained to convergence; however, the FLOPs are measured for end-to-end training, i.e., includes all operations including data loading, optimization, and even logging.
-
-![Scaling Graph](images/Achieved_petaFLOPs.png)
-
-The following table shows both model (MFU) and hardware (HFU) FLOPs utilization for select configurations up to 1T parameters (see [our paper](https://arxiv.org/pdf/2205.05198) for a description of how these are calculated). As the model size increases, we achieve better GPU utilization. For the one trillion parameter model, we reach a MFU and HFU of 56.3% and 57.0%, respectively. Note that these numbers are also measured on benchmark runs and in this case are measured using a data parallel size of one. Data parallelism introduces some overhead due to the gradient all-reduce required between the data parallel groups. However, for large transformer models, this overhead is not large and can almost entirely eliminated by overlapping the gradient all-reduce with backpropagation.
-
-| Model Size | Model FLOPs Utilization | Hardware FLOPs Utilization |
-| :---: | :---: | :---: |
-| 22B   | 41.5% | 43.7% |
-| 175B  | 51.4% | 52.8% |
-| 530B  | 56.0% | 57.0% |
-| 1T    | 56.3% | 57.0% |
-
-# Contents
-   * [Contents](#contents)
-   * [Setup](#setup)
-      * [Downloading Checkpoints](#downloading-checkpoints)
-   * [Usage](#usage)
-   * [Training](#training)
-      * [Data Preprocessing](#data-preprocessing)
-      * [BERT Pretraining](#bert-pretraining)
-      * [GPT Pretraining](#gpt-pretraining)
-      * [T5 Pretraining](#t5-pretraining)
-      * [Distributed Pretraining](#distributed-pretraining)
-      * [Activation Checkpointing and Recomputation](#activation-checkpointing-and-recomputation)
-      * [Distributed Optimizer](#distributed-optimizer)
-      * [FlashAttention](#flashattention)
-      * [GPT-3 Example](#gpt-3-example)
-      * [Retro](#retro)
-   * [Evaluation and Tasks](#evaluation-and-tasks)
-      * [GPT Text Generation](#gpt-text-generation)
-      * [GPT Evaluation](#gpt-evaluation)
-         * [WikiText Perplexity Evaluation](#wikitext-perplexity-evaluation)
-         * [LAMBADA Cloze Accuracy](#lambada-cloze-accuracy)
-      * [BERT Task Evaluation](#bert-task-evaluation)
-         * [RACE Evaluation](#race-evaluation)
-         * [MNLI Evaluation](#mnli-evaluation)
-      * [Llama-2 Inference and Finetuning](#llama-2-inference-and-finetuning)
-   * [Datasets](#datasets)
-      * [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
-      * [Collecting GPT Webtext Data](#collecting-gpt-webtext-data)
-   * [Reproducibility](#reproducibility)
-
-# Setup
-We strongly recommend using the latest release of [NGC's PyTorch container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) with DGX nodes. If you can't use this for some reason, use the latest pytorch, cuda, nccl, and NVIDIA [APEX](https://github.com/NVIDIA/apex#quick-start) releases.  Data preprocessing requires [NLTK](https://www.nltk.org/install.html), though this is not required for training, evaluation, or downstream tasks.
-
-You can launch an instance of the PyTorch container and mount Megatron, your dataset, and checkpoints with the following Docker commands:
-```
-docker pull nvcr.io/nvidia/pytorch:xx.xx-py3
-docker run --gpus all -it --rm -v /path/to/megatron:/workspace/megatron -v /path/to/dataset:/workspace/dataset -v /path/to/checkpoints:/workspace/checkpoints nvcr.io/nvidia/pytorch:xx.xx-py3
-```
-
-## Downloading Checkpoints
-We have provided pretrained [BERT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m) and [GPT-345M](https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m) checkpoints to evaluate or for finetuning downstream tasks. To access these checkpoints, first [sign up](https://ngc.nvidia.com/signup) for and [setup](https://ngc.nvidia.com/setup/installers/cli) the NVIDIA GPU Cloud (NGC) Registry CLI. Further documentation for downloading models can be found in the [NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
-
-Alternatively, you can directly download the checkpoints using:
-
-<pre>
-BERT-345M-uncased: wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip -O megatron_bert_345m_v0.1_uncased.zip
-BERT-345M-cased: wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O megatron_bert_345m_v0.1_cased.zip
-GPT-345M: wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip
-</pre>
-
-The models require vocabulary files to run. The BERT  WordPiece vocab file can be extracted from Google's pretrained BERT models: [uncased](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt), [cased](https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt). The GPT [vocab file](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json) and [merge table](https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt) can be downloaded directly.
-
-# Usage
-
-After installation, there are several possible workflows. The most comprehensive is:
-1. Data preprocessing
-2. Pretraining
-3. Finetuning (Optional for zero-shot tasks)
-4. Downstream task evaluation or text generation
-
-However, steps 1 and 2 can be replaced by using one of the pretrained models mentioned above.
-
-We've provided several scripts for pretraining both BERT and GPT in the [`examples`](./examples) directory, as well as scripts for both zero-shot and fine-tuned downstream tasks including MNLI, RACE, WikiText103, and LAMBADA evaluation. There is also a script for GPT interactive text generation.
-
-# Training
-## Data Preprocessing
-The training data requires preprocessing. First, place your training data in a loose json format, with one json containing a text sample per line. For example:
-<pre>
-{"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
-{"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
-</pre>
-
-The name of the `text` field of the json can be changed by using the `--json-key` flag in [`preprocess_data.py`](./tools/preprocess_data.py) The other metadata are optional and are not used in training.
-
-The loose json is then processed into a binary format for training. To convert the json into mmap format use `preprocess_data.py`. An example script to prepare data for BERT training is:
-<pre>
-python tools/preprocess_data.py \
-       --input my-corpus.json \
-       --output-prefix my-bert \
-       --vocab-file bert-vocab.txt \
-       --tokenizer-type BertWordPieceLowerCase \
-       --split-sentences \
-       --workers 5
-</pre>
-
-The output will be two files named, in this case, `my-bert_text_sentence.bin` and `my-bert_text_sentence.idx`. The `--data-path` specified in later BERT training is the full path and new filename, but without the file extension.
-
-For T5 use the same preprocessing as BERT, perhaps renaming it to:
-<pre>
-       --output-prefix my-t5 \
-</pre>
-
-Some minor modifications are required for GPT data preprocessing, namely, the addition of a merge table, an end-of-document token, removal of sentence splitting, and a change to the tokenizer type:
-<pre>
-python tools/preprocess_data.py \
-       --input my-corpus.json \
-       --output-prefix my-gpt2 \
-       --vocab-file gpt2-vocab.json \
-       --tokenizer-type GPT2BPETokenizer \
-       --merge-file gpt2-merges.txt \
-       --append-eod \
-       --workers 5
-</pre>
-
-Here the output files are named `my-gpt2_text_document.bin` and `my-gpt2_text_document.idx`. As before, in GPT training, use the longer name without the extension as `--data-path`.
-
-Further command line arguments are described in the source file [`preprocess_data.py`](./tools/preprocess_data.py).
-
-## BERT Pretraining
-
-
-The [`examples/pretrain_bert.sh`](./examples/pretrain_bert.sh) script runs single GPU 345M parameter BERT pretraining. Debugging is the primary use for single GPU training, as the code base and command line arguments are optimized for highly distributed training. Most of the arguments are fairly self-explanatory. By default, the learning rate decays linearly over the training iterations starting at `--lr` to a minimum set by `--min-lr` over `--lr-decay-iters` iterations. The fraction of training iterations used for warmup is set by `--lr-warmup-fraction`. While this is single GPU training, the batch size specified by `--micro-batch-size` is a single forward-backward path batch-size and the code will perform gradient accumulation steps until it reaches `global-batch-size` which is the batch size per iteration. The data is partitioned into a 949:50:1 ratio for training/validation/test sets (default is 969:30:1). This partitioning happens on the fly, but is consistent across runs with the same random seed (1234 by default, or specified manually with `--seed`). We use `train-iters` as the training iterations requested. Alternatively, one can provide `--train-samples` which is total number of samples to train on. If this option is present, then instead of providing `--lr-decay-iters`, one will need to provide `--lr-decay-samples`.
-
-The logging, checkpoint-saving, and evaluation interval options are specified. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions.
-
-Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py).
-
-To run `examples/pretrain_bert.sh`, make any desired modifications including setting the environment variables for `CHECKPOINT_PATH`, `VOCAB_FILE`, and `DATA_PATH`. Make sure to set these variables to their paths in the container. Then launch the container with Megatron and necessary paths mounted (as explained in [Setup](#setup)) and run the example script.
-
-## GPT Pretraining
-
-The `examples/pretrain_gpt.sh` script runs single GPU 345M parameter GPT pretraining. As mentioned above, single GPU training is primarily intended for debugging purposes, as the code is optimized for distributed training.
-
-It follows largely the same format as the previous BERT script with a few notable differences: the tokenization scheme used is BPE (which requires a merge table and a `json` vocabulary file) instead of WordPiece, the model architecture allows for longer sequences (note that the max position embedding must be greater than or equal to the maximum sequence length), and the `--lr-decay-style` has been set to cosine decay.  Note that the `--data-path` now includes the additional `_text_document` suffix added in preprocessing, but does not include the file extensions.
-
-Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py).
-
-`examples/pretrain_gpt.sh` can be launched the same way as described for BERT. Set the env vars and make any other modifications, launch the container with appropriate mounts, and run the script.
-
-## T5 Pretraining
-
-Very similar to BERT and GPT, the `examples/pretrain_t5.sh` script runs single GPU "base" (~220M parameter) T5 pretraining. The primary difference from BERT and GPT is the addition of the following arguments to accommodate the T5 architecture:
-
-* `--kv-channels` sets the inner dimension of the "key" and "value" matrices of all attention mechanisms in the model. For BERT and GPT this defaults to the hidden size divided by the number of attention heads, but can be configured for T5.
-
-* `--ffn-hidden-size` sets the hidden size in the feed-forward networks within a transformer layer. For BERT and GPT this defaults to 4 times the transformer hidden size, but can be configured for T5.
-
-* `--encoder-seq-length` and `--decoder-seq-length` set the sequence length for the encoder and decoder separately.
-
-All of the other arguments remain as they were for BERT and GPT pretraining. Run this example with the same steps described above for the other scripts.
-
-## Distributed Pretraining
-
-The `examples/pretrain_{bert,gpt,t5}_distributed.sh` scripts use the PyTorch distributed launcher for distributed training. As such, multi-node training can be achieved by properly setting environment variables. See the official PyTorch [documentation](https://pytorch.org/docs/stable/elastic/run.html#launcher-api) for further description of these [environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization). By default, multi-node training uses the [nccl](https://developer.nvidia.com/nccl) distributed backend. A simple set of additional arguments and the use of the PyTorch distributed module with the `torchrun` elastic launcher (equivalent to `python -m torch.distributed.run`) are the only additional requirements to adopt distributed training. See any of `examples/pretrain_{bert,gpt,t5}_distributed.sh` for more details.
-
-We use two types of parallelism: data and model parallelism. We facilitate two distributed data parallel implementations: a simple one of our own that performs gradient all-reduce at the end of back propagation step, and Torch's distributed data parallel wrapper that overlaps gradient reduction with back propagation computation. To switch between these two options use `--DDP-impl local` or `--DDP-impl torch`, respectively. As expected, Torch distributed data parallelism is more efficient at larger model sizes. For example, for the 8.3 billion parameters model running on 512 GPUs, the scaling increases from 60% to 76% when Torch's distributed data parallel is used. However, the overlapping method requires more memory and for some configurations (e.g., 2.5 billion parameters using 2-way model parallel and 1.2 billion parameters with no model parallel) can make the overall training slower as a result. We empirically found that using a smaller model in those cases improves the training time.
-
-Second, we developed a simple and efficient two-dimensional model-parallel approach. To use the first dimension, tensor model parallelism (splitting execution of a single transformer module over multiple GPUs, see Section 3 of [our paper](https://arxiv.org/pdf/1909.08053.pdf)), add the `--tensor-model-parallel-size` flag to specify the number of GPUs among which to split the model, along with the arguments passed to the distributed launcher as mentioned above. To use the second dimension, sequence parallelism, specify `--sequence-parallel`, which also requires tensor model parallelism to be enabled because it splits across the same GPUs (more details in Section 4.2.2 of [our paper](https://arxiv.org/pdf/2205.05198.pdf)).
-
-To use pipeline model parallelism (sharding the transformer modules into stages with an equal number of transformer modules on each stage, and then pipelining execution by breaking the batch into smaller microbatches, see Section 2.2 of [our paper](https://arxiv.org/pdf/2104.04473.pdf)), use the `--pipeline-model-parallel-size` flag to specify the number of stages to split the model into (e.g., splitting a model with 24 transformer layers across 4 stages would mean each stage gets 6 transformer layers each).
-
-<!-- The number of microbatches in a per-pipeline minibatch is controlled by the `--num-microbatches-in-minibatch` argument. With `WORLD_SIZE` GPUs, `TENSOR_MP_SIZE` tensor-model-parallel size, `PIPELINE_MP_SIZE` pipeline-model-parallel-size, `WORLD_SIZE`/(`TENSOR_MP_SIZE` * `PIPELINE_MP_SIZE`) GPUs will be used for data parallelism. The default values for `--tensor-model-parallel-size` and `--pipeline-model-parallel-size` is 1, which will not implement either form of model parallelism. -->
-
-We have examples of how to use these two different forms of model parallelism the example scripts ending in `distributed_with_mp.sh`:
-
-Other than these minor changes, the distributed training is identical to the training on a single GPU.
-
-The interleaved pipelining schedule (more details in Section 2.2.2 of [our paper](https://arxiv.org/pdf/2104.04473.pdf)) can be enabled using the `--num-layers-per-virtual-pipeline-stage` argument, which controls the number of transformer layers in a virtual stage (by default with the non-interleaved schedule, each GPU will execute a single virtual stage with `NUM_LAYERS / PIPELINE_MP_SIZE` transformer layers). The total number of layers in the transformer model should be divisible by this argument value. Additionally, the number of microbatches in the pipeline (computed as `GLOBAL_BATCH_SIZE / (DATA_PARALLEL_SIZE * MICRO_BATCH_SIZE)`) should be divisible by the `PIPELINE_MP_SIZE` when using this schedule (this condition is checked in an assertion in the code). The interleaved schedule is not supported for pipelines with 2 stages (`PIPELINE_MP_SIZE=2`).
-
-## Activation Checkpointing and Recomputation
-
-To reduce GPU memory usage when training a large model, we support various forms of activation checkpointing and recomputation. Instead of all activations being stored in memory to be used during backprop, as was traditionally the case in deep learning models, only activations at certain "checkpoints" in the model are retained (or stored) in memory, and the other activations are recomputed on-the-fly when needed for backprop. Note that this kind of checkpointing, *activation* checkpointing, is very different from the checkpointing of model parameters and optimizer state, which is mentioned elsewhere.
-
-We support two levels of recompute granularity: `selective` and `full`. Selective recomputation is the default and is recommended in almost all cases. This mode retains in memory the activations that take less memory storage space and are more expensive to recompute and recomputes the activations that take more memory storage space but are relatively inexpensive to recompute. See [our paper](https://arxiv.org/pdf/2205.05198) for details. You should find that this mode maximizes performance while minimizing the memory required to store activations. To enable selective activation recompute simply use `--recompute-activations`.
-
-For cases where memory is very limited, `full` recompute saves just the inputs to a transformer layer, or a group, or block, of transformer layers, and recomputes everything else. To enable full activation recompute use `--recompute-granularity full`. When using `full` activation recompute, there are two methods: `uniform` and `block`, chosen using the `--recompute-method` argument.
-
-* The `uniform` method uniformly divides the transformer layers into groups of layers (each group of size `--recompute-num-layers`) and stores the input activations of each group in memory. The baseline group size is 1 and, in this case, the input activation of each transformer layer is stored. When the GPU memory is insufficient, increasing the number of layers per group reduces the memory usage, enabling a bigger model to be trained. For example, when `--recompute-num-layers` is set to 4, only the input activation of each group of 4 transformer layers is stored.
-
-* The `block` method recomputes the input activations of a specific number (given by `--recompute-num-layers`) of individual transformer layers per pipeline stage and stores the input activations of the remaining layers in the pipeline stage. Reducing `--recompute-num-layers` results in storing the input activations to more transformer layers, which reduces the activation recomputation required in the backprop, thus improving training performance while increasing memory usage. For example, when we specify 5 layers to recompute of 8 layers per pipeline stage, the input activations of only the first 5 transformer layers are recomputed in the backprop step while the input activations for the final 3 layers are stored. `--recompute-num-layers` can be incrementally increased until the amount of memory storage space required is just small enough to fit in the available memory, thereby both maximally utilizing memory and maximizing performance.
-
-
-## Distributed Optimizer
-
-Usage: `--use-distributed-optimizer`. Compatible with all model and data types.
-
-The distributed optimizer is a memory savings technique, whereby the optimizer state is evenly distributed across data parallel ranks (versus the traditional method of replicating the optimizer state across data parallel ranks). As described in [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054), our implementation distributes all optimizer state that does not overlap with the model state. For example, when using fp16 model params, the distributed optimizer maintains its own separate copy of fp32 main params & grads, which are distributed across DP ranks. When using bf16 model params, however, the distributed optimizer's fp32 main grads are the same as the model's fp32 grads, and so the grads in this case are not distributed (although the fp32 main params are still distributed, as they are separate from the bf16 model params).
-
-Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In our implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size):
-
-| | Non-distributed optim | Distributed optim |
-|-|-|-|
-| fp16 param, fp16 grads | 20 | 4 + 16/d |
-| bf16 param, fp32 grads | 18 | 6 + 12/d |
-| fp32 param, fp32 grads | 16 | 8 + 8/d |
-
-## FlashAttention
-
-Usage: `--use-flash-attn`. Support attention head dimensions at most 128.
-
-[FlashAttention](https://github.com/HazyResearch/flash-attention) is a fast and
-memory-efficient algorithm to compute exact attention. It speeds up model
-training and reduces memory requirement.
-
-To install FlashAttention:
-```sh
-pip install flash-attn
-```
-
-## GPT-3 Example
-
-In `examples/pretrain_gpt3_175B.sh` we have provided an example of how to configure Megatron to train [GPT-3](https://arxiv.org/abs/2005.14165) with 175 billion parameters on 1024 GPUs. The script is designed for [slurm](https://slurm.schedmd.com/documentation.html) with [pyxis](https://github.com/NVIDIA/pyxis) plugin but can be easily adopted to any other scheduler. It uses 8-way tensor parallelism and 16-way pipeline parallelism. With options `global-batch-size 1536` and `rampup-batch-size 16 16 5859375`, the training will start with global batch size 16 and linearly increase the global batch size to 1536 over 5,859,375 samples with incremental steps 16. The training dataset can be either a single set or a multiple datasets combined with a set of weights.
-
-With full global batch size of 1536 on 1024 A100 GPUs, each iteration takes around 32 seconds resulting in 138 teraFLOPs per GPU which is 44% of the theoretical peak FLOPs.
-
-
-## Retro
-
-See:
-
-- `tools/retro/README.md` for an overview.
-- `tools/retro/examples/get_preprocess_cmd.sh` for an example of common preprocessing arguments.
-- `tools/retro/examples/preprocess_data.sh` for an example of how to preprocess data.
-- `tools/retro/examples/pretrain_model.sh` for an example of how to pretrain a model.
-
-Retro is a retrieval-enhanced model that is based on GPT. As described in [Improving language models by retrieving from trillions of tokens](https://arxiv.org/abs/2112.04426), Retro retrieves from a database of document chunks by performing locality search using a sample's tokens. The retrieval database can be large -- often billions or even trillions of tokens -- and provides a more efficient storage mechanism of factual knowledge, when compared to storing factual knowledge implicitly within the network's parameters.
-
-Using Retro requires two steps: 1) preprocessing the retrieval database and pretraining neighbors, and 2) pretraining a model using this data. Please see `tools/retro/README.md` for a detailed overview.
-
-<!--
-## REALM Pipeline
-We are working on implementing the [REALM](https://arxiv.org/pdf/2002.08909.pdf) system. The following sections (will) reflect the three stages of training it. For now it's just the ICT code.
-Loosely, they are pretraining the retriever modules, then jointly training the language model and the retriever, and then finetuning a question answering head on the language model with fixed retriever.
-
-### Inverse Cloze Task (ICT) Pretraining
-1. Have a corpus in loose JSON format with the intention of creating a collection of fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block but also multiple blocks per document.
-Run `tools/preprocess_data.py` to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. For the original REALM system, we construct two datasets, one with the title of every document, and another with the body.
-Refer to the following script
-<pre>
-python preprocess_data.py \
-    --input /path/to/corpus.json \
-    --json-keys text title \
-    --split-sentences \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file /path/to/vocab.txt \
-    --output-prefix corpus_indexed \
-    --workers 5  # works well for 10 CPU cores. Scale up accordingly.
-</pre>
-
-2. Use a custom samples mapping function in place of `megatron/data/realm_dataset_utils.get_block_samples_mapping` if required. To do this, you will need to implement a new function in C++ inside of `megatron/data/helpers.cpp`. The samples mapping data structure is used to select the data that will constitute every training sample in advance of the training loop.
- The samples mapping is responsible for holding all of the required metadata needed to construct the sample from one or more indexed datasets. In REALM, the samples mapping contains the start and end sentence indices, as well as the document index (to find the correct title for a body) and a unique ID for every block.
-3. Pretrain a BERT language model using `pretrain_bert.py`, with the sequence length equal to the block size in token ids. This model should be trained on the same indexed dataset that is used to supply the blocks for the information retrieval task.
-In REALM, this is an uncased bert base model trained with the standard hyperparameters.
-4. Use `pretrain_ict.py` to train an `ICTBertModel` which uses two BERT-based encoders to encode queries and blocks to perform retrieval with.
-The script below trains the ICT model from REALM. It references a pretrained BERT model (step 3) in the `--bert-load` argument. The batch size used in the paper is 4096, so this would need to be run with data parallel world size 32.
-<pre>
-python pretrain_ict.py \
-    --num-layers 12 \
-    --num-attention-heads 12 \
-    --hidden-size 768 \
-    --batch-size 128 \
-    --seq-length 256 \
-    --max-position-embeddings 256 \
-    --ict-head-size 128 \
-    --train-iters 100000 \
-    --bert-load /path/to/pretrained_bert \
-    --load checkpoints \
-    --save checkpoints \
-    --data-path /path/to/indexed_dataset \
-    --titles-data-path /path/to/titles_indexed_dataset \
-    --vocab-file /path/to/vocab.txt \
-    --lr 0.0001 \
-    --num-workers 2 \
-    --lr-decay-style linear \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --lr-warmup-fraction .01 \
-    --save-interval 3000 \
-    --query-in-block-prob 0.1 \
-    --fp16
-
-</pre>
-
-### Building an Index of Block Embeddings
-After having trained an ICT model, you can now embed an entire dataset of blocks by creating a `BlockData` structure. After that has been saved, you can load it
-and wrap it with a `FaissMIPSIndex` to do fast similarity search which is key in the learned information retrieval pipeline. The initial index can be built with the following script, meant to be run in an interactive session. It can leverage multiple GPUs on multiple nodes to index large datasets much more quickly.
-
-<pre>
-python tools/create_doc_index.py \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --ict-head-size 128 \
-    --num-attention-heads 12 \
-    --batch-size 128 \
-    --seq-length 256 \
-    --max-position-embeddings 256 \
-    --ict-load /path/to/pretrained_ict \
-    --data-path /path/to/indexed_dataset \
-    --titles-data-path /path/to/titles_indexed_dataset \
-    --block-data-path embedded_blocks.pkl \
-    --indexer-log-interval 1000 \
-    --indexer-batch-size 128 \
-    --vocab-file /path/to/vocab.txt \
-    --num-workers 2 \
-    --fp16
-</pre>
-
--->
-
-# Evaluation and Tasks
-
-We provide several command line arguments, detailed in the scripts listed below, to handle various zero-shot and fine-tuned downstream tasks. However, you can also finetune your model from a pretrained checkpoint on other corpora as desired. To do so, simply add the `--finetune` flag and adjust the input files and training parameters within the original training script. The iteration count will be reset to zero, and the optimizer and internal state will be reinitialized. If the fine-tuning is interrupted for any reason, be sure to remove the `--finetune` flag before continuing, otherwise the training will start again from the beginning.
-
-Because evaluation requires substantially less memory than training, it may be advantageous to merge a model trained in parallel for use on fewer GPUs in downstream tasks. The following script accomplishes this. This example reads in a GPT model with 4-way tensor and 4-way pipeline model parallelism and writes out a model with 2-way tensor and 2-way pipeline model parallelism.
-
-<pre>
-python tools/checkpoint/util.py \
-        --model-type GPT \
-        --load-dir checkpoints/gpt3_tp4_pp4 \
-        --save-dir checkpoints/gpt3_tp2_pp2 \
-        --target-tensor-parallel-size 2 \
-        --target-pipeline-parallel-size 2
-
-</pre>
-
-Several downstream tasks are described for both GPT and BERT models below. They can be run in distributed and model parallel modes with the same changes used in the training scripts.
-
-## GPT Text Generation
-
-We have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`. You run it much like you would start a pretraining job, specifying an appropriate pretrained checkpoint. There are also few optional parameters: `temperature`, `top-k`and `top-p`. See `--help` or the source file for more information. See [examples/run_text_generation_server_345M.sh](examples/run_text_generation_server_345M.sh) for an example of how to run the server.
-
-Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
-
-<pre>
-tools/text_generation_cli.py localhost:5000
-</pre>
-
-You can also use CURL or any other tools to query the server directly:
-
-<pre>
-curl 'http://localhost:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["Hello world"], "tokens_to_generate":1}'
-</pre>
-
-See [megatron/text_generation_server.py](megatron/text_generation_server.py) for more API options.
-
-### Detoxify GPT via Self-generation
-We include an example in `examples/detxoify_lm/` to detoxify language models by leveraging the generative power of language models.
-
-See [examples/detxoify_lm/README.md](examples/detxoify_lm/README.md) for step-by-step tutorials on how to perform domain-adaptive training and detoxify LM using self-generated corpus.
-
-
-## GPT Evaluation
-We include example scripts for GPT evaluation on WikiText perplexity evaluation and LAMBADA Cloze accuracy.
-
-### WikiText Perplexity Evaluation
-For even comparison with prior works, we evaluate perplexity on the word-level [WikiText-103 test dataset](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip), and appropriately compute perplexity given the change in tokens when using our subword tokenizer.
-
-We use the following command to run WikiText-103 evaluation on a 345M parameter model.
-<pre>
-TASK="WIKITEXT103"
-
-VALID_DATA=&#60;wikitext path&#62;.txt
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT_PATH=checkpoints/gpt2_345m
-
-COMMON_TASK_ARGS="--num-layers 24 \
-                  --hidden-size 1024 \
-                  --num-attention-heads 16 \
-                  --seq-length 1024 \
-                  --max-position-embeddings 1024 \
-                  --fp16 \
-                  --vocab-file $VOCAB_FILE"
-
-python tasks/main.py \
-       --task $TASK \
-       $COMMON_TASK_ARGS \
-       --valid-data $VALID_DATA \
-       --tokenizer-type GPT2BPETokenizer \
-       --merge-file $MERGE_FILE \
-       --load $CHECKPOINT_PATH \
-       --micro-batch-size 8 \
-       --log-interval 10 \
-       --no-load-optim \
-       --no-load-rng
-</pre>
-
-
-### LAMBADA Cloze Accuracy
-To compute LAMBADA cloze accuracy (the accuracy of predicting the last token given the preceding tokens) we utilize a detokenized, processed version of the [LAMBADA dataset](https://github.com/cybertronai/bflm/blob/master/lambada_test.jsonl).
-
-We use the following command to run LAMBADA evaluation on a 345M parameter model. Note that the `--strict-lambada` flag should be used to require whole word matching. Ensure that `lambada` is part of the file path.
-
-<pre>
-TASK="LAMBADA"
-
-VALID_DATA=&#60;lambada path&#62;.json
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT_PATH=checkpoints/gpt2_345m
-COMMON_TASK_ARGS=&#60;same as those in <a href="#wikitext-perplexity-evaluation">WikiText Perplexity Evaluation</a> above&#62;
-
-python tasks/main.py \
-       --task $TASK \
-       $COMMON_TASK_ARGS \
-       --valid-data $VALID_DATA \
-       --tokenizer-type GPT2BPETokenizer \
-       --strict-lambada \
-       --merge-file $MERGE_FILE \
-       --load $CHECKPOINT_PATH \
-       --micro-batch-size 8 \
-       --log-interval 10 \
-       --no-load-optim \
-       --no-load-rng
-</pre>
-
-Further command line arguments are described in the source file [`main.py`](./tasks/main.py)
-
-## BERT Task Evaluation
-### RACE Evaluation
-The following script finetunes the BERT model for evaluation on the [RACE dataset](http://www.cs.cmu.edu/~glai1/data/race/). The `TRAIN_DATA` and `VALID_DATA` directory contain the RACE dataset as separate `.txt` files. Note that for RACE, the batch size is the number of RACE query's to evaluate. Since each RACE query has four samples, the effective batch size passed through the model will be four times the batch size specified on the command line.
-
-<pre>
-TRAIN_DATA="data/RACE/train/middle"
-VALID_DATA="data/RACE/dev/middle \
-            data/RACE/dev/high"
-VOCAB_FILE=bert-vocab.txt
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-CHECKPOINT_PATH=checkpoints/bert_345m_race
-COMMON_TASK_ARGS="--num-layers 24 \
-                  --hidden-size 1024 \
-                  --num-attention-heads 16 \
-                  --seq-length 512 \
-                  --max-position-embeddings 512 \
-                  --fp16 \
-                  --vocab-file $VOCAB_FILE"
-
-COMMON_TASK_ARGS_EXT="--train-data $TRAIN_DATA \
-                      --valid-data $VALID_DATA \
-                      --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-                      --save-interval 10000 \
-                      --save $CHECKPOINT_PATH \
-                      --log-interval 100 \
-                      --eval-interval 1000 \
-                      --eval-iters 10 \
-                      --weight-decay 1.0e-1"
-
-python tasks/main.py \
-       --task RACE \
-       $COMMON_TASK_ARGS \
-       $COMMON_TASK_ARGS_EXT \
-       --tokenizer-type BertWordPieceLowerCase \
-       --epochs 3 \
-       --micro-batch-size 4 \
-       --lr 1.0e-5 \
-       --lr-warmup-fraction 0.06
-</pre>
-
-### MNLI Evaluation
-The following script finetunes the BERT model for evaluation with the [MultiNLI sentence pair corpus](https://www.nyu.edu/projects/bowman/multinli/). Because the matching tasks are quite similar, the script can be quickly tweaked to work with the [Quora Question Pairs](https://www.kaggle.com/quora/question-pairs-dataset) (QQP) dataset as well.
-
-<pre>
-
-TRAIN_DATA="data/glue_data/MNLI/train.tsv"
-VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
-            data/glue_data/MNLI/dev_mismatched.tsv"
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m_mnli
-COMMON_TASK_ARGS=&#60;same as those in <a href="#race-evaluation">RACE Evaluation</a> above&#62;
-COMMON_TASK_ARGS_EXT=&#60;same as those in <a href="#race-evaluation">RACE Evaluation</a> above&#62;
-
-python tasks/main.py \
-       --task MNLI \
-       $COMMON_TASK_ARGS \
-       $COMMON_TASK_ARGS_EXT \
-       --tokenizer-type BertWordPieceLowerCase \
-       --epochs 5 \
-       --micro-batch-size 8 \
-       --lr 5.0e-5 \
-       --lr-warmup-fraction 0.065
-</pre>
-
-## Llama-2 Inference and Finetuning
-
-The Llama-2 [family of models](https://ai.meta.com/llama/) are an open-source set of pretrained & finetuned (for chat) models that have achieved strong results across a wide set of benchmarks. At the time of release, Llama-2 models achieved among the best results for open-source models, and were competitive with the closed-source GPT-3.5 model (see https://arxiv.org/pdf/2307.09288.pdf).
-
-The Llama-2 checkpoints can be loaded into Megatron for inference and finetuning. See documentation [here](docs/llama2.md).
-
-# Datasets
-We do not host any datasets for GPT or BERT training, however, we detail their collection so that our results may be reproduced.
-
-## Collecting Wikipedia Training Data
-We recommend following the Wikipedia data extraction process specified by Google research: "the recommended pre-processing is to download [the latest dump](https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2), extract the text with [WikiExtractor.py](https://github.com/attardi/wikiextractor), and then apply any necessary cleanup to convert it into plain text."
-
-We recommend using the `--json` argument when using WikiExtractor, which will dump the Wikipedia data into loose json format (one json object per line), making it more manageable on the file system and also readily consumable by our codebase. We recommend further preprocessing this json dataset with nltk punctuation standardization. For BERT training, use the `--split-sentences` flag to `preprocess_data.py` as described [above](#data-preprocessing) to include sentence breaks in the produced index. If you'd like to use Wikipedia data for GPT training you should still clean it with nltk/spacy/ftfy, but do not use the `--split-sentences` flag.
-
-## Collecting GPT Webtext Data
-We utilize the publicly available [OpenWebText](https://github.com/eukaryote31/openwebtext) library from [jcpeterson](https://github.com/jcpeterson/openwebtext) and [eukaryote31's](https://github.com/eukaryote31/openwebtext) work to download urls. We then filter, clean, and deduplicate all downloaded content according to the procedure described in our [openwebtext](./tools/openwebtext) directory. For reddit URLs corresponding to content up to October 2018 we arrived at approximately 37GB of content.
-
-# Reproducibility
-Megatron training is intended to be bitwise reproducible. This means that the same training config run twice in the same HW and SW environment should produce identical model checkpoints, losses and accuracy metric values (iteration time metrics may vary).
-
-There are currently two known Megatron optimizations that break reproducibility whilst still producing almost identical training runs. The following workarounds should be applied in cases where reproducibility is required:
-1. When training using `--bf16`, reproducbility is only obtained when the checkpointing and resume schedule of training is identical. If the checkpointing schedule will change, i.e. checkpointing and resume will occur at different iterations, the option `--no-bias-gelu-fusion` should be used.
-2. Flash attention is nondeterministic. If reproducibility is required do not use `--use-flash-attn`.
-
-These sources of nondeterminism are under active investigation. If you observe nondeterminism in Megatron training under other circumstances please open an issue.
diff --git a/toolbox/Megatron-DeepSpeed/README_RLHF.md b/toolbox/Megatron-DeepSpeed/README_RLHF.md
deleted file mode 100644
index 8acb68e0371375b8e7d17369a74ffe156323f2fd..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/README_RLHF.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# RLHF
-
-## Step 1: Install
-
-```
-bash build_megatron-deepspeed.sh && bash install_megatron-deepspeed.sh
-```
-
-## Step 2: Dataset
-
-Download dataset and convert it.
-
-```
-cd dataset && bash convert_dahoas.sh
-```
-
-## Step 3: Checkpoint
-
-Download and convert checkpoints.
-
-```
-cd checkpoints && bash download_rlhf_checkpoints.sh
-bash convert_hf_2_meg.sh
-```
-
-## Step 4: Train
-
-```
-cd examples/llama2
-bash run_llama2_7b_rlhf_node1.sh
-```
diff --git a/toolbox/Megatron-DeepSpeed/SECURITY.md b/toolbox/Megatron-DeepSpeed/SECURITY.md
deleted file mode 100644
index 869fdfe2b246991a053fab9cfec1bed3ab532ab1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/SECURITY.md
+++ /dev/null
@@ -1,41 +0,0 @@
-<!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
-
-## Security
-
-Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
-
-If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
-
-## Reporting Security Issues
-
-**Please do not report security vulnerabilities through public GitHub issues.**
-
-Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
-
-If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
-
-You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
-
-Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
-
-  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
-  * Full paths of source file(s) related to the manifestation of the issue
-  * The location of the affected source code (tag/branch/commit or direct URL)
-  * Any special configuration required to reproduce the issue
-  * Step-by-step instructions to reproduce the issue
-  * Proof-of-concept or exploit code (if possible)
-  * Impact of the issue, including how an attacker might exploit the issue
-
-This information will help us triage your report more quickly.
-
-If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
-
-## Preferred Languages
-
-We prefer all communications to be in English.
-
-## Policy
-
-Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
-
-<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/toolbox/Megatron-DeepSpeed/build_megatron-deepspeed.sh b/toolbox/Megatron-DeepSpeed/build_megatron-deepspeed.sh
deleted file mode 100644
index e566fec93570afafeb33340a31f4294dadff5789..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/build_megatron-deepspeed.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-PYTHON_PATH=$(which python3)
-
-echo "build megatronspeed"
-COREX_VERSION=${COREX_VERSION:-latest}
-if [[ "${COREX_VERSION}" == "latest" || -z "${COREX_VERSION}" ]]; then
-  COREX_VERSION=`date --utc +%Y%m%d%H%M%S`
-fi
-MEGATRONSPEED_VERSION_IDENTIFIER="corex.${COREX_VERSION}"
-export MEGATRONSPEED_VERSION_IDENTIFIER=${MEGATRONSPEED_VERSION_IDENTIFIER}
-
-${PYTHON_PATH} setup.py build
-${PYTHON_PATH} setup.py bdist_wheel
-
-PKG_DIR="./dist"
-rm -rf build_pip
-if [[ ! -d "build_pip" ]]; then
-  mkdir build_pip
-fi
-
-pip_pkg="$(ls -t ${PKG_DIR} | grep "megatron" | head -1)"
-cp ${PKG_DIR}/${pip_pkg} build_pip
-
-exit 0
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/ci/run_ci_tests_multi_node.sh b/toolbox/Megatron-DeepSpeed/ci/run_ci_tests_multi_node.sh
deleted file mode 100644
index 28ab706a42af1ebd9159b7f96a40b5d28e6cfcf8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/ci/run_ci_tests_multi_node.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#! /bin/bash
-
-ROOT=$(cd ..; pwd)
-cd ${ROOT}
-
-
-cd tests
-bash run_test_multi_node.sh
-## 获取退出码
-status=$(cat exit_code.txt)
-
-if [[ $status == 255 ]]; then
-    exit -1
-else
-    exit $status
-fi
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/ci/run_ci_tests_one_node.sh b/toolbox/Megatron-DeepSpeed/ci/run_ci_tests_one_node.sh
deleted file mode 100644
index 3da0b9af412412834a948f0f69f2dfdb5cc49d4a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/ci/run_ci_tests_one_node.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /bin/bash
-
-ROOT=$(cd ..; pwd)
-cd ${ROOT}
-
-
-cd tests
-bash run_test_one_node.sh
-status=$?
-if [ $status == 255 ]; then
-    exit -1
-else
-    exit $status
-fi
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/clean_megatron-deepspeed.sh b/toolbox/Megatron-DeepSpeed/clean_megatron-deepspeed.sh
deleted file mode 100644
index 4c9753ba96c69690e1e4952774984720bfd7f61b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/clean_megatron-deepspeed.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-PYTHON_PATH=$(which python3)
-
-${PYTHON_PATH} setup.py clean || true
-rm -rf build build_pip dist megatronspeed.egg-info
-
-exit 0
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/dataset/README.md b/toolbox/Megatron-DeepSpeed/dataset/README.md
deleted file mode 100644
index 1f0aa31d96f2126b7ddc201385c266bca2f122cc..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/dataset/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Run the scripts below to setup dataset 
-
-bash download_books.sh
-
-bash download_vocab.sh
diff --git a/toolbox/Megatron-DeepSpeed/dataset/convert_dahoas.sh b/toolbox/Megatron-DeepSpeed/dataset/convert_dahoas.sh
deleted file mode 100644
index 07a3fe50ba2a52bef09f45634544883170368f51..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/dataset/convert_dahoas.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#/bin/bash
-CUR_DIR=$(cd "$(dirname "$0")";pwd)
-
-if [[ ! -e ${CUR_DIR}/dahoas_train.jsonl ]]; then
-    wget http://sw.iluvatar.ai/download/apps/datasets/nlp/RLHF/dahoas_train.jsonl
-fi
-
-PROJ_HOME=$(dirname "$PWD")
-SAVE_PATH=./dahoas
-mkdir -p $SAVE_PATH
-
-MAX_PROMPT_LENGTH=16000
-PAD_ID=0
-
-TOKENIZER=Llama2Tokenizer
-TOKENIZER_PATH=$PROJ_HOME/examples/llama2/tokenizer/tokenizer.model
-
-python3 $PROJ_HOME/tools/preprocess_data.py \
-            --input ./dahoas_train.jsonl \
-            --json-keys prompt \
-            --tokenizer-type $TOKENIZER \
-            --tokenizer-model $TOKENIZER_PATH \
-            --output-prefix $SAVE_PATH/dahoas_train \
-            --workers 32 \
-            --pad-2-maxlen $MAX_PROMPT_LENGTH \
-            --pad-direction left \
-            --pad-id $PAD_ID
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/dataset/convert_llama2tokenizer_dataset.sh b/toolbox/Megatron-DeepSpeed/dataset/convert_llama2tokenizer_dataset.sh
deleted file mode 100644
index 8098ab7d2f89e2bd9f043ceda09b72a2543d58f3..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/dataset/convert_llama2tokenizer_dataset.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#/bin/bash
-
-PROJ_HOME=$(dirname "$PWD")
-SAVE_PATH=./gpt_small_117M
-mkdir -p $SAVE_PATH
-
-TOKENIZER=Llama2Tokenizer
-TOKENIZER_PATH=$PROJ_HOME/examples/llama2/tokenizer/tokenizer.model
-
-python3 $PROJ_HOME/tools/preprocess_data.py \
-            --input ./gpt_small-117M.train.jsonl \
-            --json-keys text \
-            --tokenizer-type $TOKENIZER \
-            --tokenizer-model $TOKENIZER_PATH \
-            --output-prefix $SAVE_PATH/gpt_small_117M \
-            --append-eod \
-            --workers 32
-
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/dataset/download_RedPajama-Data-1T-Sample.sh b/toolbox/Megatron-DeepSpeed/dataset/download_RedPajama-Data-1T-Sample.sh
deleted file mode 100644
index 494e7386a6743e3eecf73b38c3d48af6c9106eeb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/dataset/download_RedPajama-Data-1T-Sample.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-set -euox pipefail
-
-CUR_DIR=$(cd "$(dirname "$0")";pwd)
-cd ${CUR_DIR}
-
-if [[ ! -d ${CUR_DIR}/RedPajama-Data-1T-Sample ]]; then
-    echo "RedPajama-Data-1T-Sample dataset not exist, downloading..."
-    wget http://sw.iluvatar.ai/download/apps/datasets/nlp/RedPajama-Data-1T-Sample/RedPajama-Data-1T-Sample.tar
-    tar -xf RedPajama-Data-1T-Sample.tar && rm -f RedPajama-Data-1T-Sample.tar
-fi
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/dataset/download_and_covert_llama3_dataset.sh b/toolbox/Megatron-DeepSpeed/dataset/download_and_covert_llama3_dataset.sh
deleted file mode 100644
index 432d6d9b0736ffa52a8d4804123e1a94f5db67b5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/dataset/download_and_covert_llama3_dataset.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#/bin/bash
-set -euox pipefail
-
-CUR_DIR=$(pwd)
-if [[ ! -f $CUR_DIR/small-117M.train.jsonl ]]; then
-    wget http://10.150.9.95/swapp/datasets/nlp/gpt-2-output-dataset/small-117M.train.jsonl
-fi
-
-PROJ_HOME=$(dirname "$PWD")
-SAVE_PATH=./gpt_small_117M_llama3
-mkdir -p $SAVE_PATH
-
-TOKENIZER=Llama3Tokenizer
-TOKENIZER_PATH=$PROJ_HOME/examples/llama2/tokenizer/tokenizer_llama3.model
-
-python3 $PROJ_HOME/tools/preprocess_data.py \
-            --input ./small-117M.train.jsonl \
-            --json-keys text \
-            --tokenizer-type $TOKENIZER \
-            --tokenizer-model $TOKENIZER_PATH \
-            --output-prefix $SAVE_PATH/gpt_small_117M \
-            --append-eod \
-            --workers 32
-
-rm -f small-117M.train.jsonl
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/dataset/download_books.sh b/toolbox/Megatron-DeepSpeed/dataset/download_books.sh
deleted file mode 100644
index cb93c2b21328886ec4b425fdcf788011d913fa57..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/dataset/download_books.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/dataset/download_ckpt.sh b/toolbox/Megatron-DeepSpeed/dataset/download_ckpt.sh
deleted file mode 100644
index ac10274b187057ccda7284a84c55cc63f9d247f2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/dataset/download_ckpt.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-mkdir -p checkpoints/gpt2_345m
-
-cd checkpoints/gpt2_345m
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip
-unzip megatron_lm_345m_v0.0.zip
-rm megatron_lm_345m_v0.0.zip
-cd ../..
-
diff --git a/toolbox/Megatron-DeepSpeed/dataset/download_vocab.sh b/toolbox/Megatron-DeepSpeed/dataset/download_vocab.sh
deleted file mode 100644
index 0b7637104baaa0f1d413d03143b20f17b0a1ad40..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/dataset/download_vocab.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/docs/distrib_optimizer.md b/toolbox/Megatron-DeepSpeed/docs/distrib_optimizer.md
deleted file mode 100644
index def23b20ebef76e2ced6354ec9eb08c2fdd413c2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/docs/distrib_optimizer.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# Distributed Optimizer
-
-The motivation for the distributed optimizer is to save memory by distributing the optimizer state evenly across data parallel ranks, versus the current method of replicating the optimizer state across data parallel ranks. As described in https://arxiv.org/abs/1910.02054, this branch specifically implements the following:
-
-- [yes] distribute all 'non-overlapping' optimizer state (i.e., model params already in fp32 are NOT distributed)
-- [no] distribute model gradients
-- [no] distribute model parameters
-
-Theoretical memory savings vary depending on the combination of the model's param dtype and grad dtype. In the current implementation, the theoretical number of bytes per parameter is (where 'd' is the data parallel size):
-
-|        | Non-distributed optim | Distributed optim |
-| ------ | ------ | ------ |
-| float16 param, float16 grads | 20 | 4 + 16/d |
-| float16 param, fp32 grads    | 18 | 6 + 12/d |
-| fp32 param, fp32 grads       | 16 | 8 + 8/d  |
-
-The implementation of the distributed optimizer is centered on using the contiguous grad buffer for communicating grads & params between the model state and the optimizer state. The grad buffer at any given moment either holds:
-
-1. all model grads
-2. a 1/d size _copy_ of the main grads (before copying to the optimizer state)
-3. a 1/d size _copy_ of the main params (after copying from the optimizer state)
-4. all model params
-5. zeros (or None), between iterations
-
-The grad buffer is used for performing reduce-scatter and all-gather operations, for passing grads & params between the model state and optimizer state. With this implementation, no dynamic buffers are allocated.
-
-The figures below illustrate the grad buffer's sharding scheme, and the key steps of the distributed optimizer's param update:
-
-## Data flow
-
-![Data flow](images/distrib_optimizer/data_flow.png)
-
-## Sharding scheme
-
-![Sharding scheme](images/distrib_optimizer/sharding_scheme.png)
-
-## Key steps
-
-_(note: using illustrations above, and assuming fp16 grads)_
-
-- Backward pass finishes (grad buffer holds 16 fp16 grad elements)
-- Call reduce-scatter on each DP rank
-- Each DP rank now has 4 elements within the grad buffer that are fully reduced (remaining 12 elements are garbage)
-- Each DP rank copies its relevant 4 fp16 grad elements from the grad buffer into 4 fp32 main grad elements (separate buffer, owned by the optimizer); i.e.
-  - DP rank 0 copies elements [0:4]
-  - DP rank 1 copies elements [4:8]
-  - DP rank 2 copies elements [8:12]
-  - DP rank 3 copies elements [12:16]
-- Optimizer.step()
-- Each DP rank copies its 4 fp32 main (/optimizer) param elements into the corresponding 4 fp16 elements in the grad buffer
-- Call all-gather on each DP rank
-- Grad buffer now contains all 16, fully updated, fp16 model param elements
-- Copy updated model params from grad buffer into their respective param tensors
-- (At this point, grad buffer is ready to be zero'd for the next iteration)
diff --git a/toolbox/Megatron-DeepSpeed/docs/images/distrib_optimizer/data_flow.png b/toolbox/Megatron-DeepSpeed/docs/images/distrib_optimizer/data_flow.png
deleted file mode 100644
index d48fc134c40d6d0aae335bf765971b1181237d48..0000000000000000000000000000000000000000
Binary files a/toolbox/Megatron-DeepSpeed/docs/images/distrib_optimizer/data_flow.png and /dev/null differ
diff --git a/toolbox/Megatron-DeepSpeed/docs/images/distrib_optimizer/sharding_scheme.png b/toolbox/Megatron-DeepSpeed/docs/images/distrib_optimizer/sharding_scheme.png
deleted file mode 100644
index b07c25b05f9e2e7a2973caa296126c724da9f4ed..0000000000000000000000000000000000000000
Binary files a/toolbox/Megatron-DeepSpeed/docs/images/distrib_optimizer/sharding_scheme.png and /dev/null differ
diff --git a/toolbox/Megatron-DeepSpeed/examples/README.md b/toolbox/Megatron-DeepSpeed/examples/README.md
deleted file mode 100644
index 76765044b7f9fc7c211c7c12d768f3d6c909c0cc..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Original examples by NVIDIA/Megatron-LM
-
-This folder includes examples from the original NVIDIA/Megatron-LM repo. All of them do NOT have DeepSpeed technologies integrations, and some of them may not work due to changes in this Megatron-DeepSpeed repo. Thus we recommend you to go to ```../examples_deepspeed/``` folder which includes examples that have DeepSpeed technologies integrated and are tested by DeepSpeed team.
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/README.md b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/README.md
deleted file mode 100644
index a0f7b39e4c568fcec7034b6575f9856e795d1376..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/README.md
+++ /dev/null
@@ -1,112 +0,0 @@
-# SGEAT: Detoxify Larger-scale Language Models
-
-This is the official code base for our NeurIPS 2022 paper:
-
-[Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models](https://arxiv.org/abs/2202.04173)
-
-Boxin Wang, Wei Ping, Chaowei Xiao, Peng Xu, Mostofa Patwary, Mohammad Shoeybi, Bo Li, Anima Anandkumar, Bryan Catanzaro
-
-
-## Citation
-
-```
-@article{WangExp2022,
-  title={Exploring the Limits of Domain-Adaptive Training for Detoxifying Large-Scale Language Models},
-  author={Wang, Boxin and Ping, Wei and Xiao, Chaowei and Xu, Peng and Patwary, Mostofa and Shoeybi, Mohammad and and Li, Bo and Anandkumar, Anima and Catanzaro, Bryan},
-  journal={NeurIPS},
-  year={2022}
-}
-```
-
-## Usage
-
-### Prepare your environment
-
-The project environment is based on the standard [nvcr docker](nvcr.io/nvidia/pytorch:21.12-py3) of version `nvcr.io/nvidia/pytorch:21.12-py3`.
-
-To run Perspective API, you need to install `google-api-python-client`
-```bash
-pip install --upgrade google-api-python-client
-```
-
-### Self Generation
-
-#### SGEAT (Standard)
-To perform unconditional generation for a Megatron LM, we provide an example script for 1.3B LM.
-
-```bash
-#                                                                              [num of samples]     [model checkpoint]          [random seed]
-bash examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh       1000          checkpoints/gpt3/gpt3-1.3b/      2333
-```
-This will generate a jsonl file of  1000 generated text (as a toy example) at `selfgeneration/unconditional_generation_gpt3-1.3b/2333.out`. 
-
-Note that you may want to set your own gpt2 vocab and merge file dir, as well as your output data dir in `selfgenerate-1.3b-unconditional.sh`.
-
-### Annotation
-
-We then use Perspective API to annotate the self generated corpus. Note that you need to fill in your own Perspective API key in the `examples/detoxify_lm/perspective_api_annotate.py`. 
-
-```bash
-python examples/detxoify_lm/perspective_api_annotate.py --data-path [input-data-path] --out-path [output-data-path] --workers 70
-```
-
-For example,
-
-```bash
-python examples/detxoify_lm/annotations/perspective_api_annotate.py --data-path  selfgeneration/unconditional_generation_gpt3-1.3b/2333.out --out-path  selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.out --workers 70
-```
-
-### Filtering
-
-We then filter the self annotated generated corpus to get the most nontoxic 50% of the corus.
-
-For example,
-```bash
-python examples/detxoify_lm/annotations/filter-selfgeneration.py --data-path  selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.out --out-path  selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic.out
-```
-
-This will generate a jsonl file of 500 text of the lowest toxicity (as a toy example) at `selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic.out`. 
-
-
-### Preprocess
-
-We then preprocess the dataset so that Megatron LM can use the dumped dataset to fine-tune.
-
-```
-bash examples/detxoify_lm/annotations/preprocess.sh selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic.out selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic
-```
-
-This will generate two files as follows
-```bash
-selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic_text_document.idx
-selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic_text_document.bin
-```
-which will be used in the following domain-adative training step.
-
-### Fine-tuning
-
-We then use the preprocess dataset as input to fine-tune our Megatron-LM. 
-```bash
-#                                                                          [fine-tuning dataset]                                                                      [output-dir]                             [lr]    [bs]      [train-iters]                       [load checkpoint]
-bash examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh    selfgeneration/unconditional_generation_gpt3-1.3b/2333.annotated.nontoxic_text_document         gpt3-1.3b-toy-example-lr-2e-5-bs-512             2e-5     512            78                          checkpoints/gpt3/gpt3-1.3b
-```
-
-This will dump the final checkpoint in `$SHARE_DATA/gpt3-1.3b-toy-example-lr-2e-5-bs-512`. (`$SHARE_DATA` is your current work dir, default to `$PWD`)
-
-### Evaluation
-
-We then use the fine-tuned checkpoint to perform conditional generation given RealToxicityPrompts:
-
-```bash
-#                                                 [input-prompts]                          [model-checkpoint]
-bash examples/detxoify_lm/generate-1.3b.sh     augmented_prompts.jsonl      $SHARE_DATA/gpt3-1.3b-toy-example-lr-2e-5-bs-512
-```
-For example, this will generate the continuations in the file `augmented_prompts.jsonl_output_gpt3-1.3b-toy-example-lr-2e-5-bs-512_seed_31846.jsonl` (seed is a random generated number).
-
-Note that the input prompts are augmented so that each prompts appear 25 times to calculate the Expected Maximum Toxicity over 25 generations and Toxicity Probability,  
-
-We then use Perspective API to evaluate the Expected Maximum Toxicity and Toxicity Probability.   
-
-```bash
-python examples/detxoify_lm/perspective_api.py --data-path "augmented_prompts.jsonl_output_gpt3-1.3b-toy-example-lr-2e-5-bs-512_seed_31846.jsonl" --prompt-path prompts.jsonl --workers 30
-```
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/filter-selfgeneration.py b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/filter-selfgeneration.py
deleted file mode 100644
index 884cdaed451f8b1337ac381a4615ed505d3b7138..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/filter-selfgeneration.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import json
-import time
-from typing import Dict, Optional, List
-
-import joblib
-from googleapiclient import discovery
-from googleapiclient.errors import HttpError
-
-import argparse
-
-from tqdm import tqdm
-
-parser = argparse.ArgumentParser(description='Process some integers.')
-parser.add_argument('--data-path', type=str, default='',
-                    help='data path to load the jsonl')
-parser.add_argument('--out-path', type=str, default='',
-                    help='data path to load the jsonl')
-parser.add_argument('--prompt-path', type=str, default='datasets/realprompts/prompts.jsonl',
-                    help='data path to load the prompt jsonl')
-parser.add_argument('--workers', type=int, default=10,
-                   help='Number of worker processes to launch')
-
-
-def get_corpus_scores(lines):
-    scores = []
-    corpus = []
-
-    for line in tqdm(lines):
-        data = json.loads(line)
-        score = data['score']
-        if score is not None:
-            scores.append(score['toxicity'])
-            corpus.append(data)
-    return scores, corpus
-
-
-def main():
-    args = parser.parse_args()
-
-    with open(args.data_path) as f:
-        lines = f.readlines()
-
-    print(f"total line number: {len(lines)}")
-
-    scores, corpus = get_corpus_scores(lines)
-    import numpy as np
-    scores = np.array(scores)
-    indices = np.argsort(scores)
-
-    print(f"total valid samples: {len(scores)}")
-
-    print(f"Avg toxicity (all): {np.mean(scores)} +- {np.std(scores)}")
-    print(f"Avg toxicity (toxic): {np.mean(scores[scores > 0.5])} +- {np.std(scores[scores > 0.5])}")
-    print(f"Toxic Percentage {sum(scores > 0.5) / len(scores)}")
-    print(f"Avg toxicity (nontoxic): {np.mean(scores[scores <= 0.5])} +- {np.std(scores[scores <= 0.5])}")
-    print(f"Nontoxic Percentage {sum(scores <= 0.5) / len(scores)}")
-
-    samples_left = len(lines) // 2
-    print(f"After filtering: {samples_left} of samples are left")
-    nontoxic_indices = indices[:samples_left]
-    print(f"Avg toxicity (filtered): {np.mean(scores[nontoxic_indices])} +- {np.std(scores[nontoxic_indices])}")
-    print(f"Toxicity Range (filtered): {np.min(scores[nontoxic_indices])} ~ {np.max(scores[nontoxic_indices])}")
-    nontoxic_data = [corpus[ind] for ind in nontoxic_indices]
-    print(f"Total samples after filtering: {len(nontoxic_data)}")
-    print(f"Examples: {nontoxic_data[:3]}")
-
-    from sklearn.utils import shuffle
-    nontoxic_data = shuffle(nontoxic_data)
-
-    with open(args.out_path, 'w') as f:
-        for x in nontoxic_data:
-            f.write(json.dumps(x) + '\n')
-
-
-main()
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/perspective_api_annotate.py b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/perspective_api_annotate.py
deleted file mode 100644
index fd82c2a2ae93d7d25836620d7568c422a71b4405..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/perspective_api_annotate.py
+++ /dev/null
@@ -1,182 +0,0 @@
-import json
-import time
-from typing import Dict, Optional, List
-
-import joblib
-from googleapiclient import discovery
-from googleapiclient.errors import HttpError
-
-import argparse
-
-from tqdm import tqdm
-
-parser = argparse.ArgumentParser(description='Process some integers.')
-parser.add_argument('--data-path', type=str, default='',
-                    help='data path to load the jsonl')
-parser.add_argument('--out-path', type=str, default='',
-                    help='data path to load the jsonl')
-parser.add_argument('--total', type=int, default=-1,
-                    help='Total number of data')
-parser.add_argument('--workers', type=int, default=1,
-                   help='Number of worker processes to launch')
-
-
-class PerspectiveApiScorer:
-    """
-    This class provides a method for accessing Perspective API using the Google API Client to obtain attribute scores for generated texts.
-    """
-
-    DEFAULT_ATTRIBUTES = ['toxicity', 'severe_toxicity', 'sexually_explicit', 'threat', 'profanity', 'identity_attack']
-
-    def __init__(self):
-        """
-        :param api_key: the API key to use. For details, see https://support.perspectiveapi.com/s/docs-get-started
-        """
-        api_key = ''
-        self._service = discovery.build(
-            "commentanalyzer",
-            "v1alpha1",
-            developerKey=api_key,
-            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
-            static_discovery=False,
-        )
-
-    def get_scores(self, input_text: str, requested_attributes: Optional[List[str]] = None) -> Dict[str, float]:
-        """
-        Get attribute scores for a given text via Perspective API.
-        :param input_text: the input text
-        :param requested_attributes: the attributes for which to compute scores
-        :return: a mapping from attribute names to scores
-        """
-        requested_attributes = requested_attributes if requested_attributes else PerspectiveApiScorer.DEFAULT_ATTRIBUTES
-
-        analyze_request = {
-            'comment': {'text': input_text},
-            'requestedAttributes': {attribute.upper(): {} for attribute in requested_attributes},
-            'spanAnnotations': False,
-            'languages': ['en'],
-        }
-
-        response = None
-        while not response:
-            try:
-                response = self._service.comments().analyze(body=analyze_request).execute()
-            except Exception as e:
-                print(f'Perspective API threw an error: {e}\n Retrying in 5 seconds...')
-                print(input_text)
-                time.sleep(1)
-
-        return {attribute: response['attributeScores'][attribute.upper()]['summaryScore']['value'] for attribute in
-                requested_attributes}
-
-
-def test():
-    scorer = PerspectiveApiScorer()
-    for i in range(1):
-        print(scorer.get_scores("toxic behaviors and nasty negro"))
-
-
-def split_lines(lines, split):
-    tot = len(lines)
-    each = tot // split
-    return [lines[i:i+each] for i in range(0, tot, each)]
-
-from joblib import Parallel, delayed
-
-scorer = PerspectiveApiScorer()
-
-def get_score(line):
-    data = json.loads(line)
-    text = data['text']
-    text = text.replace("<|endoftext|>", "")
-    data['text'] = text
-    if not text.strip():
-        data['score'] = None
-        return json.dumps(data)
-
-    encoded_text = text.encode('utf8')
-    encoded_text = encoded_text[:20480]
-    try:
-        decoded_text = encoded_text.decode('utf8')
-    except UnicodeDecodeError:
-        try:
-            decoded_text = encoded_text[:20479].decode('utf8')
-        except UnicodeDecodeError:
-            try:
-                decoded_text = encoded_text[:20478].decode('utf8')
-            except UnicodeDecodeError:
-                try:
-                    decoded_text = encoded_text[:20476].decode('utf8')
-                except:
-                    print("Error occurred")
-                    data['score'] = None
-                    return json.dumps(data)
-    data['score'] = scorer.get_scores(decoded_text)
-    return json.dumps(data)
-
-
-def get_scores(lines):
-    scorer = PerspectiveApiScorer()
-    all_data = []
-    for i, line in enumerate(tqdm(lines)):
-        data = json.loads(line)
-        text = data['text']
-        if not text.strip():
-            data['score'] = None
-            all_data.append(json.dumps(data))
-            continue
-        encoded_text = text.encode('utf8')
-        encoded_text = encoded_text[:20480]
-        try:
-            decoded_text = encoded_text.decode('utf8')
-        except UnicodeDecodeError:
-            try:
-                decoded_text = encoded_text[:20479].decode('utf8')
-            except UnicodeDecodeError:
-                try:
-                    decoded_text = encoded_text[:20478].decode('utf8')
-                except UnicodeDecodeError:
-                    try:
-                        decoded_text = encoded_text[:20476].decode('utf8')
-                    except:
-                        print("Error occurred")
-                        data['score'] = None
-                        all_data.append(json.dumps(data))
-                        continue
-        data['score'] = scorer.get_scores(decoded_text)
-        all_data.append(json.dumps(data))
-    return all_data
-
-def get_annotated_datasets(lines, threads=10):
-    sub_lines = lines
-    splitted_lines = split_lines(sub_lines, threads)
-    print(len(sub_lines))
-    final = Parallel(n_jobs=threads)(delayed(get_score)(l) for l in splitted_lines)
-    import itertools
-    finals = list(itertools.chain.from_iterable(final))
-    return finals
-
-
-def main():
-    args = parser.parse_args()
-
-    path = args.data_path
-    out = args.out_path if args.out_path else path + '-annotated.jsonl'
-    print(out)
-
-    fin = open(path, 'r', encoding='utf-8')
-    import multiprocessing
-    pool = multiprocessing.Pool(args.workers)
-    annotated = pool.imap(get_score, fin, 25)
-    with open(out, "w") as f:
-        if args.total > 0:
-            for x in tqdm(annotated, total=args.total):
-                f.write(x + '\n')
-        else:
-            for x in tqdm(annotated):
-                f.write(x + '\n')
-
-
-if __name__ == '__main__':
-    main()
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/preprocess.sh b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/preprocess.sh
deleted file mode 100644
index 4324f80144f87604b0e588ded85c69dddc772df1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/annotations/preprocess.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-VOCAB_FILE=pt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-
-python3 tools/preprocess_data.py \
-    --input $1 \
-    --output-prefix $2 \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --tokenizer-type GPT2BPETokenizer \
-    --append-eod  --workers 20 --chunk-size 25
-
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/finetune_gpt.py b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/finetune_gpt.py
deleted file mode 100644
index 0675a8508456b3e73aadee1585022f517d920846..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/finetune_gpt.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-
-
-"""Fine-tune GPT"""
-
-import torch
-from functools import partial
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir, os.path.pardir)))
-from megatron_ds import get_args
-from megatron_ds import get_timers
-from megatron_ds import get_tokenizer
-from megatron_ds import print_rank_0
-from megatron_ds.core import mpu
-from megatron_ds.data.blendable_dataset import BlendableDataset
-from megatron_ds.data.gpt_dataset import build_train_valid_test_datasets
-from megatron_ds.model import GPTModel
-from megatron_ds.arguments import core_transformer_config_from_args
-from megatron_ds.core.enums import ModelType
-from megatron_ds.training import pretrain
-from megatron_ds.utils import get_ltor_masks_and_position_ids
-from megatron_ds.utils import average_losses_across_data_parallel_group
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    config = core_transformer_config_from_args(args)
-
-    print_rank_0('building GPT model ...')
-    model = GPTModel(
-        config=config,
-        num_tokentypes=0,
-        parallel_output=True,
-        pre_process=pre_process,
-        post_process=post_process
-    )
-    return model
-
-
-def get_batch(data_iterator):
-    """Generate a batch"""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Items and their type.
-    keys = ['text']
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    data_b = mpu.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
-
-    # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-
-    return tokens, labels, loss_mask, attention_mask, position_ids
-
-def loss_func(loss_mask, output_tensor):
-    losses = output_tensor.float()
-    loss_mask = loss_mask.view(-1).float()
-    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
-
-    # Reduce loss for logging.
-    averaged_loss = average_losses_across_data_parallel_group([loss])
-
-    return loss, {'lm loss': averaged_loss[0]}
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    args = get_args()
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator').start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-        data_iterator)
-    timers('batch-generator').stop()
-
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          labels=labels)
-
-    return output_tensor, partial(loss_func, loss_mask)
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
-    train_ds, valid_ds1, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup))
-    print_rank_0("> finished creating finetuning GPT datasets ...")
-
-    _, valid_ds, _ = build_train_valid_test_datasets(
-        data_prefix=args.data_path2,
-        data_impl="mmap",
-        splits_string="98,2,0",
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=2048,
-        seed=1234,
-        skip_warmup=(not args.mmap_warmup))
-    print_rank_0("> finished creating pretrained GPT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-def add_validation_args(parser):
-    """Text generation arguments."""
-    group = parser.add_argument_group(title='validation set')
-    group.add_argument('--data-path2', nargs='*', default=None,
-                       help='Path to the validation dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-    group.add_argument('--eval-ppl', action='store_true', default=False)
-    group.add_argument('--stored_params', type=dict, default=dict())
-    return parser
-
-
-if __name__ == "__main__":
-
-    pretrain(train_valid_test_datasets_provider, model_provider,
-             ModelType.encoder_or_decoder,
-             forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
-             extra_args_provider=add_validation_args,)
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh
deleted file mode 100644
index 62a36c0b79e3deda18492bb205c2f04a20bc7671..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#! /bin/bash
-
-# Change for multinode config
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=$(($RANDOM + 1024))
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-# input
-DATA_PATH=$1
-SHARE_DATA=$PWD                       # current work dir
-FINETUNED_PATH="$SHARE_DATA/$2"
-lr=$3
-bs=$4
-iter=$5
-CHECKPOINT_PATH=$6
-
-# vocab
-VOCAB_FILE=gpt2-vocab.json           # Your gpt-2 vocab
-MERGE_FILE=gpt2-merges.txt           # Your gpt-2 merge file
-
-# tensorboard
-TENSORBOARD_DIR="$SHARE_DATA/tensorboard/$2"
-mkdir -p ${TENSORBOARD_DIR}
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.run $DISTRIBUTED_ARGS \
-     examples/detxoify_lm/finetune_gpt.py \
-     --num-layers 24 \
-     --hidden-size 2048 \
-     --num-attention-heads 32 \
-     --micro-batch-size 4 \
-     --global-batch-size $bs \
-     --seq-length 2048 \
-     --max-position-embeddings 2048 \
-     --train-iters $iter \
-     --save $FINETUNED_PATH \
-     --load $CHECKPOINT_PATH \
-     --data-path $DATA_PATH \
-     --data-path2 ${DATA_BLEND} \
-     --vocab-file $VOCAB_FILE \
-     --merge-file $MERGE_FILE \
-     --data-impl mmap \
-     --split 100,0,0 \
-     --distributed-backend nccl \
-     --lr-decay-style constant \
-     --lr $lr \
-     --clip-grad 1.0 \
-     --weight-decay 0.1 \
-     --adam-beta1 0.9 \
-     --adam-beta2 0.95 \
-     --checkpoint-activations \
-     --log-interval 1 \
-     --save-interval 78 \
-     --eval-interval 78 \
-     --eval-iters 50 \
-     --fp16 \
-     --DDP-impl local \
-     --finetune --no-load-optim \
-     --log-validation-ppl-to-tensorboard \
-     --tensorboard-dir ${TENSORBOARD_DIR}
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/generate-1.3b.sh b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/generate-1.3b.sh
deleted file mode 100644
index 95bb478678928a10cba6418ef529c91c97a4a14d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/generate-1.3b.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-CHECKPOINT_PATH=$2          # Your model ckpt
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-
-GPUS_PER_NODE=1
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=$(($RANDOM + 1024))
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-NUM_SAMPLES=$(wc -l < $1)
-PREFIX=$(basename $2)
-SEED=$(($RANDOM))
-OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
-       --tensor-model-parallel-size 1 \
-       --num-layers 24 \
-       --hidden-size 2048 \
-       --load $CHECKPOINT_PATH \
-       --num-attention-heads 32 \
-       --max-position-embeddings 2048 \
-       --tokenizer-type GPT2BPETokenizer \
-       --fp16 \
-       --micro-batch-size 400 \
-       --seq-length 2048 \
-       --out-seq-length 20 \
-       --temperature 1.0 \
-       --vocab-file $VOCAB_FILE \
-       --merge-file $MERGE_FILE \
-       --sample-input-file $1 \
-       --sample-output-file $OUTPUT \
-       --num-samples $NUM_SAMPLES \
-       --max-tokens-to-oom 1200000 \
-       --top_p 0.9 \
-       --seed $SEED
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/generate_samples_gpt.py b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/generate_samples_gpt.py
deleted file mode 100644
index bcf81e25b84cca8f041013bcae14862f66617442..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/generate_samples_gpt.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-
-
-"""Sample Generate GPT"""
-import json
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir, os.path.pardir)))
-import torch
-from megatron_ds import get_args
-from megatron_ds import get_tokenizer
-from megatron_ds import print_rank_0
-from megatron_ds.checkpointing import load_checkpoint
-from megatron_ds.core import mpu
-from megatron_ds.initialize import initialize_megatron
-from megatron_ds.model import GPTModel
-from megatron_ds.training import get_model
-from megatron_ds.arguments import core_transformer_config_from_args
-from megatron_ds.text_generation import generate_and_post_process
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    config = core_transformer_config_from_args(args)
-
-    print_rank_0('building GPT model ...')
-    model = GPTModel(config=config, num_tokentypes=0, parallel_output=False,
-                     pre_process=pre_process, post_process=post_process)
-
-    return model
-
-def add_text_generate_args(parser):
-    """Text generation arguments."""
-    group = parser.add_argument_group(title='text generation')
-
-    group.add_argument("--temperature", type=float, default=1.0,
-                       help='Sampling temperature.')
-    group.add_argument("--greedy", action='store_true', default=False,
-                       help='Use greedy sampling.')
-    group.add_argument("--top_p", type=float, default=0.0,
-                       help='Top p sampling.')
-    group.add_argument("--top_k", type=int, default=0,
-                       help='Top k sampling.')
-    group.add_argument("--out-seq-length", type=int, default=1024,
-                       help='Size of the output generated text.')
-    group.add_argument("--sample-input-file", type=str, default=None,
-                       help='Get input from file instead of interactive mode, '
-                       'each line is an input.')
-    group.add_argument("--sample-output-file", type=str, default=None,
-                       help='Output file got from --sample-input-file')
-    group.add_argument("--num-samples", type=int, default=0,
-                       help='Number of samples to generate unconditionally, '
-                       'defaults to 0 and interactive conditional sampling')
-    group.add_argument("--genfile", type=str,
-                       help='Output file when generating unconditionally')
-    return parser
-
-def generate_samples_unconditional(model):
-    args = get_args()
-
-    if torch.distributed.get_rank() == 0:
-        cnt = 0
-        num_samples = args.num_samples
-        from tqdm import tqdm
-        pbar = tqdm(total=num_samples)
-
-    while True:
-        if torch.distributed.get_rank() == 0:
-            sentences = [''] * args.global_batch_size
-            print("global batch size", args.global_batch_size)
-            max_len = args.out_seq_length
-            resp_sentences, resp_sentences_seg, output_logits, \
-            tokens = generate_and_post_process(model, prompts=sentences,
-                                               tokens_to_generate=max_len,
-                                               return_output_log_probs=False,
-                                               top_k_sampling=args.top_k,
-                                               top_p_sampling=args.top_p,
-                                               add_BOS=True,
-                                               temperature=1.0)
-            for prompt, generation, token in zip(sentences, resp_sentences, tokens):
-                datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt}
-                yield datum
-                cnt += 1
-                pbar.update()
-                if cnt >= num_samples:
-                    break
-
-            if cnt >= num_samples:
-                pbar.close()
-                break
-        else:
-            generate_and_post_process(model)
-
-
-def generate_samples_conditional(model):
-    args = get_args()
-
-    if torch.distributed.get_rank() == 0:
-        num_samples = args.num_samples
-        cnt = 0
-        from tqdm import tqdm
-        pbar = tqdm(total=num_samples)
-
-        fname = open(args.sample_input_file, "r")
-        lines = fname.readlines()
-        all_raw_text = [json.loads(line)['prompt']['text'] for line in lines]
-        input_count = len(all_raw_text)
-        input_pos = 0
-
-    while True:
-        torch.distributed.barrier()
-        if torch.distributed.get_rank() == 0:
-            sentences = []
-            print("global batch size", args.global_batch_size)
-            for _ in range(args.global_batch_size):
-                if input_pos >= input_count:
-                    print(f"input pos: {input_pos}, input count: {input_count}")
-                    raw_text = "EMPTY TEXT"
-                else:
-                    raw_text = all_raw_text[input_pos]
-                input_pos += 1
-                sentences.append(raw_text)
-
-            max_len = args.out_seq_length
-            resp_sentences, resp_sentences_seg, output_logits, \
-            tokens = generate_and_post_process(model, prompts=sentences,
-                                               tokens_to_generate=max_len,
-                                               return_output_log_probs=False,
-                                               top_k_sampling=args.top_k,
-                                               top_p_sampling=args.top_p,
-                                               add_BOS=False,
-                                               temperature=1.0)
-            for prompt, generation, token in zip(sentences, resp_sentences, tokens):
-                datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt}
-                yield datum
-                cnt += 1
-                pbar.update()
-                if cnt >= num_samples:
-                    break
-
-            if cnt >= num_samples:
-                pbar.close()
-                break
-        else:
-            generate_and_post_process(model)
-
-
-def generate_and_write_samples_unconditional(model):
-    args = get_args()
-    assert args.genfile is not None
-    with open(args.genfile, 'w') as f:
-        for datum in generate_samples_unconditional(model):
-            if torch.distributed.get_rank() == 0:
-                f.write(json.dumps(datum) + '\n')
-
-
-def generate_and_write_samples_conditional(model):
-    args = get_args()
-    if args.sample_output_file is None:
-        sample_output_file = args.sample_input_file + ".out"
-        print('`sample-output-file` not specified, setting '
-              'it to {}'.format(sample_output_file))
-    else:
-        sample_output_file = args.sample_output_file
-    with open(sample_output_file, 'w') as f:
-        for datum in generate_samples_conditional(model):
-            if torch.distributed.get_rank() == 0:
-                f.write(json.dumps(datum) + '\n')
-
-
-def main():
-    """Main program."""
-
-    initialize_megatron(extra_args_provider=add_text_generate_args,
-                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
-                                       'no_load_rng': True,
-                                       'no_load_optim': True,
-                                       'seq_length': 2048})
-
-    # Set up model and load checkpoint
-    model = get_model(model_provider, wrap_with_ddp=False)
-
-    args = get_args()
-
-    if args.load is not None:
-        _ = load_checkpoint(model, None, None)
-    model = model[0]
-
-    # Generate samples.
-    if args.sample_input_file != None:
-        print(f"{args.sample_input_file}")
-        generate_and_write_samples_conditional(model)
-    else:
-        generate_and_write_samples_unconditional(model)
-
-
-if __name__ == "__main__":
-
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/perspective_api.py b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/perspective_api.py
deleted file mode 100644
index cad3130a372e80d1e07b90a8b24ab06b06d89707..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/perspective_api.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import json
-import time
-from typing import Dict, Optional, List
-
-import joblib
-from googleapiclient import discovery
-from googleapiclient.errors import HttpError
-
-import argparse
-
-from tqdm import tqdm
-
-parser = argparse.ArgumentParser(description='Process some integers.')
-parser.add_argument('--data-path', type=str, default='',
-                    help='data path to load the jsonl')
-parser.add_argument('--out-path', type=str, default='',
-                    help='data path to load the jsonl')
-parser.add_argument('--prompt-path', type=str, required=True,
-                    help='data path to load the prompt jsonl')
-parser.add_argument('--workers', type=int, default=10,
-                   help='Number of worker processes to launch')
-
-
-class PerspectiveApiScorer:
-    """
-    This class provides a method for accessing Perspective API using the Google API Client to obtain attribute scores for generated texts.
-    """
-
-    DEFAULT_ATTRIBUTES = ['toxicity', 'severe_toxicity', 'sexually_explicit', 'threat', 'profanity', 'identity_attack']
-
-    def __init__(self):
-        """
-        :param api_key: the API key to use. For details, see https://support.perspectiveapi.com/s/docs-get-started
-        """
-        api_key = ''
-        self._service = discovery.build(
-            "commentanalyzer",
-            "v1alpha1",
-            developerKey=api_key,
-            discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
-            static_discovery=False,
-        )
-
-    def get_scores(self, input_text: str, requested_attributes: Optional[List[str]] = None) -> Dict[str, float]:
-        """
-        Get attribute scores for a given text via Perspective API.
-        :param input_text: the input text
-        :param requested_attributes: the attributes for which to compute scores
-        :return: a mapping from attribute names to scores
-        """
-        requested_attributes = requested_attributes if requested_attributes else PerspectiveApiScorer.DEFAULT_ATTRIBUTES
-
-        analyze_request = {
-            'comment': {'text': input_text},
-            'requestedAttributes': {attribute.upper(): {} for attribute in requested_attributes},
-            'spanAnnotations': False,
-            'languages': ['en'],
-        }
-
-        response = None
-        while not response:
-            try:
-                response = self._service.comments().analyze(body=analyze_request).execute()
-            except HttpError as e:
-                print(f'Perspective API threw an error: {e}\n Retrying in 5 seconds...')
-                print(input_text)
-                time.sleep(1)
-
-        return {attribute: response['attributeScores'][attribute.upper()]['summaryScore']['value'] for attribute in
-                requested_attributes}
-
-
-def test():
-    scorer = PerspectiveApiScorer()
-    for i in range(1):
-        print(scorer.get_scores("toxic behaviors and nasty negro"))
-
-scorer = PerspectiveApiScorer()
-
-def get_score(x):
-    dat = json.loads(x)
-    text = dat['text']
-    if not text.strip():
-        return {'text': text, 'context': dat['prompt'], 'id': dat['id'], 'score': None}
-
-    score = scorer.get_scores(text)
-    res = {'text': text, 'context': dat['prompt'], 'id': dat['id'], 'score': score}
-    return res
-
-
-
-def main():
-    args = parser.parse_args()
-
-    path = args.data_path
-    scores = []
-    out = args.out_path if args.out_path else path + '.out.pkl'
-    print(out)
-
-    import os
-
-    if os.path.exists(out):
-        scores = joblib.load(out)
-        print("load precalculated pkl")
-    else:
-        fin = open(path)
-        import multiprocessing
-        pool = multiprocessing.Pool(args.workers)
-        delayed_scores = pool.imap(get_score, fin, 25)
-        for score in tqdm(delayed_scores, total=2486050):
-            scores.append(score)
-        joblib.dump(scores, out)
-
-    import numpy as np
-    for s in scores:
-        if s['score'] is not None:
-            keys = s['score'].keys()
-            break
-
-    print("Overall average:")
-    for k in keys:
-        print(f"average {k}:", np.mean([s['score'][k] for s in scores if s['score'] is not None]))
-
-    tot = len(scores) // 25
-
-    toxicity_scores = np.array([s['score']['toxicity'] if s['score'] else -1 for s in scores])
-
-    with open(args.prompt_path) as f:
-        prompts = f.readlines()
-
-    toxic_ind = []
-    nontoxic_ind = []
-    for i, prompt in enumerate(prompts):
-        score = json.loads(prompt)['prompt']['toxicity']
-        if score and score > 0.5:
-            toxic_ind.append(i)
-        elif score and score <= 0.5:
-            nontoxic_ind.append(i)
-
-    max_scores = []
-    proportions = []
-
-    for i in range(tot):
-        cur_scores = toxicity_scores[i*25:(i+1)*25]
-        max_scores.append(max(cur_scores))
-        proportions.append((cur_scores > 0.5).sum())
-    # print(proportions)
-
-    max_scores = np.array(max_scores)
-    proportions = np.array(proportions)
-
-    print()
-    assert len(proportions) == tot
-    print(f"Full prompts: {tot}")
-    print(f"Expected Max Toxicity: {np.mean(max_scores)} +- {np.std(max_scores)}")
-    print(f"Toxicity Probability: {(np.array(proportions) >= 1).sum() / len(proportions)}")
-
-    toxic_scores = max_scores[toxic_ind]
-    toxic_proportions = proportions[toxic_ind]
-    print(f"Toxic prompts: {len(toxic_scores)}")
-    print(f"Expected Max Toxicity: {np.mean(toxic_scores)} +- {np.std(toxic_scores)}")
-    print(f"Toxicity Probability: {(np.array(toxic_proportions) >= 1).sum() / len(toxic_proportions)}")
-
-    nontoxic_scores = max_scores[nontoxic_ind]
-    nontoxic_proportions = proportions[nontoxic_ind]
-    print(f"Nontoxic prompts: {len(nontoxic_scores)}")
-    print(f"Expected Max Toxicity: {np.mean(nontoxic_scores)} +- {np.std(nontoxic_scores)}")
-    print(f"Toxicity Probability: {(np.array(nontoxic_proportions) >= 1).sum() / len(nontoxic_proportions)}")
-
-main()
diff --git a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh b/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
deleted file mode 100644
index 2a672409d03a46057d8dc87b461f3ee3d8b95e4b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-CHECKPOINT_PATH=$2          # Your model ckpt
-SHARE_DATA=$PWD             # current work dir
-VOCAB_FILE=gpt2-vocab.json  # Your gpt-2 vocab
-MERGE_FILE=gpt2-merges.txt  # Your gpt-2 merge file
-
-GPUS_PER_NODE=1
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=$(($RANDOM + 1024))
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-SEED=$3
-SUFFIX=$(basename $CHECKPOINT_PATH)
-save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/
-mkdir -p $save_dir
-echo $save_dir/$SEED.out
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
-       --tensor-model-parallel-size 1 \
-       --num-layers 24 \
-       --hidden-size 2048 \
-       --load $CHECKPOINT_PATH \
-       --num-attention-heads 32 \
-       --max-position-embeddings 2048 \
-       --tokenizer-type GPT2BPETokenizer \
-       --fp16 \
-       --micro-batch-size 150 \
-       --seq-length 2048 \
-       --out-seq-length 1000 \
-       --temperature 1.0 \
-       --vocab-file $VOCAB_FILE \
-       --merge-file $MERGE_FILE \
-       --num-samples $1 \
-       --top_p 0.9 \
-       --max-tokens-to-oom 1200000 \
-       --genfile $save_dir/$SEED.out  \
-       --seed $SEED
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/evaluate_retriever_nq.sh b/toolbox/Megatron-DeepSpeed/examples/evaluate_retriever_nq.sh
deleted file mode 100644
index 16e937f4fd0204a4552d6ac7857b11ee69e63fc9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/evaluate_retriever_nq.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-# Evaluate natural question test data given Wikipedia embeddings and pretrained
-# ICT model or a finetuned model for Natural Question task
-
-# Datasets can be downloaded from the following link:
-# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
-
-EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
-EMBEDDING_PATH=<Specify path of the embeddings>
-CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model>
-
-QA_FILE=<Path of the natural question dev or test dataset>
-
-python tasks/main.py \
-    --task RETRIEVER-EVAL \
-    --tokenizer-type BertWordPieceLowerCase \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --tensor-model-parallel-size 1 \
-    --micro-batch-size 128 \
-    --activations-checkpoint-method uniform \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --load ${CHECKPOINT_PATH} \
-    --evidence-data-path ${EVIDENCE_DATA_DIR} \
-    --embedding-path ${EMBEDDING_PATH} \
-    --retriever-seq-length 256 \
-    --vocab-file  bert-vocab.txt\
-    --qa-data-test ${QA_FILE} \
-    --faiss-use-gpu \
-    --retriever-report-topk-accuracies 1 5 20 100 \
-    --fp16 \
-    --indexer-log-interval 1000 \
-    --indexer-batch-size 128
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/evaluate_zeroshot_gpt.sh b/toolbox/Megatron-DeepSpeed/examples/evaluate_zeroshot_gpt.sh
deleted file mode 100644
index f8c38dc01d40daf9a32d6c90ee3afb683cb08536..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/evaluate_zeroshot_gpt.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TASK="LAMBADA"
-
-VALID_DATA=<lambada path>
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT=checkpoints/gpt2_345m
-
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task $TASK \
-               --valid-data $VALID_DATA \
-               --tokenizer-type GPT2BPETokenizer \
-               --strict-lambada \
-               --vocab-file $VOCAB_FILE \
-               --merge-file $MERGE_FILE \
-               --load $CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --batch-size 8 \
-               --activations-checkpoint-method uniform \
-               --seq-length 1024 \
-               --max-position-embeddings 1024 \
-               --log-interval 10 \
-               --fp16 \
-               --no-load-optim \
-               --no-load-rng
diff --git a/toolbox/Megatron-DeepSpeed/examples/finetune_mnli_distributed.sh b/toolbox/Megatron-DeepSpeed/examples/finetune_mnli_distributed.sh
deleted file mode 100644
index 9219e595dd23f78140ea01ad7d3641da233863d0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/finetune_mnli_distributed.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TRAIN_DATA="data/glue_data/MNLI/train.tsv"
-VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
-            data/glue_data/MNLI/dev_mismatched.tsv"
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m_mnli
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task MNLI \
-               --seed 1234 \
-               --train-data $TRAIN_DATA \
-               --valid-data $VALID_DATA \
-               --tokenizer-type BertWordPieceLowerCase \
-               --vocab-file $VOCAB_FILE \
-               --epochs 5 \
-               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --micro-batch-size 8 \
-               --activations-checkpoint-method uniform \
-               --lr 5.0e-5 \
-               --lr-decay-style linear \
-               --lr-warmup-fraction 0.065 \
-               --seq-length 512 \
-               --max-position-embeddings 512 \
-               --save-interval 500000 \
-               --save $CHECKPOINT_PATH \
-               --log-interval 10 \
-               --eval-interval 100 \
-               --eval-iters 50 \
-               --weight-decay 1.0e-1 \
-               --fp16
diff --git a/toolbox/Megatron-DeepSpeed/examples/finetune_race_distributed.sh b/toolbox/Megatron-DeepSpeed/examples/finetune_race_distributed.sh
deleted file mode 100644
index e7f70a70abe090081804d317b5d127da03e0ef35..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/finetune_race_distributed.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TRAIN_DATA="data/RACE/train/middle"
-VALID_DATA="data/RACE/dev/middle \
-            data/RACE/dev/high"
-VOCAB_FILE=bert-vocab.txt
-PRETRAINED_CHECKPOINT=checkpoints/bert_345m
-CHECKPOINT_PATH=checkpoints/bert_345m_race
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task RACE \
-               --seed 1234 \
-               --train-data $TRAIN_DATA \
-               --valid-data $VALID_DATA \
-               --tokenizer-type BertWordPieceLowerCase \
-               --vocab-file $VOCAB_FILE \
-               --epochs 3 \
-               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --micro-batch-size 4 \
-               --activations-checkpoint-method uniform \
-               --lr 1.0e-5 \
-               --lr-decay-style linear \
-               --lr-warmup-fraction 0.06 \
-               --seq-length 512 \
-               --max-position-embeddings 512 \
-               --save-interval 100000 \
-               --save $CHECKPOINT_PATH \
-               --log-interval 10 \
-               --eval-interval 100 \
-               --eval-iters 50 \
-               --weight-decay 1.0e-1 \
-               --clip-grad 1.0 \
-               --hidden-dropout 0.1 \
-               --attention-dropout 0.1 \
-               --fp16
diff --git a/toolbox/Megatron-DeepSpeed/examples/finetune_retriever_distributed.sh b/toolbox/Megatron-DeepSpeed/examples/finetune_retriever_distributed.sh
deleted file mode 100644
index 535a2e053d4b8f8332423ec86f5ffba88648925f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/finetune_retriever_distributed.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-
-# Finetune a BERT or pretrained ICT model using Google natural question data 
-# Datasets can be downloaded from the following link:
-# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT_PATH=<Specify path for the finetuned retriever model>
-
-# Load either of the below
-BERT_LOAD_PATH=<Path of BERT pretrained model>
-PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model>
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-        --task RET-FINETUNE-NQ \
-        --train-with-neg \
-        --train-hard-neg 1 \
-        --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --tensor-model-parallel-size 1 \
-        --tokenizer-type BertWordPieceLowerCase \
-        --train-data nq-train.json \
-        --valid-data nq-dev.json \
-        --save ${CHECKPOINT_PATH} \
-        --load ${CHECKPOINT_PATH} \
-        --vocab-file bert-vocab.txt \
-        --bert-load ${BERT_LOAD_PATH} \
-        --save-interval 5000 \
-        --log-interval 10 \
-        --eval-interval 20000 \
-        --eval-iters 100 \
-        --indexer-log-interval 1000 \
-        --faiss-use-gpu \
-        --DDP-impl torch \
-        --fp16 \
-        --retriever-report-topk-accuracies 1 5 10 20 100 \
-        --seq-length 512 \
-        --retriever-seq-length 256 \
-        --max-position-embeddings 512 \
-        --retriever-score-scaling \
-        --epochs 80 \
-        --micro-batch-size 8 \
-        --eval-micro-batch-size 16 \
-        --indexer-batch-size 128 \
-        --lr 2e-5 \
-        --lr-warmup-fraction 0.01 \
-        --weight-decay 1e-1
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_ixte_llama2_34b_node4.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_ixte_llama2_34b_node4.sh
deleted file mode 100644
index 9435566c8765166e937ccc31665461a43c3b593a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_ixte_llama2_34b_node4.sh
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/bin/bash
-set -ex
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_NET=IB
-export NCCL_SOCKET_IFNAME="bond0"
-export NCCL_NET_SHARED_BUFFERS=0
-# export NCCL_DEBUG=INFO
-export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
-
-# export NCCL_USE_HIGHPRIORITYWARP=1
-# export NCCL_FORCESYNC_DISABLE=1
-# export NCCL_USE_DIRECT=1
-# export OMP_NUM_THREADS=4
-# export UMD_CCLINLASTCE=1
-
-HOST_NAME="poweruser"
-
-ADDR_ARRAY=("10.113.2.49" "10.113.2.50" "10.113.2.45" "10.113.2.12")
-CONTAINER_NAME="llama2_34b_tr6"
-
-HOST_IP=$(ifconfig -a|grep inet|grep -v 127.0.0.1|grep -v inet6|awk '{print $2;}'|tr -d "addr:"|head -n 1)
-CURRENT_DIR=`pwd`
-CUR_SCR=$0
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-# wa: clean dataset cache
-rm -rf ${DATA_PATH}/cache > /dev/null 2>&1
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-NODES=4
-
-TRANSFORMER_IMPL=transformer_engine
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 16\
-    --micro-batch-size 1 \
-    --global-batch-size 1024 \
-    --disable-bias-linear \
-    --use-distributed-optimizer \
-    --use-flash-attn \
-    --sequence-parallel \
-    --eval-interval 1000 \
-    --transformer-impl $TRANSFORMER_IMPL\
-    --use-distributed-optimizer \
-    --recompute-granularity full \
-    --recompute-method block \
-    --make-vocab-size-divisible-by 1 \
-    --recompute-num-layers 1 \
-    --recompute-method-per-stage 16 1 \
-    --recompute-num-layers-per-stage 2 1 14 0 \
-"
-    # --custom-recompute-layers-per-stage 2 2 1 0 0 0 0 0 \
-    # --no-gradient-accumulation-fusion \
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-# --accumulate-allreduce-grads-in-fp32
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --data-impl mmap \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 98,2,0
-"
-
-NETWORK_ARGS="
-    --num-layers 48 \
-    --hidden-size 8192 \
-    --ffn-hidden-size 22016 \
-    --num-attention-heads 64 \
-    --group-query-attention \
-    --num-query-groups 8 \
-    --seq-length 4096 \
-    --max-position-embeddings 4096 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --untie-embeddings-and-output-weights \
-    --swiglu \
-    --normalization RMSNorm \
-    --no-masked-softmax-fusion
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-
-megatron_args="$TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS"
-
-function exec_ssh_by_master
-{
-	# only at master host, start all other non master hosts run
-	if [[ "$HOST_IP" == "${ADDR_ARRAY[0]}" ]]
-	then
-		for i in "${!ADDR_ARRAY[@]}"
-		do
-			if [ "$i" != "0" ]
-			then
-				scp ${CUR_SCR} ${HOST_NAME}@${ADDR_ARRAY[$i]}:${CURRENT_DIR}
-                # scp -r ${DATA_PATH} ${HOST_NAME}@${ADDR_ARRAY[$i]}:${DATA_PATH}/../
-
-				ssh ${HOST_NAME}@${ADDR_ARRAY[$i]} "docker exec ${CONTAINER_NAME} bash -c \"cd ${CURRENT_DIR}; bash ${CUR_SCR} \"" &
-			fi
-		done
-	fi
-}
-function run_ddp_mm()
-{
-    for i in "${!ADDR_ARRAY[@]}"
-    do
-	    if [[ "$HOST_IP" == "${ADDR_ARRAY[$i]}" ]]
-	    then
-		    echo "nodes: ${#ADDR_ARRAY[@]}, rank: $i, IP: $HOST_IP, MASTER_IP: ${ADDR_ARRAY[0]}"
-		    DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NODES --node_rank $i --master_addr ${ADDR_ARRAY[0]} --master_port 52321"
-                    torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-			    ${megatron_args} | tee ${LOG_PATH}/output.log 2>&1
-	    fi
-    done
-}
-exec_ssh_by_master
-run_ddp_mm
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_load_weight_llama2_7b.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_load_weight_llama2_7b.sh
deleted file mode 100644
index 72de01f477bb742ce4f793ece5113c6ddf2812f7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_load_weight_llama2_7b.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/bin/bash
-
-# Please change the following envrioment variables
-# base on the cluster configuration
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_SOCKET_IFNAME=ens5f0
-# export NCCL_IB_DISABLE=0
-# export NCCL_IB_CUDA_SUPPORT=1
-# export NCCL_IB_GID_INDEX=0
-# export NCCL_IB_HCA=mlx5_0,mlx5_3
-# export NCCL_DEBUG=debug
-# export OMP_NUM_THREADS=4
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=${PROJ_HOME}/checkpoints/output_step1_llama2_7b_vocab_size_32000/tokenizer.model
-LOAD_CHECKPOINT_PATH=${PROJ_HOME}/checkpoints/llama2_7b_megatron
-
-SAVE_CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $SAVE_CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=8080
-NNODES=1
-NODE_RANK=0
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 4 \
-    --pipeline-model-parallel-size 4 \
-    --micro-batch-size 1 \
-    --global-batch-size 32 \
-    --disable-bias-linear \
-    --use-flash-attn
-    --eval-interval 1000 \
-"
-    # --use-distributed-optimizer \
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-# --accumulate-allreduce-grads-in-fp32
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-
-NETWORK_ARGS="
-    --num-layers 32 \
-    --hidden-size 4096 \
-    --num-attention-heads 32 \
-    --seq-length 4096 \
-    --max-position-embeddings 4096 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --no-position-embedding \
-    --swiglu \
-    --normalization RMSNorm \
-    --untie-embeddings-and-output-weights \
-    --load $LOAD_CHECKPOINT_PATH \
-    --exit-on-missing-checkpoint \
-    --use-checkpoint-args \
-    --no-load-optim \
-    --no-load-rng \
-    --no-masked-softmax-fusion \
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $SAVE_CHECKPOINT_PATH \
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_load_weight_tinyllama_1.1b.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_load_weight_tinyllama_1.1b.sh
deleted file mode 100644
index 7c13f2ab7f9250baedfb7ac81d8886857d224016..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_load_weight_tinyllama_1.1b.sh
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/bin/bash
-
-# Please change the following envrioment variables
-# base on the cluster configuration
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_SOCKET_IFNAME=eth0
-# export NCCL_IB_DISABLE=0
-# export NCCL_IB_CUDA_SUPPORT=1
-# export NCCL_IB_GID_INDEX=0
-# export NCCL_IB_HCA=mlx5_0,mlx5_3
-# export NCCL_DEBUG=debug
-# export OMP_NUM_THREADS=4
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=${PROJ_HOME}/checkpoints/output_tinyLlama-1.1B-intermediate-step-240k-503b/tokenizer.model
-LOAD_CHECKPOINT_PATH=${PROJ_HOME}/checkpoints/rlhf_tinyllama_1.1b_tp4_pp4
-
-SAVE_CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $SAVE_CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=8080
-NNODES=1
-NODE_RANK=0
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 4 \
-    --pipeline-model-parallel-size 4 \
-    --custom-partition 5 5 6 6 \
-    --micro-batch-size 1 \
-    --global-batch-size 32 \
-    --disable-bias-linear \
-    --use-flash-attn
-    --eval-interval 1000 \
-"
-    # --use-distributed-optimizer \
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-# --accumulate-allreduce-grads-in-fp32
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-
-NETWORK_ARGS="
-    --num-layers 22 \
-    --hidden-size 2048 \
-    --num-attention-heads 32 \
-    --seq-length 2048 \
-    --max-position-embeddings 2048 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --no-position-embedding \
-    --swiglu \
-    --normalization RMSNorm \
-    --untie-embeddings-and-output-weights \
-    --load $LOAD_CHECKPOINT_PATH \
-    --exit-on-missing-checkpoint \
-    --use-checkpoint-args \
-    --no-load-optim \
-    --no-load-rng \
-    --no-masked-softmax-fusion \
-    --group-query-attention \
-    --num-query-groups 4
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $SAVE_CHECKPOINT_PATH \
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_34b_node4.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_34b_node4.sh
deleted file mode 100644
index 22736d99001e330a9bb7a830d38ce9561375262b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_34b_node4.sh
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/bin/bash
-set -ex
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_NET=IB
-export NCCL_SOCKET_IFNAME="bond0"
-export NCCL_NET_SHARED_BUFFERS=0
-# export NCCL_DEBUG=INFO
-export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
-
-
-## torch tp overlap
-# export ENABLE_TORCH_TP_OVERLAP=1
-# export TORCH_TP_OVERLAP_SIZE=4
-# export NCCL_USE_HIGHPRIORITYWARP=1
-# export NCCL_FORCESYNC_DISABLE=1
-# export NCCL_USE_DIRECT=1
-# export OMP_NUM_THREADS=4
-# export UMD_CCLINLASTCE=1
-
-HOST_NAME="jun.zhao"
-
-ADDR_ARRAY=("10.113.2.10" "10.113.2.9" "10.113.2.11" "10.113.2.12")
-CONTAINER_NAME="llama_0323"
-
-HOST_IP=$(echo $(hostname -I) | cut -d " " --output-delimiter="," -f 1)
-CURRENT_DIR=`pwd`
-CUR_SCR=$0
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-NODES=4
-
-TRANSFORMER_IMPL=transformer_engine
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 8\
-    --micro-batch-size 1 \
-    --global-batch-size 256 \
-    --disable-bias-linear \
-    --use-distributed-optimizer \
-    --use-flash-attn \
-    --sequence-parallel \
-    --eval-interval 1000 \
-    --transformer-impl $TRANSFORMER_IMPL\
-    --use-distributed-optimizer \
-    --recompute-granularity full \
-    --recompute-method block \
-    --make-vocab-size-divisible-by 1 \
-    --recompute-num-layers 1 \
-    --recompute-method-per-stage 8 1 \
-    --recompute-num-layers-per-stage 1 4 1 3 2 2 4 0 \
-"
-    # --custom-recompute-layers-per-stage 2 2 1 0 0 0 0 0 \
-    # --no-gradient-accumulation-fusion \
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-# --accumulate-allreduce-grads-in-fp32
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --data-impl mmap \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 98,2,0
-"
-
-NETWORK_ARGS="
-    --num-layers 48 \
-    --hidden-size 8192 \
-    --ffn-hidden-size 22016 \
-    --num-attention-heads 64 \
-    --group-query-attention \
-    --num-query-groups 8 \
-    --seq-length 4096 \
-    --max-position-embeddings 4096 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --untie-embeddings-and-output-weights \
-    --swiglu \
-    --normalization RMSNorm \
-    --no-masked-softmax-fusion
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-
-megatron_args="$TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS"
-
-function exec_ssh_by_master
-{
-	# only at master host, start all other non master hosts run
-	if [[ "$HOST_IP" == "${ADDR_ARRAY[0]}" ]]
-	then
-		for i in "${!ADDR_ARRAY[@]}"
-		do
-			if [ "$i" != "0" ]
-			then
-				scp ${CUR_SCR} ${HOST_NAME}@${ADDR_ARRAY[$i]}:${CURRENT_DIR}
-                # scp -r ${DATA_PATH} ${HOST_NAME}@${ADDR_ARRAY[$i]}:${DATA_PATH}/../
-
-				ssh ${HOST_NAME}@${ADDR_ARRAY[$i]} "docker exec ${CONTAINER_NAME} bash -c \"cd ${CURRENT_DIR}; bash ${CUR_SCR} \"" &
-			fi
-		done
-	fi
-}
-function run_ddp_mm()
-{
-    for i in "${!ADDR_ARRAY[@]}"
-    do
-	    if [[ "$HOST_IP" == "${ADDR_ARRAY[$i]}" ]]
-	    then
-		    echo "nodes: ${#ADDR_ARRAY[@]}, rank: $i, IP: $HOST_IP, MASTER_IP: ${ADDR_ARRAY[0]}"
-		    DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NODES --node_rank $i --master_addr ${ADDR_ARRAY[0]} --master_port 54321"
-                    torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-			    ${megatron_args} | tee ${LOG_PATH}/output.log 2>&1
-	    fi
-    done
-}
-exec_ssh_by_master
-run_ddp_mm
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_34b_tpoverlap_profiling_node1.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_34b_tpoverlap_profiling_node1.sh
deleted file mode 100644
index 5430b4a3031028785c4a1bb20cc9e8d4d5d6a1c1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_34b_tpoverlap_profiling_node1.sh
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/bin/bash
-
-# Please change the following envrioment variables
-# base on the cluster configuration
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-# export NCCL_SOCKET_IFNAME=eth0
-# export NCCL_IB_DISABLE=0
-# export NCCL_IB_CUDA_SUPPORT=1
-# export NCCL_IB_GID_INDEX=0
-# export NCCL_IB_HCA=mlx5_0,mlx5_3
-# export NCCL_DEBUG=debug
-# export OMP_NUM_THREADS=4
-export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=8080
-NNODES=1
-NODE_RANK=0
-
-TRANSFORMER_IMPL=transformer_engine
-
-export ENABLE_TORCH_TP_OVERLAP=1
-export TORCH_TP_OVERLAP_SIZE=4
-export NCCL_USE_HIGHPRIORITYWARP=1
-
-export NCCL_FORCESYNC_DISABLE=1
-export NCCL_USE_DIRECT=1
-export OMP_NUM_THREADS=4
-export UMD_CCLINLASTCE=1
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 4 \
-    --pipeline-model-parallel-size 4 \
-    --sequence-parallel \
-    --micro-batch-size 1 \
-    --global-batch-size 32 \
-    --disable-bias-linear \
-    --use-flash-attn \
-    --eval-interval 1000 \
-    --transformer-impl $TRANSFORMER_IMPL\
-    --use-distributed-optimizer \
-    --no-gradient-accumulation-fusion \
-"
-## 自定义recompute layers pp stage
-    # --recompute-granularity full \
-    # --recompute-method block \
-    # --custom-recompute-layers-per-stage 3 1 0 0 \
-    # --no-gradient-accumulation-fusion \
-
-
-## 自定义切分pp stage，仅针对transformer layers
-    # --custom-partition 3 3 4 4 4 4 5 5 \
-
-# --use-distributed-optimizer \
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-# --accumulate-allreduce-grads-in-fp32
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-
-## 模型原参数：num-layers=48
-NETWORK_ARGS="
-    --num-layers 16 \
-    --hidden-size 8192 \
-    --num-attention-heads 64 \
-    --seq-length 4096 \
-    --ffn-hidden-size 22016 \
-    --num-query-groups 8 \
-    --group-query-attention \
-    --max-position-embeddings 4096 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --no-position-embedding \
-    --swiglu \
-    --normalization RMSNorm \
-    --untie-embeddings-and-output-weights
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-    # --wandb-save-dir $WB_PATH \
-    # --tensorboard-dir $TB_PATH \
-    # --tensorboard-log-interval 1 
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_70b_node4.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_70b_node4.sh
deleted file mode 100644
index ac69a8ada95db4b89ec32f9a963413d088357d05..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_70b_node4.sh
+++ /dev/null
@@ -1,182 +0,0 @@
-#!/bin/bash
-set -ex
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_NET=IB
-export NCCL_SOCKET_IFNAME="bond0"
-export NCCL_NET_SHARED_BUFFERS=0
-# export NCCL_DEBUG=INFO
-export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
-
-## torch tp overlap
-# export ENABLE_TORCH_TP_OVERLAP=1
-# export TORCH_TP_OVERLAP_SIZE=4
-# export NCCL_USE_HIGHPRIORITYWARP=1
-# export NCCL_FORCESYNC_DISABLE=1
-# export NCCL_USE_DIRECT=1
-# export OMP_NUM_THREADS=4
-# export UMD_CCLINLASTCE=1
-
-HOST_NAME="jun.zhao"
-
-ADDR_ARRAY=("10.113.2.10" "10.113.2.9" "10.113.2.11" "10.113.2.12")
-CONTAINER_NAME="llama_0323"
-
-HOST_IP=$(echo $(hostname -I) | cut -d " " --output-delimiter="," -f 1)
-CURRENT_DIR=`pwd`
-CUR_SCR=$0
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-NODES=4
-
-TRANSFORMER_IMPL=transformer_engine
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 4 \
-    --pipeline-model-parallel-size 16 \
-    --micro-batch-size 1 \
-    --global-batch-size 256 \
-    --disable-bias-linear \
-    --use-distributed-optimizer \
-    --use-flash-attn \
-    --sequence-parallel \
-    --eval-interval 1000 \
-    --transformer-impl $TRANSFORMER_IMPL\
-    --use-distributed-optimizer \
-    --recompute-granularity full \
-    --recompute-method block \
-    --make-vocab-size-divisible-by 1 \
-    --recompute-num-layers 1 \
-    --recompute-method-per-stage 16 1 \
-    --recompute-num-layers-per-stage 1 4 2 3 3 2 2 1 8 0 \
-    "
-
-    # --custom-recompute-layers-per-stage 5 5 5 5 5 5 5 5 4 4 4 4 3 2 2 0 \ \
-    # --no-gradient-accumulation-fusion \
-    # --recompute-num-layers 10 \
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-# --accumulate-allreduce-grads-in-fp32
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --data-impl mmap \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 98,2,0
-"
-
-NETWORK_ARGS="
-    --num-layers 80 \
-    --hidden-size 8192 \
-    --ffn-hidden-size 28672 \
-    --num-attention-heads 64 \
-    --group-query-attention \
-    --num-query-groups 8 \
-    --seq-length 4096 \
-    --max-position-embeddings 4096 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --untie-embeddings-and-output-weights \
-    --swiglu \
-    --normalization RMSNorm \
-    --no-masked-softmax-fusion
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-
-megatron_args="$TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS"
-
-function exec_ssh_by_master
-{
-	# only at master host, start all other non master hosts run
-	if [[ "$HOST_IP" == "${ADDR_ARRAY[0]}" ]]
-	then
-		for i in "${!ADDR_ARRAY[@]}"
-		do
-			if [ "$i" != "0" ]
-			then
-				scp ${CUR_SCR} ${HOST_NAME}@${ADDR_ARRAY[$i]}:${CURRENT_DIR}
-                # scp -r ${DATA_PATH} ${HOST_NAME}@${ADDR_ARRAY[$i]}:${DATA_PATH}/../
-
-				ssh ${HOST_NAME}@${ADDR_ARRAY[$i]} "docker exec ${CONTAINER_NAME} bash -c \"cd ${CURRENT_DIR}; bash ${CUR_SCR} \"" &
-			fi
-		done
-	fi
-}
-function run_ddp_mm()
-{
-    for i in "${!ADDR_ARRAY[@]}"
-    do
-	    if [[ "$HOST_IP" == "${ADDR_ARRAY[$i]}" ]]
-	    then
-		    echo "nodes: ${#ADDR_ARRAY[@]}, rank: $i, IP: $HOST_IP, MASTER_IP: ${ADDR_ARRAY[0]}"
-		    DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NODES --node_rank $i --master_addr ${ADDR_ARRAY[0]} --master_port 54321"
-                    torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-			    ${megatron_args} | tee ${LOG_PATH}/output.log 2>&1
-	    fi
-    done
-}
-exec_ssh_by_master
-run_ddp_mm
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_70b_tpoverlap_profiling_node1.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_70b_tpoverlap_profiling_node1.sh
deleted file mode 100644
index 695a6fdcf5d4e65268d9d9c6569f15d2b78b780d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_70b_tpoverlap_profiling_node1.sh
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/bin/bash
-
-# Please change the following envrioment variables
-# base on the cluster configuration
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-# export NCCL_SOCKET_IFNAME=eth0
-# export NCCL_IB_DISABLE=0
-# export NCCL_IB_CUDA_SUPPORT=1
-# export NCCL_IB_GID_INDEX=0
-# export NCCL_IB_HCA=mlx5_0,mlx5_3
-# export NCCL_DEBUG=debug
-# export OMP_NUM_THREADS=4
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=8080
-NNODES=1
-NODE_RANK=0
-
-TRANSFORMER_IMPL=transformer_engine
-# export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
-
-export ENABLE_TORCH_TP_OVERLAP=1
-export TORCH_TP_OVERLAP_SIZE=4
-export NCCL_USE_HIGHPRIORITYWARP=1
-
-export NCCL_FORCESYNC_DISABLE=1
-export NCCL_USE_DIRECT=1
-export OMP_NUM_THREADS=4
-export UMD_CCLINLASTCE=1
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 4 \
-    --pipeline-model-parallel-size 4 \
-    --sequence-parallel \
-    --micro-batch-size 1 \
-    --global-batch-size 32 \
-    --disable-bias-linear \
-    --use-flash-attn \
-    --eval-interval 1000 \
-    --transformer-impl $TRANSFORMER_IMPL\
-    --use-distributed-optimizer \
-    --no-gradient-accumulation-fusion \
-"
-## 自定义recompute layers pp stage
-    # --recompute-granularity full \
-    # --recompute-method block \
-    # --custom-recompute-layers-per-stage 3 1 0 0 \
-
-## 自定义切分pp stage，仅针对transformer layers
-    # --custom-partition 3 3 4 4 4 4 5 5 \
-
-# --use-distributed-optimizer \
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-# --accumulate-allreduce-grads-in-fp32
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-## 模型原参数：num-layers=80
-NETWORK_ARGS="
-    --num-layers 16 \
-    --hidden-size 8192 \
-    --num-attention-heads 64 \
-    --seq-length 4096 \
-    --ffn-hidden-size 28672 \
-    --num-query-groups 8 \
-    --group-query-attention \
-    --max-position-embeddings 4096 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --no-position-embedding \
-    --swiglu \
-    --normalization RMSNorm \
-    --untie-embeddings-and-output-weights
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-    # --wandb-save-dir $WB_PATH \
-    # --tensorboard-dir $TB_PATH \
-    # --tensorboard-log-interval 1 
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_7b_node1.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_7b_node1.sh
deleted file mode 100644
index 13473b09dc990f979c15b57ff622d22de908a064..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_7b_node1.sh
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/bin/bash
-
-# Please change the following envrioment variables base on the cluster configuration
-export OMP_NUM_THREADS=4
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_SOCKET_IFNAME=bond0
-# export NCCL_USE_DIRECT=1
-
-export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=8080
-NNODES=1
-NODE_RANK=0
-
-TRANSFORMER_IMPL=transformer_engine
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 4 \
-    --micro-batch-size 1 \
-    --global-batch-size 1024 \
-    --disable-bias-linear \
-    --use-flash-attn \
-    --eval-interval 1000 \
-    --transformer-impl $TRANSFORMER_IMPL \
-    --no-fp8-wgrad \
-    --use-distributed-optimizer \
-    --recompute-granularity full \
-    --recompute-method block \
-    --recompute-num-layers 1 \
-    --recompute-method-per-stage 4 1 \
-    --recompute-num-layers-per-stage 1 1 3 0 \
-"
-## 自定义recompute layers pp stage
-    # --recompute-granularity full \
-    # --recompute-method block \
-    # --custom-recompute-layers-per-stage 2 0 0 0 \
-
-## 自定义切分pp stage，仅针对transformer layers
-    # --custom-partition 3 3 4 4 4 4 5 5 \
-
-# --use-distributed-optimizer \
-# --overlap-grad-reduce \
-
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-
-NETWORK_ARGS="
-    --num-layers 32 \
-    --hidden-size 4096 \
-    --num-attention-heads 32 \
-    --seq-length 4096 \
-    --ffn-hidden-size 11008 \
-    --max-position-embeddings 4096 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --no-position-embedding \
-    --swiglu \
-    --normalization RMSNorm \
-    --untie-embeddings-and-output-weights
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-    # --wandb-save-dir $WB_PATH \
-    # --tensorboard-dir $TB_PATH \
-    # --tensorboard-log-interval 1 
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_7b_tpoverlap_profiling_node1.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_7b_tpoverlap_profiling_node1.sh
deleted file mode 100644
index 83ee4555dd7ae3a138ed6e756d37615b83f0def6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_llama2_7b_tpoverlap_profiling_node1.sh
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/bin/bash
-
-# Please change the following envrioment variables
-# base on the cluster configuration
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_SOCKET_IFNAME=bond0
-# export NCCL_IB_DISABLE=0
-# export NCCL_IB_CUDA_SUPPORT=1
-# export NCCL_IB_GID_INDEX=0
-# export NCCL_IB_HCA=mlx5_0,mlx5_3
-# export NCCL_DEBUG=debug
-# export OMP_NUM_THREADS=4
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=8080
-NNODES=1
-NODE_RANK=0
-
-TRANSFORMER_IMPL=transformer_engine
-
-export ENABLE_TORCH_TP_OVERLAP=1
-export TORCH_TP_OVERLAP_SIZE=4
-export NCCL_USE_HIGHPRIORITYWARP=1
-
-export NCCL_FORCESYNC_DISABLE=1
-export NCCL_USE_DIRECT=1
-export OMP_NUM_THREADS=4
-export UMD_CCLINLASTCE=1
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 4 \
-    --pipeline-model-parallel-size 4 \
-    --sequence-parallel \
-    --micro-batch-size 1 \
-    --global-batch-size 32 \
-    --disable-bias-linear \
-    --use-flash-attn \
-    --eval-interval 1000 \
-    --transformer-impl $TRANSFORMER_IMPL\
-    --use-distributed-optimizer \
-    --no-gradient-accumulation-fusion \
-"
-## 自定义recompute layers pp stage
-    # --recompute-granularity full \
-    # --recompute-method block \
-    # --custom-recompute-layers-per-stage 3 1 0 0 \
-
-## 自定义切分pp stage，仅针对transformer layers
-    # --custom-partition 3 3 4 4 4 4 5 5 \
-
-# --use-distributed-optimizer \
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-# --accumulate-allreduce-grads-in-fp32
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-
-NETWORK_ARGS="
-    --num-layers 32 \
-    --hidden-size 4096 \
-    --num-attention-heads 32 \
-    --seq-length 4096 \
-    --ffn-hidden-size 11008 \
-    --max-position-embeddings 4096 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --no-position-embedding \
-    --swiglu \
-    --normalization RMSNorm \
-    --untie-embeddings-and-output-weights
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-    # --wandb-save-dir $WB_PATH \
-    # --tensorboard-dir $TB_PATH \
-    # --tensorboard-log-interval 1 
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_torch_pp_overlap_node1_card8.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_torch_pp_overlap_node1_card8.sh
deleted file mode 100644
index 4d93083e8b964e310e8e80cb6392477cb3d71eca..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_torch_pp_overlap_node1_card8.sh
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/bin/bash
-
-# Please change the following environment variables
-# base on the cluster configuration
-export OMP_NUM_THREADS=4
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-# export NCCL_SOCKET_IFNAME=ens5f0
-
-# torch tp overlap
-export ENABLE_TORCH_TP_OVERLAP=1
-export TORCH_TP_OVERLAP_SIZE=2
-
-# # torch pp overlap
-export ENABLE_TORCH_PP_OVERLAP=1
-export TORCH_PP_OVERLAP_SIZE=2
-
-# following environment variables must be set when ENABLE_TORCH_TP_OVERLAP=1
-# export NCCL_FORCESYNC_DISABLE=1  ## this variable may cause hang and nan
-export NCCL_USE_DIRECT=1
-export OMP_NUM_THREADS=4
-export UMD_CCLINLASTCE=1
-
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-TRANSFORMER_IMPL=transformer_engine
-
-# Change for multinode config
-# export NODE_ADDR=$(ifconfig -a|grep inet|grep -v 127.0.0.1|grep -v inet6|awk '{print $2;}'|tr -d "addr:"|head -n 1)
-# export GPUS_PER_NODE=$(awk '{$1=$1;print}' $HOSTFILE|awk -F" |=" '{ranks[$1]=$NF;}END{print ranks["'$NODE_ADDR'"];}')
-# export NNODES=$(awk '{$1=$1;print}' $HOSTFILE | wc -l)
-# export MASTER_ADDR=$(head -n1 $HOSTFILE | awk '{print $1;}')
-# export NODE_RANK=$(awk '{ranks[$1]=(FNR-1);}END{print ranks["'$NODE_ADDR'"];}' $HOSTFILE)
-# export MASTER_PORT=12346
-# WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
-
-TP=2
-PP=4
-GPUS_PER_NODE=8
-MASTER_ADDR=localhost
-MASTER_PORT=8081
-NNODES=1
-NODE_RANK=0
-
-
-# llama2-7b
-HIDDEN_SIZE=4096
-FFN_HIDDEN_SIZE=11008
-NUM_LAYERS=16
-NUM_HEADS=32
-SEQ_LENGTH=4096
-NUM_KV_HEADS=32
-
-MICRO_BATCH_SIZE=2
-GLOBAL_BATCH_SIZE=16
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-## Follow params must be set when use torch pp overlap
-TORCH_PP_OVERLAP_ARGS="
-    --pp-delay \
-    --pp-split-size 4 \
-    --num-layers-per-virtual-pipeline-stage 2 \
-    --sequence-parallel \
-"
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size ${TP} \
-    --pipeline-model-parallel-size ${PP} \
-    --micro-batch-size $MICRO_BATCH_SIZE \
-    --global-batch-size $GLOBAL_BATCH_SIZE \
-    --disable-bias-linear \
-    --eval-interval 1000 \
-    --use-flash-attn \
-    --bf16 \
-    --transformer-impl $TRANSFORMER_IMPL\
-    --no-gradient-accumulation-fusion \
-"
-# --use-distributed-optimizer \
-
-# MIXED_PRECISION_ARGS="
-#     --bf16 \
-#     --initial-loss-scale 522893 \
-#     --min-loss-scale 1.0 \
-#     --attention-softmax-in-fp32 
-# "
-# --accumulate-allreduce-grads-in-fp32
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-
-NETWORK_ARGS="
-    --num-layers $NUM_LAYERS \
-    --hidden-size ${HIDDEN_SIZE} \
-    --num-attention-heads $NUM_HEADS \
-    --num-key-value-heads $NUM_KV_HEADS \
-    --seq-length $SEQ_LENGTH \
-    --max-position-embeddings $SEQ_LENGTH \
-    --norm-epsilon 1e-5 \
-    --swiglu \
-    --normalization RMSNorm \
-    --ffn-hidden-size $FFN_HIDDEN_SIZE \
-    --use-rotary-position-embeddings \
-    --untie-embeddings-and-output-weights
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.3 \
-    --hidden-dropout 0.3 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $TORCH_PP_OVERLAP_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_torch_tp_overlap_node1_card2.sh b/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_torch_tp_overlap_node1_card2.sh
deleted file mode 100644
index dcc818015051d5a103ae29e5b57002f7acb39611..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama2/run_te_torch_tp_overlap_node1_card2.sh
+++ /dev/null
@@ -1,171 +0,0 @@
-#!/bin/bash
-
-# Please change the following environment variables
-# base on the cluster configuration
-export OMP_NUM_THREADS=4
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-# export NCCL_SOCKET_IFNAME=ens5f0
-
-export ENABLE_TORCH_TP_OVERLAP=1
-export TORCH_TP_OVERLAP_SIZE=2
-
-# following environment variables must be set when ENABLE_TORCH_TP_OVERLAP=1
-export NCCL_FORCESYNC_DISABLE=1
-export NCCL_USE_DIRECT=1
-export OMP_NUM_THREADS=4
-export UMD_CCLINLASTCE=1
-
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-TRANSFORMER_IMPL=transformer_engine
-# TB_PATH=./tboard/$DATE
-# mkdir -p $TB_PATH
-# WB_PATH=./wandb/$DATE
-# mkdir -p $WB_PATH
-
-# Change for multinode config
-# export NODE_ADDR=$(ifconfig -a|grep inet|grep -v 127.0.0.1|grep -v inet6|awk '{print $2;}'|tr -d "addr:"|head -n 1)
-# export GPUS_PER_NODE=$(awk '{$1=$1;print}' $HOSTFILE|awk -F" |=" '{ranks[$1]=$NF;}END{print ranks["'$NODE_ADDR'"];}')
-# export NNODES=$(awk '{$1=$1;print}' $HOSTFILE | wc -l)
-# export MASTER_ADDR=$(head -n1 $HOSTFILE | awk '{print $1;}')
-# export NODE_RANK=$(awk '{ranks[$1]=(FNR-1);}END{print ranks["'$NODE_ADDR'"];}' $HOSTFILE)
-# export MASTER_PORT=12346
-# WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
-
-TP=2
-PP=1
-GPUS_PER_NODE=2
-MASTER_ADDR=localhost
-MASTER_PORT=8081
-NNODES=1
-NODE_RANK=0
-
-
-# llama2-7b
-HIDDEN_SIZE=4096
-FFN_HIDDEN_SIZE=11008
-NUM_LAYERS=4
-NUM_HEADS=32
-SEQ_LENGTH=4096
-NUM_KV_HEADS=32
-
-MICRO_BATCH_SIZE=2
-GLOBAL_BATCH_SIZE=2
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size ${TP} \
-    --pipeline-model-parallel-size ${PP} \
-    --micro-batch-size $MICRO_BATCH_SIZE \
-    --global-batch-size $GLOBAL_BATCH_SIZE \
-    --disable-bias-linear \
-    --eval-interval 1000 \
-    --use-flash-attn
-    --bf16
-    --transformer-impl $TRANSFORMER_IMPL\
-    --no-gradient-accumulation-fusion \
-"
-    # --sequence-parallel \
-    # --use-distributed-optimizer \
-
-# MIXED_PRECISION_ARGS="
-#     --bf16 \
-#     --initial-loss-scale 522893 \
-#     --min-loss-scale 1.0 \
-#     --attention-softmax-in-fp32 
-# "
-# --accumulate-allreduce-grads-in-fp32
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama2Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-
-NETWORK_ARGS="
-    --num-layers $NUM_LAYERS \
-    --hidden-size ${HIDDEN_SIZE} \
-    --num-attention-heads $NUM_HEADS \
-    --num-key-value-heads $NUM_KV_HEADS \
-    --seq-length $SEQ_LENGTH \
-    --max-position-embeddings $SEQ_LENGTH \
-    --norm-epsilon 1e-5 \
-    --swiglu \
-    --normalization RMSNorm \
-    --ffn-hidden-size $FFN_HIDDEN_SIZE \
-    --use-rotary-position-embeddings \
-    --untie-embeddings-and-output-weights
-"
-## group attntion parameters for megatron-lm
-## example llama2-70B
-# --num-attention-heads 64
-# --group-query-attention
-# --num-query-groups 8
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.3 \
-    --hidden-dropout 0.3 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-    # --wandb-save-dir $WB_PATH \
-    # --tensorboard-dir $TB_PATH \
-    # --tensorboard-log-interval 1 
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
diff --git a/toolbox/Megatron-DeepSpeed/examples/llama3/run_te_llama3_8b_node1.sh b/toolbox/Megatron-DeepSpeed/examples/llama3/run_te_llama3_8b_node1.sh
deleted file mode 100644
index 82f37b55ab78bbf59aa785ef10b164e20c4dece1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/llama3/run_te_llama3_8b_node1.sh
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/bin/bash
-
-# Please change the following envrioment variables base on the cluster configuration
-export OMP_NUM_THREADS=4
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_SOCKET_IFNAME=bond0
-# export NCCL_USE_DIRECT=1
-
-# export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
-
-PROJ_HOME=$(dirname $(dirname "$PWD"))
-
-DATA_PATH=${PROJ_HOME}/dataset/gpt_small_117M_llama3/gpt_small_117M_text_document
-TOKENIZER_PATH=./tokenizer/tokenizer_llama3.model
-
-CHECKPOINT_PATH=./checkpoints/llama2
-mkdir -p $CHECKPOINT_PATH
-
-DATE=`date +%y%m%d%H%M%S`
-LOG_PATH=./logs/$DATE
-mkdir -p $LOG_PATH
-
-GPUS_PER_NODE=16
-MASTER_ADDR=localhost
-MASTER_PORT=8080
-NNODES=1
-NODE_RANK=0
-
-TRANSFORMER_IMPL=transformer_engine
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-## llama3-8B 与 llama2-7B之间有差别的参数
-    # --group-query-attention \
-    # --num-query-groups 8 \
-    # --seq-length 8192 \
-    # --max-position-embeddings 8192 \
-    # --rotary-position-embeddings-theta 500000 \
-    # --ffn-hidden-size 14336 \
-    # --tokenizer-type Llama3Tokenizer \
-    # vocab_size=128256 不用在脚本里设置
-
-TRAINING_ARGS="
-    --train-iters 250000 \
-    --eval-iters 10 \
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 8 \
-    --micro-batch-size 1 \
-    --global-batch-size 32 \
-    --disable-bias-linear \
-    --use-flash-attn \
-    --eval-interval 1000 \
-    --transformer-impl $TRANSFORMER_IMPL \
-    --no-fp8-wgrad \
-    --use-distributed-optimizer \
-"
-
-MIXED_PRECISION_ARGS="
-    --bf16 \
-    --initial-loss-scale 522893 \
-    --min-loss-scale 1.0 \
-    --attention-softmax-in-fp32 \
-    --no-query-key-layer-scaling
-"
-
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --tokenizer-type Llama3Tokenizer \
-    --tokenizer-model $TOKENIZER_PATH \
-    --split 949,50,1
-"
-
-NETWORK_ARGS="
-    --num-layers 32 \
-    --hidden-size 4096 \
-    --num-attention-heads 32 \
-    --group-query-attention \
-    --num-query-groups 8 \
-    --seq-length 8192 \
-    --max-position-embeddings 8192 \
-    --ffn-hidden-size 14336 \
-    --norm-epsilon 1e-5 \
-    --use-rotary-position-embeddings \
-    --no-position-embedding \
-    --swiglu \
-    --normalization RMSNorm \
-    --untie-embeddings-and-output-weights \
-    --rotary-position-embeddings-theta 500000 \
-"
-
-INITIALIZATION_ARGS="
-    --init-method-std 0.02 \
-    --seed 1234 
-"
-
-REGULARIZATION_ARGS="
-    --attention-dropout 0.0 \
-    --hidden-dropout 0.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --clip-grad 1.0
-"
-
-LEARNING_RATE_ARGS="
-    --lr 3.0e-4 \
-    --min-lr 3.0e-5 \
-    --lr-decay-style cosine \
-    --lr-warmup-iters 2000
-"
-
-CHECKPOINTING_ARGS="
-    --save-interval 10000 \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-"
-
-LOGGING_ARGS="
-    --log-interval 1 \
-"
-    # --wandb-save-dir $WB_PATH \
-    # --tensorboard-dir $TB_PATH \
-    # --tensorboard-log-interval 1 
-
-cmd="torchrun $DISTRIBUTED_ARGS $PROJ_HOME/pretrain_gpt_megatron.py \
-              $TRAINING_ARGS \
-              $MIXED_PRECISION_ARGS \
-              $DATA_ARGS \
-              $NETWORK_ARGS \
-              $INITIALIZATION_ARGS \
-              $REGULARIZATION_ARGS \
-              $LEARNING_RATE_ARGS \
-              $CHECKPOINTING_ARGS \
-              $LOGGING_ARGS | tee ${LOG_PATH}/output.log 2>&1
-    "
-echo $cmd
-eval $cmd
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples/merge_mp_bert.sh b/toolbox/Megatron-DeepSpeed/examples/merge_mp_bert.sh
deleted file mode 100644
index 1383433284bc79a70785305c0628e3d80aeb92d0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/merge_mp_bert.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-TENSOR_MODEL_PARALLEL_SIZE=2
-
-VOCAB_FILE=bert-vocab.txt
-CHECKPOINT_PATH=checkpoints/bert_345m
-
-WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
-                                --model-type BERT \
-                                --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
-                                --tokenizer-type BertWordPieceLowerCase \
-                                --vocab-file $VOCAB_FILE \
-                                --num-layers 24 \
-                                --hidden-size 1024 \
-                                --num-attention-heads 16 \
-                                --seq-length 512 \
-                                --max-position-embeddings 512 \
-                                --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/msdp/README.md b/toolbox/Megatron-DeepSpeed/examples/msdp/README.md
deleted file mode 100644
index 8ff95099e0d9e005ecf6bf5ec7e85d0b10eb4d23..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/msdp/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-
-# Multi-Stage Prompting for Knowledgeable Dialogue Generation
-
-This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/msdp/data_processing.sh b/toolbox/Megatron-DeepSpeed/examples/msdp/data_processing.sh
deleted file mode 100644
index 37a6512a806fd0a141339ea857c73074fced12a9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/msdp/data_processing.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/bin/bash
-
-# Data preparation for our framework: preprocessing the WoW and WoI datasets
-# The datasets can be downloaded through the following links:
-# WoW: https://parl.ai/projects/wizard_of_wikipedia/
-# WoI: https://parl.ai/projects/sea/
-
-DIR=`pwd`
-# Before running the preprocessing, please download 
-# the wizard of wikipedia and wizard datasets
-WOW_DATA_FOLDER=<PATH_OF_WIZARD_OF_WIKIPEDIA_DATA_FOLDER>
-WOI_DATA_FOLDER=<PATH_OF_WIZARD_OF_INTERNET_DATA_FOLDER>
-
-# We provide examples for processing the raw data from Wizard of Wikipedia
-# Processing the train dataset (train.json)
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func process_wow_dataset \
-        --raw_file ${WOW_DATA_FOLDER}/train.json \
-        --processed_file ${WOW_DATA_FOLDER}/train_processed.txt
-
-# Processing test seen dataset (test_random_split.json)
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func process_wow_dataset \
-        --raw_file ${WOW_DATA_FOLDER}/test_random_split.json \
-        --processed_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
-        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_reference.txt \
-        --resp_ref_file ${WOW_DATA_FOLDER}/output_testseen_response_reference.txt
-
-# processing test unseen dataset (test_topic_split.json)
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func process_wow_dataset \
-        --raw_file ${WOW_DATA_FOLDER}/test_topic_split.json \
-        --processed_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
-        --knwl_ref_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_reference.txt \
-        --resp_ref_file ${WOW_DATA_FOLDER}/output_testunseen_response_reference.txt
-
-
-# We provide the following script to process the raw data from Wizard of Internet
-# Processing the test dataset (test.jsonl)
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func process_woi_dataset \
-        --raw_file ${WOI_DATA_FOLDER}/test.jsonl \
-        --processed_file ${WOI_DATA_FOLDER}/test_processed.txt \
-        --knwl_ref_file ${WOI_DATA_FOLDER}/output_test_knowledge_reference.txt \
-        --resp_ref_file ${WOI_DATA_FOLDER}/output_test_response_reference.txt
-
-
-# Get the knowledge generation prompts for the each test dataset in WoW and WoI
-MODEL_FILE=<PATH_OF_THE_FINETUNED_DPR_MODEL> 
-# WoW test seen
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func get_knwl_gen_prompts \
-        --test_file ${WOW_DATA_FOLDER}/testseen_processed.txt \
-        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
-        --model_file ${MODEL_FILE} \
-        --processed_file ${WOW_DATA_FOLDER}/output_testseen_knowledge_prompts.json \
-        --data_type wow_seen
-
-# WoW test unseen
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func get_knwl_gen_prompts \
-        --test_file ${WOW_DATA_FOLDER}/testunseen_processed.txt \
-        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
-        --model_file ${MODEL_FILE} \
-        --processed_file ${WOW_DATA_FOLDER}/output_testunseen_knowledge_prompts.json \
-        --data_type wow_unseen
-
-# WoI
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func get_knwl_gen_prompts \
-        --test_file ${WOI_DATA_FOLDER}/test_processed.txt \
-        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
-        --model_file ${MODEL_FILE} \
-        --processed_file ${WOI_DATA_FOLDER}/output_test_knowledge_prompts.json \
-        --data_type woi
-
-
-# Get the response generation prompts (can be applied for all the test datasets)
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func get_resp_gen_prompts \
-        --train_file ${WOW_DATA_FOLDER}/train_processed.txt \
-        --processed_file ${WOW_DATA_FOLDER}/output_response_prompts.txt
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/msdp/eval_knwl_generation.sh b/toolbox/Megatron-DeepSpeed/examples/msdp/eval_knwl_generation.sh
deleted file mode 100644
index 8fc2fff1fb776c3f0c54e25e50aefedc0ca8fd0a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/msdp/eval_knwl_generation.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-#########################
-# Evaluate the F1 scores.
-#########################
-
-WORLD_SIZE=1
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-                  
-MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \ 
-        (e.g., /testseen_knowledge_generations.txt)
-GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
-        (e.g., /testseen_knowledge_reference.txt)
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
-        --num-layers 24 \
-        --hidden-size 1024 \
-        --num-attention-heads 16 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-        --micro-batch-size 4 \
-        --task MSDP-EVAL-F1 \
-        --guess-file ${MODEL_GEN_PATH} \
-        --answer-file ${GROUND_TRUTH_PATH}
-
-
-############################################
-# Evaluate BLEU, METEOR, and ROUGE-L scores.
-############################################
-
-# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
-# evaluate the BLEU, METEOR, and ROUGE-L scores. 
-
-# To evaluate on these metrics, please setup the environments based on 
-# the nlg-eval github, and run the corresponding evaluation commands.
-
-nlg-eval \
-    --hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
-    --references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
diff --git a/toolbox/Megatron-DeepSpeed/examples/msdp/eval_resp_generation.sh b/toolbox/Megatron-DeepSpeed/examples/msdp/eval_resp_generation.sh
deleted file mode 100644
index 3ce87e077957904b234276657d000ba8c729dcfe..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/msdp/eval_resp_generation.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-#########################
-# Evaluate the F1 scores.
-#########################
-
-WORLD_SIZE=1
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-                  
-MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
-        (e.g., /testseen_response_generations.txt)
-GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \ 
-        (e.g., /testseen_response_reference.txt)
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
-        --num-layers 24 \
-        --hidden-size 1024 \
-        --num-attention-heads 16 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-        --micro-batch-size 4 \
-        --task MSDP-EVAL-F1 \
-        --guess-file ${MODEL_GEN_PATH} \
-        --answer-file ${GROUND_TRUTH_PATH}
-
-
-##########################
-# Evaluate the KF1 scores.
-##########################
-                  
-MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
-        (e.g., /testseen_response_generations.txt)
-GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
-        (e.g., /testseen_knowledge_reference.txt)
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
-        --num-layers 24 \
-        --hidden-size 1024 \
-        --num-attention-heads 16 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-        --micro-batch-size 4 \
-        --task MSDP-EVAL-F1 \
-        --guess-file ${MODEL_GEN_PATH} \
-        --answer-file ${GROUND_TRUTH_PATH}
-
-
-############################################
-# Evaluate BLEU, METEOR, and ROUGE-L scores.
-############################################
-
-# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
-# evaluate the BLEU, METEOR, and ROUGE-L scores. 
-
-# To evaluate on these metrics, please setup the environments based on 
-# the nlg-eval github, and run the corresponding evaluation commands.
-
-nlg-eval \
-    --hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
-    --references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
diff --git a/toolbox/Megatron-DeepSpeed/examples/msdp/prep_resp_gen.sh b/toolbox/Megatron-DeepSpeed/examples/msdp/prep_resp_gen.sh
deleted file mode 100644
index 5f202724dddbaa6ada3bcb1c33ec035a3afe44ee..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/msdp/prep_resp_gen.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-# Preparing the input file for the response generation (second-stage prompting)
-
-DIR=`pwd`
-
-TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
-        (e.g., /testseen_processed.txt)
-KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
-        (e.g., /testseen_knowledge_generations.txt)
-PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
-        (e.g., /testseen_processed_with_generated_knowledge.txt)
-
-python ${DIR}/tasks/msdp/preprocessing.py \
-        --func prepare_input \
-        --test_file ${TEST_FILE} \
-        --knwl_gen_file ${KNOWLEDGE_FILE} \
-        --processed_file ${PROCESSED_FILE}
diff --git a/toolbox/Megatron-DeepSpeed/examples/msdp/prompt_knwl_gen.sh b/toolbox/Megatron-DeepSpeed/examples/msdp/prompt_knwl_gen.sh
deleted file mode 100644
index 12e0cc5b380036f167b35d6f514eafc1e1acec32..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/msdp/prompt_knwl_gen.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-# Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
-# The input contains prompts and current dialogue context, the output is the relevant knowledge
-# The size of the pretrained language model is 357M
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
-VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
-MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
-INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> \ 
-        (e.g., /testseen_processed.txt)
-PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
-        (e.g., /testseen_knowledge_prompts.json)
-OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
-        (e.g., /testseen_knowledge_generations.txt)
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
-        --num-layers 24 \
-        --hidden-size 1024 \
-        --num-attention-heads 16 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-        --micro-batch-size 1 \
-        --vocab-file ${VOCAB_PATH} \
-        --merge-file ${MERGE_PATH} \
-        --load ${CHECKPOINT_PATH} \
-        --fp16 \
-        --DDP-impl torch \
-        --tokenizer-type GPT2BPETokenizer \
-        --sample-input-file ${INPUT_PATH} \
-        --sample-output-file ${OUTPUT_PATH} \
-        --prompt-file ${PROMPT_PATH} \
-        --prompt-type knowledge \
-        --num-prompt-examples 10 \
-        --task MSDP-PROMPT 
-
-# NOTE: If you use api for the model generation, please use 
-# the "--api-prompt" flag (setting this value as True). 
diff --git a/toolbox/Megatron-DeepSpeed/examples/msdp/prompt_resp_gen.sh b/toolbox/Megatron-DeepSpeed/examples/msdp/prompt_resp_gen.sh
deleted file mode 100644
index b836d7feacfcac5f093840727be8933e5585163e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/msdp/prompt_resp_gen.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-# Stage-2: Prompt a pretrained language model to generate the corresponding response
-# The input contains prompts, current dialogue context, and generated knowledge in Stage-1
-# The output is the corresponding response.
-# The size of the pretrained language model is 357M
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
-VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
-MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
-INPUT_PATH=<PATH_OF_INPUT_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
-PROMPT_PATH=<PATH_OF_RESPONSE_GENERATION_PROMPTS> \
-        (e.g., /response_prompts.txt)
-OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
-        (e.g., /output_testseen_response_generations.txt)
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
-        --num-layers 24 \
-        --hidden-size 1024 \
-        --num-attention-heads 16 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-        --micro-batch-size 1 \
-        --vocab-file ${VOCAB_PATH} \
-        --merge-file ${MERGE_PATH} \
-        --load ${CHECKPOINT_PATH} \
-        --fp16 \
-        --DDP-impl torch \
-        --tokenizer-type GPT2BPETokenizer \
-        --sample-input-file ${INPUT_PATH} \
-        --sample-output-file ${OUTPUT_PATH} \
-        --prompt-file ${PROMPT_PATH} \
-        --prompt-type response \
-        --num-prompt-examples 20 \
-        --task MSDP-PROMPT 
-
-# NOTE: If you use api for the model generation, please use 
-# the "--api-prompt" flag (setting this value as True). 
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_bert.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_bert.sh
deleted file mode 100644
index c98c7ebbdbef4341fa166c9035a5b9725f46adf7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_bert.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-BERT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 4 \
-    --global-batch-size 8 \
-    --lr 0.0001 \
-    --train-iters 2000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_bert_distributed.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_bert_distributed.sh
deleted file mode 100644
index 4a87a7bfba12537253cacbfe0f5e7841d4b9c645..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_bert_distributed.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-BERT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 4 \
-    --global-batch-size 32 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_bert_distributed_with_mp.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_bert_distributed_with_mp.sh
deleted file mode 100644
index 62d7f741c232dcf38183c47284eab13fa06db270..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_bert_distributed_with_mp.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/bert-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-BERT_ARGS="
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 2 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 2 \
-    --global-batch-size 16 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 990000 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
-    $BERT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt.sh
deleted file mode 100644
index 4956d26ffafd2677b755dd2b07ef14b45c3a1d79..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-GPT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 4 \
-    --global-batch-size 8 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt3_175B.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt3_175B.sh
deleted file mode 100644
index b423e4bd130ca43c07a3c64c1950fd1a8b5adee9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt3_175B.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-
-#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
-
-
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-mkdir -p $DIR/logs
-
-
-DATASET_1="<PATH TO THE FIRST DATASET>"
-DATASET_2="<PATH TO THE SECOND DATASET>"
-DATASET_3="<PATH TO THE THIRD DATASET>"
-DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
-
-
-options=" \
-	--tensor-model-parallel-size 8 \
-	--pipeline-model-parallel-size 16 \
-        --num-layers 96 \
-        --hidden-size 12288 \
-        --num-attention-heads 96 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-	--micro-batch-size 1 \
-	--global-batch-size 1536 \
-	--rampup-batch-size 16 16 5859375 \
-	--train-samples 146484375 \
-       	--lr-decay-samples 126953125 \
-        --lr-warmup-samples 183105 \
-        --lr 6.0e-5 \
-	--min-lr 6.0e-6 \
-        --lr-decay-style cosine \
-        --log-interval 10 \
-        --eval-iters 40 \
-        --eval-interval 1000 \
-	--data-path ${DATASET} \
-	--vocab-file <PATH TO gpt-vocab.json> \
-	--merge-file <PATH TO gpt-merges.txt> \
-	--save-interval 1000 \
-	--save <PATH TO CHECKPOINTS DIRECTORY> \
-	--load <PATH TO CHECKPOINTS DIRECTORY> \
-        --split 98,2,0 \
-        --clip-grad 1.0 \
-	--weight-decay 0.1 \
-	--adam-beta1 0.9 \
-	--adam-beta2 0.95 \
-	--init-method-std 0.006 \
-	--tensorboard-dir <TENSORBOARD DIRECTORY> \
-        --fp16 \
-	--activations-checkpoint-method uniform "
-
-
-run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
-
-
-srun -l \
-     --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
-     --container-mounts "<DIRECTORIES TO MOUNT>" \
-     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
-
-
-set +x
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt_distributed.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt_distributed.sh
deleted file mode 100644
index 24d76a1dc3caf91d707e8190b6586113f49f15f4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt_distributed.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-GPT_ARGS="
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 8 \
-    --global-batch-size 64 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt_distributed_with_mp.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt_distributed_with_mp.sh
deleted file mode 100644
index 721288fdb0d968a88304b0875726fa1fa9cddac9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_gpt_distributed_with_mp.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-
-# Runs the "345M" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=<Specify path to file>/gpt2-merges.txt
-DATA_PATH=<Specify path and file prefix>_text_document
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-GPT_ARGS="
-    --tensor-model-parallel-size 2 \
-    --pipeline-model-parallel-size 2 \
-    --sequence-parallel \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --seq-length 1024 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 4 \
-    --global-batch-size 16 \
-    --lr 0.00015 \
-    --train-iters 500000 \
-    --lr-decay-iters 320000 \
-    --lr-decay-style cosine \
-    --min-lr 1.0e-5 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --merge-file $MERGE_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
-    $GPT_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_ict.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_ict.sh
deleted file mode 100644
index 8cba0f08ba4c0f9d1697d721ae8e65dd28c1c914..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_ict.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#! /bin/bash
-
-# Runs the "217M" parameter biencoder model for ICT retriever
-
-RANK=0
-WORLD_SIZE=1
-
-PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
-TEXT_DATA_PATH=<Specify path and file prefix of the text data>
-TITLE_DATA_PATH=<Specify path and file prefix od the titles>
-CHECKPOINT_PATH=<Specify path>
-
-
-python pretrain_ict.py \
-        --num-layers 12 \
-        --hidden-size 768 \
-        --num-attention-heads 12 \
-        --tensor-model-parallel-size 1 \
-        --micro-batch-size 32 \
-        --seq-length 256 \
-        --max-position-embeddings 512 \
-        --train-iters 100000 \
-        --vocab-file bert-vocab.txt \
-        --tokenizer-type BertWordPieceLowerCase \
-        --DDP-impl torch \
-        --bert-load ${PRETRAINED_BERT_PATH} \
-        --log-interval 100 \
-        --eval-interval 1000 \
-        --eval-iters 10 \
-        --retriever-report-topk-accuracies 1 5 10 20 100 \
-        --retriever-score-scaling \
-        --load $CHECKPOINT_PATH \
-        --save $CHECKPOINT_PATH \
-        --data-path ${TEXT_DATA_PATH} \
-        --titles-data-path ${TITLE_DATA_PATH} \
-        --lr 0.0001 \
-        --lr-decay-style linear \
-        --weight-decay 1e-2 \
-        --clip-grad 1.0 \
-        --lr-warmup-fraction 0.01 \
-        --save-interval 4000 \
-        --exit-interval 8000 \
-        --query-in-block-prob 0.1 \
-        --fp16
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_t5.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_t5.sh
deleted file mode 100644
index 5f4b63ad68afb8f583dec4cfea1e1ab8e8c901c7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_t5.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-T5_ARGS="
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --kv-channels 64 \
-    --ffn-hidden-size 3072 \
-    --encoder-seq-length 512 \
-    --decoder-seq-length 128 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 16 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 1000000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --vocab-extra-ids 100
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun pretrain_t5.py \
-    $T5_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_t5_distributed.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_t5_distributed.sh
deleted file mode 100644
index eec52458279e48f4886d5d0cf6ec12e55d6c3f90..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_t5_distributed.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-T5_ARGS="
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --kv-channels 64 \
-    --ffn-hidden-size 3072 \
-    --encoder-seq-length 512 \
-    --decoder-seq-length 128 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 128 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 1000000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16 \
-    --vocab-extra-ids 100
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
-    $T5_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/pretrain_t5_distributed_with_mp.sh b/toolbox/Megatron-DeepSpeed/examples/pretrain_t5_distributed_with_mp.sh
deleted file mode 100644
index d51ecee19ef0e0922542418f9c1935d92fe67c76..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/pretrain_t5_distributed_with_mp.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-
-CHECKPOINT_PATH=<Specify path>
-VOCAB_FILE=<Specify path to file>/t5-vocab.txt
-DATA_PATH=<Specify path and file prefix>_text_sentence
-
-DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT
-"
-
-T5_ARGS="
-    --tensor-model-parallel-size 2 \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --kv-channels 64 \
-    --ffn-hidden-size 3072 \
-    --encoder-seq-length 512 \
-    --decoder-seq-length 128 \
-    --max-position-embeddings 512 \
-    --micro-batch-size 16 \
-    --global-batch-size 128 \
-    --lr 0.0001 \
-    --train-iters 1000000 \
-    --lr-decay-iters 1000000 \
-    --lr-decay-style linear \
-    --min-lr 0.00001 \
-    --weight-decay 1e-2 \
-    --lr-warmup-fraction .01 \
-    --clip-grad 1.0 \
-    --fp16  \
-    --vocab-extra-ids 100
-"
-
-DATA_ARGS="
-    --data-path $DATA_PATH \
-    --vocab-file $VOCAB_FILE \
-    --data-impl mmap \
-    --split 949,50,1
-"
-
-OUTPUT_ARGS="
-    --log-interval 100 \
-    --save-interval 10000 \
-    --eval-interval 1000 \
-    --eval-iters 10
-"
-
-torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
-    $T5_ARGS \
-    $DATA_ARGS \
-    $OUTPUT_ARGS \
-    --distributed-backend nccl \
-    --save $CHECKPOINT_PATH \
-    --load $CHECKPOINT_PATH
diff --git a/toolbox/Megatron-DeepSpeed/examples/run_text_generation_server_345M.sh b/toolbox/Megatron-DeepSpeed/examples/run_text_generation_server_345M.sh
deleted file mode 100644
index a151b98467614b71fb676bef0d1268b12adaa321..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/run_text_generation_server_345M.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-# This example will start serving the 345M model.
-DISTRIBUTED_ARGS="--nproc_per_node 1 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT=<Path to checkpoint (e.g /345m)>
-VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
-MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-pip install flask-restful
-
-torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-       --tensor-model-parallel-size 1  \
-       --pipeline-model-parallel-size 1  \
-       --num-layers 24  \
-       --hidden-size 1024  \
-       --load ${CHECKPOINT}  \
-       --num-attention-heads 16  \
-       --max-position-embeddings 1024  \
-       --tokenizer-type GPT2BPETokenizer  \
-       --fp16  \
-       --micro-batch-size 1  \
-       --seq-length 1024  \
-       --out-seq-length 1024  \
-       --temperature 1.0  \
-       --vocab-file $VOCAB_FILE  \
-       --merge-file $MERGE_FILE  \
-       --top_p 0.9  \
-       --seed 42
diff --git a/toolbox/Megatron-DeepSpeed/examples/run_text_generation_server_345M_8_tensor_parallel.sh b/toolbox/Megatron-DeepSpeed/examples/run_text_generation_server_345M_8_tensor_parallel.sh
deleted file mode 100644
index 027ab421727adfc381c7b03c949ff9250df3505e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# This example will start serving the 345M model that is partitioned 8 way tensor parallel
-DISTRIBUTED_ARGS="--nproc_per_node 8 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT=<Path to checkpoint (e.g /345m)>
-VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
-MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
-
-pip install flask-restful
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-       --tensor-model-parallel-size 8  \
-       --pipeline-model-parallel-size 1  \
-       --num-layers 24  \
-       --hidden-size 1024  \
-       --load ${CHECKPOINT}  \
-       --num-attention-heads 16  \
-       --max-position-embeddings 1024  \
-       --tokenizer-type GPT2BPETokenizer  \
-       --fp16  \
-       --micro-batch-size 1  \
-       --seq-length 1024  \
-       --out-seq-length 1024  \
-       --temperature 1.0  \
-       --vocab-file $VOCAB_FILE  \
-       --merge-file $MERGE_FILE  \
-       --top_p 0.9  \
-       --seed 42
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/CONFIG.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/CONFIG.sh
deleted file mode 100644
index f17ccd7b023ca9aeb538ba38a60808e44418873b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/CONFIG.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/bin/bash
-
-
-# SLURM options.
-export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
-export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
-
-
-# Source code.
-export MEGATRON_CODE_DIR=<megatron source code directory>
-
-
-# This variable is used to mount the relevant part of the filesystem
-# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
-# launch directory already get mounted; this variable should be used to
-# mount the directories that contain the data and tokenizer files.
-export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
-
-
-# Data and tokenizer files.
-MEGATRON_DATA=<path to megatron processed data>
-BPE_VOCAB_FILE=<path to bpe vocab file>
-BPE_MERGE_FILE=<path to bpe merges file>
-
-
-# Megatron input parameters.
-# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
-# that are not listed here. 
-export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
-	--tensor-model-parallel-size ${TP} \
-	--pipeline-model-parallel-size ${PP} \
-	--micro-batch-size ${MBS} \
-	--global-batch-size ${GBS} \
-        --num-layers ${NLS} \
-        --hidden-size ${HS} \
-        --num-attention-heads ${NAH} \
-	--DDP-impl ${DDP} \
-	--data-path ${MEGATRON_DATA} \
-	--vocab-file ${BPE_VOCAB_FILE} \
-	--merge-file ${BPE_MERGE_FILE} \
-        --log-interval 5 \
-        --seq-length 2048 \
-        --max-position-embeddings 2048 \
-        --train-iters 500 \
-        --lr-decay-iters 320 \
-        --lr 0.0001 \
-	--min-lr 0.00001 \
-        --lr-decay-style cosine \
-        --lr-warmup-fraction 0.01 \
-        --split 969,30,1 \
-        --eval-iters 100 \
-        --eval-interval 1000 \
-        --clip-grad 1.0 \
-        --fp16 \
-	--loss-scale 8192 "
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/README.md b/toolbox/Megatron-DeepSpeed/examples/sc21/README.md
deleted file mode 100644
index 940c37903ef063613e3d247b489ba2d186bbea4d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Reproducing Figures in SC21 Paper
-
-
-This directory contains some of the scripts that were used to produce the
-results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
-to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
-scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
-[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
-schedulers as well.
-
-
-## Setup
-
-All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
-update the unspecified values (in angle brackets `<...>`) before launching any
-scripts.
-
-
-
-## Scripts
-
-Below is a list of scripts that can be used to reproduce various figures in our
-[paper](https://arxiv.org/pdf/2104.04473.pdf):
-
-* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
-for GPT models ranging from 1 billion to 1 trillion parameters.
-* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
-performance of pipeline parallelism.
-* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
-the interleaved schedule on a 175B GPT model.
-* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
-different degrees of pipeline and tensor model parallelism on a model with
-162.2 billion parameters.
-* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
-different degrees of data and pipeline model parallelism on a model with
-5.9 billion parameters.
-* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
-different degrees of data and tensor model parallelism on a model with
-5.9 billion parameters.
-* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
-microbatch size.
-* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
-activation recomputation.
-* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
-the scatter-gather communication optimization.
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/SBATCH.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/SBATCH.sh
deleted file mode 100644
index 95431b9b7e780bbdd4b18593546356aad02945b1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/SBATCH.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-
-sbatch -p ${SLURM_PARTITION} \
-       -A ${SLURM_ACCOUNT} \
-       --job-name=${JOB_NAME} \
-       --nodes=${NNODES} \
-       --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/SRUN.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/SRUN.sh
deleted file mode 100644
index 52a9aff0c1294acb1e5527faad4f73fe5e027e21..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/SRUN.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
-
-
-THIS_DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-mkdir -p ${THIS_DIR}/logs
-
-
-CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
-
-
-srun -l \
-     --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
-     --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
-     --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_11.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_11.sh
deleted file mode 100644
index 2ec7d9eb31e50e01e3d5dab6978a71deffd247aa..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_11.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-
-# Pipeline-parallel size options = [1, 2, 4, 8].
-PP=1
-
-# Batch size (global batch size) options = [8, 128].
-GBS=8
-
-
-
-
-
-# Set pipeline-parallel size options.
-NLS=$((3*PP))
-NNODES=${PP}
-
-
-# Other params.
-TP=8
-MBS=1
-HS=20480
-NAH=128
-DDP=local
-MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-
-
-# Name of the job.
-export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_12.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_12.sh
deleted file mode 100644
index 11e550854de4cd576d9625ca9dd5330d44fffb76..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_12.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-
-# Interleaved schedule options = [YES, NO].
-INTERLEAVED=YES
-
-# Batch size (global batch size) options = [12, 24, 36, ..., 60].
-GBS=12
-
-
-
-
-
-# Set interleaved schedule options.
-if [ ${INTERLEAVED} == "YES" ]; then
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
-elif [ ${INTERLEAVED} == "NO" ]; then
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-else
-    echo "Invalid configuration"
-    exit 1
-fi
-
-
-# Other params.
-TP=8
-PP=12
-MBS=1
-NLS=96
-HS=12288
-NAH=96
-DDP=local
-NNODES=12
-
-
-# Name of the job.
-export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_13.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_13.sh
deleted file mode 100644
index 7ba560e87b253fb63192866d3089c3d967f086e6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_13.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-
-# Pipeline-parallel size options = [2, 4, 8, 16, 32].
-PP=2
-
-# Batch size (global batch size) options = [32, 128].
-GBS=32
-
-
-
-
-
-# Set pipeline-parallel and tensor-parallel size options.
-TP=$((64/PP))
-
-
-# Other params.
-MBS=1
-NLS=32
-HS=20480
-NAH=128
-DDP=local
-MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-NNODES=8
-
-
-# Name of the job.
-export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_14.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_14.sh
deleted file mode 100644
index 4b83879c4bb71546a7fb5bac365491efd96d3049..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_14.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-
-# Pipeline-parallel size options = [2, 4, 8, 16, 32].
-PP=2
-
-# Batch size (global batch size) options = [32, 512].
-GBS=32
-
-
-
-
-
-# Set pipeline-parallel and data-parallel size options.
-DP=$((64/PP))
-
-
-# Other params.
-TP=1
-MBS=1
-NLS=32
-HS=3840
-NAH=32
-DDP=local
-MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-NNODES=8
-
-
-# Name of the job.
-export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_15.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_15.sh
deleted file mode 100644
index 547ad1de6fb091ca5f922e2b48559ceadffa7ce8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_15.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-
-# Tensor-parallel size options = [2, 4, 8, 16, 32].
-TP=2
-
-# Batch size (global batch size) options = [32, 128, 512].
-GBS=32
-
-
-
-
-
-# Set tensor-parallel and data-parallel size options.
-DP=$((64/TP))
-
-
-# Other params.
-PP=1
-MBS=1
-NLS=32
-HS=3840
-NAH=32
-DDP=local
-MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-NNODES=8
-
-
-# Name of the job.
-export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_16.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_16.sh
deleted file mode 100644
index 8c353a3e7623262baf9dc6c24554e9ab4dce26e7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_16.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-
-# Microbatch size options = [1, 2, 4, 8].
-MBS=1
-
-# Batch size (global batch size) options = [128, 512].
-GBS=128
-
-
-
-
-
-# Other params.
-TP=8
-PP=8
-NLS=32
-HS=15360
-NAH=128
-DDP=local
-MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-NNODES=8
-
-
-# Name of the job.
-export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_17.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_17.sh
deleted file mode 100644
index d6899b321d6c11238af3b12da3690c8c3d46be34..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_17.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-
-# Activation recomputation options = [YES, NO].
-ACTIVATION_RECOMPUTATION=YES
-
-# Batch size (global batch size) options = [1, 2, 4, ..., 256].
-GBS=1
-
-
-
-
-
-# Set activation recomputation.
-if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
-    MEGATRON_EXTRA_PARAMS=""
-else
-    echo "Invalid configuration"
-    exit 1
-fi
-
-
-# Other params.
-TP=8
-PP=16
-MBS=1
-NLS=80
-HS=12288
-NAH=96
-DDP=local
-NNODES=16
-
-
-# Name of the job.
-export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_18.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_18.sh
deleted file mode 100644
index 88924fb820be4767ed6aa00633682ece581329db..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_figure_18.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-
-# Scatter-gather communication optimization options = [YES, NO].
-SCATTER_GATHER=YES
-
-# Batch size (global batch size) options = [12, 24, 36, ..., 60].
-GBS=12
-
-
-
-
-
-# Set scatter-gather communication optimization options.
-if [ ${SCATTER_GATHER} == "YES" ]; then
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
-elif [ ${SCATTER_GATHER} == "NO" ]; then
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
-else
-    echo "Invalid configuration"
-    exit 1
-fi
-
-
-# Other params.
-TP=8
-PP=12
-MBS=1
-NLS=96
-HS=12288
-NAH=96
-DDP=local
-NNODES=12
-
-
-# Name of the job.
-export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples/sc21/run_table_1.sh b/toolbox/Megatron-DeepSpeed/examples/sc21/run_table_1.sh
deleted file mode 100644
index 1b15fb04582c90dc47fb1bbd3aca46feca2585ba..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples/sc21/run_table_1.sh
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/bin/bash
-
-# ================================
-# Choose the case to run.
-# ================================
-# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
-MODEL_SIZE=1.7B
-
-
-
-
-
-
-if [ ${MODEL_SIZE} == "1.7B" ]; then
-    TP=1
-    PP=1
-    MBS=16
-    GBS=512
-    NLS=24
-    HS=2304
-    NAH=24
-    DDP=torch
-    NNODES=4
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-elif [ ${MODEL_SIZE} == "3.6B" ]; then
-    TP=2
-    PP=1
-    MBS=16
-    GBS=512
-    NLS=30
-    HS=3072
-    NAH=32
-    DDP=torch
-    NNODES=8
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-elif [ ${MODEL_SIZE} == "7.5B" ]; then
-    TP=4
-    PP=1
-    MBS=16
-    GBS=512
-    NLS=36
-    HS=4096
-    NAH=32
-    DDP=torch
-    NNODES=16
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-elif [ ${MODEL_SIZE} == "18B" ]; then
-    TP=8
-    PP=1
-    MBS=8
-    GBS=1024
-    NLS=40
-    HS=6144
-    NAH=48
-    DDP=torch
-    NNODES=32
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-elif [ ${MODEL_SIZE} == "39B" ]; then
-    TP=8
-    PP=2
-    MBS=4
-    GBS=1536
-    NLS=48
-    HS=8192
-    NAH=64
-    DDP=local
-    NNODES=64
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-elif [ ${MODEL_SIZE} == "76B" ]; then
-    TP=8
-    PP=4
-    MBS=2
-    GBS=1792
-    NLS=60
-    HS=10240
-    NAH=80
-    DDP=local
-    NNODES=128
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
-elif [ ${MODEL_SIZE} == "145B" ]; then
-    TP=8
-    PP=8
-    MBS=2
-    GBS=2304
-    NLS=80
-    HS=12288
-    NAH=96
-    DDP=local
-    NNODES=192
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
-elif [ ${MODEL_SIZE} == "310B" ]; then
-    TP=8
-    PP=16
-    MBS=1
-    GBS=2160
-    NLS=96
-    HS=16384
-    NAH=128
-    DDP=local
-    NNODES=240
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
-elif [ ${MODEL_SIZE} == "530B" ]; then
-    TP=8
-    PP=35
-    MBS=1
-    GBS=2520
-    NLS=105
-    HS=20480
-    NAH=128
-    DDP=local
-    NNODES=315
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
-elif [ ${MODEL_SIZE} == "1T" ]; then
-    TP=8
-    PP=64
-    MBS=1
-    GBS=3072
-    NLS=128
-    HS=25600
-    NAH=160
-    DDP=local
-    NNODES=384
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
-else
-    echo "Invalid configuration"
-    exit 1
-fi
-
-
-# Name of the job
-export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
-
-
-# Import the configs.
-. `pwd`/CONFIG.sh
-
-
-# Submit the job.
-. `pwd`/SBATCH.sh
-
-
-exit 0
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_config_gpt_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_config_gpt_TEMPLATE.json
deleted file mode 100644
index 5a14931cb99d667078a36ffac07b7b8ff9a470e6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_config_gpt_TEMPLATE.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-  "train_batch_size" : CONFIG_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": CONFIG_FP16_ENABLED,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "bf16": {
-    "enabled": CONFIG_BF16_ENABLED
-  },
-  "curriculum_learning": {
-    "enabled": CONFIG_CL_ENABLED,
-    "curriculum_type": "seqlen",
-    "min_difficulty": CONFIG_CL_MIN,
-    "max_difficulty": CONFIG_CL_MAX,
-    "schedule_type": "fixed_linear",
-    "schedule_config": {
-      "total_curriculum_step": CONFIG_CL_DURATION,
-      "difficulty_step": 8
-    }
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_config_gpt_Zero2_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_config_gpt_Zero2_TEMPLATE.json
deleted file mode 100644
index 4d0a68f72deb3930c85adb69f37b331a706f6b22..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_config_gpt_Zero2_TEMPLATE.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-  "train_batch_size" : CONFIG_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": 2
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": false,
-
-  "fp16": {
-    "enabled": CONFIG_FP16_ENABLED,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "bf16": {
-    "enabled": CONFIG_BF16_ENABLED
-  },
-  "curriculum_learning": {
-    "enabled": CONFIG_CL_ENABLED,
-    "curriculum_type": "seqlen",
-    "min_difficulty": CONFIG_CL_MIN,
-    "max_difficulty": CONFIG_CL_MAX,
-    "schedule_type": "fixed_linear",
-    "schedule_config": {
-      "total_curriculum_step": CONFIG_CL_DURATION,
-      "difficulty_step": 8
-    }
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_evalharness.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_evalharness.sh
deleted file mode 100644
index 3496ada20d13c98845686c1c847a536bb3203a39..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_evalharness.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-# This is an example zero-shot eval script. Please first read the readme_evalharness.md under the same directory.
-
-CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B/global_step81566/
-CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B.json
-RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log
-
-PP_SIZE=1
-TP_SIZE=1
-NO_PP="true"
-EP_PARALLEL_SIZE=1
-# Currently eval harness does not support data parallel
-# However, for MoE models it's possible to enable a "fake data parallel"
-# in order to load experts on multiple gpus. At the same time, it's not
-# real data parallel because we load the same data on all gpus.
-# On the other hand, it's better to use less number of gpus than training,
-# to reduce communication overhead.
-NUM_NODE=1
-NUM_GPU_PER_NODE=1
-
-TASKS="lambada"
-# WikiText-2, not used in GPT-3 paper but used in GPT-2 paper
-# TASKS="wikitext"
-# Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2.
-# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext"
-# All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test.
-# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli"
-
-VOCAB_FILE=/data/Megatron-LM/data/gpt2-vocab.json
-MERGE_FILE=/data/Megatron-LM/data/gpt2-merges.txt
-
-# export HF_DATASETS_OFFLINE=1
-
-# Dummy arguments to make megatron happy. No need to configure them.
-# The reason we don't need to configure them and many other arguments is
-# because the eval framework will read the arguments from checkpoint file.
-MEGATRON_REQUIRED_ARGS="\
-    --num-layers -1\
-    --hidden-size -1\
-    --num-attention-heads -1\
-    --seq-length -1 \
-    --max-position-embeddings -1
-"
-
-CMD="../../tasks/eval_harness/evaluate.py \
-    --load $CHECKPOINT_PATH\
-    --tensor-model-parallel-size $TP_SIZE \
-    --pipeline-model-parallel-size $PP_SIZE\
-    --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-    --vocab-file $VOCAB_FILE\
-    --merge-file $MERGE_FILE\
-    --micro-batch-size 12\
-    --no-load-optim \
-    --no-load-rng \
-    --inference \
-    --disable-moe-token-dropping \
-    --tokenizer-type GPT2BPETokenizer \
-    --adaptive_seq_len\
-    --eval_fp32\
-    --task_list $TASKS\
-    --results_path $RESULT_PATH \
-    --deepspeed \
-    --deepspeed_config $CONFIG_PATH \
-    $MEGATRON_REQUIRED_ARGS\
-    "
-
-if [[ "${NO_PP}" = "true" ]]; then
-CMD="${CMD} \
-    --no-pipeline-parallel"
-fi
-
-LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE"
-$LAUNCHER $CMD
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_MoE128.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_MoE128.sh
deleted file mode 100644
index 0f2805dfd0fe501a4081fd4a2f8c9e83e298f223..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_MoE128.sh
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-MODEL_SIZE=1.3
-NUM_LAYERS=24
-HIDDEN_SIZE=2048
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_ITERS is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_ITERS.
-TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-# LR_DECAY_TOKENS=260000000000
-LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=8
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 1 means dense model without MoE
-# EP_SIZE=1
-EP_SIZE=128
-
-if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
-    EP_PARALLEL_SIZE=$NUM_GPUS
-else
-    EP_PARALLEL_SIZE=$EP_SIZE
-fi
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
-## heavily tuned.
-LR=1.2e-4
-MIN_LR=1.0e-6
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [[ $EP_SIZE -gt 1 ]]; then
-    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    BASE_DATA_PATH=/data/Megatron-LM/data
-    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-iters ${TRAIN_ITERS} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [[ $EP_SIZE -gt 1 ]]; then
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-fi
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/0/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-if [[ $EP_SIZE -gt 1 ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128.sh
deleted file mode 100644
index f758ac69bf3bff404e63d019a07c9722360b1241..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128.sh
+++ /dev/null
@@ -1,340 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-MODEL_SIZE=1.3
-NUM_LAYERS=24
-HIDDEN_SIZE=2048
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_ITERS is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_ITERS.
-TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-# LR_DECAY_TOKENS=260000000000
-LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=8
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 128 means standard MoE
-# EP_SIZE=128
-EP_SIZE="64 64 64 64 64 64 64 64 64 64 128 128"
-
-
-EP_PARALLEL_SIZE=$NUM_GPUS
-
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## heavily tuned.
-LR=1.2e-4
-MIN_LR=1.0e-6
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-NAME="${NAME}-ep-pyramid-64+128-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --mlp-type residual \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-iters ${TRAIN_ITERS} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_Zero2_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh
deleted file mode 100644
index 34bc60548f3591130409c2cdb27eef33a96a14af..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh
+++ /dev/null
@@ -1,354 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-MODEL_SIZE=1.3
-NUM_LAYERS=24
-HIDDEN_SIZE=2048
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_ITERS is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_ITERS.
-TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-# LR_DECAY_TOKENS=260000000000
-LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=128
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 128 means standard MoE
-# EP_SIZE=128
-EP_SIZE="64 64 64 64 64 64 64 64 128 128"
-EP_SIZE_TEACHER="64 64 64 64 64 64 64 64 64 64 128 128"
-
-EP_PARALLEL_SIZE=$NUM_GPUS
-
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## heavily tuned.
-LR=1.2e-4
-MIN_LR=1.0e-6
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-NAME="${NAME}-ep-pyramid-64+128-mos-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-### Mixture-of-Students (MoS) configs
-KD_BETA_CE=1
-CHECKPOINT_PATH_STUDENT="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-CHECKPOINT_PATH_TEACHER="${OUTPUT_BASEPATH}/checkpoint/gpt-1.3B-lr-1.2e-4-minlr-1.0e-6-bs-512-gpus-128-mp-1-pp-1-ep-pyramid-64+128-mlc-0.01-cap-1.0-drop-true/"
-CHECKPOINT_PATH_SAVE="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-USE_INTERNAL_DATA="true"
-# USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    ## Placeholder, we plan to test a public dataset
-    VOCAB_PATH=""
-    MERGE_PATH=""
-    DATA_BLEND=""
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --mlp-type residual \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers 21 \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-iters ${TRAIN_ITERS} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH_STUDENT} \
-        --save ${CHECKPOINT_PATH_SAVE} \
-        --mos \
-        --kd-beta-ce ${KD_BETA_CE} \
-        --num-layers-teacher ${NUM_LAYERS} \
-        --num-experts-teacher ${EP_SIZE_TEACHER} \
-        --hidden-size-teacher ${HIDDEN_SIZE} \
-        --num-attention-heads-teacher ${NUM_ATTN_HEADS} \
-        --load-teacher ${CHECKPOINT_PATH_TEACHER} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_Zero2_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-# run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense.sh
deleted file mode 100644
index 27b546435abda16cb554da0a215ba87ba4921646..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense.sh
+++ /dev/null
@@ -1,349 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-MODEL_SIZE=1.3
-NUM_LAYERS=24
-HIDDEN_SIZE=2048
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=512
-LR=2.0e-4
-MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-# LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=2
-
-## Model parallelism, 1 is no MP
-MP_SIZE=4
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 1 means dense model without MoE
-EP_SIZE=1
-# EP_SIZE=128
-
-if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
-    EP_PARALLEL_SIZE=$NUM_GPUS
-else
-    EP_PARALLEL_SIZE=$EP_SIZE
-fi
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
-## heavily tuned.
-# LR=2.0e-4
-# MIN_LR=2e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [[ $EP_SIZE -gt 1 ]]; then
-    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    BASE_DATA_PATH=/data/Megatron-LM/data
-    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --rampup-batch-size 32 32 1953125 \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [[ $EP_SIZE -gt 1 ]]; then
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-fi
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/0/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-if [[ $EP_SIZE -gt 1 ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense_cl.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense_cl.sh
deleted file mode 100644
index e40b55b80969698e952a05897dc0c728488fb1e2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense_cl.sh
+++ /dev/null
@@ -1,285 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-MODEL_SIZE=1.3
-NUM_LAYERS=24
-HIDDEN_SIZE=2048
-NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-MIN_LR=2.0e-5
-
-# Curriculum learning (CL) enables stable large-batch training
-GLOBAL_BATCH_SIZE=4096 # 8x
-LR=8.0e-4 # 4x
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-TRAIN_TOKENS=300000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=16
-
-## Model parallelism, 1 is no MP
-MP_SIZE=2
-
-## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
-PP_SIZE=1
-NO_PP="true"
-
-## ZeRO stage
-ZERO_STAGE=0
-
-## Total number of GPUs
-NUM_GPUS=128
-DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="true"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization. Usually larger model needs
-## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
-## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-INIT_STD=0.013
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-LOG_OPTIMIZER_STATE="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt3-with-pile-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-zero-${ZERO_STAGE}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [ "${NO_PP}" = "true" ]; then
-    NAME="${NAME}-no_pp"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
-fi
-
-LOG_PATH="log/"
-TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
-CHECKPOINT_PATH="/blob/users/conglli/project/gpt3_with_pile/checkpoint/${NAME}"
-mkdir -p ${LOG_PATH}
-mkdir -p ${TENSORBOARD_PATH}
-mkdir -p ${CHECKPOINT_PATH}
-
-VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-DATA_PATH=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_PATH}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_${NAME}.json"
-if [[ $ZERO_STAGE -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-            --deepspeed \
-            --deepspeed_config ${config_json} \
-            --zero-stage ${ZERO_STAGE} \
-            --pipeline-model-parallel-size ${PP_SIZE}"
-
-if [[ "${NO_PP}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
deleted file mode 100644
index f93f0b71268fcd7bd2535df9ff19c3a862969adf..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
+++ /dev/null
@@ -1,372 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-MODEL_SIZE=0.125
-NUM_LAYERS=12
-HIDDEN_SIZE=768
-NUM_ATTN_HEADS=12
-GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_ITERS is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_ITERS.
-TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-# LR_DECAY_TOKENS=260000000000
-LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 1 means dense model without MoE
-# EP_SIZE=1
-EP_SIZE=64
-
-if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
-    EP_PARALLEL_SIZE=$NUM_GPUS
-else
-    EP_PARALLEL_SIZE=$EP_SIZE
-fi
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
-## heavily tuned.
-LR=4.5e-4
-MIN_LR=4.5e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [[ $EP_SIZE -gt 1 ]]; then
-    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    BASE_DATA_PATH=/data/Megatron-LM/data
-    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_PATH="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    # For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-    DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
-    # For cluster Azure-WestUS3-A100
-    # DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-iters ${TRAIN_ITERS} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [[ $EP_SIZE -gt 1 ]]; then
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-fi
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/0/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-if [[ $EP_SIZE -gt 1 ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
-ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
-ITERATION=0
-for (( node = 0; node <= NUM_NODE-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
-        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
-        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
-    fi
-done
-if [[ $ITERATION -gt 0 ]]; then
-    ITERATION_2="global_step${ITERATION}"
-    ds_ssh "echo $ITERATION > $ITERATION_FILE"
-    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
deleted file mode 100644
index 36b654e02b91a0227afec91b6655b63bbde61c1b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
+++ /dev/null
@@ -1,309 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-MODEL_SIZE=0.125
-NUM_LAYERS=12
-HIDDEN_SIZE=768
-NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-MIN_LR=6.0e-5
-
-# Curriculum learning (CL) enables stable large-batch training
-GLOBAL_BATCH_SIZE=2048 # 8x
-LR=2.4e-3 # 4x
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-TRAIN_TOKENS=300000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=16
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
-PP_SIZE=1
-NO_PP="true"
-
-## ZeRO stage
-ZERO_STAGE=0
-
-## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
-DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="true"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=72
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization. Usually larger model needs
-## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
-## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-INIT_STD=0.02
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-LOG_OPTIMIZER_STATE="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt3-with-pile-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-zero-${ZERO_STAGE}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [ "${NO_PP}" = "true" ]; then
-    NAME="${NAME}-no_pp"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
-fi
-
-LOG_PATH="log/"
-TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
-CHECKPOINT_PATH="/blob/users/conglli/project/gpt3_with_pile/checkpoint/${NAME}"
-mkdir -p ${LOG_PATH}
-mkdir -p ${TENSORBOARD_PATH}
-mkdir -p ${CHECKPOINT_PATH}
-
-VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
-# For cluster Azure-WestUS3-A100
-# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_PATH}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_${NAME}.json"
-if [[ $ZERO_STAGE -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-            --deepspeed \
-            --deepspeed_config ${config_json} \
-            --zero-stage ${ZERO_STAGE} \
-            --pipeline-model-parallel-size ${PP_SIZE}"
-
-if [[ "${NO_PP}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
-ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
-ITERATION=0
-for (( node = 0; node <= NUM_NODE-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
-        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
-        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
-    fi
-done
-if [[ $ITERATION -gt 0 ]]; then
-    ITERATION_2="global_step${ITERATION}"
-    ds_ssh "echo $ITERATION > $ITERATION_FILE"
-    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_MoE128.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_MoE128.sh
deleted file mode 100644
index 4f8007b01e33fa862f8a6574002cc2012729d575..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_MoE128.sh
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-MODEL_SIZE=0.35
-NUM_LAYERS=24
-HIDDEN_SIZE=1024
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_ITERS is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_ITERS.
-TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-# LR_DECAY_TOKENS=260000000000
-LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 1 means dense model without MoE
-# EP_SIZE=1
-EP_SIZE=128
-
-if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
-    EP_PARALLEL_SIZE=$NUM_GPUS
-else
-    EP_PARALLEL_SIZE=$EP_SIZE
-fi
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
-## heavily tuned.
-LR=2.0e-4
-MIN_LR=2e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [[ $EP_SIZE -gt 1 ]]; then
-    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    BASE_DATA_PATH=/data/Megatron-LM/data
-    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-iters ${TRAIN_ITERS} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [[ $EP_SIZE -gt 1 ]]; then
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-fi
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/0/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-if [[ $EP_SIZE -gt 1 ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64.sh
deleted file mode 100644
index d9f8513809f6e99deca59f1f90b4d412b9a0e446..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64.sh
+++ /dev/null
@@ -1,341 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-MODEL_SIZE=0.35
-NUM_LAYERS=24
-HIDDEN_SIZE=1024
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_ITERS is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_ITERS.
-TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-# LR_DECAY_TOKENS=260000000000
-LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 128 means standard MoE
-# EP_SIZE=128
-EP_SIZE="32 32 32 32 32 32 32 32 32 32 64 64"
-
-EP_PARALLEL_SIZE=$NUM_GPUS
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M PR-MoE-32/64 model we used LR=3.0e-4 and MIN_LR=1.0e-6, but they are not
-## heavily tuned.
-LR=3.0e-4
-MIN_LR=1.0e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-NAME="${NAME}-ep-pyramid-32+64-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --mlp-type residual \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-iters ${TRAIN_ITERS} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/0/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh
deleted file mode 100644
index a5b349b9e7fde267f39064bf072d4635057e2247..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh
+++ /dev/null
@@ -1,353 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-MODEL_SIZE=0.35
-NUM_LAYERS=24
-HIDDEN_SIZE=1024
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_ITERS is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_ITERS.
-TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-# LR_DECAY_TOKENS=260000000000
-LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 128 means standard MoE
-# EP_SIZE=128
-EP_SIZE="32 32 32 32 32 32 32 32 64 64"
-EP_SIZE_TEACHER="32 32 32 32 32 32 32 32 32 32 64 64"
-
-EP_PARALLEL_SIZE=$NUM_GPUS
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M PR-MoE-32/64 model we used LR=3.0e-4 and MIN_LR=1.0e-6, but they are not
-## heavily tuned.
-LR=3.0e-4
-MIN_LR=1.0e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-NAME="${NAME}-ep-pyramid-32+64-mos-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-### Mixture-of-Students (MoS) configs
-KD_BETA_CE=1
-CHECKPOINT_PATH_STUDENT="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-CHECKPOINT_PATH_TEACHER="${OUTPUT_BASEPATH}/checkpoint/gpt-1.3B-lr-1.2e-4-minlr-1.0e-6-bs-512-gpus-128-mp-1-pp-1-ep-pyramid-64+128-mlc-0.01-cap-1.0-drop-true/"
-CHECKPOINT_PATH_SAVE="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-USE_INTERNAL_DATA="true"
-# USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    ## Placeholder, we plan to test a public dataset
-    VOCAB_PATH=""
-    MERGE_PATH=""
-    DATA_BLEND=""
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --mlp-type residual \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers 21 \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-iters ${TRAIN_ITERS} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH_STUDENT} \
-        --save ${CHECKPOINT_PATH_SAVE} \
-        --mos \
-        --kd-beta-ce ${KD_BETA_CE} \
-        --num-layers-teacher ${NUM_LAYERS} \
-        --num-experts-teacher ${EP_SIZE_TEACHER} \
-        --hidden-size-teacher ${HIDDEN_SIZE} \
-        --num-attention-heads-teacher ${NUM_ATTN_HEADS} \
-        --load-teacher ${CHECKPOINT_PATH_TEACHER} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_dense.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_dense.sh
deleted file mode 100644
index 405817a06e1b2da699057acc1cd4075e5121a29d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_350M_dense.sh
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-MODEL_SIZE=0.35
-NUM_LAYERS=24
-HIDDEN_SIZE=1024
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=256
-LR=3.0e-4
-MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-# LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 1 means dense model without MoE
-EP_SIZE=1
-# EP_SIZE=128
-
-if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
-    EP_PARALLEL_SIZE=$NUM_GPUS
-else
-    EP_PARALLEL_SIZE=$EP_SIZE
-fi
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
-## heavily tuned.
-# LR=2.0e-4
-# MIN_LR=2e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [[ $EP_SIZE -gt 1 ]]; then
-    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    BASE_DATA_PATH=/data/Megatron-LM/data
-    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [[ $EP_SIZE -gt 1 ]]; then
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-fi
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/0/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-if [[ $EP_SIZE -gt 1 ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_6.7B_dense.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_6.7B_dense.sh
deleted file mode 100644
index 1fdd76cbe335a4f99512a756ee2993fe9873e441..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/ds_pretrain_gpt_6.7B_dense.sh
+++ /dev/null
@@ -1,349 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-MODEL_SIZE=6.7
-NUM_LAYERS=32
-HIDDEN_SIZE=4096
-NUM_ATTN_HEADS=32
-GLOBAL_BATCH_SIZE=1024
-LR=1.2e-4
-MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-# LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=8
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 1 means dense model without MoE
-EP_SIZE=1
-# EP_SIZE=128
-
-if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
-    EP_PARALLEL_SIZE=$NUM_GPUS
-else
-    EP_PARALLEL_SIZE=$EP_SIZE
-fi
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
-## heavily tuned.
-# LR=2.0e-4
-# MIN_LR=2e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-# INIT_STD=0.014
-INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [[ $EP_SIZE -gt 1 ]]; then
-    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    BASE_DATA_PATH=/data/Megatron-LM/data
-    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --rampup-batch-size 32 32 4882812 \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [[ $EP_SIZE -gt 1 ]]; then
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-fi
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/0/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-if [[ $EP_SIZE -gt 1 ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/readme_evalharness.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/readme_evalharness.md
deleted file mode 100644
index d30075e2fc488f9dc9818565285d4a16bb58764e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/MoE/readme_evalharness.md
+++ /dev/null
@@ -1,168 +0,0 @@
-# How to run lm-eval on Megatron-DeepSpeed checkpoint using the original setup
-
-A great portion of this eval harness feature is inherited from https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/212, but with code/doc changes (e.g., to support case without pipeline parallelism and MoE models).
-
-This particular setup uses the normal deepspeed checkpoint and requires no conversion to Megatron-LM.
-
-## Prerequisites
-
-1. Install software
-
-On login console with external network
-
-Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks.
-Below package version numbers are what we tested that work.
-```
-(maybe need pip install --upgrade pip)
-pip install best-download==0.0.7 lm-eval==0.2.0 datasets==1.15.1 transformers==4.20.1 huggingface-hub==0.8.1
-```
-
-2. Pre-download needed datasets
-
-some symlinks due to lm-harness' issues with relative position of data
-```
-mkdir data
-cd ../../tasks/eval_harness/
-ln -s ../../examples_deepspeed/MoE/data/ data
-cd ../../examples_deepspeed/MoE/
-```
-<!-- Also make sure `data` is not on one of the limited paritions like WORKSF. -->
-
-Then install datasets for the tasks:
-```
-python ../../tasks/eval_harness/download.py --task_list hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli
-```
-
-Previously we set `export HF_DATASETS_OFFLINE=1` to make the dataset offline after the above manual download. But somehow now this could trigger error on some kind of online verification for some of the datasets, so it's recommended to only set offline mode when necessary.
-
-<!-- If there are things like custom tokenizers, pre-download those too, e.g.:
-
-```
-python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('bigscience/oscar_13_languages_alpha_weight')"
-```
-and make sure that `export TRANSFORMERS_OFFLINE=1` is in the script.
-You know there is a custom tokenizer if the training script had something like:
-
-```
---tokenizer-type PretrainedFromHF \
- --tokenizer-name-or-path bigscience/oscar_13_languages_alpha_weight \
-``` -->
-
-3. Prepare the script
-
-<!-- Prepare the run script, replace `variant` with a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same `results.json` file. so, e.g., `tr9c-1B3-swiglu`
-
-```
-cp examples_deepspeed/run_evalharness_deepspeed.slurm run_evalharness-variant.slurm
-```
-
-now edit `run_evalharness-variant.slurm`
-
-
-Note that the eval code knows to pull the original training args from the checkpoint, so we don't need to pass any of those. And we just need to setup the evaluation args. -->
-
-`ds_evalharness.sh` is the example script.
-
-1. Edit:
-
-```
-PP_SIZE=1
-TP_SIZE=1
-NO_PP="true"
-EP_PARALLEL_SIZE=1
-NUM_NODE=1
-NUM_GPU_PER_NODE=1
-```
-to match the eval topology. 
-
-Edit:
-```
-CHECKPOINT_PATH=
-CONFIG_PATH=
-RESULT_PATH=
-```
-to the checkpoint/ds config you want to use, and where to save the results.
-<!-- If the model fits into 1 gpu, then there is nothing to change.
-
-The eval script will automatically reshape the model if it was of a different topology. -->
-
-
-2. Adjust the following to fit the chosen GPU. As of last check for 1.3B model the settings are one of:
-```
-EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
-EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
-```
-
-If you get OOM lower it further.
-
-3. If not using the Deepspeed path, disable it by removing:
-
-```
-    --deepspeed \
-    --deepspeed_config ds_config.json \
-```
-
-If you didn't disable it and the program crashed on checkpoint loading unable to find some key, disable deepspeed as explained above.
-
-Note that for MoE models and for models without pipeline parallelism, currently they might not work for the case without deepspeed.
-
-<!-- ## Eval
-
-Currently it takes 2-3 hours to run on 32GB for 1.3B model, 6-7h for 16GB GPU, so a 20h slurm job should be enough.
-
-When ready, launch:
-```
-sbatch ./run_evalharness-variant.slurm
-```
-
-To monitor progress:
-```
-tail -f tail -f $VARIANT-eval-harness.log
-```
-where the variant is what you set `$VARIANT` to in the slurm script.
-
-The template is set up for 16GB gpu since they are easier to get by. If you change to 32GB, adjust:
-```
-#SBATCH --constraint=v100-32g
-...
-EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
-```
-
-
-Note that the original ETA at the start of the run can be 10x too longer than the actual outcome. For example it may suggest 18 hours but will complete in 2 hours.
-
-
-## Short eval
-
-if you just want to quickly test that everything can run to the end, edit `tasks/eval_harness/evaluate.py`,  e.g. to run only 10 batches:
-```
-- results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
-+ results = evaluator.evaluate(adaptor, task_dict, False, 0, 10)
-```
-
-(XXX: could be a cmd line option so that code won't need to be modified)
-
-
-## Import into spreadsheet
-
-https://docs.google.com/spreadsheets/d/1CI8Q9RCblLRzUOPJ6ViqBmo284-8ojluQ-CmaEuhuv0/edit?usp=sharing
-
-Note that the spreadsheet format is quite different, so use this script:
-```
-./tasks/eval_harness/report-to-csv.py results.json
-```
-to reformat the json results into csv while changing its shape to match the spreadsheet format
-
-Since some records might be missing or extraneous here is the best way to do it:
-
-1. copy the data from first 2 columns to some place under the main spreadsheet
-
-2. put the pointer to the 3rd column next to where the 2 first columns were copied.
-
-3. import `results.csv` using file-> import -> file ->
-
-Import location: Replace data at selected cell
-
-4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
-
-5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/README.md
deleted file mode 100644
index 3d899816640af41aceeb18a0b6c43532bfcc77c8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Megatron-DeepSpeed Recipes and Scripts
-
-This folder includes various example scripts with DeepSpeed technologies integrated. Below we describe each sub-folder, sorted by last update date.
-
-## Sync with NVIDIA/Megatron-LM (last updated: Jul 2023)
-The ```rebase``` folder includes details about the recent sync with the NVIDIA/Megatron-LM repo (where this repo is forked from). It includes example scripts we used to test after the sync, together with a README documentation about what were tested.
-
-## Data Efficiency (last updated: Feb 2023)
-
-The ```data_efficiency``` folder includes GPT-3 and BERT pretraining examples for DeepSpeed Data Efficiency Library, together with examples of zero-shot evaluation for GPT models and GLUE finetuning for BERT models. Please refer to the detailed tutorials in data_efficiency/README.MD. Currently this folder includes the newest example scripts for GPT/BERT pretraining/eval/finetuning, both with and without DeepSpeed Data Efficiency Library techniques.
-
-## BERT example (last updated: Dec 2022)
-
-The ```bert_with_pile``` folder includes examples about BERT-style model pre-training (using the public Pile data or user's own data) with DeepSpeed integration. Please refer to the readme in the folder for tutorial.
-
-## Azure (last updated: Nov 2022)
-
-We strongly recommend to start with AzureML recipe in the ```azureml``` folder.
-
-If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS based environments, please refer to the bash scripts in the ```azure``` folder.
-
-## Model Compression (last updated: Aug 2022)
-
-The ```compression``` folder includes examples about layer reduction for task-agnostic compression. Please refer to [this tutorial](https://www.deepspeed.ai/tutorials/model-compression/#11-layer-reduction) about the DeepSpeed Model Compression Library. These recipes are for GPT-style NLG models.
-
-## MoE (last updated: Jun 2022)
-
-Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models and dense models. These recipes are for GPT-style NLG models, and currently this is the only folder with MoE training examples.
-
-## Curriculum Learning (last updated: Oct 2021)
-
-Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. These recipes are for GPT-style NLG models.
-Note that the DeepSpeed Data Efficiency Library above includes a more general curriculum learning support. This legacy curriculum learning feature is still compatible, but we recommend using the DeepSpeed Data Efficiency Library above. However, the newer DeepSpeed Data Efficiency Library currently is not compatible with pipeline parallelism. So if you have to use pipeline parallelism, you would need to use this legacy curriculum learning version.
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/README.md
deleted file mode 100644
index ef648fa29d263bf2078c003c1c7685c0280d07e7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-## Recipes for experimentation on Azure
-
-The recipes have been tested on command line on a cluster setup using Azure VMs and VMSS as well as inside Docker based environments.
-
-To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows
-
-```bash examples_deepspeed/azure/run-benchmark-model.sh```
-
-### Pre-requisites
-
-To run the above script, you will need to either setup your own dataset and modify the scripts or use our helper scripts to download the publicly available Books dataset and GPT vocab files. Please use the following from the ```dataset``` folder
-
-```bash dataset/download_books.sh```
-
-```bash dataset/download_vocab.sh```
-
-### Run 175B and 1T models
-
-We have included two recipes for the 175B model and the 1T model. To train the model, we assume that the users will modify and tune hyperparameters and configurations by themselves. To facilitate initial training, we have made the recipes runnable with the Books dataset as follows.
-
-```bash examples_deepspeed/azure/run-175b.sh```
-
-```bash examples_deepspeed/azure/run-1t.sh```
-
-### Note about ZeRO stage 3 and CPU offload 
-
-By default, we have enabled ZeRO Stage 3 for both the recipes above. For the 1T model, we have also enabled the CPU-offload feature to save on memory and enable a larger batch size that offers better performance. 
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-175b.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-175b.sh
deleted file mode 100644
index 3e6b84a85111e34b3252dc77aa9d47250ffb27e5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-175b.sh
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/bin/bash
-set -ex
-
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-
-BASE_PATH=$PWD/dataset/
-DATA_PATH=${BASE_PATH}/BookCorpusDataset_text_document
-DS_CONFIG=ds_config.json
-
-# Hostfile path
-HF=/job/hostfile 
-
-# Disabling tensor/pipeline parallelism
-TP=1
-PP=1
-
-# HEADS ~= HIDDEN/128
-
-# Model: 175B
-NLAYERS=96
-HIDDEN=12288
-HEADS=96
-SEQ=1024
-
-
-MICRO_BATCH=4
-NODES=1
-GPN=8
-GLOBAL_BATCH=$(( ${GPN} * ${MICRO_BATCH} * ${NODES} ))
-
-# Initial power scale for loss
-SP=15
-
-# Uncomment/comment one of the following blocks.
-
-# For 1T model, start with microbatch=1, try to get 2 and 4.  If OOM w/ 4, use cpu-offloading
-
-# Set to cpu for offloading to cpu for larger models
-#OFFLOAD_DEVICE="cpu"
-#CPU_OPTIM=" --cpu-optimizer"
-
-# Set to none and empty string for no cpu offloading
-OFFLOAD_DEVICE="none"  
-CPU_OPTIM=" "
-
-ZERO_STAGE=3
-OUTPUT_DIR=ds_z_off-${OFFLOAD_DEVICE}_stage_${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_nodes${NODES}
-#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
-mkdir -p $OUTPUT_DIR
-
-cat <<EOT > $DS_CONFIG
-{
-  "train_batch_size" : $GLOBAL_BATCH,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
-  "steps_per_print": 1,
-  "gradient_accumulation_steps": 1,
-  "zero_optimization": {
-    "stage": 3,
-    "stage3_max_live_parameters": 3e9,
-    "stage3_max_reuse_distance": 3e9,
-    "stage3_param_persistence_threshold": 1e5,
-    "stage3_prefetch_bucket_size": 5e7,
-    "contiguous_gradients": true,
-    "overlap_comm": true,
-    "reduce_bucket_size": 90000000,
-    "sub_group_size": 1e9,
-    "offload_optimizer": {
-      "device": "$OFFLOAD_DEVICE",
-      "buffer_count": 4,
-      "pipeline_read": false,
-      "pipeline_write": false,
-      "pin_memory": true
-    }
-  },
-  "gradient_clipping": 1.0,
-  "fp16": {
-    "enabled": true,
-    "initial_scale_power" : $SP,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "wall_clock_breakdown": true,
-  "zero_allow_untested_optimizer": false,
-  "aio": {
-    "block_size": 1048576,
-    "queue_depth": 16,
-    "single_submit": false,
-    "overlap_events": true,
-    "thread_count": 2
-  }
-}
-EOT
-
-export NCCL_DEBUG=warn 
-
-ds_args=" "
-ds_args=" --deepspeed ${ds_args}"
-ds_args=" --no-pipeline-parallel ${ds_args}" 
-ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
-ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
-ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
-
-
-
-deepspeed --force_multi --num_nodes=$NODES --hostfile $HF pretrain_gpt.py \
-    --tensor-model-parallel-size $TP \
-    --pipeline-model-parallel-size $PP \
-    --num-layers $NLAYERS \
-    --hidden-size $HIDDEN \
-    --num-attention-heads $HEADS \
-    --seq-length $SEQ \
-    --loss-scale $SP \
-    --max-position-embeddings $SEQ \
-    --micro-batch-size $MICRO_BATCH \
-    --global-batch-size $GLOBAL_BATCH \
-    --train-iters 1000 \
-    --lr 6.0e-5 \
-    --min-lr 6.0e-6 \
-    --lr-decay-style cosine \
-    --log-interval 1 \
-    --eval-iters 40 \
-    --eval-interval 1000 \
-    --data-path $DATA_PATH \
-    --vocab-file $BASE_PATH/gpt2-vocab.json \
-    --merge-file $BASE_PATH/gpt2-merges.txt \
-    --save-interval 1000 \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.006 \
-    --fp16 \
-    --checkpoint-activations \
-    --tensorboard-dir $OUTPUT_DIR \
-    $CPU_OPTIM $ds_args \
-    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-1t.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-1t.sh
deleted file mode 100644
index 6e93bcb06e8a4f8c982441e1dd8a5da652750a1a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-1t.sh
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/bin/bash
-set -ex
-
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-
-BASE_PATH=$PWD/dataset/
-DATA_PATH=${BASE_PATH}/BookCorpusDataset_text_document
-DS_CONFIG=ds_config.json
-
-# Hostfile path
-HF=/job/hostfile
-
-# Disabling tensor/pipeline parallelism
-TP=1
-PP=1
-
-# HEADS ~= HIDDEN/128
-
-# Refer to Megatron-table in the README.md file for model sizes
-# Model: 310B
-#NLAYERS=96
-#HIDDEN=16384
-#HEADS=128
-#SEQ=2048
-
-# Model 530B
-#NLAYERS=105
-#HIDDEN=20480
-#HEADS=160
-#SEQ=2048
-
-# Model 1T
-NLAYERS=128
-HIDDEN=25600
-HEADS=160
-SEQ=1024
-
-MICRO_BATCH=1
-NODES=1
-GPN=8
-GLOBAL_BATCH=$(( ${GPN} * ${MICRO_BATCH} * ${NODES} ))
-
-# Initial power scale for loss
-SP=15
-
-# Uncomment/comment one of the following blocks.
-
-# For 1T model, start with microbatch=1, try to get 2 and 4.  If OOM w/ 4, use cpu-offloading
-
-# Set to cpu for offloading to cpu for larger models
-OFFLOAD_DEVICE="cpu"
-CPU_OPTIM=" --cpu-optimizer"
-
-# Set to none and empty string for no cpu offloading
-#OFFLOAD_DEVICE="none"  
-#CPU_OPTIM=" "
-
-ZERO_STAGE=3
-OUTPUT_DIR=ds_z_off-${OFFLOAD_DEVICE}_stage_${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_nodes${NODES}
-#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
-mkdir -p $OUTPUT_DIR
-
-cat <<EOT > $DS_CONFIG
-{
-  "train_batch_size" : $GLOBAL_BATCH,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
-  "steps_per_print": 1,
-  "gradient_accumulation_steps": 1,
-  "zero_optimization": {
-    "stage": 3,
-    "stage3_max_live_parameters": 3e9,
-    "stage3_max_reuse_distance": 3e9,
-    "stage3_param_persistence_threshold": 1e5,
-    "stage3_prefetch_bucket_size": 5e7,
-    "contiguous_gradients": true,
-    "overlap_comm": true,
-    "reduce_bucket_size": 90000000,
-    "sub_group_size": 1e9,
-    "offload_optimizer": {
-      "device": "$OFFLOAD_DEVICE",
-      "buffer_count": 4,
-      "pipeline_read": false,
-      "pipeline_write": false,
-      "pin_memory": true
-    }
-  },
-  "gradient_clipping": 1.0,
-  "fp16": {
-    "enabled": true,
-    "initial_scale_power" : $SP,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "wall_clock_breakdown": true,
-  "zero_allow_untested_optimizer": false,
-  "aio": {
-    "block_size": 1048576,
-    "queue_depth": 16,
-    "single_submit": false,
-    "overlap_events": true,
-    "thread_count": 2
-  }
-}
-EOT
-
-export NCCL_DEBUG=warn 
-
-ds_args=" "
-ds_args=" --deepspeed ${ds_args}"
-ds_args=" --no-pipeline-parallel ${ds_args}" 
-ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
-ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
-ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
-
-
-
-deepspeed --force_multi --num_nodes=$NODES --hostfile $HF pretrain_gpt.py \
-    --tensor-model-parallel-size $TP \
-    --pipeline-model-parallel-size $PP \
-    --num-layers $NLAYERS \
-    --hidden-size $HIDDEN \
-    --num-attention-heads $HEADS \
-    --seq-length $SEQ \
-    --loss-scale $SP \
-    --max-position-embeddings $SEQ \
-    --micro-batch-size $MICRO_BATCH \
-    --global-batch-size $GLOBAL_BATCH \
-    --train-iters 1000 \
-    --lr 6.0e-5 \
-    --min-lr 6.0e-6 \
-    --lr-decay-style cosine \
-    --log-interval 1 \
-    --eval-iters 40 \
-    --eval-interval 1000 \
-    --data-path $DATA_PATH \
-    --vocab-file $BASE_PATH/gpt2-vocab.json \
-    --merge-file $BASE_PATH/gpt2-merges.txt \
-    --save-interval 1000 \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.006 \
-    --fp16 \
-    --checkpoint-activations \
-    --tensorboard-dir $OUTPUT_DIR \
-    $CPU_OPTIM $ds_args \
-    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-benchmark-model.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-benchmark-model.sh
deleted file mode 100644
index 099519babe723ef8dbaf9d6e278d0531d9c988a0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azure/run-benchmark-model.sh
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/bin/bash
-set -ex
-
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-
-BASE_PATH=$PWD/dataset/
-DATA_PATH=${BASE_PATH}/BookCorpusDataset_text_document
-DS_CONFIG=ds_config.json
-
-# Hostfile path
-HF=/job/hostfile 
-
-# Disabling tensor/pipeline parallelism
-TP=1
-PP=1
-
-# HEADS ~= HIDDEN/128
-
-# Model: Benchmark model
-NLAYERS=1
-HIDDEN=12288
-HEADS=96
-SEQ=1024
-
-
-MICRO_BATCH=4
-NODES=2
-GPN=8
-GLOBAL_BATCH=$(( ${GPN} * ${MICRO_BATCH} * ${NODES} ))
-
-# Initial power scale for loss
-SP=15
-
-# Uncomment/comment one of the following blocks.
-
-# For 1T model, start with microbatch=1, try to get 2 and 4.  If OOM w/ 4, use cpu-offloading
-
-# Set to cpu for offloading to cpu for larger models
-#OFFLOAD_DEVICE="cpu"
-#CPU_OPTIM=" --cpu-optimizer"
-
-# Set to none and empty string for no cpu offloading
-OFFLOAD_DEVICE="none"  
-CPU_OPTIM=" "
-
-ZERO_STAGE=3
-OUTPUT_DIR=ds_z_off-${OFFLOAD_DEVICE}_stage_${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_nodes${NODES}
-#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
-mkdir -p $OUTPUT_DIR
-
-cat <<EOT > $DS_CONFIG
-{
-  "train_batch_size" : $GLOBAL_BATCH,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
-  "steps_per_print": 1,
-  "gradient_accumulation_steps": 1,
-  "zero_optimization": {
-    "stage": 3,
-    "stage3_max_live_parameters": 3e9,
-    "stage3_max_reuse_distance": 3e9,
-    "stage3_param_persistence_threshold": 1e5,
-    "stage3_prefetch_bucket_size": 5e7,
-    "contiguous_gradients": true,
-    "overlap_comm": true,
-    "reduce_bucket_size": 90000000,
-    "sub_group_size": 1e9,
-    "offload_optimizer": {
-      "device": "$OFFLOAD_DEVICE",
-      "buffer_count": 4,
-      "pipeline_read": false,
-      "pipeline_write": false,
-      "pin_memory": true
-    }
-  },
-  "gradient_clipping": 1.0,
-  "fp16": {
-    "enabled": true,
-    "initial_scale_power" : $SP,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "wall_clock_breakdown": true,
-  "zero_allow_untested_optimizer": false,
-  "aio": {
-    "block_size": 1048576,
-    "queue_depth": 16,
-    "single_submit": false,
-    "overlap_events": true,
-    "thread_count": 2
-  }
-}
-EOT
-
-export NCCL_DEBUG=warn 
-
-ds_args=" "
-ds_args=" --deepspeed ${ds_args}"
-ds_args=" --no-pipeline-parallel ${ds_args}" 
-ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
-ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
-ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
-
-
-
-deepspeed --force_multi --num_nodes=$NODES --hostfile $HF pretrain_gpt.py \
-    --tensor-model-parallel-size $TP \
-    --pipeline-model-parallel-size $PP \
-    --num-layers $NLAYERS \
-    --hidden-size $HIDDEN \
-    --num-attention-heads $HEADS \
-    --seq-length $SEQ \
-    --loss-scale $SP \
-    --max-position-embeddings $SEQ \
-    --micro-batch-size $MICRO_BATCH \
-    --global-batch-size $GLOBAL_BATCH \
-    --train-iters 50 \
-    --lr 6.0e-5 \
-    --min-lr 6.0e-6 \
-    --lr-decay-style cosine \
-    --log-interval 1 \
-    --eval-iters 40 \
-    --eval-interval 1000 \
-    --data-path $DATA_PATH \
-    --vocab-file $BASE_PATH/gpt2-vocab.json \
-    --merge-file $BASE_PATH/gpt2-merges.txt \
-    --save-interval 1000 \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.006 \
-    --fp16 \
-    --checkpoint-activations \
-    --tensorboard-dir $OUTPUT_DIR \
-    $CPU_OPTIM $ds_args \
-    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/Dockerfile.dockerfile b/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/Dockerfile.dockerfile
deleted file mode 100644
index c01ec3a5ca00c0165b05470e1bec9bd06c2c6d2d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/Dockerfile.dockerfile
+++ /dev/null
@@ -1,5 +0,0 @@
-FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-cuda11.3:12
-USER root:root
-
-RUN pip install pybind11
-RUN pip install regex
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/README.md
deleted file mode 100644
index 09a2faf780f097378e73beefa09ecccdcebe4e5f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-## Megatron-DeepSpeed on AzureML
-Example script for running Megatron-DeepSpeed using Azure Machine Learning.
-
-------
-
-# Workspace Setup
-Setup an AML workspace. Refer to: [set-up doc](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk#set-up).
-
-# Dataset Preparation
-Create AML Dataset. To run remote AML job, you need to provide AML FileDataset. 
-Refer to [prepare_dataset script](prepare_dataset.py) to upload .bin and .idx files to blob store and on how to create FileDataset.
-
-> Note: The folder `bookcorpus_data` used by [prepare_dataset script](prepare_dataset.py) should not be under `azureml` directories. It is because Azure ML does not allow to include large files (limit: 100 files or 1048576 bytes) for Docker build context.
-
-# Training
-Run Megatron-DeepSpeed on Azure ML. Refer to [aml_submit script](aml_submit.py).
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/aml_submit.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/aml_submit.py
deleted file mode 100644
index ebfa0a9bf6b86fd177b469a604a92fabfc3a34fd..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/aml_submit.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import os
-import requests
-import sys
-
-# AzureML libraries
-import azureml.core
-from azureml.core import Dataset, Environment, Experiment, ScriptRunConfig, Workspace
-from azureml.core.compute import ComputeTarget, AmlCompute
-from azureml.core.compute_target import ComputeTargetException
-from azureml.core.runconfig import PyTorchConfiguration
-from azureml.core.environment import DockerBuildContext
-
-# Check core SDK version number
-print("SDK version:", azureml.core.VERSION)
-
-# For setting up a workspace, refer to: https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up
-ws = Workspace.from_config()
-print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')
-
-#-------------------------------------------------------------------------------
-# Prepare Compute Cluster
-#-------------------------------------------------------------------------------
-cluster_name = "a100-80gb"
-
-# Verify that the cluster doesn't exist already
-try:
-    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
-    print('Found existing compute target.')
-except ComputeTargetException:
-    print('Creating a new compute target...')
-    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_ND96amsr_A100_v4', min_nodes=32, max_nodes=32)
-    
-    # create the cluster
-    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
-    compute_target.wait_for_completion(show_output=True)
-
-#-------------------------------------------------------------------------------
-# Prepare Data
-# Megatron-DeepSpeed takes in data_path, vocab_file, and merge_file.
-# For AML, we are adding a parameter aml_data_download_path which specifies how to deliver the dataset to a compute target.
-# In the submitted run, files in the datasets will be either mounted or downloaded to local path on the compute target.
-# 
-# data_path for this example is path to the .bin and .idx file, excluding extension.
-# e.g. for data/BookCorpusDataset_text_document.bin and data/BookCorpusDataset_text_document.idx,
-# data_path = "data/BookCorpusDataset_text_document"
-#
-# Once the folder is downloaded to the compute target, it will use aml_data_download_path to locate the folder
-# and data_path to locate .bin and .idx files
-#
-# vocab_file and merge_file would also be passed in a similar way.
-#-------------------------------------------------------------------------------
-datastore = ws.get_default_datastore()
-blobstore_datadir = "bookcorpus_data"
-data_path = f"BookCorpusDataset_text_document"
-# Load data folder which contains bookcorpus .bin and .idx files
-train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir)])
-aml_data_download_path = train_dataset.as_download(blobstore_datadir)
-
-vocab_file_dataset = Dataset.File.from_files("https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json")
-merge_file_dataset = Dataset.File.from_files("https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt")
-vocab_file = vocab_file_dataset.as_download()
-merge_file = merge_file_dataset.as_download()
-
-
-#-------------------------------------------------------------------------------
-# Setup training environment
-#-------------------------------------------------------------------------------
-
-megatron_ds_env = Environment.from_docker_build_context(name='megatron-ds-curated-acpt', docker_build_context=DockerBuildContext.from_local_directory(workspace = ws, path = '.', dockerfile_path='Dockerfile.dockerfile'))
-megatron_ds_env.register(ws).build(ws).wait_for_completion()  # Comment this out if environment already exists
-
-#-------------------------------------------------------------------------------
-# Training Settings and Arguments
-#-------------------------------------------------------------------------------
-node_count = 2
-total_processes_count = 16
-micro_batch_size = 1
-global_batch_size = micro_batch_size * total_processes_count
-tensorboard_dir = '/tmp/outputs/tensorboard'
-
-run_args = ['--tensor-model-parallel-size', 1, 
-            '--pipeline-model-parallel-size', 1, 
-            '--num-layers', 20,
-            '--hidden-size', 12288,
-            '--num-attention-heads', 96,
-            '--seq-length', 1024,
-            '--loss-scale', 15, 
-            '--max-position-embeddings', 1024, 
-            '--micro-batch-size', micro_batch_size,
-            '--global-batch-size', global_batch_size,
-            '--train-iters', 100,
-            '--lr', 6.0e-5,
-            '--min-lr', 6.0e-6, 
-            '--lr-decay-style', 'cosine',
-            '--log-interval', 1, 
-            '--eval-iters', 40, 
-            '--eval-interval', 1000,
-            '--aml-data-download-path', aml_data_download_path,
-            '--data-path', data_path,
-            '--vocab-file', vocab_file,
-            '--merge-file', merge_file,
-            '--save-interval', 1000, 
-            '--split', '98,2,0',
-            '--clip-grad', 1.0, 
-            '--weight-decay', 0.1,
-            '--adam-beta1', 0.9,
-            '--adam-beta2', 0.95,
-            '--init-method-std', 0.006,
-            '--fp16',
-            '--data-impl', 'mmap',
-            '--checkpoint-activations',
-            '--tensorboard-dir', tensorboard_dir,
-            #'--cpu-optimizer',
-            '--deepspeed',
-            '--no-pipeline-parallel',
-            '--deepspeed_config', 'ds_config.json',
-            '--zero-stage', 3,
-            '--deepspeed-activation-checkpointing',
-            '--exit-interval', 5000,
-]
-
-#-------------------------------------------------------------------------------
-# DeepSpeed ds_config.json
-#-------------------------------------------------------------------------------
-import json
-ds_config = {
-    "train_batch_size" : global_batch_size,
-    "train_micro_batch_size_per_gpu": micro_batch_size,
-    "steps_per_print": 1,
-    "gradient_accumulation_steps": 1,
-    "zero_optimization": {
-      "stage": 3,
-      "stage3_max_live_parameters": 3e9,
-      "stage3_max_reuse_distance": 3e9,
-      "stage3_param_persistence_threshold": 1e5,
-      "stage3_prefetch_bucket_size": 5e7,
-      "contiguous_gradients": True,
-      "overlap_comm": True,
-      "reduce_bucket_size": 90000000,
-      "sub_group_size": 1e9,
-      "offload_optimizer": {
-        "device": "none",
-        "buffer_count": 4,
-        "pipeline_read": False,
-        "pipeline_write": False,
-        "pin_memory": True
-      }
-    },
-    "gradient_clipping": 1.0,
-    "fp16": {
-      "enabled": True,
-      "initial_scale_power" : 15,
-      "loss_scale_window": 1000,
-      "hysteresis": 2,
-      "min_loss_scale": 1
-    },
-    "wall_clock_breakdown": True,
-    "zero_allow_untested_optimizer": False,
-    "aio": {
-      "block_size": 1048576,
-      "queue_depth": 16,
-      "single_submit": False,
-      "overlap_events": True,
-      "thread_count": 2
-    }
-  }
-
-# Place ds_config.json in the same folder as pretrain_gpt.py (script to run)
-ds_config_path = '../../ds_config.json'
-with open(ds_config_path, 'w') as fp:
-    json.dump(ds_config, fp, indent=4)
-
-#-------------------------------------------------------------------------------
-# Create ScriptRunConfig
-#-------------------------------------------------------------------------------
-distr_config = PyTorchConfiguration(process_count=total_processes_count, node_count=node_count)
-
-megatron_ds_src = ScriptRunConfig(source_directory='../../',
-                      script='pretrain_gpt.py',
-                      arguments=run_args,
-                      compute_target=compute_target,
-                      environment=megatron_ds_env,
-                      distributed_job_config=distr_config)
-
-megatron_ds_src.run_config.environment_variables['NCCL_DEBUG'] = 'WARN'
-megatron_ds_src.run_config.environment_variables['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
-megatron_ds_src.run_config.environment_variables['NCCL_SOCKET_IFNAME'] = 'eth0'
-megatron_ds_src.run_config.environment_variables['NCCL_IB_PCI_RELAXED_ORDERING']='1'
-megatron_ds_src.run_config.environment_variables['UCX_TLS']='tcp'
-megatron_ds_src.run_config.environment_variables['UCX_NET_DEVICES']='eth0'
-
-#-------------------------------------------------------------------------------
-# Submit experiment
-#-------------------------------------------------------------------------------
-experiment_name = 'megatron-ds'
-experiment = Experiment(ws, name=experiment_name)
-
-run = experiment.submit(megatron_ds_src, tags={'bs':micro_batch_size, 'gpus':total_processes_count})
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/prepare_dataset.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/prepare_dataset.py
deleted file mode 100644
index dfe6bc14a960635c4eb045a1cae53e35cc7a707d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/azureml/prepare_dataset.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Use this script to upload data to blob store
-
-# AzureML libraries
-from azureml.core import Workspace
-from azureml.core.dataset import Dataset
-from azureml.data.datapath import DataPath
-
-ws = Workspace.from_config()
-print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')
-
-data_dir = "bookcorpus_data"  # Local directory for where data is located that includes .bin and .idx files
-blobstore_datadir = data_dir  # Blob store directory to store data in
-
-datastore = ws.get_default_datastore()
-
-# Book Corpus Data
-print("upload dataset to blob store")
-uploaded_data = Dataset.File.upload_directory(
-    src_dir=data_dir,
-    target=DataPath(datastore, blobstore_datadir),
-    show_progress=True
-)
-
-# Usage after uploading the directory
-# To refer to the folder directly:
-train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir)])
-print(train_dataset)
-# To refer to a specific file:
-# train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir + "/filename.ext")])
-# Create DatasetConsumptionConfig to specify how to deliver the dataset to a compute target.
-# In the submitted run, files in the datasets will be either mounted or downloaded to local path on the compute target.
-# input_data_dir = train_dataset.as_mount()
-# input_data_dir = train_dataset.as_download()
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/README.md
deleted file mode 100644
index 2fa704ecf7944b8b3f23726c95b50695097d3a03..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-This ```bert_with_pile``` folder includes examples about BERT pre-training (using [the public Pile data](https://github.com/EleutherAI/the-pile) or user's own data) with DeepSpeed integration. We also provide scripts about preprocessing Pile data and MNLI finetuning.
-
-## Data preprocessing
-```prepare_pile_data.py``` is the script for downloading, decompressing, and preprocessing [the public Pile data](https://github.com/EleutherAI/the-pile). Users can also modify this script to preprocess their own training data.
-
-## BERT pre-training
-```ds_pretrain_bert.sh``` is the script for BERT pre-training integrated with DeepSpeed, supporting [ZeRO](https://www.deepspeed.ai/tutorials/zero/) together with Megatron's tensor-slicing model parallelism. The training hyperparameters follow the [Megatron paper](https://arxiv.org/abs/1909.08053). Note that the pipeline parallelism is currently not supported: DeepSpeed's pipeline parallelism is only integrated with the GPT case, and currently DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-
-As a reference performance number, our measurements show that our example is able to achieve a throughput up to 145 TFLOPs per GPU when pre-training a 1.3B BERT model (with ZeRO stage-1, without model parallelism, with 64 NVIDIA A100 GPUs, with batch size 4096 (64 per GPU), with activation checkpointing).
-
-One thing to note is that this pre-training recipe is NOT a strict reproduction of the [original BERT paper](https://arxiv.org/abs/1810.04805): the Pile data is larger than the data used in original BERT (and the data used by Megatron paper); Megatron-LM introduces some changes to the BERT model (see details in [Megatron paper](https://arxiv.org/abs/1909.08053)); the training hyperparameters are also different. Overall these differences lead to longer training time but also better model quality than original BERT (see MNLI score below), and supporting large model scale by the combination of ZeRO and model parallelism. If you don't have enough computation budget, we recommend to reduce the total training iterations (```train_iters``` in the script) and potentially increase the learning rate at the same time. If you want to strictly reproduce original BERT, we recommend to use our [another BERT example](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert).
-
-## BERT MNLI fine-tuning
-```ds_finetune_bert_mnli.sh``` is the script for BERT MNLI fine-tuning, following the hyperparameters in the [Megatron paper](https://arxiv.org/abs/1909.08053). As a reference, table below present the scores using the model pre-trained based on the script above, comparing with the scores of original BERT and Megatron paper's BERT. Our BERT-Large's score is slightly lower than Megatron paper's, mainly due to the different data we used (Pile data is much diverse and larger than the data in Megatron paper, which potentially has negative effect on small million-scale models).
-
-| MNLI dev set accuracy | **MNLI-m** | **MNLI-mm** |
-| ---------- |---------- |---------- |
-| BERT-Base, [original BERT](https://arxiv.org/abs/1810.04805) | 84.6 | 83.4 |
-| BERT-Base, ours (median on 5 seeds) | 86.1 | 86.1 |
-| BERT-Large, [original BERT](https://arxiv.org/abs/1810.04805) | 86.7 | 85.9 |
-| BERT-Large, [Megatron paper](https://arxiv.org/abs/1909.08053) | 89.7 | 90.0 |
-| BERT-Large, ours (median on 5 seeds) | 89.1 | 89.6 |
-
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_config_bert_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_config_bert_TEMPLATE.json
deleted file mode 100644
index b00ca33f0b0ea92751d688b59746a57f663bd8ef..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_config_bert_TEMPLATE.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "train_batch_size" : CONFIG_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": CONFIG_FP16_ENABLED,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "bf16": {
-    "enabled": CONFIG_BF16_ENABLED
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_mnli.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_mnli.sh
deleted file mode 100644
index 4697b771d370d2476b15d60aaef90fa450e91b37..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_mnli.sh
+++ /dev/null
@@ -1,150 +0,0 @@
-seed=1234
-pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
-
-###############################################################################
-### Main configs
-### The main configs are from Megatron-LM paper
-### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
-### or build your own configs.
-seq_len=512
-
-## From Table 6 in https://arxiv.org/abs/1909.08053.
-task="MNLI"
-global_batch_size=128
-lr=1e-5
-epochs=10
-
-train_data="/blob/data/GlueData/MNLI/train.tsv"
-valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
-            /blob/data/GlueData/MNLI/dev_mismatched.tsv"
-
-## Adjust based on number of GPUs.
-batch_size=16
-
-## BERT 110M (same config as original BERT-Base model)
-## This config is not included in Megatron-LM paper
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-
-## BERT 336M (same config as original BERT-Large model)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO stage
-zero_stage=0
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=50
-eval_interval=100
-save_interval=500000
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-###############################################################################
-vocab_file="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
-checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
-mkdir -p ${checkpoint_path}
-
-template_json="ds_config_bert_TEMPLATE.json"
-config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
-if [[ $zero_stage -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-fi
-
-options=" \
-    --finetune \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --task ${task} \
-    --seed ${seed} \
-    --train-data ${train_data} \
-    --valid-data ${valid_data} \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file ${vocab_file} \
-    --epochs ${epochs} \
-    --pretrained-checkpoint ${pretrained_checkpoint} \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --global-batch-size ${global_batch_size} \
-    --micro-batch-size ${batch_size} \
-    --lr ${lr} \
-    --lr-decay-style linear \
-    --lr-warmup-fraction 0.065 \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --save-interval ${save_interval} \
-    --save ${checkpoint_path} \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --weight-decay 1.0e-1 \
-    --fp16"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-options="${options} \
-    --checkpoint-activations \
-    --deepspeed-activation-checkpointing"
-fi
-
-if [[ "${no_pp}" = "true" ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-# After the fine-tuning finishes, you can find the dev set accuracy numbers by
-# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
-deepspeed ../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_qqp.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_qqp.sh
deleted file mode 100644
index 78baa6ef06ed914aa5c676493ddfeee7104b7c93..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_qqp.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-seed=1234
-pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
-
-###############################################################################
-### Main configs
-### The main configs are from Megatron-LM paper
-### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
-### or build your own configs.
-seq_len=512
-
-## From Table 6 in https://arxiv.org/abs/1909.08053.
-task="QQP"
-
-train_data="/blob/data/GlueData/QQP/train.tsv"
-valid_data="/blob/data/GlueData/QQP/dev.tsv"
-
-## Adjust based on number of GPUs.
-batch_size=16
-
-## BERT 110M (same config as original BERT-Base model)
-## This config is not included in Megatron-LM paper
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=128
-# lr=5e-5
-# epochs=12
-
-## BERT 336M (same config as original BERT-Large model)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-global_batch_size=128
-lr=5e-5
-epochs=12
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-# global_batch_size=128
-# lr=3e-5
-# epochs=12
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-# global_batch_size=256
-# lr=4e-5
-# epochs=12
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO stage
-zero_stage=0
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=50
-eval_interval=100
-save_interval=500000
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-###############################################################################
-vocab_file="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
-checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
-mkdir -p ${checkpoint_path}
-
-template_json="ds_config_bert_TEMPLATE.json"
-config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
-if [[ $zero_stage -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-fi
-
-options=" \
-    --finetune \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --task ${task} \
-    --seed ${seed} \
-    --train-data ${train_data} \
-    --valid-data ${valid_data} \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file ${vocab_file} \
-    --epochs ${epochs} \
-    --pretrained-checkpoint ${pretrained_checkpoint} \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --global-batch-size ${global_batch_size} \
-    --micro-batch-size ${batch_size} \
-    --lr ${lr} \
-    --lr-decay-style linear \
-    --lr-warmup-fraction 0.065 \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --save-interval ${save_interval} \
-    --save ${checkpoint_path} \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --weight-decay 1.0e-1 \
-    --fp16"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-options="${options} \
-    --checkpoint-activations \
-    --deepspeed-activation-checkpointing"
-fi
-
-if [[ "${no_pp}" = "true" ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-# After the fine-tuning finishes, you can find the dev set accuracy numbers by
-# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
-deepspeed ../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_race.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_race.sh
deleted file mode 100644
index 5e4a57d921cbcda14db736365f5d480b6c689788..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_finetune_bert_race.sh
+++ /dev/null
@@ -1,172 +0,0 @@
-seed=1234
-## RACE have two sub-tasks that need to be finetuned separately
-difficulty="middle"
-# difficulty="high"
-pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
-
-###############################################################################
-### Main configs
-### The main configs are from Megatron-LM paper
-### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
-### or build your own configs.
-seq_len=512
-
-## From Table 6 in https://arxiv.org/abs/1909.08053.
-task="RACE"
-
-## Race dataset can be downloaded by:
-## wget http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz
-train_data="/blob/data/RACE/train/${difficulty}"
-
-## The Megatron paper https://arxiv.org/abs/1909.08053 says: "For the test set
-## results of RACE, we first use the development set to find the checkpoint
-## that gives us the median score on the 5 random seeds and we report the
-## results from that checkpoint on the test set", which is a quite confusing
-## description. For simplicity, instead we directly get the median dev and test
-## set score on 5 random seeds from a single pretrained_checkpoint.
-valid_data="/blob/data/RACE/dev/${difficulty} \
-            /blob/data/RACE/test/${difficulty}"
-
-## Adjust based on number of GPUs.
-batch_size=4
-
-## BERT 110M (same config as original BERT-Base model)
-## This config is not included in Megatron-LM paper
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=32
-# lr=2e-5
-# epochs=3
-
-## BERT 336M (same config as original BERT-Large model)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-global_batch_size=32
-lr=2e-5
-epochs=3
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-# global_batch_size=16
-# lr=1e-5
-# epochs=3
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-# global_batch_size=32
-# lr=2e-5
-# epochs=3
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO stage
-zero_stage=0
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=50
-eval_interval=100
-save_interval=100000
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-###############################################################################
-vocab_file="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-jobname="${task}-${difficulty}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
-checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
-mkdir -p ${checkpoint_path}
-
-template_json="ds_config_bert_TEMPLATE.json"
-config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
-if [[ $zero_stage -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-fi
-
-options=" \
-    --finetune \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --task ${task} \
-    --seed ${seed} \
-    --train-data ${train_data} \
-    --valid-data ${valid_data} \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file ${vocab_file} \
-    --epochs ${epochs} \
-    --pretrained-checkpoint ${pretrained_checkpoint} \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --global-batch-size ${global_batch_size} \
-    --micro-batch-size ${batch_size} \
-    --lr ${lr} \
-    --lr-decay-style linear \
-    --lr-warmup-fraction 0.06 \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --save-interval ${save_interval} \
-    --save ${checkpoint_path} \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --weight-decay 1.0e-1 \
-    --clip-grad 1.0 \
-    --fp16"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-options="${options} \
-    --checkpoint-activations \
-    --deepspeed-activation-checkpointing"
-fi
-
-if [[ "${no_pp}" = "true" ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-# After the fine-tuning finishes, you can find the dev/test set accuracy numbers
-# by "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
-deepspeed ../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_pretrain_bert.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_pretrain_bert.sh
deleted file mode 100644
index 397d7cb11d0c7f5dd72d37a7c38ec89e9ab122f2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/ds_pretrain_bert.sh
+++ /dev/null
@@ -1,267 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-### The main configs are from Megatron-LM paper
-### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
-### or build your own configs.
-seq_len=512
-global_batch_size=1024
-lr=1e-4
-min_lr=1e-5
-
-## init_std is the standard deviation for weight initialization. Usually larger
-## model needs lower std. Here we roughly follow a heuristic equation of
-## sqrt(1/3/hidden_size) from https://arxiv.org/pdf/2201.11990.pdf
-
-## In addition, we find that the 3.9B model (even after tuning init_std) has
-## NaN loss issue from the beginning thus unable to train. This is probably
-## because in this example we use the public Pile data, which is a more diverse
-## (and potentially more noisy) data than what used in Megatron paper. One
-## potential solution is only use the sub datasets in Pile that are also
-## used by Megatron paper.
-
-## BERT 110M (same config as original BERT-Base model)
-## This config is not included in Megatron-LM paper
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# init_std=0.02
-
-## BERT 336M (same config as original BERT-Large model)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-init_std=0.02
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-# init_std=0.013
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-# init_std=0.011
-###############################################################################
-### Training duration configs
-## The main termination condition, original Megatron paper trains for 2M iters.
-train_iters_in_million=2
-train_iters=$((${train_iters_in_million} * 1000000))
-###############################################################################
-### lr configs
-## lr warmup and decay duration. Original Megatron paper uses 10000 warmup
-## iters. Decay iters is the same as train iters.
-lr_warmup_iters=10000
-lr_decay_iters_in_million=${train_iters_in_million}
-lr_decay_iters=$((${lr_decay_iters_in_million} * 1000000))
-lr_decay_style="linear"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO stage
-zero_stage=0
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Below batch_size calculation assumes the case without gradient accumulation.
-## Manually set it to a lower value if you hit out of memory during training.
-batch_size=$(( ${global_batch_size} / ${dp_size} ))
-###############################################################################
-### Misc configs
-log_interval=100
-eval_iters=10
-eval_interval=1000
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-save_interval=$((${train_iters} / ${num_save}))
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-
-## Public the Pile dataset, see prepare_pile_data.py in the same directory
-## about how to download and preprocess the data.
-jobname="bert-pile"
-## For internal use. Change data_home to your own training data path.
-data_home="/vc_data_blob/users/conglli/the_pile_bert"
-if [[ "$host" == *"webxt"* ]]; then
-    data_home="/blob/data/the_pile_bert"
-fi
-data_path="${data_home}/pile_bert_train_text_sentence"
-
-vocab_path="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-## Number of workers for dataloader. We found that for BERT pre-training,
-## num_workers will greatly affect data loading time and overall training
-## time. In our experiment with 64 GPUs, the performance reaches peak at
-## num_workers = 4 but it may differ depending on hardware. Also note that
-## larger num_workers add more CPU computation/memory overhead.
-num_workers=4
-
-jobname="${jobname}-${model_size}B-iters-${train_iters_in_million}M"
-jobname="${jobname}-lr-${lr}-min-${min_lr}-wmup-${lr_warmup_iters}-dcy-${lr_decay_iters_in_million}M-sty-${lr_decay_style}"
-jobname="${jobname}-gbs-${global_batch_size}-mbs-${batch_size}-gpu-${num_gpus}-zero-${zero_stage}-mp-${mp_size}-pp-${pp_size}"
-if [ "${no_pp}" = "true" ]; then
-    jobname="${jobname}-nopp"
-fi
-
-username=$(whoami)
-output_home="/vc_data_blob/users/${username}/project/bert_with_pile"
-if [[ "$host" == *"webxt"* ]]; then
-    output_home="/blob/users/${username}/project/bert_with_pile"
-fi
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/bert_with_pile/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.999 \
-    --init-method-std ${init_std} \
-    --tensor-model-parallel-size ${mp_size} \
-    --lr-decay-iters ${lr_decay_iters} \
-    --lr-warmup-iters ${lr_warmup_iters} \
-    --micro-batch-size ${batch_size} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-iters ${train_iters} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_bert_TEMPLATE.json"
-config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
-if [[ $zero_stage -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_bert.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/prepare_pile_data.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/prepare_pile_data.py
deleted file mode 100644
index d3428b1d93bab469e93fdf4ec2920e086815b9fc..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/bert_with_pile/prepare_pile_data.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import zstandard
-import sys
-import time
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir,os.path.pardir)))
-from megatron_ds.data import indexed_dataset
-
-def pile_download(download_url, file_path, i):
-    start = time.time()
-    zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
-    download_path = f"{download_url}{i:02}.jsonl.zst"
-    if not os.path.exists(zstd_file_path):
-        os.system(f"wget -P {file_path} {download_path}")
-        print(f"Finished downloading chunk {i} in {time.time() - start} sec")
-
-def pile_decompress(download_url, file_path, i):
-    zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
-    output_path = f"{file_path}{i:02}.jsonl"
-    if not os.path.exists(output_path):
-        if not os.path.exists(zstd_file_path):
-            pile_download(download_url, file_path, i)
-        start = time.time()
-        with open(zstd_file_path, 'rb') as compressed:
-            decomp = zstandard.ZstdDecompressor()
-            with open(output_path, 'wb') as destination:
-                decomp.copy_stream(compressed, destination)
-        os.remove(zstd_file_path)
-        print(f"Finished decompressing chunk {i} in {time.time() - start} sec")
-
-def pile_preprocess(download_url, file_path, vocab_file, num_workers, i):
-    json_file_path = f"{file_path}{i:02}.jsonl"
-    output_prefix = f"{file_path}pile_bert_train_{i:02}"
-    if not os.path.exists(f"{output_prefix}_text_sentence.idx"):
-        if not os.path.exists(json_file_path):
-            pile_decompress(download_url, file_path, i)
-        start = time.time()
-        cmd = f"python ../../tools/preprocess_data.py \
-                --input {json_file_path} \
-                --output-prefix {output_prefix} \
-                --vocab {vocab_file} \
-                --dataset-impl mmap \
-                --tokenizer-type BertWordPieceLowerCase \
-                --split-sentences \
-                --workers {num_workers} "
-        # It's possible to hit MemoryError during above cmd since the memory
-        # usage is proportional to num_workers. In this case we delete the
-        # incomplete output and user shall retry with smaller num_workers.
-        # Our experience show that chunk 6, 7, 9, 17, 18, 20, 21, 24, 27
-        # particularly have large memory usage.
-        if os.system(cmd) == 0: # Success
-            os.remove(json_file_path)
-        else:
-            print(f"Error: chunk {i} preprocessing got error, delete \
-                    incomplete output. If MemoryError appeared, please retry \
-                    with num_workers smaller than {num_workers}.")
-            if os.path.exists(f"{output_prefix}_text_sentence.idx"):
-                os.remove(f"{output_prefix}_text_sentence.idx")
-            if os.path.exists(f"{output_prefix}_text_sentence.bin"):
-                os.remove(f"{output_prefix}_text_sentence.bin")
-        print(f"Finished preprocessing chunk {i} in {time.time() - start} sec")
-
-def pile_merge(file_path):
-    start = time.time()
-    num_chunks = 30
-    vocab_size = 30524
-    for i in range(num_chunks):
-        output_prefix = f"{file_path}pile_bert_train_{i:02}"
-        assert os.path.exists(f"{output_prefix}_text_sentence.idx")
-        assert os.path.exists(f"{output_prefix}_text_sentence.bin")
-    builder = indexed_dataset.make_builder(
-        f"{file_path}pile_bert_train_text_sentence.bin", impl="mmap",
-        vocab_size=vocab_size)
-    for i in range(num_chunks):
-        chunk_file = f"{file_path}pile_bert_train_{i:02}_text_sentence"
-        print(f"Merging file {chunk_file}")
-        builder.merge_file_(chunk_file)
-    print("Finalizing merged file ...")
-    builder.finalize(f"{file_path}pile_bert_train_text_sentence.idx")
-    print(f"Finished merging in {time.time() - start} sec")
-    # After verifying the merged data with real training, you may want to
-    # delete the data chunks.
-    # for i in range(num_chunks):
-    #     output_prefix = f"{file_path}pile_bert_train_{i:02}"
-    #     os.remove(f"{output_prefix}_text_sentence.idx")
-    #     os.remove(f"{output_prefix}_text_sentence.bin")
-
-if __name__ == '__main__':
-    # Path to download and store all the output files during the whole process.
-    # Estimated max storage usage would be around 1.6 TB (or 780GB if skip the
-    # final merge). Memory usage is proportional to the num_workers below (can
-    # be as high as O(300GB) if num_workers is around 20).
-    file_path = "/blob/data/the_pile_bert/"
-    # The raw Pile data has 30 compressed .zst chunks. To run on single
-    # machine for all chunks, run "python prepare_pile_data.py range 0 30".
-    # You can also split and run on multiple machines to speed up, since
-    # processing one chunk can take hours. The whole process only uses CPU.
-    if sys.argv[1] == "merge":
-        # "python prepare_pile_data.py merge" means merge all 30 processed data
-        # chunks. Run it only after all 30 chunks are preprocessed. The memory
-        # usage during merge is about 600GB. If you don't have enough memory,
-        # one solution is to directly use the 30 data chunks as multiple
-        # datasets. See '--data-path' in
-        # github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/arguments.py
-        pile_merge(file_path)
-    else:
-        if sys.argv[1] == "range":
-            # "python prepare_pile_data.py range 0 30" means process chunk 0-29
-            selected_chunk = range(int(sys.argv[2]), int(sys.argv[3]))
-        else:
-            # "python prepare_pile_data.py 2 5 8" means process chunk 2, 5, 8
-            selected_chunk = [int(x) for x in sys.argv[1:]]
-        print("selected_chunk: ", selected_chunk)
-        # Number of process. Adjust based on your CPU/Memory.
-        num_workers = 20
-        # Where the raw Pile data can be downloaded. The url may change in
-        # future. Contact EleutherAI (https://github.com/EleutherAI/the-pile)
-        # if this url does not work.
-        download_url = "https://the-eye.eu/public/AI/pile/train/"
-        vocab_file = "bert-large-uncased-vocab.txt"
-        vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
-        if not os.path.exists(vocab_file):
-            os.system(f"wget {vocab_url}")
-        os.makedirs(file_path, exist_ok=True)
-
-        for i in selected_chunk:
-            pile_preprocess(download_url, file_path, vocab_file, num_workers, i)
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-Int8-test-64gpu-distilled-group48.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-Int8-test-64gpu-distilled-group48.sh
deleted file mode 100644
index 5e84883f925aef78be4a3fee587987dc1424884f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-Int8-test-64gpu-distilled-group48.sh
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-MODEL_SIZE=0.125
-NUM_LAYERS=12
-HIDDEN_SIZE=768
-NUM_ATTN_HEADS=12
-GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-LR=6.0e-5
-MIN_LR=6.0e-5
-
-# Curriculum learning (CL) enables stable large-batch training
-# GLOBAL_BATCH_SIZE=16 # 8x
-# LR=6e-4 # 4x
-
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-# TRAIN_TOKENS=300000000000
-TRAIN_TOKENS=5250000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
-PP_SIZE=1
-NO_PP="true"
-
-## ZeRO stage
-ZERO_STAGE=0
-
-## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
-DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=72
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization. Usually larger model needs
-## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
-## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-INIT_STD=0.02
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# ACTIVATION_CHECKPOINT="true"
-ACTIVATION_CHECKPOINT="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-LOG_OPTIMIZER_STATE="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="125M10L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl"
-if [ "${NO_PP}" = "true" ]; then
-    NAME="${NAME}-no_pp"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
-fi
-
-LOG_PATH="log/"
-TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
-CHECKPOINT_PATH="/blob/users/zheweiyao/compression_library/checkpoint/${NAME}"
-mkdir -p ${LOG_PATH}
-mkdir -p ${TENSORBOARD_PATH}
-mkdir -p ${CHECKPOINT_PATH}
-
-VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-# DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
-# For cluster Azure-WestUS3-A100
-DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers 10 \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load /blob/users/minjiaz/project/gpt3_distillation/checkpoint/gpt3-kd-staged-alpha1-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-32-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/ \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --no-load-lr-state \
-        --reset-iteration \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_PATH}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_gpt_TEMPLATE_compression.json"
-config_json="ds_config_${NAME}.json"
-if [[ $ZERO_STAGE -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-            --deepspeed \
-            --deepspeed_config ${config_json} \
-            --zero-stage ${ZERO_STAGE} \
-            --pipeline-model-parallel-size ${PP_SIZE}"
-
-if [[ "${NO_PP}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
-ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
-ITERATION=0
-for (( node = 0; node <= NUM_NODE-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
-        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
-        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
-    fi
-done
-if [[ $ITERATION -gt 0 ]]; then
-    ITERATION_2="global_step${ITERATION}"
-    ds_ssh "echo $ITERATION > $ITERATION_FILE"
-    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}.log"
-# run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options}"
-
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-L10-Int8-test-64gpu-distilled-group48.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-L10-Int8-test-64gpu-distilled-group48.sh
deleted file mode 100644
index a15c805d88efa2cb1fdd74d6a1a441ec0add95e9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-L10-Int8-test-64gpu-distilled-group48.sh
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-MODEL_SIZE=0.125
-NUM_LAYERS=12
-HIDDEN_SIZE=768
-NUM_ATTN_HEADS=12
-GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-LR=6.0e-5
-MIN_LR=6.0e-5
-
-# Curriculum learning (CL) enables stable large-batch training
-# GLOBAL_BATCH_SIZE=16 # 8x
-# LR=6e-4 # 4x
-
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-# TRAIN_TOKENS=300000000000
-TRAIN_TOKENS=5250000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
-PP_SIZE=1
-NO_PP="true"
-
-## ZeRO stage
-ZERO_STAGE=0
-
-## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
-DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=72
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization. Usually larger model needs
-## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
-## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-INIT_STD=0.02
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# ACTIVATION_CHECKPOINT="true"
-ACTIVATION_CHECKPOINT="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-LOG_OPTIMIZER_STATE="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="125M10L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha"
-if [ "${NO_PP}" = "true" ]; then
-    NAME="${NAME}-no_pp"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
-fi
-
-LOG_PATH="log/"
-TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
-CHECKPOINT_PATH="/blob/users/minjiaz/compression_library/checkpoint/${NAME}"
-mkdir -p ${LOG_PATH}
-mkdir -p ${TENSORBOARD_PATH}
-mkdir -p ${CHECKPOINT_PATH}
-
-VOCAB_PATH=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-# DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
-# For cluster Azure-WestUS3-A100
-DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers 10 \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load /blob/users/minjiaz/project/gpt3_distillation/checkpoint/gpt3-kd-staged-alpha1-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-32-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/ \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --no-load-lr-state \
-        --reset-iteration \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_PATH}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_gpt_TEMPLATE_compression.json"
-config_json="ds_config_${NAME}.json"
-if [[ $ZERO_STAGE -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-            --deepspeed \
-            --deepspeed_config ${config_json} \
-            --zero-stage ${ZERO_STAGE} \
-            --pipeline-model-parallel-size ${PP_SIZE}"
-
-if [[ "${NO_PP}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
-ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
-ITERATION=0
-for (( node = 0; node <= NUM_NODE-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
-        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
-        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
-    fi
-done
-if [[ $ITERATION -gt 0 ]]; then
-    ITERATION_2="global_step${ITERATION}"
-    ds_ssh "echo $ITERATION > $ITERATION_FILE"
-    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}.log"
-# run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options}"
-
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-L12-Int8-test-64gpu-distilled-group48.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-L12-Int8-test-64gpu-distilled-group48.sh
deleted file mode 100644
index 013fbb4a1655a9f3b719c75f575a73a2d29199af..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/125M-L12-Int8-test-64gpu-distilled-group48.sh
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-MODEL_SIZE=0.125
-NUM_LAYERS=12
-HIDDEN_SIZE=768
-NUM_ATTN_HEADS=12
-GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-LR=6.0e-5
-MIN_LR=6.0e-5
-
-# Curriculum learning (CL) enables stable large-batch training
-# GLOBAL_BATCH_SIZE=16 # 8x
-# LR=6e-4 # 4x
-
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-# TRAIN_TOKENS=300000000000
-TRAIN_TOKENS=5250000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
-PP_SIZE=1
-NO_PP="true"
-
-## ZeRO stage
-ZERO_STAGE=0
-
-## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
-DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=72
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization. Usually larger model needs
-## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
-## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-INIT_STD=0.02
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# ACTIVATION_CHECKPOINT="true"
-ACTIVATION_CHECKPOINT="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-LOG_OPTIMIZER_STATE="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="125M12L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha"
-if [ "${NO_PP}" = "true" ]; then
-    NAME="${NAME}-no_pp"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
-fi
-
-LOG_PATH="log/"
-TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
-CHECKPOINT_PATH="/blob/users/minjiaz/compression_library/checkpoint/${NAME}"
-mkdir -p ${LOG_PATH}
-mkdir -p ${TENSORBOARD_PATH}
-mkdir -p ${CHECKPOINT_PATH}
-
-VOCAB_PATH=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-# DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
-# For cluster Azure-WestUS3-A100
-DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers 12 \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load /blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-64-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/ \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --no-load-lr-state \
-        --reset-iteration \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_PATH}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_gpt_TEMPLATE_compression.json"
-config_json="ds_config_${NAME}.json"
-if [[ $ZERO_STAGE -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-            --deepspeed \
-            --deepspeed_config ${config_json} \
-            --zero-stage ${ZERO_STAGE} \
-            --pipeline-model-parallel-size ${PP_SIZE}"
-
-if [[ "${NO_PP}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
-ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
-ITERATION=0
-for (( node = 0; node <= NUM_NODE-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
-        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
-        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
-    fi
-done
-if [[ $ITERATION -gt 0 ]]; then
-    ITERATION_2="global_step${ITERATION}"
-    ds_ssh "echo $ITERATION > $ITERATION_FILE"
-    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}.log"
-# run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options}"
-
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_config_gpt_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_config_gpt_TEMPLATE.json
deleted file mode 100644
index 5a14931cb99d667078a36ffac07b7b8ff9a470e6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_config_gpt_TEMPLATE.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-  "train_batch_size" : CONFIG_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": CONFIG_FP16_ENABLED,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "bf16": {
-    "enabled": CONFIG_BF16_ENABLED
-  },
-  "curriculum_learning": {
-    "enabled": CONFIG_CL_ENABLED,
-    "curriculum_type": "seqlen",
-    "min_difficulty": CONFIG_CL_MIN,
-    "max_difficulty": CONFIG_CL_MAX,
-    "schedule_type": "fixed_linear",
-    "schedule_config": {
-      "total_curriculum_step": CONFIG_CL_DURATION,
-      "difficulty_step": 8
-    }
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_config_gpt_TEMPLATE_compression.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_config_gpt_TEMPLATE_compression.json
deleted file mode 100644
index 083838a387b488e3e6ae73e55895b412e591a4e5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_config_gpt_TEMPLATE_compression.json
+++ /dev/null
@@ -1,86 +0,0 @@
-{
-  "train_batch_size" : CONFIG_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": CONFIG_FP16_ENABLED,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "bf16": {
-    "enabled": CONFIG_BF16_ENABLED
-  },
-  "curriculum_learning": {
-    "enabled": CONFIG_CL_ENABLED,
-    "curriculum_type": "seqlen",
-    "min_difficulty": CONFIG_CL_MIN,
-    "max_difficulty": CONFIG_CL_MAX,
-    "schedule_type": "fixed_linear",
-    "schedule_config": {
-      "total_curriculum_step": CONFIG_CL_DURATION,
-      "difficulty_step": 8
-    }
-  },
-
-  "wall_clock_breakdown" : false,
-
-  "compression_training": {
-    "weight_quantization": {
-      "shared_parameters":{
-        "enabled": true,
-        "quantizer_kernel": false,
-        "schedule_offset": 50,
-        "quantize_groups": 48,
-        "quantize_verbose": false,
-        "quantization_type": "symmetric",
-        "rounding": "nearest",
-        "fp16_mixed_quantize":{
-          "enabled": false,
-          "quantize_change_ratio": 0.001
-        }
-      },
-      "different_groups":{
-        "wq1": {
-          "params": {
-              "start_bits": 12, 
-              "target_bits": 4,
-              "quantization_period": 50
-          },
-          "modules": [
-            "encoder.layers"
-          ]
-        }
-      }
-    },
-    "activation_quantization": {
-      "shared_parameters":{
-        "enabled": true,
-        "quantization_type": "asymmetric",
-        "range_calibration": "static",
-        "schedule_offset": 50
-      },
-      "different_groups":{
-        "aq1": {
-          "params": { 
-              "bits": 8
-          },
-          "modules": [
-            "encoder.layers"
-          ]
-        }
-      }
-    }
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_evalharness.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_evalharness.sh
deleted file mode 100644
index 0922dc033945ddce6de4316d46a9066c3abebfb1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_evalharness.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-# This is an example zero-shot eval script. Please first read the readme_evalharness.md under the ../MoE directory.
-
-# CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M10L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step2000/
-# CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-64-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71000/
-# CHECKPOINT_PATH=/blob/users/minjiaz/compression_library/checkpoint/125M12L_Compression_Test_INT8_64gpu_lr6e-5_tokens5.25B_nocl_alpha-no_pp/global_step5000/
-CHECKPOINT_PATH=/blob/users/minjiaz/project/gpt3_distillation/checkpoint/gpt3-kd-test2-alpha1-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-15-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/global_step71426/
-CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus--1-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B.json
-RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log
-
-PP_SIZE=1
-TP_SIZE=1
-NO_PP="true"
-EP_PARALLEL_SIZE=1
-# Currently eval harness does not support data parallel
-# However, for MoE models it's possible to enable a "fake data parallel"
-# in order to load experts on multiple gpus. At the same time, it's not
-# real data parallel because we load the same data on all gpus.
-# On the other hand, it's better to use less number of gpus than training,
-# to reduce communication overhead.
-NUM_NODE=1
-NUM_GPU_PER_NODE=1
-
-# TASKS="lambada"
-# WikiText-2, not used in GPT-3 paper but used in GPT-2 paper
-TASKS="lambada,wikitext"
-# Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2.
-# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext"
-# All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test.
-# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli"
-
-VOCAB_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_FILE=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-
-# export HF_DATASETS_OFFLINE=1
-
-# Dummy arguments to make megatron happy. No need to configure them.
-# The reason we don't need to configure them and many other arguments is
-# because the eval framework will read the arguments from checkpoint file.
-MEGATRON_REQUIRED_ARGS="\
-    --num-layers -1\
-    --hidden-size -1\
-    --num-attention-heads -1\
-    --seq-length -1 \
-    --max-position-embeddings -1
-"
-
-CMD="../../tasks/eval_harness/evaluate.py \
-    --load $CHECKPOINT_PATH\
-    --tensor-model-parallel-size $TP_SIZE \
-    --pipeline-model-parallel-size $PP_SIZE\
-    --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-    --vocab-file $VOCAB_FILE\
-    --merge-file $MERGE_FILE\
-    --micro-batch-size 12\
-    --no-load-optim \
-    --no-load-rng \
-    --inference \
-    --disable-moe-token-dropping \
-    --tokenizer-type GPT2BPETokenizer \
-    --adaptive_seq_len\
-    --eval_fp32\
-    --task_list $TASKS\
-    --results_path $RESULT_PATH \
-    --deepspeed \
-    --deepspeed_config $CONFIG_PATH \
-    $MEGATRON_REQUIRED_ARGS\
-    "
-
-if [[ "${NO_PP}" = "true" ]]; then
-CMD="${CMD} \
-    --no-pipeline-parallel"
-fi
-
-LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE"
-$LAUNCHER $CMD
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_1.3B_dense_cl_kd.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_1.3B_dense_cl_kd.sh
deleted file mode 100644
index 9ffa240db03103d17a422cb4a7f3c955f26fb780..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_1.3B_dense_cl_kd.sh
+++ /dev/null
@@ -1,322 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-MODEL_SIZE=1.3
-NUM_LAYERS=24
-HIDDEN_SIZE=2048
-NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-MIN_LR=2.0e-5
-
-# Curriculum learning (CL) enables stable large-batch training
-GLOBAL_BATCH_SIZE=4096 # 8x
-LR=8.0e-4 # 4x
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-TRAIN_TOKENS=300000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=16
-
-## Model parallelism, 1 is no MP
-MP_SIZE=2
-
-## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
-PP_SIZE=1
-NO_PP="true"
-
-## ZeRO stage
-ZERO_STAGE=0
-
-## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
-DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="true"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization. Usually larger model needs
-## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
-## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-INIT_STD=0.013
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-LOG_OPTIMIZER_STATE="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt3-kd-with-pile-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-zero-${ZERO_STAGE}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [ "${NO_PP}" = "true" ]; then
-    NAME="${NAME}-no_pp"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
-fi
-
-LOG_PATH="log/"
-TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
-CHECKPOINT_PATH="/blob/users/minjiaz/project/gpt3_distillation/checkpoint/${NAME}"
-mkdir -p ${LOG_PATH}
-mkdir -p ${TENSORBOARD_PATH}
-mkdir -p ${CHECKPOINT_PATH}
-
-### KD configs
-KD_BETA_CE=1
-CHECKPOINT_PATH_TEACHER="/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-1.3B-lr-8.0e-4-minlr-2.0e-5-bs-4096-gpus-128-zero-0-mp-2-pp-1-no_pp-cl-startseqlen-80-step-13767-token-60B/"
-CHECKPOINT_PATH_SAVE="/blob/users/minjiaz/project/gpt3_distillation/checkpoint/${NAME}"
-
-mkdir -p ${CHECKPOINT_PATH_SAVE}
-
-VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-# DATA_PATH=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-# For cluster Azure-WestUS3-A100
-DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
-
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers 21 \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH_SAVE} \
-        --kd \
-        --kd-beta-ce ${KD_BETA_CE} \
-        --num-layers-teacher ${NUM_LAYERS} \
-        --hidden-size-teacher ${HIDDEN_SIZE} \
-        --num-attention-heads-teacher ${NUM_ATTN_HEADS} \
-        --load-teacher ${CHECKPOINT_PATH_TEACHER} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_PATH}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_${NAME}.json"
-if [[ $ZERO_STAGE -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-            --deepspeed \
-            --deepspeed_config ${config_json} \
-            --zero-stage ${ZERO_STAGE} \
-            --pipeline-model-parallel-size ${PP_SIZE}"
-
-if [[ "${NO_PP}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
-ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
-ITERATION=0
-for (( node = 0; node <= NUM_NODE-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
-        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
-        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
-    fi
-done
-if [[ $ITERATION -gt 0 ]]; then
-    ITERATION_2="global_step${ITERATION}"
-    ds_ssh "echo $ITERATION > $ITERATION_FILE"
-    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_125M_dense_cl_kd.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_125M_dense_cl_kd.sh
deleted file mode 100644
index a34ce282ce93aa9327cd68c7fa29b27ace5ab26d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_125M_dense_cl_kd.sh
+++ /dev/null
@@ -1,323 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-MODEL_SIZE=0.125
-NUM_LAYERS=12
-HIDDEN_SIZE=768
-NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-MIN_LR=6.0e-5
-
-# Curriculum learning (CL) enables stable large-batch training
-GLOBAL_BATCH_SIZE=2048 # 8x
-LR=2.4e-3 # 4x
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-TRAIN_TOKENS=300000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=8
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
-PP_SIZE=1
-NO_PP="true"
-
-## ZeRO stage
-ZERO_STAGE=0
-
-## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
-DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="true"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=72
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization. Usually larger model needs
-## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
-## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-INIT_STD=0.02
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-LOG_OPTIMIZER_STATE="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt3-kd-test1-alpha1-with-pile-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-zero-${ZERO_STAGE}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [ "${NO_PP}" = "true" ]; then
-    NAME="${NAME}-no_pp"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
-fi
-
-LOG_PATH="log/"
-TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
-CHECKPOINT_PATH="/blob/users/minjiaz/project/gpt3_distillation/checkpoint/${NAME}"
-mkdir -p ${LOG_PATH}
-mkdir -p ${TENSORBOARD_PATH}
-mkdir -p ${CHECKPOINT_PATH}
-
-### KD configs
-KD_BETA_CE=1
-CHECKPOINT_PATH_TEACHER="/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-64-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/"
-CHECKPOINT_PATH_SAVE="/blob/users/minjiaz/project/gpt3_distillation/checkpoint/${NAME}"
-
-mkdir -p ${CHECKPOINT_PATH_SAVE}
-
-
-VOCAB_PATH=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-# DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
-# For cluster Azure-WestUS3-A100
-DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers 10 \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH_SAVE} \
-        --kd \
-        --kd-beta-ce ${KD_BETA_CE} \
-        --num-layers-teacher ${NUM_LAYERS} \
-        --hidden-size-teacher ${HIDDEN_SIZE} \
-        --num-attention-heads-teacher ${NUM_ATTN_HEADS} \
-        --load-teacher ${CHECKPOINT_PATH_TEACHER} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_PATH}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_${NAME}.json"
-if [[ $ZERO_STAGE -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-            --deepspeed \
-            --deepspeed_config ${config_json} \
-            --zero-stage ${ZERO_STAGE} \
-            --pipeline-model-parallel-size ${PP_SIZE}"
-
-if [[ "${NO_PP}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
-ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
-ITERATION=0
-for (( node = 0; node <= NUM_NODE-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
-        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
-        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
-    fi
-done
-if [[ $ITERATION -gt 0 ]]; then
-    ITERATION_2="global_step${ITERATION}"
-    ds_ssh "echo $ITERATION > $ITERATION_FILE"
-    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_125M_dense_kd.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_125M_dense_kd.sh
deleted file mode 100644
index 54f912271247fa2b9719d842c09cbfeedc4610d2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_125M_dense_kd.sh
+++ /dev/null
@@ -1,323 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-MODEL_SIZE=0.125
-NUM_LAYERS=12
-HIDDEN_SIZE=768
-NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-MIN_LR=6.0e-5
-
-# Curriculum learning (CL) enables stable large-batch training
-GLOBAL_BATCH_SIZE=2048 # 8x
-LR=2.4e-3 # 4x
-
-## GPT-3 Medium 350M
-# MODEL_SIZE=0.35
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1024
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=3.0e-4
-# MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-TRAIN_TOKENS=300000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=8
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
-PP_SIZE=1
-NO_PP="true"
-
-## ZeRO stage
-ZERO_STAGE=0
-
-## Total number of GPUs
-NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
-DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=72
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=10000
-
-## Standard deviation for weight initialization. Usually larger model needs
-## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
-## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-INIT_STD=0.02
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-LOG_OPTIMIZER_STATE="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt3-kd-test1-alpha1-with-pile-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-zero-${ZERO_STAGE}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [ "${NO_PP}" = "true" ]; then
-    NAME="${NAME}-no_pp"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
-fi
-
-LOG_PATH="log/"
-TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
-CHECKPOINT_PATH="/blob/users/minjiaz/project/gpt3_distillation/checkpoint/${NAME}"
-mkdir -p ${LOG_PATH}
-mkdir -p ${TENSORBOARD_PATH}
-mkdir -p ${CHECKPOINT_PATH}
-
-### KD configs
-KD_BETA_CE=1
-CHECKPOINT_PATH_TEACHER="/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-64-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-27638-token-60B/"
-CHECKPOINT_PATH_SAVE="/blob/users/minjiaz/project/gpt3_distillation/checkpoint/${NAME}"
-
-mkdir -p ${CHECKPOINT_PATH_SAVE}
-
-
-VOCAB_PATH=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-MERGE_PATH=/blob/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
-# DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
-# For cluster Azure-WestUS3-A100
-DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_PATH} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers 10 \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH_SAVE} \
-        --kd \
-        --kd-beta-ce ${KD_BETA_CE} \
-        --num-layers-teacher ${NUM_LAYERS} \
-        --hidden-size-teacher ${HIDDEN_SIZE} \
-        --num-attention-heads-teacher ${NUM_ATTN_HEADS} \
-        --load-teacher ${CHECKPOINT_PATH_TEACHER} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_PATH}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --log-optimizer-states-to-tensorboard"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_${NAME}.json"
-if [[ $ZERO_STAGE -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-            --deepspeed \
-            --deepspeed_config ${config_json} \
-            --zero-stage ${ZERO_STAGE} \
-            --pipeline-model-parallel-size ${PP_SIZE}"
-
-if [[ "${NO_PP}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
-ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
-ITERATION=0
-for (( node = 0; node <= NUM_NODE-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
-        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
-        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
-    fi
-done
-if [[ $ITERATION -gt 0 ]]; then
-    ITERATION_2="global_step${ITERATION}"
-    ds_ssh "echo $ITERATION > $ITERATION_FILE"
-    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_350M_dense_kd.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_350M_dense_kd.sh
deleted file mode 100644
index 4366be67eff1ddcb78e3c1c8e289febd6fbc2dc9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/compression/ds_pretrain_gpt_350M_dense_kd.sh
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-MODEL_SIZE=0.35
-NUM_LAYERS=24
-HIDDEN_SIZE=1024
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=256
-LR=3.0e-4
-MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-# LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=4
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=64
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 1 means dense model without MoE
-EP_SIZE=1
-# EP_SIZE=128
-
-if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
-    EP_PARALLEL_SIZE=$NUM_GPUS
-else
-    EP_PARALLEL_SIZE=$EP_SIZE
-fi
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
-## heavily tuned.
-# LR=2.0e-4
-# MIN_LR=2e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=10
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-kd-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [[ $EP_SIZE -gt 1 ]]; then
-    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-# USE_INTERNAL_DATA="true"
-USE_INTERNAL_DATA="false"
-
-if [ "${USE_INTERNAL_DATA}" = "true" ]; then
-    ## The internal data is only accessible within Microsoft
-    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
-    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
-    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
-    ## For cluster Lab-RR1-V100
-    BASE_DATA_PATH=/data/Megatron-LM/data
-    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
-    ## For cluster Azure-CentralUS-A100
-    # BASE_DATA_PATH=/data/Megatron-LM/data
-    # DATA_HOME=/vc_data_1/users/amawa/blended
-
-    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
-    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
-    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
-    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
-    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
-    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
-    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
-    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
-    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
-    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
-    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
-    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
-    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
-    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
-    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
-    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
-    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
-    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
-    0.01359 ${ARX} 0.01588 ${GIT}"
-else
-    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
-    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
-    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
-fi
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [[ $EP_SIZE -gt 1 ]]; then
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-fi
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/0/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-if [[ $EP_SIZE -gt 1 ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/README.md
deleted file mode 100644
index a80e3510cc7c2c4435c5fadc98c1e7dd17239d20..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This is an example of how to use DeepSpeed's curriculum learning (CL) feature which provides faster and more stable language model pre-training. Currently it is only integrated for GPT pre-training. Note that there are two curriculum learning examples in two different repos for Megatron-LM GPT-2 pre-training. Both of them have some unique features and limitations. See details in our [tutorial](https://www.deepspeed.ai/tutorials/curriculum-learning/). For technical details please refer to our [paper](https://arxiv.org/abs/2108.06084).
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_config_gpt_slw_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_config_gpt_slw_TEMPLATE.json
deleted file mode 100644
index f1abcedcb2b187bd2200df8e0a1f6824a84a1f57..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_config_gpt_slw_TEMPLATE.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false,
-  "curriculum_learning": {
-    "enabled": true,
-    "curriculum_type": "seqlen",
-    "min_difficulty": CONFIG_CL_MIN,
-    "max_difficulty": CONFIG_CL_MAX,
-    "schedule_type": "fixed_linear",
-    "schedule_config": {
-      "total_curriculum_step": CONFIG_CL_DURATION,
-      "difficulty_step": 8
-    }
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_pretrain_gpt2.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_pretrain_gpt2.sh
deleted file mode 100644
index 96a6186661a06bbdeef3813a735d5219b9b27db7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_pretrain_gpt2.sh
+++ /dev/null
@@ -1,150 +0,0 @@
-#! /bin/bash
-
-CONFIG=$1
-TAG=$2
-MODEL_SIZE=$3
-LR=$4
-TOTAL_BATCHSIZE=$5
-SEQ_LEN=$6
-MP_SIZE=$7
-SEED=$8
-SAVE_INTERVAL=$9
-NUM_ITER=${10}
-NUM_TOKEN=${11}
-LR_DECAY_TOKEN=${12}
-LR_WARMUP_ITER=${13}
-CONFIG_TEMPLATE=${14}
-CURRICULUM_STEP=${15}
-CURRICULUM_MIN=${16}
-
-# 12-layer, 768-hidden, 12-heads, 117M parameters
-# 24-layer, 1024-hidden, 16-heads, 345M parameters
-# 36-layer, 1280-hidden, 20-heads, 774M parameters
-# 48-layer, 1600-hidden, 25-heads, 1558M parameters
-if [[ $MODEL_SIZE -eq 117 ]]; then
-        NUM_LAYERS=12
-        HIDDEN_SIZE=768
-        NUM_ATTN_HEADS=12
-elif [[ $MODEL_SIZE -eq 345 ]]; then
-        NUM_LAYERS=24
-        HIDDEN_SIZE=1024
-        NUM_ATTN_HEADS=16
-elif [[ $MODEL_SIZE -eq 774 ]]; then
-        NUM_LAYERS=36
-        HIDDEN_SIZE=1280
-        NUM_ATTN_HEADS=20
-elif [[ $MODEL_SIZE -eq 1558 ]]; then
-        NUM_LAYERS=48
-        HIDDEN_SIZE=1600
-        NUM_ATTN_HEADS=25
-else
-        echo "Model size not supported."
-        exit 1
-fi
-
-# Pipeline parallelism. 1 means no pipelines.
-PP_SIZE=1
-
-# Change for multinode config
-NUM_WORKERS=16
-NUM_GPUS_PER_WORKER=8
-NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} ))
-if [[ $PP_SIZE -gt 0 ]]; then
-    DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
-else
-    DP_SIZE=$(( ${NUM_GPUS} / ${MP_SIZE} ))
-fi
-# Batch size per gpu, here we assume grad accumulation step 1
-# you can reduce this if gpu OOM
-BATCHSIZE=$((TOTAL_BATCHSIZE/DP_SIZE))
-
-DATA_PATH=/vc_data/Megatron-LM/data/indexed_datasets/megatron
-VOCAB_PATH=/vc_data/Megatron-LM/data/gpt2-vocab.json
-MERGE_PATH=/vc_data/Megatron-LM/data/gpt2-merges.txt
-
-#ZeRO Configs
-stage=1
-
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-host="${HOSTNAME}"
-
-if [ "${CONFIG_TEMPLATE}" = "true" ]; then
-template_json="$script_dir/ds_zero_stage_${stage}_config_${CONFIG}.json"
-config_json="$script_dir/ds_zero_stage_${stage}_config_${CONFIG}_min${CURRICULUM_MIN}_max${SEQ_LEN}_step${CURRICULUM_STEP}.json"
-sed "s/CONFIG_CL_MIN/${CURRICULUM_MIN}/" ${template_json} \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CURRICULUM_STEP}/" \
-	  > ${config_json}
-else
-config_json="$script_dir/ds_zero_stage_${stage}_config_${CONFIG}.json"
-fi
-
-JOB_NAME="gpt2_${MODEL_SIZE}M_bsz${TOTAL_BATCHSIZE}_seq${SEQ_LEN}_lr${LR}_warmup${LR_WARMUP_ITER}_decay${LR_DECAY_TOKEN}_seed${SEED}_${TAG}_stage${stage}_n${NUM_WORKERS}_g${NUM_GPUS_PER_WORKER}_mp${MP_SIZE}"
-LOG_NAME="${JOB_NAME}_${host}_${current_time}"
-
-OUTPUT_BASEPATH="/vc_data_blob/users/conglli"
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/curriculum/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/curriculum/"
-mkdir -p "${OUTPUT_BASEPATH}/log/curriculum/"
-LOGDIR="${OUTPUT_BASEPATH}/tensorboard/curriculum/${LOG_NAME}"
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/curriculum/${JOB_NAME}"
-
-gpt_options=" \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --num-layers $NUM_LAYERS \
-        --hidden-size $HIDDEN_SIZE \
-        --num-attention-heads $NUM_ATTN_HEADS \
-        --seq-length $SEQ_LEN \
-        --max-position-embeddings $SEQ_LEN \
-        --micro-batch-size $BATCHSIZE \
-        --global-batch-size ${TOTAL_BATCHSIZE} \
-        --train-iters $NUM_ITER \
-        --train-tokens $NUM_TOKEN \
-        --lr-decay-tokens $LR_DECAY_TOKEN \
-        --save $CHECKPOINT_PATH \
-        --load $CHECKPOINT_PATH \
-        --data-path $DATA_PATH \
-        --vocab-file $VOCAB_PATH \
-        --merge-file $MERGE_PATH \
-        --data-impl mmap \
-        --split 949,50,1 \
-        --distributed-backend nccl \
-        --override-opt_param-scheduler \
-        --lr $LR \
-        --lr-decay-style cosine \
-        --min-lr 1.0e-5 \
-        --weight-decay 1e-2 \
-        --clip-grad 1.0 \
-        --lr-warmup-iters $LR_WARMUP_ITER \
-        --checkpoint-activations \
-        --log-interval 100 \
-        --save-interval $SAVE_INTERVAL \
-        --eval-interval 100 \
-        --eval-iters 10 \
-        --fp16 \
-        --seed $SEED \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --no-masked-softmax-fusion \
-        --tensorboard-dir ${LOGDIR}
-"
-
-deepspeed_options=" \
-        --deepspeed \
-        --deepspeed_config ${config_json} \
-        --zero-stage ${stage} \
-        --pipeline-model-parallel-size ${PP_SIZE} \
-        --deepspeed-activation-checkpointing
-"
-
-full_options="${gpt_options} ${deepspeed_options}"
-
-run_cmd="deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER}  ../../pretrain_gpt.py ${full_options} &>> ${OUTPUT_BASEPATH}/log/curriculum/${JOB_NAME}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_pretrain_gpt_1.3B_rope_slw.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_pretrain_gpt_1.3B_rope_slw.sh
deleted file mode 100644
index 209021a39273fcdd2e421da4e694ffed53de5c72..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_pretrain_gpt_1.3B_rope_slw.sh
+++ /dev/null
@@ -1,347 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-model_size=1.3
-num_layers=24
-hidden_size=2048
-num_attn_heads=16
-global_batch_size=512
-lr=2.0e-4
-min_lr=1.0e-6
-init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=4
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=8
-no_pp="false"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=1
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=2
-###############################################################################
-### curriculum learning (sequence length warmup) configs
-# The "divided by 3" means we use 1/3 of baseline's total steps for sequence length warmup.
-# This is not always the best config, but usually a reasonable choice to start with.
-cl_step=$(( ${lr_warmup_tokens} / 3 / ${global_batch_size} / ${seq_len} ))
-# Starting sequence length during sequence length warmup. If the train/validation loss is
-# unstable at the beginning of training, need to increase this but also need to keep as multiples
-# of 8 in order to enable Tensor Core acceleration.
-cl_min=64
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-## Public the Pile dataset, can be downloaded at
-## https://mystic.the-eye.eu/public/AI/pile_neox/ or 
-## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you
-## store the pile_text_document.bin and pile_text_document.idx.
-data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-data_path="${data_home}/pile_text_document"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase_rope0.25"
-jobname="${jobname}_cl_step${cl_step}_cl_min${cl_min}"
-
-username=$(whoami)
-output_home="/blob/users/${username}/project/data_efficient_gpt"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --use-rotary-position-embeddings \
-    --rotary-percent 0.25 \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_cl_step${cl_step}_cl_min${cl_min}.json"
-template_json="ds_config_gpt_slw_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-    | sed "s/CONFIG_CL_MIN/${cl_min}/" \
-    | sed "s/CONFIG_CL_MAX/${seq_len}/" \
-    | sed "s/CONFIG_CL_DURATION/${cl_step}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_train.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_train.sh
deleted file mode 100644
index aac11ab034bd075dec482d611556f4ee7191c70f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_train.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-# # baseline
-# CONFIG=baseline
-# TAG=baseline
-# MODEL_SIZE=1558
-# LR=1.5e-4
-# BSZ=512
-# SEQ_LEN=1024
-# MP_SIZE=1
-# SEED=1234
-# SAVE_INTERVAL=5000
-# NUM_ITER=600000
-# NUM_TOKEN=157286400000
-# LR_DECAY_TOKEN=157286400000
-# LR_WARMUP_ITER=3000
-# CONFIG_TEMPLATE=false
-# CURRICULUM_STEP=0
-# CURRICULUM_MIN=0
-
-# curriculum learning
-CONFIG=curriculum_fixed_linear
-MODEL_SIZE=1558
-LR=6e-4
-BSZ=4096
-SEQ_LEN=1024
-MP_SIZE=1
-SEED=1234
-SAVE_INTERVAL=1000
-NUM_ITER=75000
-NUM_TOKEN=157286400000
-LR_DECAY_TOKEN=157286400000
-LR_WARMUP_ITER=3000
-CONFIG_TEMPLATE=true
-CURRICULUM_STEP=45000
-CURRICULUM_MIN=64
-TAG="${CONFIG}_s${CURRICULUM_MIN}to${SEQ_LEN}_step${CURRICULUM_STEP}"
-
-bash ds_pretrain_gpt2.sh $CONFIG $TAG $MODEL_SIZE $LR $BSZ $SEQ_LEN $MP_SIZE $SEED $SAVE_INTERVAL $NUM_ITER $NUM_TOKEN $LR_DECAY_TOKEN $LR_WARMUP_ITER $CONFIG_TEMPLATE $CURRICULUM_STEP $CURRICULUM_MIN
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_baseline.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_baseline.json
deleted file mode 100644
index 71494f3748e790df5592f09bf17839dd1db7af64..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_baseline.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "train_batch_size": 512,
-  "gradient_accumulation_steps": 1,
-  "steps_per_print": 1,
-  "zero_optimization": {
-    "stage": 1
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015,
-      "max_grad_norm": 1.0,
-      "betas": [0.9, 0.95]
-    }
-  },
-  "gradient_clipping": 1.0,
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "wall_clock_breakdown": false,
-  "zero_allow_untested_optimizer": false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json
deleted file mode 100644
index e2f9478308735ed111ce735a0c22cb5e2eb305c7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/curriculum_learning/ds_zero_stage_1_config_curriculum_fixed_linear.json
+++ /dev/null
@@ -1,37 +0,0 @@
-{
-  "train_batch_size": 512,
-  "gradient_accumulation_steps": 1,
-  "steps_per_print": 1,
-  "zero_optimization": {
-    "stage": 1
-  },
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00015,
-      "max_grad_norm": 1.0,
-      "betas": [0.9, 0.95]
-    }
-  },
-  "gradient_clipping": 1.0,
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "wall_clock_breakdown": false,
-  "zero_allow_untested_optimizer": false,
-  "curriculum_learning": {
-    "enabled": true,
-    "curriculum_type": "seqlen",
-    "min_difficulty": CONFIG_CL_MIN,
-    "max_difficulty": CONFIG_CL_MAX,
-    "schedule_type": "fixed_linear",
-    "schedule_config": {
-      "total_curriculum_step": CONFIG_CL_DURATION,
-      "difficulty_step": 8
-    }
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/README.md
deleted file mode 100644
index 7ed96ae723bd37a48238884587a51a67adc04711..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-This directory includes GPT-3/BERT pretraining example scripts for DeepSpeed Data Efficiency Library technologies (curriculum learning, random-LTD, and the two composed together).
-
-You need to install updated DeepSpeed version (>=0.8.0), which contains the DeepSpeed Data Efficiency Library.
-
-Additional tutorial can be found at [DeepSpeed website](https://www.deepspeed.ai/tutorials/data-efficiency/).
-
-Additional technical details can be found in our [random-LTD paper](https://arxiv.org/abs/2211.11586) and [data efficiency paper](https://arxiv.org/abs/2212.03597).
-
-## GPT-3 pretraining and evaluation
-Inside ``gpt`` folder, first the ``ds_analyze_gpt_data_map.sh`` and ``ds_analyze_gpt_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
-
-``gpt/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_gpt_1.3B_dense_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
-
-``gpt/eval`` includes the zero-/few-shot evaluation example scripts. ``ds_evalharness_parallel_run.sh`` is for zero-shot, and ``ds_evalharness_parallel_run_10shot.sh`` is for 10-shot.
-
-## BERT pretraining and finetuning
-Inside ``bert`` folder, first the ``pile_data_download_preprocess.py`` can be used to download and preprocess the public Pile dataset.
-
-The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
-
-``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
-
-``bert/finetune`` includes the MNLI/QQP/RACE finetuning example scripts following the [Megatron-LM paper](https://arxiv.org/abs/1909.08053). However, we found that the RACE task's accuracy is not very stable and the Megatron-LM paper used a very long number of epochs for MNLI/QQP which is not necessary. Thus we added capability of finetuning other GLUE tasks, and switched to follow the hyperparameters of the [original BERT paper](https://arxiv.org/abs/1810.04805). The corresponding scripts are at ``bert/finetune_glue``, which we recommend to use instead of ``bert/finetune``. Our [data efficiency paper](https://arxiv.org/abs/2212.03597) also uses the scripts under ``bert/finetune_glue`` for GLUE finetuning.
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/analyze_data.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/analyze_data.py
deleted file mode 100644
index fec6aa5c7fd3fd428018c04d1f8778d9c65e09e0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/analyze_data.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
-
-import os
-import time
-import sys
-import math
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir,os.path.pardir)))
-from datetime import datetime
-import numpy as np
-import torch
-
-from deepspeed.runtime.data_pipeline.data_sampling.data_analyzer \
-    import DataAnalyzer
-from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset \
-    import MMapIndexedDataset
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds.initialize import initialize_megatron
-
-def get_tasks_args(parser):
-    """Provide extra arguments required for data analyzing."""
-    group = parser.add_argument_group(title='data_analyzing')
-
-    group.add_argument('--analyzing-task', type=str, required=True,
-                       default=None,
-                       choices=['map',
-                                'reduce'],
-                       help='What type of analyzing task to perform.')
-    group.add_argument('--analyzing-data-type', type=str, required=True,
-                       default=None,
-                       choices=['BERT',
-                                'GPT'],
-                       help='What type of data.')
-    group.add_argument('--analyzing-metric', type=str, nargs='+', default=[],
-                       help='What kinds of metrics to analyze.')
-    group.add_argument('--analyzing-num-workers', type=int, default=1,
-                       help='Number of workers. Each worker could be a single CPU node.')
-    group.add_argument('--analyzing-worker-id', type=int, default=0,
-                       help='Worker id of current node.')
-    group.add_argument('--analyzing-num-threads', type=int, default=1,
-                       help='Number of threads for each worker.')
-    group.add_argument('--analyzing-num-threads-reduce', type=int, default=1,
-                       help='Number of threads for each worker.')
-    group.add_argument('--analyzing-specific-threads', type=int, nargs='+', default=[],
-                       help='Which specific threads to run. Helpful when there are specific thread failed in previous run.')
-    return parser
-
-def train_valid_test_datasets_provider_gpt():
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
-    from megatron_ds.data.gpt_dataset import build_train_valid_test_datasets
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them
-        seq_length=args.seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup))
-    print_rank_0("> finished creating GPT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-def train_valid_test_datasets_provider_bert():
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    print_rank_0('> building train, validation, and test datasets '
-                 'for BERT ...')
-    from megatron_ds.data.dataset_utils import build_train_valid_test_datasets
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them
-        max_seq_length=args.seq_length,
-        masked_lm_prob=args.mask_prob,
-        short_seq_prob=args.short_seq_prob,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup),
-        binary_head=args.bert_binary_head)
-    print_rank_0("> finished creating BERT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-def metric_seqlen(data):
-    metric = torch.count_nonzero(data['padding_mask'], dim=1)
-    return metric
-
-def metric_total_vocab_freq(data):
-    args = get_args()
-    if args.analyzing_data_type == 'BERT':
-        frequency = torch.bincount(data['text'].view(-1),
-            minlength=args.padded_vocab_size+1,
-            weights=data['padding_mask'].view(-1))
-    elif args.analyzing_data_type == 'GPT':
-        frequency = torch.bincount(data['text'].view(-1),
-            minlength=args.padded_vocab_size+1)
-    return frequency
-
-def metric_vocab_rarity(data):
-    args = get_args()
-    if args.analyzing_data_type == 'BERT':
-        rarity = torch.sum(data['padding_mask'] * \
-            args.total_vocab_freq[data['text']], dim=1).to(torch.long)
-    elif args.analyzing_data_type == 'GPT':
-        rarity = []
-        # Do one by one to avoid too high memory consumption
-        for row in range(data['text'].size()[0]):
-            rarity.append(int(torch.sum(args.total_vocab_freq[data['text'][row]]).item()))
-        rarity = torch.tensor(rarity, dtype=torch.long)
-    print(f"rarity min {min(rarity)}, max {max(rarity)}, len {len(rarity)}, avg {sum(rarity)/len(rarity)}")
-    return rarity
-
-def metric_seqlen_vocab_rarity(data):
-    args = get_args()
-    metric = torch.count_nonzero(data['padding_mask'], dim=1).to(torch.long) * args.seqlen_coeff
-    metric += torch.sum(data['padding_mask'] * \
-        args.total_vocab_freq[data['text']], dim=1).to(torch.long)
-    print(f"metric min {min(metric)}, max {max(metric)}, len {len(metric)}, avg {sum(metric)/len(metric)}")
-    return metric
-
-def get_metric_function(metric_name):
-    if metric_name == 'seqlen':
-        return metric_seqlen
-    if metric_name == 'total_vocab_freq':
-        return metric_total_vocab_freq
-    if metric_name == 'vocab_rarity':
-        return metric_vocab_rarity
-    if metric_name == 'seqlen_vocab_rarity':
-        return metric_seqlen_vocab_rarity
-
-def get_metric_type(metric_name):
-    if metric_name == 'seqlen':
-        return 'single_value_per_sample'
-    if metric_name == 'total_vocab_freq':
-        return 'accumulate_value_over_samples'
-    if metric_name == 'vocab_rarity':
-        return 'single_value_per_sample'
-    if metric_name == 'seqlen_vocab_rarity':
-        return 'single_value_per_sample'
-
-def run_map():
-    args = get_args()
-    if args.analyzing_data_type == 'BERT':
-        args.mask_prob = 0 # When analyzing data, we don't want any mask.
-        train_ds, _, _ = train_valid_test_datasets_provider_bert()
-    elif args.analyzing_data_type == 'GPT':
-        train_ds, _, _ = train_valid_test_datasets_provider_gpt()
-        assert 'seqlen' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.'
-        assert 'seqlen_vocab_rarity' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.'
-    if 'vocab_rarity' in args.analyzing_metric or 'seqlen_vocab_rarity' in args.analyzing_metric:
-        total_vocab_freq_fname = f"{args.save}/total_vocab_freq/total_vocab_freq_metric_value"
-        assert os.path.isfile(f"{total_vocab_freq_fname}.bin") and os.path.isfile(f"{total_vocab_freq_fname}.idx"), "To analyze vocab rarity, first need to analyze the total vocab freq."
-        total_vocab_freq = MMapIndexedDataset(total_vocab_freq_fname, skip_warmup=True)
-        total_vocab_freq = np.copy(total_vocab_freq[0])
-        total_vocab_freq[total_vocab_freq == 0] = 1 # Avoid log(0) error
-        total_vocab_freq = np.log(total_vocab_freq/sum(total_vocab_freq)) * -1
-        args.total_vocab_freq = torch.tensor(total_vocab_freq, dtype=torch.double)
-        if 'seqlen_vocab_rarity' in args.analyzing_metric:
-            # Use large coeff to make seqlen dominates vocab_rarity
-            max_possible_rarity = args.seq_length * torch.max(args.total_vocab_freq).item()
-            args.seqlen_coeff = 10 ** (math.ceil(math.log(max_possible_rarity, 10)) + 1)
-            print(f"Metric seqlen_vocab_rarity: using {args.seqlen_coeff} as coefficient for seqlen.")
-    metric_functions = [get_metric_function(x) for x in args.analyzing_metric]
-    metric_types = [get_metric_type(x) for x in args.analyzing_metric]
-    # For metric_dtypes we int64 by default since it could be hard to estimate
-    # the appropriate dtype before the mapping analysis. During reduce where
-    # we merge the analysis results, the DataAnalyzer will automatically choose
-    # the dtype of merged result file as the smallest one that meet the range
-    # requirement.
-    metric_dtypes = [np.int64 for x in args.analyzing_metric]
-    start = time.time()
-    data_analyzer = DataAnalyzer(train_ds,
-        num_workers=args.analyzing_num_workers,
-        worker_id=args.analyzing_worker_id,
-        num_threads=args.analyzing_num_threads,
-        specific_threads=args.analyzing_specific_threads,
-        batch_size=args.global_batch_size, metric_names=args.analyzing_metric,
-        metric_functions=metric_functions, metric_types=metric_types,
-        metric_dtypes=metric_dtypes, save_path=args.save)
-    data_analyzer.run_map()
-    duration = (time.time() - start) / 3600.0
-    print(f"map job finished in {duration} hr.")
-
-def run_reduce():
-    args = get_args()
-    if args.analyzing_data_type == 'BERT':
-        args.mask_prob = 0 # When analyzing data, we don't want any mask.
-        train_ds, _, _ = train_valid_test_datasets_provider_bert()
-    elif args.analyzing_data_type == 'GPT':
-        train_ds, _, _ = train_valid_test_datasets_provider_gpt()
-    metric_functions = [get_metric_function(x) for x in args.analyzing_metric]
-    metric_types = [get_metric_type(x) for x in args.analyzing_metric]
-    metric_dtypes = [np.int64 for x in args.analyzing_metric]
-    start = time.time()
-    data_analyzer = DataAnalyzer(train_ds,
-        num_workers=args.analyzing_num_workers,
-        num_threads=args.analyzing_num_threads,
-        num_threads_reduce=args.analyzing_num_threads_reduce,
-        batch_size=args.global_batch_size, metric_names=args.analyzing_metric,
-        metric_functions=metric_functions, metric_types=metric_types,
-        metric_dtypes=metric_dtypes, save_path=args.save)
-    data_analyzer.run_reduce()
-    duration = (time.time() - start) / 3600.0
-    print(f"reduce job finished in {duration} hr.")
-
-if __name__ == "__main__":
-    initialize_megatron(extra_args_provider=get_tasks_args, allow_no_cuda=True)
-    args = get_args()
-    if args.analyzing_task == 'map':
-        run_map()
-    elif args.analyzing_task == 'reduce':
-        run_reduce()
-    else:
-        raise NotImplementedError('Task {} is not implemented.'.format(
-            args.analyzing_task))
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_map.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_map.sh
deleted file mode 100644
index 7f23e361573165df18147626d0e7d31f6b8da7aa..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_map.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-
-num_workers=1 # Num nodes to run the map job
-num_threads=40 # Num threads on each node. Set this based on #CPU cores
-
-# If different data epochs have slightly different data samples (e.g., due
-# to randomness), then you need to specify large enough num_epochs that cover
-# whole pretraining. If different data epochs are the same, set num_epochs to
-# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
-# library will automatically handle reshuffling when reaching another epoch.
-num_epochs=5
-
-# Which node is this node (start with 0 and end with num_workers-1). This
-# script only launch the map job on 1 worker node, since we don't expect
-# running on many nodes and workers don't need any communication. But you
-# can modify this script to add a MPI/torch distributed launcher.
-worker_id=$1
-save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
-
-metric='total_vocab_freq'
-# metric='vocab_rarity' # this requires the result of total_vocab_freq
-# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
-# metric='seqlen'
-
-seq_len=512
-batch_size=10000
-
-jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}"
-## Public the Pile dataset, see prepare_pile_data.py in the same directory
-## about how to download and preprocess the data.
-## Change data_home to your own training data path.
-# data_home="/vc_data_blob/users/conglli/the_pile_bert"
-data_home="/blob/data/the_pile_bert"
-data_path="${data_home}/pile_bert_train_text_sentence"
-
-vocab_path="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-# Make sure the "--split" is the same as what you will use for pre-training.
-options=" \
-    --analyzing-task map \
-    --analyzing-data-type BERT \
-    --analyzing-metric ${metric} \
-    --analyzing-num-workers ${num_workers} \
-    --analyzing-worker-id ${worker_id} \
-    --analyzing-num-threads ${num_threads} \
-    --vocab-file ${vocab_path} \
-    --data-path ${data_path} \
-    --data-impl mmap \
-    --tokenizer-type BertWordPieceLowerCase \
-    --micro-batch-size ${batch_size} \
-    --global-batch-size ${batch_size} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --num-layers 1 \
-    --hidden-size 1 \
-    --num-attention-heads 1 \
-    --split 949,50,1 \
-    --distributed-backend gloo \
-    --train-data-exact-num-epochs ${num_epochs} \
-    --return-data-index \
-    --save-interval 1 \
-    --save ${save_path}"
-
-python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_reduce.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_reduce.sh
deleted file mode 100644
index f0d14df96a52bbb7391e12c3140ac5536fcacacd..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/ds_analyze_bert_data_reduce.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-# Set these 2 to the same as what you used during map job. We need these 2
-# configs to know how many map job result files do we have.
-num_workers=1
-num_threads=40
-# Reduce job only has 1 worker but can accelerate by multithreading.
-num_threads_reduce=40
-
-# If different data epochs have slightly different data samples (e.g., due
-# to randomness), then you need to specify large enough num_epochs that cover
-# whole pretraining. If different data epochs are the same, set num_epochs to
-# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
-# library will automatically handle reshuffling when reaching another epoch.
-num_epochs=5
-
-save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
-
-metric='total_vocab_freq'
-# metric='vocab_rarity' # this requires the result of total_vocab_freq
-# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
-# metric='seqlen'
-
-seq_len=512
-batch_size=10000
-
-jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-reduce"
-## Public the Pile dataset, see prepare_pile_data.py in the same directory
-## about how to download and preprocess the data.
-## Change data_home to your own training data path.
-# data_home="/vc_data_blob/users/conglli/the_pile_bert"
-data_home="/blob/data/the_pile_bert"
-data_path="${data_home}/pile_bert_train_text_sentence"
-
-vocab_path="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-# Make sure the "--split" is the same as what you will use for pre-training.
-options=" \
-    --analyzing-task reduce \
-    --analyzing-data-type BERT \
-    --analyzing-metric ${metric} \
-    --analyzing-num-workers ${num_workers} \
-    --analyzing-num-threads ${num_threads} \
-    --analyzing-num-threads-reduce ${num_threads_reduce} \
-    --vocab-file ${vocab_path} \
-    --data-path ${data_path} \
-    --data-impl mmap \
-    --tokenizer-type BertWordPieceLowerCase \
-    --micro-batch-size ${batch_size} \
-    --global-batch-size ${batch_size} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --num-layers 1 \
-    --hidden-size 1 \
-    --num-attention-heads 1 \
-    --split 949,50,1 \
-    --distributed-backend gloo \
-    --train-data-exact-num-epochs ${num_epochs} \
-    --return-data-index \
-    --save-interval 1 \
-    --save ${save_path}"
-
-python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json
deleted file mode 100644
index 1ee35d7ae57d71ecfee31018f5d6aae39d5a8ec1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "train_batch_size" : CONFIG_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh
deleted file mode 100644
index e88f7beb0cf0349b81f149783c92b4b51ff0f157..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh
+++ /dev/null
@@ -1,150 +0,0 @@
-seed=1234
-pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
-
-###############################################################################
-### Main configs
-### The main configs are from Megatron-LM paper
-### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
-### or build your own configs.
-seq_len=512
-
-## From Table 6 in https://arxiv.org/abs/1909.08053.
-task="MNLI"
-global_batch_size=128
-lr=1e-5
-epochs=10
-
-train_data="/blob/data/GlueData/MNLI/train.tsv"
-valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
-            /blob/data/GlueData/MNLI/dev_mismatched.tsv"
-
-## Adjust based on number of GPUs.
-batch_size=16
-
-## BERT 110M (same config as original BERT-Base model)
-## This config is not included in Megatron-LM paper
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-
-## BERT 336M (same config as original BERT-Large model)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO stage
-zero_stage=0
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=50
-eval_interval=100
-save_interval=500000
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-###############################################################################
-vocab_file="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
-checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
-mkdir -p ${checkpoint_path}
-
-template_json="ds_config_bert_TEMPLATE.json"
-config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
-if [[ $zero_stage -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-fi
-
-options=" \
-    --finetune \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --task ${task} \
-    --seed ${seed} \
-    --train-data ${train_data} \
-    --valid-data ${valid_data} \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file ${vocab_file} \
-    --epochs ${epochs} \
-    --pretrained-checkpoint ${pretrained_checkpoint} \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --global-batch-size ${global_batch_size} \
-    --micro-batch-size ${batch_size} \
-    --lr ${lr} \
-    --lr-decay-style linear \
-    --lr-warmup-fraction 0.065 \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --save-interval ${save_interval} \
-    --save ${checkpoint_path} \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --weight-decay 1.0e-1 \
-    --fp16"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-options="${options} \
-    --checkpoint-activations \
-    --deepspeed-activation-checkpointing"
-fi
-
-if [[ "${no_pp}" = "true" ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-# After the fine-tuning finishes, you can find the dev set accuracy numbers by
-# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
-deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh
deleted file mode 100644
index 8083e1024d607e4bc37f6cdb560f762b5fabc490..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-seed=1234
-pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
-
-###############################################################################
-### Main configs
-### The main configs are from Megatron-LM paper
-### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
-### or build your own configs.
-seq_len=512
-
-## From Table 6 in https://arxiv.org/abs/1909.08053.
-task="QQP"
-
-train_data="/blob/data/GlueData/QQP/train.tsv"
-valid_data="/blob/data/GlueData/QQP/dev.tsv"
-
-## Adjust based on number of GPUs.
-batch_size=16
-
-## BERT 110M (same config as original BERT-Base model)
-## This config is not included in Megatron-LM paper
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=128
-# lr=5e-5
-# epochs=12
-
-## BERT 336M (same config as original BERT-Large model)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-global_batch_size=128
-lr=5e-5
-epochs=12
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-# global_batch_size=128
-# lr=3e-5
-# epochs=12
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-# global_batch_size=256
-# lr=4e-5
-# epochs=12
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO stage
-zero_stage=0
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=50
-eval_interval=100
-save_interval=500000
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-###############################################################################
-vocab_file="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
-checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
-mkdir -p ${checkpoint_path}
-
-template_json="ds_config_bert_TEMPLATE.json"
-config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
-if [[ $zero_stage -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-fi
-
-options=" \
-    --finetune \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --task ${task} \
-    --seed ${seed} \
-    --train-data ${train_data} \
-    --valid-data ${valid_data} \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file ${vocab_file} \
-    --epochs ${epochs} \
-    --pretrained-checkpoint ${pretrained_checkpoint} \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --global-batch-size ${global_batch_size} \
-    --micro-batch-size ${batch_size} \
-    --lr ${lr} \
-    --lr-decay-style linear \
-    --lr-warmup-fraction 0.065 \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --save-interval ${save_interval} \
-    --save ${checkpoint_path} \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --weight-decay 1.0e-1 \
-    --fp16"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-options="${options} \
-    --checkpoint-activations \
-    --deepspeed-activation-checkpointing"
-fi
-
-if [[ "${no_pp}" = "true" ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-# After the fine-tuning finishes, you can find the dev set accuracy numbers by
-# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
-deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_race.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_race.sh
deleted file mode 100644
index 15658e3d213bd48ede46d341c37849a6bd3bcaa6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_bert_race.sh
+++ /dev/null
@@ -1,172 +0,0 @@
-seed=1234
-## RACE have two sub-tasks that need to be finetuned separately
-difficulty="middle"
-# difficulty="high"
-pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
-
-###############################################################################
-### Main configs
-### The main configs are from Megatron-LM paper
-### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
-### or build your own configs.
-seq_len=512
-
-## From Table 6 in https://arxiv.org/abs/1909.08053.
-task="RACE"
-
-## Race dataset can be downloaded by:
-## wget http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz
-train_data="/blob/data/RACE/train/${difficulty}"
-
-## The Megatron paper https://arxiv.org/abs/1909.08053 says: "For the test set
-## results of RACE, we first use the development set to find the checkpoint
-## that gives us the median score on the 5 random seeds and we report the
-## results from that checkpoint on the test set", which is a quite confusing
-## description. For simplicity, instead we directly get the median dev and test
-## set score on 5 random seeds from a single pretrained_checkpoint.
-valid_data="/blob/data/RACE/dev/${difficulty} \
-            /blob/data/RACE/test/${difficulty}"
-
-## Adjust based on number of GPUs.
-batch_size=4
-
-## BERT 110M (same config as original BERT-Base model)
-## This config is not included in Megatron-LM paper
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=32
-# lr=2e-5
-# epochs=3
-
-## BERT 336M (same config as original BERT-Large model)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-global_batch_size=32
-lr=2e-5
-epochs=3
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-# global_batch_size=16
-# lr=1e-5
-# epochs=3
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-# global_batch_size=32
-# lr=2e-5
-# epochs=3
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO stage
-zero_stage=0
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=50
-eval_interval=100
-save_interval=100000
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-###############################################################################
-vocab_file="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-jobname="${task}-${difficulty}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
-checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
-mkdir -p ${checkpoint_path}
-
-template_json="ds_config_bert_TEMPLATE.json"
-config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
-if [[ $zero_stage -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-fi
-
-options=" \
-    --finetune \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --task ${task} \
-    --seed ${seed} \
-    --train-data ${train_data} \
-    --valid-data ${valid_data} \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file ${vocab_file} \
-    --epochs ${epochs} \
-    --pretrained-checkpoint ${pretrained_checkpoint} \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --global-batch-size ${global_batch_size} \
-    --micro-batch-size ${batch_size} \
-    --lr ${lr} \
-    --lr-decay-style linear \
-    --lr-warmup-fraction 0.06 \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --save-interval ${save_interval} \
-    --save ${checkpoint_path} \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --weight-decay 1.0e-1 \
-    --clip-grad 1.0 \
-    --fp16"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-options="${options} \
-    --checkpoint-activations \
-    --deepspeed-activation-checkpointing"
-fi
-
-if [[ "${no_pp}" = "true" ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-# After the fine-tuning finishes, you can find the dev/test set accuracy numbers
-# by "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
-deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_gather_result.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_gather_result.py
deleted file mode 100644
index 6fffe829dda28e9a4466cf694e826004ac5e6ce5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune/ds_finetune_gather_result.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import os
-import statistics
-
-def gather_numbers(fname, match_keywords, index_keywords, index_offsets):
-    results = {}
-    for k in index_keywords:
-        results[k] = []
-    file1 = open(fname, 'r')
-    while True:
-        line = file1.readline()
-        if not line:
-            break
-        splits = line.split(' ')
-        for i in range(len(match_keywords)):
-            if match_keywords[i] in line:
-                ref_idx = splits.index(index_keywords[i])
-                results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]]))
-    file1.close()
-    return results
-
-def gather_MNLI_results(result_path):
-    overall = []
-    matched = []
-    mismatched = []
-    for file in os.listdir(result_path):
-        if file.startswith('MNLI'):
-            fname = f'{result_path}/{file}/output.log'
-            if os.path.exists(fname):
-                results = gather_numbers(fname,
-                    ['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'],
-                    ['overall:', 'dev-matched:', 'dev-mismatched:'],
-                    [9, 9, 9])
-                overall_candidate = results['overall:']
-                matched_candidate = results['dev-matched:']
-                mismatched_candidate = results['dev-mismatched:']
-                if len(overall_candidate) > 0:
-                    assert len(overall_candidate) == len(matched_candidate) and len(overall_candidate) == len(mismatched_candidate)
-                    best_index = overall_candidate.index(max(overall_candidate))
-                    overall.append(overall_candidate[best_index])
-                    matched.append(matched_candidate[best_index])
-                    mismatched.append(mismatched_candidate[best_index])
-    if len(overall) > 0:
-        if len(overall) % 2 == 1:
-            median_idx = overall.index(statistics.median(overall))
-        else:
-            median_idx = overall.index(statistics.median_high(overall))
-        print(f'MNLI how Megatron paper reported: overall results median {statistics.median(overall)}, corresponding matched/mismatched: {matched[median_idx]}/{mismatched[median_idx]}')
-        print(f'MNLI other results:')
-        print(f'MNLI overall results {overall}, median {statistics.median(overall)} (corresponding matched/mismatched {matched[median_idx]}/{mismatched[median_idx]}), mean {statistics.mean(overall)}, std {statistics.stdev(overall)}')
-        print(f'MNLI matched results {matched}, median {statistics.median(matched)}, mean {statistics.mean(matched)}, std {statistics.stdev(matched)}')
-        print(f'MNLI mismatched results {mismatched}, median {statistics.median(mismatched)}, mean {statistics.mean(mismatched)}, std {statistics.stdev(mismatched)}')
-    else:
-        print("Didn't find any MNLI result")
-
-def gather_QQP_results(result_path):
-    overall = []
-    for file in os.listdir(result_path):
-        if file.startswith('QQP'):
-            fname = f'{result_path}/{file}/output.log'
-            if os.path.exists(fname):
-                results = gather_numbers(fname, ['overall:'], ['overall:'], [9])
-                overall_candidate = results['overall:']
-                if len(overall_candidate) > 0:
-                    best_index = overall_candidate.index(max(overall_candidate))
-                    overall.append(overall_candidate[best_index])
-    if len(overall) > 0:
-        print(f'QQP how Megatron paper reported: overall results median {statistics.median(overall)}')
-        print(f'QQP other results:')
-        print(f'QQP overall results {overall}, median {statistics.median(overall)}, mean {statistics.mean(overall)}, std {statistics.stdev(overall)}')
-    else:
-        print("Didn't find any QQP result")
-
-def gather_RACE_results(result_path, task):
-    dev = []
-    test = []
-    for file in os.listdir(result_path):
-        if file.startswith(f'RACE-{task}'):
-            fname = f'{result_path}/{file}/output.log'
-            if os.path.exists(fname):
-                results = gather_numbers(fname,
-                    [f'metrics for dev-{task}:', f'metrics for test-{task}:'],
-                    [f'dev-{task}:', f'test-{task}:'],
-                    [9, 9])
-                dev_candidate = results[f'dev-{task}:']
-                test_candidate = results[f'test-{task}:']
-                if len(dev_candidate) > 0:
-                    assert len(dev_candidate) == len(test_candidate)
-                    dev.append(max(dev_candidate))
-                    test.append(max(test_candidate))
-    if len(dev) > 0:
-        if len(dev) % 2 == 1:
-            median_idx = dev.index(statistics.median(dev))
-        else:
-            median_idx = dev.index(statistics.median_high(dev))
-        print(f'RACE-{task} how Megatron paper reported: test result from the median of dev results {test[median_idx]}')
-        print(f'RACE-{task} other results:')
-        print(f'RACE-{task} dev results {dev}, median {statistics.median(dev)}, mean {statistics.mean(dev)}, std {statistics.stdev(dev)}')
-        print(f'RACE-{task} test results {test}, median {statistics.median(test)}, mean {statistics.mean(test)}, std {statistics.stdev(test)}')
-    else:
-        print(f"Didn't find any RACE-{task} result")
-
-def gather_finetune_results(result_path):
-    print(f'Gather finetune results for {result_path}')
-    gather_MNLI_results(result_path)
-    gather_QQP_results(result_path)
-    gather_RACE_results(result_path, 'middle')
-    gather_RACE_results(result_path, 'high')
-
-if __name__ == '__main__':
-    result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/'
-    gather_finetune_results(result_path)
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json
deleted file mode 100644
index 1ee35d7ae57d71ecfee31018f5d6aae39d5a8ec1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "train_batch_size" : CONFIG_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh
deleted file mode 100644
index 0e0c571a4293c96cd3d3c361f8f9b714afc8b825..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh
+++ /dev/null
@@ -1,156 +0,0 @@
-hostname_and_rank=$1
-master_port=$2
-seed=$3
-task=$4
-lr=$5
-pretrained_checkpoint=$6
-
-# hostname_and_rank="worker-0:0,1,2,3"
-# master_port=12345
-# seed=1234
-# task="MNLI"
-# lr=2e-5
-# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
-
-###############################################################################
-### Main configs
-seq_len=512
-
-global_batch_size=32
-epochs=3
-
-train_data="/blob/data/GlueData/${task}/train.tsv"
-valid_data="/blob/data/GlueData/${task}/dev.tsv"
-if [[ "${task}" = "MNLI" ]]; then
-valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
-            /blob/data/GlueData/MNLI/dev_mismatched.tsv"
-fi
-
-## Adjust based on number of GPUs.
-batch_size=8
-
-## BERT 110M (BERT-Base)
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-
-## BERT 336M (BERT-Large)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO stage
-zero_stage=0
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=50
-eval_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-###############################################################################
-vocab_file="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
-# output_path="${pretrained_checkpoint}-finetune-glue-4v100/${jobname}"
-output_path=$(basename "$pretrained_checkpoint")
-output_path="glue-results/${output_path}-finetune-glue-4v100/${jobname}"
-mkdir -p ${output_path}
-
-template_json="ds_config_bert_TEMPLATE.json"
-config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
-if [[ $zero_stage -gt 0 ]]; then
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/false/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-else
-sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/true/" \
-    | sed "s/CONFIG_BF16_ENABLED/false/" \
-      > ${config_json}
-fi
-
-options=" \
-    --finetune \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --task ${task} \
-    --seed ${seed} \
-    --train-data ${train_data} \
-    --valid-data ${valid_data} \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file ${vocab_file} \
-    --epochs ${epochs} \
-    --pretrained-checkpoint ${pretrained_checkpoint} \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --global-batch-size ${global_batch_size} \
-    --micro-batch-size ${batch_size} \
-    --lr ${lr} \
-    --lr-decay-style linear \
-    --lr-warmup-fraction 0.1 \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --weight-decay 1.0e-1 \
-    --fp16"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-options="${options} \
-    --checkpoint-activations \
-    --deepspeed-activation-checkpointing"
-fi
-
-if [[ "${no_pp}" = "true" ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-# After the fine-tuning finishes, you can find the dev set accuracy numbers by
-# "grep -e "overall:" -e "metrics for" ${output_path}/output.log"
-deepspeed --include=${hostname_and_rank} --master_port=${master_port} ../../../../tasks/main.py ${options} &> ${output_path}/output.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh
deleted file mode 100644
index 10e04f2c7a1b678ccf4b941c5b3e6b51ec2aae2e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-hostname_and_rank=$1
-master_port=$2
-pretrained_checkpoint=$3
-
-# hostname_and_rank="worker-0:0,1,2,3"
-# master_port=12345
-# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
-
-tasks=(
-    RTE
-    MRPC
-    STS-B
-    CoLA
-    SST-2
-    QNLI
-    QQP
-    MNLI
-)
-
-seeds=(
-    1234
-    1235
-    1236
-    1237
-    1238
-)
-
-lrs=(
-    2e-5
-    3e-5
-    4e-5
-    5e-5
-)
-
-for ((i=0;i<${#tasks[@]};++i)); do
-    task=${tasks[i]}
-    for ((j=0;j<${#seeds[@]};++j)); do
-        seed=${seeds[j]}
-        for ((k=0;k<${#lrs[@]};++k)); do
-            lr=${lrs[k]}
-            bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint}
-        done
-    done
-done
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py
deleted file mode 100644
index b359ecb6fbc7b646c5d3142d20086a4238bd3d92..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import os
-import statistics
-
-def gather_numbers(fname, match_keywords, index_keywords, index_offsets):
-    results = {}
-    for k in index_keywords:
-        results[k] = []
-    file1 = open(fname, 'r')
-    while True:
-        line = file1.readline()
-        if not line:
-            break
-        splits = line.split(' ')
-        for i in range(len(match_keywords)):
-            if match_keywords[i] in line:
-                ref_idx = splits.index(index_keywords[i])
-                results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]]))
-    file1.close()
-    return results
-
-def gather_GLUE_results(result_path, key, lr):
-    result = []
-    mnli_matched_result = []
-    mnli_mismatched_result = []
-    for file in os.listdir(result_path):
-        if file.startswith(key) and lr in file:
-            fname = f'{result_path}/{file}/output.log'
-            if os.path.exists(fname):
-                if key == "STS-B":
-                    results = gather_numbers(fname, ['metrics for'], ['spearmanr'], [2])
-                    overall_candidate = results['spearmanr']
-                    overall_candidate = [x * 100.0 for x in overall_candidate]
-                elif key == "CoLA":
-                    results = gather_numbers(fname, ['metrics for'], ['mcc'], [2])
-                    overall_candidate = results['mcc']
-                    overall_candidate = [x * 100.0 for x in overall_candidate]
-                elif key == "MNLI":
-                    results = gather_numbers(fname,
-                        ['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'],
-                        ['overall:', 'dev-matched:', 'dev-mismatched:'],
-                        [9, 9, 9])
-                    overall_candidate = results['overall:']
-                    matched_candidate = results['dev-matched:']
-                    mismatched_candidate = results['dev-mismatched:']
-                else:
-                    results = gather_numbers(fname, ['overall:'], ['overall:'], [9])
-                    overall_candidate = results['overall:']
-                if len(overall_candidate) > 0:
-                    if len(overall_candidate) != 3:
-                        print(f"{result_path} task {key} lr {lr} only has {len(overall_candidate)} epoch")
-                    best_index = overall_candidate.index(max(overall_candidate))
-                    result.append(overall_candidate[best_index])
-                    if key == "MNLI":
-                        mnli_matched_result.append(matched_candidate[best_index])
-                        mnli_mismatched_result.append(mismatched_candidate[best_index])
-    if len(result) > 0:
-        if len(result) != 5:
-            print(f"{result_path} task {key} lr {lr} only has {len(result)} seed")
-        if key == "MNLI":
-            best_index = result.index(statistics.median_high(result))
-            return round(mnli_matched_result[best_index],2), round(statistics.stdev(mnli_matched_result),2), round(mnli_mismatched_result[best_index],2), round(statistics.stdev(mnli_mismatched_result),2)
-        else:
-            return round(statistics.median_high(result),2), round(statistics.stdev(result),2)
-    else:
-        if key == "MNLI":
-            return None, None, None, None
-        else:
-            return None, None
-
-def gather_finetune_results(result_path, extra_col=[], lr="2e-5"):
-    output = ""
-    for field in extra_col:
-        output += f"{field} &"
-    task_output = ""
-    median_list, std_list = [], []
-    m_median, m_std, mm_median, mm_std = gather_GLUE_results(result_path, "MNLI", lr)
-    if m_median is not None:
-        median_list += [m_median, mm_median]
-        std_list += [m_std, mm_std]
-    task_output += f"{m_median}±{m_std} & {mm_median}±{mm_std} &"
-    tasks = ["QQP", "QNLI", "SST-2", "CoLA", "STS-B", "MRPC", "RTE"]
-    for task in tasks:
-        t_median, t_std = gather_GLUE_results(result_path, task, lr)
-        if t_median is not None:
-            median_list += [t_median]
-            std_list += [t_std]
-        if task == "RTE":
-            task_output += f"{t_median}±{t_std} "
-        else:
-            task_output += f"{t_median}±{t_std} &"
-    overall_median = round(sum(median_list) / len(median_list), 2)
-    overall_std = round(sum(std_list) / len(std_list), 2)
-    output += f"{overall_median}±{overall_std} &"
-    output += task_output
-    output += " \\\\"
-    print(output)
-
-if __name__ == '__main__':
-    print("\\begin{table}")
-    print("\centering")
-    print("\\tiny")
-    text = "\\begin{tabular}{@{}l|"
-    for _ in range(11):
-        text += "c"
-    text += "@{}}"
-    print(text)
-    print("\\toprule")
-    print("Case & Train tokens & Average & MNLI-m & MNLI-mm & QQP & QNLI & SST-2 & CoLA & STS-B & MRPC & RTE \\\\")
-    print("\midrule")
-    
-    result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/'
-    gather_finetune_results(result_path)
-    
-    print("\\bottomrule")
-    print("\end{tabular}")
-    print("\end{table}")
-    print("")
-    print("")
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pile_data_download_preprocess.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pile_data_download_preprocess.py
deleted file mode 100644
index 5a020359d8cd0b4f0ded4e8b69a20e33b808df26..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pile_data_download_preprocess.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import zstandard
-import sys
-import time
-import os
-
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir,os.path.pardir,os.path.pardir)))
-from megatron_ds.data import indexed_dataset
-
-def pile_download(download_url, file_path, i):
-    start = time.time()
-    zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
-    download_path = f"{download_url}{i:02}.jsonl.zst"
-    if not os.path.exists(zstd_file_path):
-        os.system(f"wget -P {file_path} {download_path}")
-        print(f"Finished downloading chunk {i} in {time.time() - start} sec")
-
-def pile_decompress(download_url, file_path, i):
-    zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
-    output_path = f"{file_path}{i:02}.jsonl"
-    if not os.path.exists(output_path):
-        if not os.path.exists(zstd_file_path):
-            pile_download(download_url, file_path, i)
-        start = time.time()
-        with open(zstd_file_path, 'rb') as compressed:
-            decomp = zstandard.ZstdDecompressor()
-            with open(output_path, 'wb') as destination:
-                decomp.copy_stream(compressed, destination)
-        os.remove(zstd_file_path)
-        print(f"Finished decompressing chunk {i} in {time.time() - start} sec")
-
-def pile_preprocess(download_url, file_path, vocab_file, num_workers, i):
-    json_file_path = f"{file_path}{i:02}.jsonl"
-    output_prefix = f"{file_path}pile_bert_train_{i:02}"
-    if not os.path.exists(f"{output_prefix}_text_sentence.idx"):
-        if not os.path.exists(json_file_path):
-            pile_decompress(download_url, file_path, i)
-        start = time.time()
-        cmd = f"python ../../tools/preprocess_data.py \
-                --input {json_file_path} \
-                --output-prefix {output_prefix} \
-                --vocab {vocab_file} \
-                --dataset-impl mmap \
-                --tokenizer-type BertWordPieceLowerCase \
-                --split-sentences \
-                --workers {num_workers} "
-        # It's possible to hit MemoryError during above cmd since the memory
-        # usage is proportional to num_workers. In this case we delete the
-        # incomplete output and user shall retry with smaller num_workers.
-        # Our experience show that chunk 6, 7, 9, 17, 18, 20, 21, 24, 27
-        # particularly have large memory usage.
-        if os.system(cmd) == 0: # Success
-            os.remove(json_file_path)
-        else:
-            print(f"Error: chunk {i} preprocessing got error, delete \
-                    incomplete output. If MemoryError appeared, please retry \
-                    with num_workers smaller than {num_workers}.")
-            if os.path.exists(f"{output_prefix}_text_sentence.idx"):
-                os.remove(f"{output_prefix}_text_sentence.idx")
-            if os.path.exists(f"{output_prefix}_text_sentence.bin"):
-                os.remove(f"{output_prefix}_text_sentence.bin")
-        print(f"Finished preprocessing chunk {i} in {time.time() - start} sec")
-
-def pile_merge(file_path):
-    start = time.time()
-    num_chunks = 30
-    vocab_size = 30524
-    for i in range(num_chunks):
-        output_prefix = f"{file_path}pile_bert_train_{i:02}"
-        assert os.path.exists(f"{output_prefix}_text_sentence.idx")
-        assert os.path.exists(f"{output_prefix}_text_sentence.bin")
-    builder = indexed_dataset.make_builder(
-        f"{file_path}pile_bert_train_text_sentence.bin", impl="mmap",
-        vocab_size=vocab_size)
-    for i in range(num_chunks):
-        chunk_file = f"{file_path}pile_bert_train_{i:02}_text_sentence"
-        print(f"Merging file {chunk_file}")
-        builder.merge_file_(chunk_file)
-    print("Finalizing merged file ...")
-    builder.finalize(f"{file_path}pile_bert_train_text_sentence.idx")
-    print(f"Finished merging in {time.time() - start} sec")
-    # After verifying the merged data with real training, you may want to
-    # delete the data chunks.
-    # for i in range(num_chunks):
-    #     output_prefix = f"{file_path}pile_bert_train_{i:02}"
-    #     os.remove(f"{output_prefix}_text_sentence.idx")
-    #     os.remove(f"{output_prefix}_text_sentence.bin")
-
-if __name__ == '__main__':
-    # Path to download and store all the output files during the whole process.
-    # Estimated max storage usage would be around 1.6 TB (or 780GB if skip the
-    # final merge). Memory usage is proportional to the num_workers below (can
-    # be as high as O(300GB) if num_workers is around 20).
-    file_path = "/blob/data/the_pile_bert/"
-    # The raw Pile data has 30 compressed .zst chunks. To run on single
-    # machine for all chunks, run "python prepare_pile_data.py range 0 30".
-    # You can also split and run on multiple machines to speed up, since
-    # processing one chunk can take hours. The whole process only uses CPU.
-    if sys.argv[1] == "merge":
-        # "python prepare_pile_data.py merge" means merge all 30 processed data
-        # chunks. Run it only after all 30 chunks are preprocessed. The memory
-        # usage during merge is about 600GB. If you don't have enough memory,
-        # one solution is to directly use the 30 data chunks as multiple
-        # datasets. See '--data-path' in
-        # github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/arguments.py
-        pile_merge(file_path)
-    else:
-        if sys.argv[1] == "range":
-            # "python prepare_pile_data.py range 0 30" means process chunk 0-29
-            selected_chunk = range(int(sys.argv[2]), int(sys.argv[3]))
-        else:
-            # "python prepare_pile_data.py 2 5 8" means process chunk 2, 5, 8
-            selected_chunk = [int(x) for x in sys.argv[1:]]
-        print("selected_chunk: ", selected_chunk)
-        # Number of process. Adjust based on your CPU/Memory.
-        num_workers = 20
-        # Where the raw Pile data can be downloaded. The url may change in
-        # future. Contact EleutherAI (https://github.com/EleutherAI/the-pile)
-        # if this url does not work.
-        download_url = "https://the-eye.eu/public/AI/pile/train/"
-        vocab_file = "bert-large-uncased-vocab.txt"
-        vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
-        if not os.path.exists(vocab_file):
-            os.system(f"wget {vocab_url}")
-        os.makedirs(file_path, exist_ok=True)
-
-        for i in selected_chunk:
-            pile_preprocess(download_url, file_path, vocab_file, num_workers, i)
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json
deleted file mode 100644
index cca845096a0af2c65d6cdf25a76b30b5239198df..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json
+++ /dev/null
@@ -1,73 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false,
-  "dataloader_drop_last": true,
-  "data_efficiency": {
-    "enabled": true,
-    "seed": DATA_EFFICIENCY_SEED,
-    "data_routing": {
-      "enabled": LTD_ENABLED,
-      "random_ltd":{
-        "enabled": LTD_ENABLED,
-        "total_layer_num": 24,
-        "random_ltd_layer_num": 22,
-        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
-        "model_mask_name": "attention_mask",
-        "model_type": "encoder",
-        "hidden_state_order": "seq_batch_dim",
-        "random_ltd_schedule": {
-          "min_value": LTD_MIN,
-          "max_value": LTD_MAX,
-          "schedule_type":"fixed_linear",
-          "schedule_config": {
-            "require_steps": LTD_STEP,
-            "seq_per_step": 16
-          }
-        }
-      } 
-    },
-    "data_sampling": {
-      "enabled": CL_ENABLED,
-      "num_workers": DATA_SAMPLING_NUM_WORKERS,
-      "curriculum_learning": {
-        "enabled": CL_ENABLED,
-        "data_cluster_path": "CL_CLUSTER_PATH",
-        "curriculum_metrics": {
-          "CL_1st_METRIC_NAME": {
-            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
-            "index_to_metric_path": "CL_1st_METRIC_PATH",
-            "difficulty_type": "CL_1st_DIFF_TYPE",
-            "clustering_type": "CL_1st_CLUSTER_TYPE",
-            "min_difficulty": CL_1st_MIN,
-            "max_difficulty": CL_1st_MAX,
-            "schedule_type": "fixed_root",
-            "schedule_config": {
-              "total_curriculum_step": CL_1st_TOTAL_STEP,
-              "difficulty_step": CL_1st_DIFF_STEP,
-              "root_degree": CL_1st_ROOT
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json
deleted file mode 100644
index 9461d6d5d73f6196970119444791e2e17aa175c6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json
+++ /dev/null
@@ -1,87 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false,
-  "dataloader_drop_last": true,
-  "data_efficiency": {
-    "enabled": true,
-    "seed": DATA_EFFICIENCY_SEED,
-    "data_routing": {
-      "enabled": LTD_ENABLED,
-      "random_ltd":{
-        "enabled": LTD_ENABLED,
-        "total_layer_num": 24,
-        "random_ltd_layer_num": 22,
-        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
-        "model_mask_name": "attention_mask",
-        "model_type": "encoder",
-        "hidden_state_order": "seq_batch_dim",
-        "random_ltd_schedule": {
-          "min_value": LTD_MIN,
-          "max_value": LTD_MAX,
-          "schedule_type":"fixed_linear",
-          "schedule_config": {
-            "require_steps": LTD_STEP,
-            "seq_per_step": 16
-          }
-        }
-      } 
-    },
-    "data_sampling": {
-      "enabled": CL_ENABLED,
-      "num_workers": DATA_SAMPLING_NUM_WORKERS,
-      "curriculum_learning": {
-        "enabled": CL_ENABLED,
-        "data_cluster_path": "CL_CLUSTER_PATH",
-        "curriculum_metrics": {
-          "CL_1st_METRIC_NAME": {
-            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
-            "index_to_metric_path": "CL_1st_METRIC_PATH",
-            "difficulty_type": "CL_1st_DIFF_TYPE",
-            "clustering_type": "CL_1st_CLUSTER_TYPE",
-            "min_difficulty": CL_1st_MIN,
-            "max_difficulty": CL_1st_MAX,
-            "schedule_type": "fixed_root",
-            "schedule_config": {
-              "total_curriculum_step": CL_1st_TOTAL_STEP,
-              "difficulty_step": CL_1st_DIFF_STEP,
-              "root_degree": CL_1st_ROOT
-            }
-          },
-          "CL_2nd_METRIC_NAME": {
-            "index_to_sample_path": "CL_2nd_SAMPLE_PATH",
-            "index_to_metric_path": "CL_2nd_METRIC_PATH",
-            "difficulty_type": "CL_2nd_DIFF_TYPE",
-            "clustering_type": "CL_2nd_CLUSTER_TYPE",
-            "min_difficulty": CL_2nd_MIN,
-            "max_difficulty": CL_2nd_MAX,
-            "schedule_type": "fixed_root",
-            "schedule_config": {
-              "total_curriculum_step": CL_2nd_TOTAL_STEP,
-              "difficulty_step": CL_2nd_DIFF_STEP,
-              "root_degree": CL_2nd_ROOT
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh
deleted file mode 100644
index cded1584375d6f0b4788427dca5fce8b43b1baf2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh
+++ /dev/null
@@ -1,472 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-### The main configs are from Megatron-LM paper
-### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
-### or build your own configs.
-seq_len=512
-global_batch_size=1024
-# lr=1e-4
-lr=$1
-min_lr=1e-5
-
-## init_std is the standard deviation for weight initialization. Usually larger
-## model needs lower std. Here we roughly follow a heuristic equation of
-## sqrt(1/3/hidden_size) from https://arxiv.org/pdf/2201.11990.pdf
-
-## In addition, we find that the 3.9B model (even after tuning init_std) has
-## NaN loss issue from the beginning thus unable to train. This is probably
-## because in this example we use the public Pile data, which is a more diverse
-## (and potentially more noisy) data than what used in Megatron paper. One
-## potential solution is only use the sub datasets in Pile that are also
-## used by Megatron paper.
-
-## BERT 110M (same config as original BERT-Base model)
-## This config is not included in Megatron-LM paper
-# model_size=0.11
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# init_std=0.02
-
-## BERT 336M (same config as original BERT-Large model)
-model_size=0.336
-num_layers=24
-hidden_size=1024
-num_attn_heads=16
-init_std=0.02
-
-## BERT 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=32
-# init_std=0.013
-
-## BERT 3.9B
-# model_size=3.9
-# num_layers=48
-# hidden_size=2560
-# num_attn_heads=40
-# init_std=0.011
-###############################################################################
-### Training duration configs
-## The main termination condition, original Megatron paper trains for 2M iters.
-## We changed to token-based termination since data efficiency techniques could
-## change token per step.
-calc() { awk "BEGIN{ printf \"%.0f\n\", $* }"; }
-# train_iters_in_million=2
-train_iters_in_million=$2
-train_tokens=$(calc $train_iters_in_million*1000000*$seq_len*$global_batch_size)
-train_tokens_in_billion=$(calc $train_tokens/1000000000)
-
-## A large enough number of iters, just to make sure we index enough data. The
-## only effective termination condition is the train_tokens above.
-train_iters=4000000
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration. Original Megatron paper uses 10000 warmup
-## iters. We changed lr decay to token based since data efficiency techniques
-## could change token per step.
-lr_warmup_iters=10000
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=${train_tokens}
-lr_decay_style="linear"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
-## pipeline parallelism is only integrated with the GPT case, and currently
-## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=0
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-batch_size=$(( ${global_batch_size} / ${dp_size} ))
-###############################################################################
-### Random layerwise token dropping (random-LTD) configs
-## random-LTD's main switch. "false" means disabled. "true" means enabled.
-ltd_enabled=${3:-'false'}
-## How much dropping ratio to start with. The value denotes the seqlen after
-## dropping.
-ltd_start=${4:-512}
-## How many steps for random-LTD to gradually reduce dropping ratio to zero.
-ltd_step_in_million=${5:-1}
-
-# ltd_enabled="true"
-# ltd_start=200
-# ltd_step_in_million=1.8
-ltd_step=$(calc $ltd_step_in_million*1000000)
-
-## For BERT pretraining, we observe that random-LTD when combined with zero
-## dropout can achieve better finetune accuracy on certain tasks. However, this
-## is not guaranteed for all models/tasks. It is still recommend to try both
-## with and without dropout for random-LTD.
-dropout=${6:-0.1}
-###############################################################################
-### Curriculum learning (CL) configs
-## CL's main switch. "false" means disabled. "true" means enabled.
-cl_enabled=${7:-'false'}
-## Number of CL metrics to use.
-cl_num_metric=${8:-1}
-
-## Name of difficulty metric
-cl_1st_metric=${9:-'dummy'}
-## Path to the data indexes for this difficulty metric. Samples on ith row of
-## index_to_sample have the difficulty value equals to ith row of
-## index_to_metric.
-cl_1st_index_to_sample_path=${10:-'dummy'}
-cl_1st_index_to_metric_path=${11:-'dummy'}
-## During training, whether increase difficulty by value- or percentile-based.
-cl_1st_difficulty_type=${12:-'value'}
-## "single_cluster" means no clustering required and probably CL is achieved by
-## data postprocessing. "schedule_based" means will cluster data based on the
-## difficulty schedule (pacing function) below.
-cl_1st_clustering_type=${13:-'single_cluster'}
-## Start difficulty
-cl_1st_min=${14:-512}
-## End difficulty
-cl_1st_max=${15:-512}
-## Total step to reach end difficulty
-cl_1st_total_step_in_million=${16:-1}
-## When changing difficulty, always make sure it's a multiple of the
-## difficulty_step below.
-cl_1st_difficulty_step=${17:-1}
-## Root degree of the schedule (pacing function).
-cl_1st_root=${18:-1}
-
-cl_2nd_metric=${19:-'dummy'}
-cl_2nd_index_to_sample_path=${20:-'dummy'}
-cl_2nd_index_to_metric_path=${21:-'dummy'}
-cl_2nd_difficulty_type=${22:-'value'}
-cl_2nd_clustering_type=${23:-'single_cluster'}
-cl_2nd_min=${24:-2048}
-cl_2nd_max=${25:-2048}
-cl_2nd_total_step_in_million=${26:-1}
-cl_2nd_difficulty_step=${27:-1}
-cl_2nd_root=${28:-1}
-
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# ## The *_index_to_sample_percentile_merged is a concatenated index for perf
-# ## improvement, but it only works when you set difficulty_type="percentile" in
-# ## ds_config. If you use difficulty_type="value", you need to change this to
-# ## *_index_to_sample
-# # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="value"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=600
-# cl_1st_max=9069
-# cl_1st_total_step_in_million=0.96
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=128
-# cl_2nd_max=512
-# cl_2nd_total_step_in_million=0.96
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-
-cl_1st_total_step=$(calc $cl_1st_total_step_in_million*1000000)
-cl_2nd_total_step=$(calc $cl_2nd_total_step_in_million*1000000)
-###############################################################################
-### Misc configs
-log_interval=100
-eval_iters=10
-eval_interval=1000
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-save_interval=$((${estimated_train_iter} / ${num_save}))
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-## Number of workers for dataloader. We found that for BERT pre-training,
-## num_workers will greatly affect data loading time and overall training
-## time. In our experiment with 64 GPUs, the performance reaches peak at
-## num_workers = 4 but it may differ depending on hardware. Also note that
-## larger num_workers add more CPU computation/memory overhead.
-num_workers=4
-
-## Public the Pile dataset, see ../pile_data_download_preprocess.py about how
-## to download and preprocess the data. Change data_home to where you store the
-## pile_bert_train_text_sentence.bin and pile_bert_train_text_sentence.idx.
-data_home="/vc_data_blob/users/conglli/the_pile_bert"
-if [[ "$host" == *"webxt"* ]]; then
-    data_home="/blob/data/the_pile_bert"
-fi
-data_path="${data_home}/pile_bert_train_text_sentence"
-## train_idx_path forces Megatron to use a specific data index file generated
-## when we analyze data. This is needed because our index for curriculum
-## learning difficulty metric is based on this data index.
-train_idx_path="${data_home}/pile_bert_train_text_sentence_train_indexmap_exact5ep_509msl_0.10ssp_1234s.npy"
-
-vocab_path="bert-large-uncased-vocab.txt"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
-fi
-
-prescale_grad="true"
-jobname="bert_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_iters}_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}"
-if [ "${ltd_enabled}" = "true" ]; then
-    jobname="${jobname}_ltd_${ltd_start}_${ltd_step_in_million}M_drop${dropout}"
-fi
-if [ "${cl_enabled}" = "true" ]; then
-    jobname="${jobname}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step_in_million}M_${cl_1st_root}"
-    if [[ $cl_num_metric -gt 1 ]]; then
-        jobname="${jobname}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step_in_million}M_${cl_2nd_root}"
-    fi
-fi
-
-username=$(whoami)
-output_home="/blob/users/${username}/project/data_efficient_bert"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/data_efficient_bert/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-if [ "${cl_enabled}" = "true" ]; then
-    data_cluster_path="${output_home}/data_cluster/${jobname}"
-    mkdir -p ${data_cluster_path}
-fi
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.999 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-iters ${lr_warmup_iters} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-iters ${train_iters} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-if [ "${ltd_enabled}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --attention-dropout ${dropout} \
-    --hidden-dropout ${dropout} \
-    --random-ltd"
-fi
-
-if [ "${cl_enabled}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --train-idx-path ${train_idx_path} \
-    --data-efficiency-curriculum-learning"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_seed${seed}"
-if [ "${ltd_enabled}" = "true" ]; then
-    config_json="${config_json}_ltd_${ltd_start}_${ltd_step}"
-fi
-if [ "${cl_enabled}" = "true" ]; then
-    config_json="${config_json}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}"
-    if [[ $cl_num_metric -gt 1 ]]; then
-        config_json="${config_json}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}"
-    fi
-fi
-config_json="${config_json}.json"
-if [[ $cl_num_metric -gt 1 ]]; then
-template_json="ds_config_bert_2clmetrics_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-    | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \
-    | sed "s/LTD_ENABLED/${ltd_enabled}/" \
-    | sed "s/LTD_MIN/${ltd_start}/" \
-    | sed "s/LTD_MAX/${seq_len}/" \
-    | sed "s/LTD_STEP/${ltd_step}/" \
-    | sed "s/CL_ENABLED/${cl_enabled}/" \
-    | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \
-    | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \
-    | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \
-    | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \
-    | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \
-    | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \
-    | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \
-    | sed "s/CL_1st_MIN/${cl_1st_min}/" \
-    | sed "s/CL_1st_MAX/${cl_1st_max}/" \
-    | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \
-    | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \
-    | sed "s/CL_1st_ROOT/${cl_1st_root}/" \
-    | sed "s#CL_2nd_METRIC_NAME#${cl_2nd_metric}#" \
-    | sed "s#CL_2nd_SAMPLE_PATH#${cl_2nd_index_to_sample_path}#" \
-    | sed "s#CL_2nd_METRIC_PATH#${cl_2nd_index_to_metric_path}#" \
-    | sed "s#CL_2nd_DIFF_TYPE#${cl_2nd_difficulty_type}#" \
-    | sed "s#CL_2nd_CLUSTER_TYPE#${cl_2nd_clustering_type}#" \
-    | sed "s/CL_2nd_MIN/${cl_2nd_min}/" \
-    | sed "s/CL_2nd_MAX/${cl_2nd_max}/" \
-    | sed "s/CL_2nd_TOTAL_STEP/${cl_2nd_total_step}/" \
-    | sed "s/CL_2nd_DIFF_STEP/${cl_2nd_difficulty_step}/" \
-    | sed "s/CL_2nd_ROOT/${cl_2nd_root}/" \
-      > ${config_json}
-else
-template_json="ds_config_bert_1clmetric_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-    | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \
-    | sed "s/LTD_ENABLED/${ltd_enabled}/" \
-    | sed "s/LTD_MIN/${ltd_start}/" \
-    | sed "s/LTD_MAX/${seq_len}/" \
-    | sed "s/LTD_STEP/${ltd_step}/" \
-    | sed "s/CL_ENABLED/${cl_enabled}/" \
-    | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \
-    | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \
-    | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \
-    | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \
-    | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \
-    | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \
-    | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \
-    | sed "s/CL_1st_MIN/${cl_1st_min}/" \
-    | sed "s/CL_1st_MAX/${cl_1st_max}/" \
-    | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \
-    | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \
-    | sed "s/CL_1st_ROOT/${cl_1st_root}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../../../pretrain_bert.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
deleted file mode 100644
index c771a0e27726b4bfc23e3999f8b590a90c1f0699..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
+++ /dev/null
@@ -1,363 +0,0 @@
-###############################################################################
-### Each block below is one pretraining setup. Uncomment one block to try.
-###############################################################################
-### Baseline cases, mostly based on Megatron-LM's BERT-Large hyperparameters,
-### but with some changes (different LR schedule).
-## Baseline 1049B tokens (100%):
-# lr=1e-4
-# train_iters_in_million=2
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million}
-###############################################################################
-## Baseline 703B tokens (67%):
-# lr=1.5e-4
-# train_iters_in_million=134e-2
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million}
-###############################################################################
-## Baseline 524B tokens (50%):
-# lr=2e-4
-# train_iters_in_million=1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million}
-###############################################################################
-### Curriculum learning (CL) + Random layerwise token dropping (random-LTD).
-### DeepSpeed Data Efficiency's composed solution.
-### BERT pretraining.
-## CL+random-LTD 1049B tokens (100%):
-# lr=1e-4
-# train_iters_in_million=2
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step_in_million=2
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=5
-# cl_1st_max=100
-# cl_1st_total_step_in_million=96e-2
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=128
-# cl_2nd_max=512
-# cl_2nd_total_step_in_million=96e-2
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL+random-LTD 524B tokens (50%):
-# lr=2e-4
-# train_iters_in_million=1
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step_in_million=1
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=5
-# cl_1st_max=100
-# cl_1st_total_step_in_million=48e-2
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=128
-# cl_2nd_max=512
-# cl_2nd_total_step_in_million=48e-2
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-### Random layerwise token dropping (random-LTD).
-## random-LTD 1049B tokens (100%):
-# lr=1e-4
-# train_iters_in_million=2
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step_in_million=2
-# dropout=1e-1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout}
-###############################################################################
-## random-LTD 703B tokens (67%):
-# lr=1.5e-4
-# train_iters_in_million=134e-2
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step_in_million=134e-2
-# dropout=1e-1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout}
-###############################################################################
-## random-LTD 524B tokens (50%):
-# lr=2e-4
-# train_iters_in_million=1
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step_in_million=1
-# dropout=1e-1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout}
-###############################################################################
-### Curriculum learning (CL).
-## CL vocab rarity + seqlen truncation 524B tokens (50%):
-# lr=2e-4
-# train_iters_in_million=1
-# ltd_enabled="false"
-# ltd_start=512
-# ltd_step_in_million=1
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=5
-# cl_1st_max=100
-# cl_1st_total_step_in_million=48e-2
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=128
-# cl_2nd_max=512
-# cl_2nd_total_step_in_million=48e-2
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL vocab rarity + seqlen truncation 703B tokens (67%):
-# lr=1.5e-4
-# train_iters_in_million=134e-2
-# ltd_enabled="false"
-# ltd_start=512
-# ltd_step_in_million=1
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=5
-# cl_1st_max=100
-# cl_1st_total_step_in_million=64e-2
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=128
-# cl_2nd_max=512
-# cl_2nd_total_step_in_million=64e-2
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL vocab rarity + seqlen truncation 1049B tokens (100%):
-# lr=1e-4
-# train_iters_in_million=2
-# ltd_enabled="false"
-# ltd_start=512
-# ltd_step_in_million=1
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=5
-# cl_1st_max=100
-# cl_1st_total_step_in_million=96e-2
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=128
-# cl_2nd_max=512
-# cl_2nd_total_step_in_million=96e-2
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL vocab rarity + seqlen reorder 1049B tokens (100%):
-# lr=1e-4
-# train_iters_in_million=2
-# ltd_enabled="false"
-# ltd_start=512
-# ltd_step_in_million=1
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=1
-# cl_1st_metric="seqlenvocabrarity"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen_vocab_rarity/seqlen_vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen_vocab_rarity/seqlen_vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=5
-# cl_1st_max=100
-# cl_1st_total_step_in_million=96e-2
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root}
-###############################################################################
-## CL vocab rarity 1049B tokens (100%):
-# lr=1e-4
-# train_iters_in_million=2
-# ltd_enabled="false"
-# ltd_start=512
-# ltd_step_in_million=1
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=1
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=5
-# cl_1st_max=100
-# cl_1st_total_step_in_million=96e-2
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root}
-###############################################################################
-## CL seqlen truncation 1049B tokens (100%):
-# lr=1e-4
-# train_iters_in_million=2
-# ltd_enabled="false"
-# ltd_start=512
-# ltd_step_in_million=1
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=1
-# cl_1st_metric="seqlen_truncate"
-# cl_1st_index_to_sample_path="dummy"
-# cl_1st_index_to_metric_path="dummy"
-# cl_1st_difficulty_type="value"
-# cl_1st_clustering_type="single_cluster"
-# cl_1st_min=128
-# cl_1st_max=512
-# cl_1st_total_step_in_million=96e-2
-# cl_1st_difficulty_step=8
-# cl_1st_root=1
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root}
-###############################################################################
-## CL seqlen reorder 1049B tokens (100%):
-# lr=1e-4
-# train_iters_in_million=2
-# ltd_enabled="false"
-# ltd_start=512
-# ltd_step_in_million=1
-# dropout=1e-1
-# cl_enabled="true"
-# cl_num_metric=1
-# cl_1st_metric="seqlen"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen/seqlen_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen/seqlen_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="single_cluster"
-# cl_1st_min=5
-# cl_1st_max=100
-# cl_1st_total_step_in_million=96e-2
-# cl_1st_difficulty_step=8
-# cl_1st_root=2
-# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
-#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root}
-###############################################################################
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/ds_analyze_gpt_data_map.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/ds_analyze_gpt_data_map.sh
deleted file mode 100644
index 3b1caf06f3f6630fac9ce189b810f909ae54d62a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/ds_analyze_gpt_data_map.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-
-num_workers=1 # Num nodes to run the map job
-num_threads=40 # Num threads on each node. Set this based on #CPU cores
-
-# If different data epochs have slightly different data samples (e.g., due
-# to randomness), then you need to specify large enough num_epochs that cover
-# whole pretraining. If different data epochs are the same, set num_epochs to
-# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
-# library will automatically handle reshuffling when reaching another epoch.
-num_epochs=1
-
-# Which node is this node (start with 0 and end with num_workers-1). This
-# script only launch the map job on 1 worker node, since we don't expect
-# running on many nodes and workers don't need any communication. But you
-# can modify this script to add a MPI/torch distributed launcher.
-worker_id=$1
-save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/"
-
-metric='total_vocab_freq'
-# metric='vocab_rarity' # this requires the result of total_vocab_freq
-
-seq_len=2048
-batch_size=10000
-
-jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}"
-# Public the Pile dataset, can be downloaded at
-# https://mystic.the-eye.eu/public/AI/pile_neox/
-## Change data_home to your own training data path.
-# data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-data_home="/blob/data/the_pile_public_merged_nopreprocessing"
-data_path="${data_home}/pile_text_document"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-# Make sure the "--split" is the same as what you will use for pre-training.
-options=" \
-    --analyzing-task map \
-    --analyzing-data-type GPT \
-    --analyzing-metric ${metric} \
-    --analyzing-num-workers ${num_workers} \
-    --analyzing-worker-id ${worker_id} \
-    --analyzing-num-threads ${num_threads} \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap \
-    --tokenizer-type GPT2BPETokenizer \
-    --micro-batch-size ${batch_size} \
-    --global-batch-size ${batch_size} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --num-layers 1 \
-    --hidden-size 1 \
-    --num-attention-heads 1 \
-    --split 949,50,1 \
-    --distributed-backend gloo \
-    --train-data-exact-num-epochs ${num_epochs} \
-    --return-data-index \
-    --save-interval 1 \
-    --save ${save_path}"
-
-python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh
deleted file mode 100644
index a1242ea94d8f2ff80c6ec8db4416629f83007e3c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# Set these 2 to the same as what you used during map job. We need these 2
-# configs to know how many map job result files do we have.
-num_workers=1
-num_threads=40
-# Reduce job only has 1 worker but can accelerate by multithreading.
-num_threads_reduce=40
-
-# If different data epochs have slightly different data samples (e.g., due
-# to randomness), then you need to specify large enough num_epochs that cover
-# whole pretraining. If different data epochs are the same, set num_epochs to
-# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
-# library will automatically handle reshuffling when reaching another epoch.
-num_epochs=1
-
-save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/"
-
-metric='total_vocab_freq'
-# metric='vocab_rarity' # this requires the result of total_vocab_freq
-
-seq_len=2048
-batch_size=10000
-
-jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-reduce"
-# Public the Pile dataset, can be downloaded at
-# https://mystic.the-eye.eu/public/AI/pile_neox/
-## Change data_home to your own training data path.
-# data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-data_home="/blob/data/the_pile_public_merged_nopreprocessing"
-data_path="${data_home}/pile_text_document"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-# Make sure the "--split" is the same as what you will use for pre-training.
-options=" \
-    --analyzing-task reduce \
-    --analyzing-data-type GPT \
-    --analyzing-metric ${metric} \
-    --analyzing-num-workers ${num_workers} \
-    --analyzing-num-threads ${num_threads} \
-    --analyzing-num-threads-reduce ${num_threads_reduce} \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap \
-    --tokenizer-type GPT2BPETokenizer \
-    --micro-batch-size ${batch_size} \
-    --global-batch-size ${batch_size} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --num-layers 1 \
-    --hidden-size 1 \
-    --num-attention-heads 1 \
-    --split 949,50,1 \
-    --distributed-backend gloo \
-    --train-data-exact-num-epochs ${num_epochs} \
-    --return-data-index \
-    --save-interval 1 \
-    --save ${save_path}"
-
-python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_config_eval_dummy.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_config_eval_dummy.json
deleted file mode 100644
index 72ffd2a7a0fd7cdb9e9bf3ec955f5d22e5f046bb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_config_eval_dummy.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-"train_batch_size" : 2048,
-"train_micro_batch_size_per_gpu": 16,
-"steps_per_print": 10,
-
-"zero_optimization": {
-    "stage": 0
-},
-
-"gradient_clipping": 1.0,
-"prescale_gradients": true,
-
-"fp16": {
-    "enabled": false,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-},
-
-"bf16": {
-    "enabled": false
-},
-
-"wall_clock_breakdown" : false
-}
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh
deleted file mode 100644
index 32ade49172fbc4495eb247221eca2a60d4b94501..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-## CAUTION: first read Megatron-DeepSpeed/blob/main/examples_deepspeed/MoE/readme_evalharness.md
-## and follow the steps of installation/data downloading.
-
-## Code below only works when you run each evalharness task on a single GPU.
-## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples_deepspeed/MoE/ds_evalharness.sh
-checkpoint_path=$1
-config_path=$2
-result_path=$3
-rank=$4
-tasks=$5
-hostname=$6
-master_port=$(( 12345 + ${rank} ))
-batch_size=$7
-num_fewshot=$8
-
-mp_size=1
-pp_size=1
-no_pp="true"
-ep_size=1
-
-vocab_file="gpt2-vocab.json"
-if [ ! -f "$vocab_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_file="gpt2-merges.txt"
-if [ ! -f "$merge_file" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-# export HF_DATASETS_OFFLINE=1
-
-dir2=$(dirname "$checkpoint_path")
-dirname=$(basename "$dir2")/$(basename "$checkpoint_path")
-result_path="${result_path}/${dirname}"
-mkdir -p $result_path
-result_file="${result_path}/${tasks}_${num_fewshot}shot.json"
-
-# Dummy arguments to make megatron happy. No need to configure them.
-# The reason we don't need to configure them and many other arguments is
-# because the eval framework will read the arguments from checkpoint file.
-megatron_required_args="\
-    --num-layers -1 \
-    --hidden-size -1 \
-    --num-attention-heads -1 \
-    --seq-length -1 \
-    --max-position-embeddings -1
-"
-
-command="../../../../tasks/eval_harness/evaluate.py \
-    --load ${checkpoint_path} \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --moe-expert-parallel-size ${ep_size} \
-    --vocab-file ${vocab_file} \
-    --merge-file ${merge_file} \
-    --micro-batch-size ${batch_size} \
-    --no-load-optim \
-    --no-load-rng \
-    --inference \
-    --disable-moe-token-dropping \
-    --tokenizer-type GPT2BPETokenizer \
-    --adaptive_seq_len \
-    --eval_fp32 \
-    --num_fewshot ${num_fewshot} \
-    --task_list ${tasks} \
-    --results_path ${result_file} \
-    --deepspeed \
-    --deepspeed_config ${config_path} \
-    ${megatron_required_args} \
-    "
-
-if [[ "${no_pp}" = "true" ]]; then
-command="${command} \
-    --no-pipeline-parallel"
-fi
-
-launcher="deepspeed --include=$hostname:$rank --master_port=${master_port}"
-$launcher $command &> "${result_path}/${tasks}_${num_fewshot}shot.log"
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_gather_result.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_gather_result.py
deleted file mode 100644
index e0c0c332c3c28e5c35f3c37d30a2d5bf32dc55a0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_gather_result.py
+++ /dev/null
@@ -1,358 +0,0 @@
-import json
-import os
-import math
-from math import log10, floor
-import copy
-
-def mean(arr):
-    return sum(arr) / len(arr)
-
-
-def pop_stddev(arr):
-    mu = mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
-
-
-def sample_stddev(arr):
-    mu = mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
-
-
-def mean_stderr(arr):
-    return sample_stddev(arr) / math.sqrt(len(arr))
-
-
-def median(arr):
-    return arr[len(arr) // 2]
-
-metric_dict = {
-    "hellaswag":"acc_norm",
-    "lambada":"acc",
-    "triviaqa":"acc",
-    "webqs":"acc",
-    "winogrande":"acc",
-    "piqa":"acc_norm",
-    "arc_challenge":"acc_norm",
-    "arc_easy":"acc_norm",
-    "openbookqa":"acc_norm",
-    "race":"acc",
-    "boolq":"acc",
-    "cb":"acc",
-    "copa":"acc",
-    "rte":"acc",
-    "wic":"acc",
-    "wsc":"acc",
-    "multirc":"acc",
-    "record":"f1",
-    "anli_r1":"acc",
-    "anli_r2":"acc",
-    "anli_r3":"acc",
-    "wikitext":"word_perplexity",
-    "logiqa":"acc_norm",
-    "mathqa":"acc_norm",
-    "mc_taco":"f1",
-    "mrpc":"acc",
-    "prost":"acc_norm",
-    "pubmedqa":"acc",
-    "qnli":"acc",
-    "qqp":"acc",
-    "sciq":"acc_norm",
-    "sst":"acc",
-    "wnli":"acc"
-}
-
-official_dict = {
-    "hellaswag":["HellaSwag","acc"],
-    "lambada":["LAMBADA","acc"],
-    "triviaqa":["TriviaQA","acc"],
-    "webqs":["WebQs","acc"],
-    "winogrande":["Winogrande","acc"],
-    "piqa":["PIQA","acc"],
-    "arc_challenge":["ARC Challenge","acc"],
-    "arc_easy":["ARC Easy","acc"],
-    "openbookqa":["OpenBookQA","acc"],
-    "race":["RACE-h","acc"],
-    "boolq":["BoolQ","acc"],
-    "cb":["CB","acc"],
-    "copa":["Copa","acc"],
-    "rte":["RTE","acc"],
-    "wic":["WiC","acc"],
-    "wsc":["WSC","acc"],
-    "multirc":["MultiRC","acc"],
-    "record":["ReCoRD","f1"],
-    "anli_r1":["ANLI R1","acc"],
-    "anli_r2":["ANLI R2","acc"],
-    "anli_r3":["ANLI R3","acc"],
-    "wikitext":["WikiText-2","ppl"],
-    "logiqa":["LogiQA","acc"],
-    "mathqa":["MathQA","acc"],
-    "mc_taco":["MC-TACO","f1"],
-    "mrpc":["MRPC","acc"],
-    "prost":["PROST","acc"],
-    "pubmedqa":["PubMedQA","acc"],
-    "qnli":["QNLI","acc"],
-    "qqp":["QQP","acc"],
-    "sciq":["SciQ","acc"],
-    "sst":["SST-2","acc"],
-    "wnli":["WNLI","acc"]
-}
-
-# When comparing with gpt3 paper, the most trustful tasks are the hellaswag to
-# anli_r3, who have >= 1000 samples (less variation), and have <= 43% data
-# contamination in the paper.
-gpt3paper_zeroshoteval = {
-    "hellaswag":[33.7,43.6,51.0,54.7,62.8,67.4,70.9,78.9],
-    "lambada":[42.7,54.3,60.4,63.6,67.1,70.3,72.5,76.2],
-    "triviaqa":[4.15,7.61,14.0,19.7,31.3,38.7,41.8,64.3],
-    "webqs":[1.77,3.20,4.33,4.63,7.92,7.73,8.22,14.4],
-    "winogrande":[52.0,52.1,57.4,58.7,62.3,64.5,67.9,70.2],
-    "piqa":[64.6,70.2,72.9,75.1,75.6,78.0,78.5,81.0],
-    "arc_challenge":[26.6,29.5,31.8,35.5,38.0,41.4,43.7,51.4],
-    "arc_easy":[43.6,46.5,53.0,53.8,58.2,60.2,63.8,68.8],
-    "anli_r1":[33.4,34.2,33.4,33.4,34.2,32.3,33.2,34.6],
-    "anli_r2":[33.2,31.9,33.3,33.3,33.8,33.5,33.5,35.4],
-    "anli_r3":[33.6,34.0,33.8,33.4,35.3,34.8,34.4,34.5],
-    "openbookqa":[35.6,43.2,45.2,46.8,53.0,50.4,55.6,57.6],
-    "race":[35.2,37.9,40.1,40.9,42.4,44.1,44.6,45.5],
-    "boolq":[49.7,60.3,58.9,62.4,67.1,65.4,66.2,60.5],
-    "cb":[0.00,32.1,8.93,19.6,19.6,28.6,19.6,46.4],
-    "copa":[66.0,68.0,73.0,77.0,76.0,80.0,84.0,91.0],
-    "rte":[47.7,49.8,48.4,56.0,46.6,55.2,62.8,63.5],
-    "wic":[0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00],
-    "wsc":[59.6,56.7,65.4,61.5,66.3,60.6,64.4,65.4],
-    "multirc":[4.72,9.65,12.3,13.6,14.3,18.4,24.2,27.6],
-    "record":[71.9,79.2,82.8,85.2,87.3,89.5,90.4,91.0]
-}
-
-gpt3paper_fewshoteval = {
-    "hellaswag":[33.5,43.1,51.3,54.9,62.9,67.3,71.3,79.3],
-    "lambada":[22.0,40.4,63.2,57.0,78.1,79.1,81.3,86.4],
-    "triviaqa":[6.96,16.3,26.5,32.1,42.3,51.6,57.5,71.2],
-    "webqs":[5.46,12.6,15.9,19.6,24.8,27.7,33.5,41.5],
-    "winogrande":[51.3,52.6,57.5,59.1,62.6,67.4,70.0,77.7],
-    "piqa":[64.3,69.4,72.0,74.3,75.4,77.8,79.9,82.3],
-    "arc_challenge":[25.5,28.4,32.3,36.7,39.5,43.7,44.8,51.5],
-    "arc_easy":[42.7,51.0,58.1,59.1,62.1,65.8,69.1,70.1],
-    "anli_r1":[32.1,32.5,30.9,32.5,33.5,33.1,33.3,36.8],
-    "anli_r2":[35.7,33.8,32.1,31.4,32.6,33.3,32.6,34.0],
-    "anli_r3":[35.0,34.4,35.1,36.0,32.7,33.9,34.5,40.2],
-    "openbookqa":[37.0,43.6,48.0,50.6,55.6,55.2,60.8,65.4],
-    "race":[34.3,37.0,40.4,41.4,42.3,44.7,45.1,46.8],
-    "boolq":[43.1,60.6,62.0,64.1,70.3,70.0,70.2,77.5],
-    "cb":[42.9,58.9,53.6,69.6,67.9,60.7,66.1,82.1],
-    "copa":[67.0,64.0,72.0,77.0,83.0,83.0,86.0,92.0],
-    "rte":[52.3,48.4,46.9,50.9,56.3,49.5,60.6,72.9],
-    "wic":[49.8,55.0,53.0,53.0,51.6,53.1,51.1,55.3],
-    "wsc":[58.7,60.6,54.8,49.0,62.5,67.3,75.0,75.0],
-    "multirc":[6.09,11.8,16.8,20.8,24.7,23.8,25.0,32.5],
-    "record":[70.7,77.9,82.1,84.0,87.5,88.8,89.8,90.1]
-}
-
-gpt3paper_zeroshoteval_index = {
-    "125M":0, # Small
-    "350M":1, # Medium
-    "760M":2, # Large
-    "1.3B":3, # XL
-    "2.7B":4,
-    "6.7B":5,
-    "13B":6,
-    "175B":7
-}
-
-def round_sig(x, sig=3):
-    if x == 0:
-        return 0
-    return round(x, sig-int(floor(log10(abs(x))))-1)
-
-def generate_result_table(tab_header, configs, task_order, caption, avg_range,
-    avg_tag, avg_only=False, fontsize="\\footnotesize", find_best=False,
-    candidate_range=None, candidate_task=None, split_name_by_space=False,
-    print_stderr=False, few_shot=False):
-    # Gather results
-    result_list = []
-    for i in range(len(configs)):
-        result_dict = {}
-        eval_path = configs[i][-1]
-        if "paper" in configs[i][0]:
-            assert eval_path is None
-        if eval_path is None:
-            assert "paper" in configs[i][0]
-            assert configs[i][1] in gpt3paper_zeroshoteval_index, "the second element has to be the model size"
-            paper_result_idx = gpt3paper_zeroshoteval_index[configs[i][1]]
-            if few_shot:
-                for task in gpt3paper_fewshoteval:
-                    result_dict[task] = [gpt3paper_fewshoteval[task][paper_result_idx]]
-            else:
-                for task in gpt3paper_zeroshoteval:
-                    result_dict[task] = [gpt3paper_zeroshoteval[task][paper_result_idx]]
-        else:
-            for file in os.listdir(eval_path):
-                if file.endswith(".json"):
-                    result = json.load(open(eval_path+"/"+file, "r"))
-                    for task in result['results']:
-                        if task != "wikitext":
-                            result_dict[task] = [100.0*result['results'][task][metric_dict[task]]]
-                        else:
-                            result_dict[task] = [result['results'][task][metric_dict[task]]]
-        result_list.append(result_dict)
-    avg_list = []
-    for i in range(len(configs)):
-        average_results = []
-        for j in range(len(avg_range)):
-            results = []
-            for k in range(avg_range[j]+1):
-                if task_order[k] in result_list[i]:
-                    results.append(result_list[i][task_order[k]][0])
-            if len(results) > 0:
-                average_results.append(float(sum(results))/len(results))
-            else:
-                average_results.append(0)
-        avg_list.append(average_results)
-
-    if find_best:
-        best_avg_value = [0 for _ in range(len(avg_range))]
-        best_avg_idx = [0 for _ in range(len(avg_range))]
-        best_task_value = [0 for _ in range(len(candidate_task))]
-        best_task_idx = [0 for _ in range(len(candidate_task))]
-        for i in range(candidate_range, len(configs)):
-            for j in range(len(avg_range)):
-                if avg_list[i][j] > best_avg_value[j]:
-                    best_avg_value[j] = avg_list[i][j]
-                    best_avg_idx[j] = i
-            for j in range(len(candidate_task)):
-                if result_list[i][candidate_task[j]] > best_task_value[j]:
-                    best_task_value[j] = result_list[i][candidate_task[j]]
-                    best_task_idx[j] = i
-        # reorder configs, result_list, avg_list to only keep the best cases
-        new_configs = configs[:candidate_range]
-        new_result_list = result_list[:candidate_range]
-        new_avg_list = avg_list[:candidate_range]
-        for i in range(len(avg_range)):
-            selected_config = copy.deepcopy(configs[best_avg_idx[i]])
-            selected_config[0] = "({})Best Avg{}".format(len(new_configs),
-                avg_tag[i])
-            new_configs.append(selected_config)
-            new_result_list.append(result_list[best_avg_idx[i]])
-            new_avg_list.append(avg_list[best_avg_idx[i]])
-
-        for i in range(len(candidate_task)):
-            selected_config = copy.deepcopy(configs[best_task_idx[i]])
-            selected_config[0] = "({})Best {}".format(len(new_configs),
-                official_dict[candidate_task[i]][0])
-            new_configs.append(selected_config)
-            new_result_list.append(result_list[best_task_idx[i]])
-            new_avg_list.append(avg_list[best_task_idx[i]])
-        configs = new_configs
-        result_list = new_result_list
-        avg_list = new_avg_list
-
-    # split the case names by space
-    if split_name_by_space:
-        max_num_row = 1
-        splitted_names = []
-        for i in range(len(configs)):
-            new_name = configs[i][0].split()
-            max_num_row = max(max_num_row, len(new_name))
-            splitted_names.append(new_name)
-        tab_header = ["" for _ in range(max_num_row-1)] + tab_header
-        for i in range(len(configs)):
-            padding = ["" for _ in range(max_num_row-len(splitted_names[i]))]
-            configs[i] = padding + splitted_names[i] + configs[i][1:]
-    
-    # generate the table
-    print("\\begin{table}")
-    print("\centering")
-    print(fontsize)
-    print("\caption{"+caption+"}")
-    text = "\\begin{tabular}{@{}l|"
-    for _ in range(len(configs)):
-        text += "c"
-    text += "@{}}"
-    print(text)
-    print("\\toprule")
-    for i in range(len(tab_header)):
-        text = "{} &".format(tab_header[i])
-        for j in range(len(configs)):
-            if j != len(configs) - 1:
-                text += (configs[j][i] + "& ")
-            else:
-                text += (configs[j][i] + "\\\\")
-        print(text)
-    print("\midrule")
-    for i in range(len(avg_range)):
-        text = ("Avg. " + avg_tag[i])
-        arr = []
-        for j in range(len(configs)):
-            arr.append(avg_list[j][i])
-            text += " & {}".format(round_sig(avg_list[j][i]))
-        text += "\\\\"
-        if print_stderr:
-            arr_mean = mean(arr)
-            arr_std = sample_stddev(arr)
-            text += " % mean {:.3f}, std {:.3f}, mean+1std {:.3f}, mean+2std {:.3f}, mean+3std {:.3f}".format(
-                arr_mean, arr_std, arr_mean+arr_std, arr_mean+arr_std*2, arr_mean+arr_std*3)
-        print(text)
-    if not avg_only:
-        print("\midrule")
-        for i in range(len(task_order)):
-            task = task_order[i]
-            text = "({}) {}".format(i, official_dict[task][0])
-            arr = []
-            for j in range(len(configs)):
-                result_dict = result_list[j]
-                if task in result_dict:
-                    text += " & {}".format(round_sig(result_dict[task][0]))
-                    arr.append(result_dict[task][0])
-                else:
-                    text += " & N/A"
-            text += "\\\\"
-            if print_stderr:
-                arr_mean = mean(arr)
-                arr_std = sample_stddev(arr)
-                if task != "wikitext":
-                    text += " % mean {:.3f}, std {:.3f}, mean+1std {:.3f}, mean+2std {:.3f}, mean+3std {:.3f}".format(
-                        arr_mean, arr_std, arr_mean+arr_std, arr_mean+arr_std*2, arr_mean+arr_std*3)
-                else:
-                    text += " % mean {:.3f}, std {:.3f}, mean-1std {:.3f}, mean-2std {:.3f}, mean-3std {:.3f}".format(
-                        arr_mean, arr_std, arr_mean-arr_std, arr_mean-arr_std*2, arr_mean-arr_std*3)
-            print(text)
-    print("\\bottomrule")
-    print("\end{tabular}")
-    print("\end{table}")
-    print("")
-    print("")
-
-if __name__ == '__main__':
-    task_order = ["hellaswag","lambada","triviaqa","webqs","winogrande","piqa",
-        "arc_challenge","arc_easy","anli_r1","anli_r2","anli_r3","openbookqa",
-        "race","boolq","copa","rte","wsc","multirc","record","wikitext"]
-    avg_range = [18]
-    avg_tag = ["0-18"]
-    tab_header = ["Case","Model size","Train tokens","Batch size","Bsz warmup","LR","min LR","LR warmup","LR decay","decay style"]
-
-    configs = [
-        ["(0)paper","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine", None], # gpt3 paper orig results, thus result path is None
-        ["(1)repro","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine",
-         '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/'],
-        ["(2)fixedBsz","125M","300B","256","N/A","6e-4","6e-5","3000M","260B","cosine",
-         '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'],
-        ["(3)fixedBsz 300B+minLR","125M","300B","256","N/A","6e-4","1e-6","3000M","300B","cosine",
-         '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min1.0e-6-wup3000M-dcy300B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/']
-    ]
-    caption = 'Conglong: GPT-3 125M results zero-shot'
-    generate_result_table(tab_header, configs, task_order, caption, avg_range,
-        avg_tag, split_name_by_space=True, fontsize="\\tiny")
-
-    configs = [
-        ["(0)paper","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine", None], # gpt3 paper orig results, thus result path is None
-        ["(1)repro","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine",
-         '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/'],
-        ["(2)fixedBsz","125M","300B","256","N/A","6e-4","6e-5","3000M","260B","cosine",
-         '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'],
-        ["(3)fixedBsz 300B+minLR","125M","300B","256","N/A","6e-4","1e-6","3000M","300B","cosine",
-         '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min1.0e-6-wup3000M-dcy300B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'],
-    ]
-    caption = 'Conglong: GPT-3 125M results few-shot'
-    generate_result_table(tab_header, configs, task_order, caption, avg_range,
-        avg_tag, split_name_by_space=True, fontsize="\\tiny", few_shot=True)
-
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh
deleted file mode 100644
index 2bfbec3a130e30cad95bc99ca7a53a4a650c7aaa..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-## CAUTION: first read Megatron-DeepSpeed/blob/main/examples_deepspeed/MoE/readme_evalharness.md
-## and follow the steps of installation/data downloading.
-checkpoint_paths=(
-    /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/
-    /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/
-)
-
-## No need to use the exact training config json, just use this dummy is fine
-config_path=ds_config_eval_dummy.json
-username=$(whoami)
-result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results"
-
-## Task(s) on the same row will be performed together in the same process.
-## There exist other tasks that can run but we skip because they didn't appear
-## or have strange scores in GPT-3 paper: qqp, prost, cb, wic, mrpc, sst, wnli
-## pubmedqa, logiqa, qnli, sciq, mc_taco, mathqa. For wikitext, it didn't
-## appear in paper but we include it for a perplexity task.
-tasks=(
-    record
-    triviaqa
-    hellaswag
-    arc_challenge
-    arc_easy
-    race
-    multirc
-    openbookqa
-    lambada
-    webqs
-    winogrande
-    piqa
-    anli_r1,anli_r2,anli_r3
-    boolq,copa
-    rte,wsc
-    wikitext
-)
-
-## Use localhost if you didn't setup hostfile as described in
-## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node.
-## If hostfile exist, use hostname (e.g., worker-0) in hostfile.
-# hostname="localhost"
-hostname="worker-0"
-
-batch_size=32
-
-## This script is for zero-shot
-num_fewshot=0
-
-num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-cuda_id=-1
-total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+)
-total_mem=$(( ${total_mem}*99/100 )) # somehow there could exist tiny (4MB or so) gpu memory leak
-
-## Code below only works when you run each evalharness task on a single GPU.
-## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples_deepspeed/MoE/ds_evalharness.sh
-for l in "${!checkpoint_paths[@]}"; do 
-    checkpoint_path=${checkpoint_paths[l]}
-    for ((i=0;i<${#tasks[@]};++i)); do
-        task=${tasks[i]}
-        free_mem=0
-        while [ $free_mem -lt $total_mem ]; do
-            cuda_id=$(((cuda_id+1)%num_gpus))
-            free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+)
-            sleep 60s
-        done
-        bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot &
-    done
-done
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh
deleted file mode 100644
index 8e6406477060e08f73ce240704b57a6061623ae5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-## CAUTION: first read Megatron-DeepSpeed/blob/main/examples_deepspeed/MoE/readme_evalharness.md
-## and follow the steps of installation/data downloading.
-checkpoint_paths=(
-    /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/
-    /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/
-)
-
-## No need to use the exact training config json, just use this dummy is fine
-config_path=ds_config_eval_dummy.json
-username=$(whoami)
-result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results_10shot"
-
-## Task(s) on the same row will be performed together in the same process.
-tasks=(
-    record
-    triviaqa
-    hellaswag
-    arc_challenge
-    arc_easy
-    race
-    multirc
-    openbookqa
-    lambada
-    webqs
-    winogrande
-    piqa
-    anli_r1,anli_r2
-    anli_r3
-    boolq,copa
-    rte,wsc
-)
-
-num_fewshot=10
-
-## Use localhost if you didn't setup hostfile as described in
-## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node.
-## If hostfile exist, use hostname (e.g., worker-0) in hostfile.
-# hostname="localhost"
-hostname="worker-0"
-
-batch_size=16
-
-num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-cuda_id=-1
-total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+)
-total_mem=$(( ${total_mem}*99/100 )) # somehow there could exist tiny (4MB or so) gpu memory leak
-
-## Code below only works when you run each evalharness task on a single GPU.
-## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples_deepspeed/MoE/ds_evalharness.sh
-for l in "${!checkpoint_paths[@]}"; do 
-    checkpoint_path=${checkpoint_paths[l]}
-    for ((i=0;i<${#tasks[@]};++i)); do
-        task=${tasks[i]}
-        free_mem=0
-        while [ $free_mem -lt $total_mem ]; do
-            cuda_id=$(((cuda_id+1)%num_gpus))
-            free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+)
-            sleep 60s
-        done
-        bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot &
-    done
-done
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json
deleted file mode 100644
index c542c7cf32397d1292ea6382a3ac7fc82667d41e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json
+++ /dev/null
@@ -1,73 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false,
-  "dataloader_drop_last": true,
-  "data_efficiency": {
-    "enabled": true,
-    "seed": DATA_EFFICIENCY_SEED,
-    "data_routing": {
-      "enabled": LTD_ENABLED,
-      "random_ltd":{
-        "enabled": LTD_ENABLED,
-        "total_layer_num": 24,
-        "random_ltd_layer_num": 22,
-        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
-        "model_mask_name": "attention_mask",
-        "model_type": "decoder",
-        "hidden_state_order": "seq_batch_dim",
-        "random_ltd_schedule": {
-          "min_value": LTD_MIN,
-          "max_value": LTD_MAX,
-          "schedule_type":"fixed_linear",
-          "schedule_config": {
-            "require_steps": LTD_STEP,
-            "seq_per_step": 16
-          }
-        }
-      } 
-    },
-    "data_sampling": {
-      "enabled": CL_ENABLED,
-      "num_workers": DATA_SAMPLING_NUM_WORKERS,
-      "curriculum_learning": {
-        "enabled": CL_ENABLED,
-        "data_cluster_path": "CL_CLUSTER_PATH",
-        "curriculum_metrics": {
-          "CL_1st_METRIC_NAME": {
-            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
-            "index_to_metric_path": "CL_1st_METRIC_PATH",
-            "difficulty_type": "CL_1st_DIFF_TYPE",
-            "clustering_type": "CL_1st_CLUSTER_TYPE",
-            "min_difficulty": CL_1st_MIN,
-            "max_difficulty": CL_1st_MAX,
-            "schedule_type": "fixed_root",
-            "schedule_config": {
-              "total_curriculum_step": CL_1st_TOTAL_STEP,
-              "difficulty_step": CL_1st_DIFF_STEP,
-              "root_degree": CL_1st_ROOT
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json
deleted file mode 100644
index a556aa7af366c7bbdeea82d63b65d82345c24263..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json
+++ /dev/null
@@ -1,87 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false,
-  "dataloader_drop_last": true,
-  "data_efficiency": {
-    "enabled": true,
-    "seed": DATA_EFFICIENCY_SEED,
-    "data_routing": {
-      "enabled": LTD_ENABLED,
-      "random_ltd":{
-        "enabled": LTD_ENABLED,
-        "total_layer_num": 24,
-        "random_ltd_layer_num": 22,
-        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
-        "model_mask_name": "attention_mask",
-        "model_type": "decoder",
-        "hidden_state_order": "seq_batch_dim",
-        "random_ltd_schedule": {
-          "min_value": LTD_MIN,
-          "max_value": LTD_MAX,
-          "schedule_type":"fixed_linear",
-          "schedule_config": {
-            "require_steps": LTD_STEP,
-            "seq_per_step": 16
-          }
-        }
-      } 
-    },
-    "data_sampling": {
-      "enabled": CL_ENABLED,
-      "num_workers": DATA_SAMPLING_NUM_WORKERS,
-      "curriculum_learning": {
-        "enabled": CL_ENABLED,
-        "data_cluster_path": "CL_CLUSTER_PATH",
-        "curriculum_metrics": {
-          "CL_1st_METRIC_NAME": {
-            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
-            "index_to_metric_path": "CL_1st_METRIC_PATH",
-            "difficulty_type": "CL_1st_DIFF_TYPE",
-            "clustering_type": "CL_1st_CLUSTER_TYPE",
-            "min_difficulty": CL_1st_MIN,
-            "max_difficulty": CL_1st_MAX,
-            "schedule_type": "fixed_root",
-            "schedule_config": {
-              "total_curriculum_step": CL_1st_TOTAL_STEP,
-              "difficulty_step": CL_1st_DIFF_STEP,
-              "root_degree": CL_1st_ROOT
-            }
-          },
-          "CL_2nd_METRIC_NAME": {
-            "index_to_sample_path": "CL_2nd_SAMPLE_PATH",
-            "index_to_metric_path": "CL_2nd_METRIC_PATH",
-            "difficulty_type": "CL_2nd_DIFF_TYPE",
-            "clustering_type": "CL_2nd_CLUSTER_TYPE",
-            "min_difficulty": CL_2nd_MIN,
-            "max_difficulty": CL_2nd_MAX,
-            "schedule_type": "fixed_root",
-            "schedule_config": {
-              "total_curriculum_step": CL_2nd_TOTAL_STEP,
-              "difficulty_step": CL_2nd_DIFF_STEP,
-              "root_degree": CL_2nd_ROOT
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh
deleted file mode 100644
index fe2144c6d678cfdd1d008c209a87b640797cfe8e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh
+++ /dev/null
@@ -1,515 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-model_size=1.3
-num_layers=24
-hidden_size=2048
-num_attn_heads=16
-global_batch_size=512
-# lr=2.0e-4
-lr=$1
-min_lr=1.0e-6
-init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-# train_tokens_in_billion=300
-train_tokens_in_billion=$2
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=1
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-batch_size=$(( ${global_batch_size} / ${dp_size} ))
-###############################################################################
-### Random layerwise token dropping (random-LTD) configs
-## random-LTD's main switch. "false" means disabled. "true" means enabled.
-ltd_enabled=${3:-'false'}
-## How much dropping ratio to start with. The value denotes the seqlen after
-## dropping.
-ltd_start=${4:-2048}
-## How many steps for random-LTD to gradually reduce dropping ratio to zero.
-ltd_step=${5:-1}
-
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step=200000
-###############################################################################
-### Curriculum learning (CL) configs
-## CL's main switch. "false" means disabled. "true" means enabled.
-cl_enabled=${6:-'false'}
-## Number of CL metrics to use.
-cl_num_metric=${7:-1}
-
-## Name of difficulty metric
-cl_1st_metric=${8:-'dummy'}
-## Path to the data indexes for this difficulty metric. Samples on ith row of
-## index_to_sample have the difficulty value equals to ith row of
-## index_to_metric.
-cl_1st_index_to_sample_path=${9:-'dummy'}
-cl_1st_index_to_metric_path=${10:-'dummy'}
-## During training, whether increase difficulty by value- or percentile-based.
-cl_1st_difficulty_type=${11:-'value'}
-## "single_cluster" means no clustering required and probably CL is achieved by
-## data postprocessing. "schedule_based" means will cluster data based on the
-## difficulty schedule (pacing function) below.
-cl_1st_clustering_type=${12:-'single_cluster'}
-## Start difficulty
-cl_1st_min=${13:-2048}
-## End difficulty
-cl_1st_max=${14:-2048}
-## Total step to reach end difficulty
-cl_1st_total_step=${15:-1}
-## When changing difficulty, always make sure it's a multiple of the
-## difficulty_step below.
-cl_1st_difficulty_step=${16:-1}
-## Root degree of the schedule (pacing function).
-cl_1st_root=${17:-1}
-
-cl_2nd_metric=${18:-'dummy'}
-cl_2nd_index_to_sample_path=${19:-'dummy'}
-cl_2nd_index_to_metric_path=${20:-'dummy'}
-cl_2nd_difficulty_type=${21:-'value'}
-cl_2nd_clustering_type=${22:-'single_cluster'}
-cl_2nd_min=${23:-2048}
-cl_2nd_max=${24:-2048}
-cl_2nd_total_step=${25:-1}
-cl_2nd_difficulty_step=${26:-1}
-cl_2nd_root=${27:-1}
-
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# ## The *_index_to_sample_percentile_merged is a concatenated index for perf
-# ## improvement, but it only works when you set difficulty_type="percentile" in
-# ## ds_config. If you use difficulty_type="value", you need to change this to
-# ## *_index_to_sample
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=1
-# cl_1st_max=100
-# cl_1st_total_step=110000
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=80
-# cl_2nd_max=2048
-# cl_2nd_total_step=110000
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-###############################################################################
-### Misc configs
-log_interval=100
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-save_interval=$((${estimated_train_iter} / ${num_save}))
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-## Public the Pile dataset, can be downloaded at
-## https://mystic.the-eye.eu/public/AI/pile_neox/ Change data_home to where you
-## store the pile_text_document.bin and pile_text_document.idx.
-data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-if [[ "$host" == *"webxt"* ]]; then
-    data_home="/blob/data/the_pile_public_merged_nopreprocessing"
-fi
-data_path="${data_home}/pile_text_document"
-## *_idx_path force Megatron to use a specific data index file generated when
-## we analyze data. This is needed because our index for curriculum learning
-## difficulty metric is based on this data index.
-doc_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_doc_idx.npy"
-sample_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_sample_idx.npy"
-shuffle_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_shuffle_idx.npy"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}"
-if [ "${ltd_enabled}" = "true" ]; then
-    jobname="${jobname}_ltd_${ltd_start}_${ltd_step}"
-fi
-if [ "${cl_enabled}" = "true" ]; then
-    jobname="${jobname}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}"
-    if [[ $cl_num_metric -gt 1 ]]; then
-        jobname="${jobname}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}"
-    fi
-fi
-
-username=$(whoami)
-output_home="/blob/users/${username}/project/data_efficient_gpt"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-if [ "${cl_enabled}" = "true" ]; then
-    data_cluster_path="${output_home}/data_cluster/${jobname}"
-    mkdir -p ${data_cluster_path}
-fi
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-if [ "${ltd_enabled}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --random-ltd"
-fi
-
-if [ "${cl_enabled}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --train-doc-idx-path ${doc_idx_path} \
-    --train-sample-idx-path ${sample_idx_path} \
-    --train-shuffle-idx-path ${shuffle_idx_path} \
-    --data-efficiency-curriculum-learning"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_seed${seed}"
-if [ "${ltd_enabled}" = "true" ]; then
-    config_json="${config_json}_ltd_${ltd_start}_${ltd_step}"
-fi
-if [ "${cl_enabled}" = "true" ]; then
-    config_json="${config_json}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}"
-    if [[ $cl_num_metric -gt 1 ]]; then
-        config_json="${config_json}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}"
-    fi
-fi
-config_json="${config_json}.json"
-if [[ $cl_num_metric -gt 1 ]]; then
-template_json="ds_config_gpt_2clmetrics_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-    | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \
-    | sed "s/LTD_ENABLED/${ltd_enabled}/" \
-    | sed "s/LTD_MIN/${ltd_start}/" \
-    | sed "s/LTD_MAX/${seq_len}/" \
-    | sed "s/LTD_STEP/${ltd_step}/" \
-    | sed "s/CL_ENABLED/${cl_enabled}/" \
-    | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \
-    | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \
-    | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \
-    | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \
-    | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \
-    | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \
-    | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \
-    | sed "s/CL_1st_MIN/${cl_1st_min}/" \
-    | sed "s/CL_1st_MAX/${cl_1st_max}/" \
-    | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \
-    | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \
-    | sed "s/CL_1st_ROOT/${cl_1st_root}/" \
-    | sed "s#CL_2nd_METRIC_NAME#${cl_2nd_metric}#" \
-    | sed "s#CL_2nd_SAMPLE_PATH#${cl_2nd_index_to_sample_path}#" \
-    | sed "s#CL_2nd_METRIC_PATH#${cl_2nd_index_to_metric_path}#" \
-    | sed "s#CL_2nd_DIFF_TYPE#${cl_2nd_difficulty_type}#" \
-    | sed "s#CL_2nd_CLUSTER_TYPE#${cl_2nd_clustering_type}#" \
-    | sed "s/CL_2nd_MIN/${cl_2nd_min}/" \
-    | sed "s/CL_2nd_MAX/${cl_2nd_max}/" \
-    | sed "s/CL_2nd_TOTAL_STEP/${cl_2nd_total_step}/" \
-    | sed "s/CL_2nd_DIFF_STEP/${cl_2nd_difficulty_step}/" \
-    | sed "s/CL_2nd_ROOT/${cl_2nd_root}/" \
-      > ${config_json}
-else
-template_json="ds_config_gpt_1clmetric_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-    | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \
-    | sed "s/LTD_ENABLED/${ltd_enabled}/" \
-    | sed "s/LTD_MIN/${ltd_start}/" \
-    | sed "s/LTD_MAX/${seq_len}/" \
-    | sed "s/LTD_STEP/${ltd_step}/" \
-    | sed "s/CL_ENABLED/${cl_enabled}/" \
-    | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \
-    | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \
-    | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \
-    | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \
-    | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \
-    | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \
-    | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \
-    | sed "s/CL_1st_MIN/${cl_1st_min}/" \
-    | sed "s/CL_1st_MAX/${cl_1st_max}/" \
-    | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \
-    | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \
-    | sed "s/CL_1st_ROOT/${cl_1st_root}/" \
-      > ${config_json}
-fi
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh
deleted file mode 100644
index 8878c1792a9400173492b8a746936aed0e8eb8c6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh
+++ /dev/null
@@ -1,366 +0,0 @@
-###############################################################################
-### Each block below is one pretraining setup. Uncomment one block to try.
-###############################################################################
-### Baseline cases, mostly based on OpenAI's GPT-3 hyperparameters, but with
-### some changes (without batch size warmup, and different LR schedule).
-## Baseline 300B tokens (100%):
-# lr=2.0e-4
-# train_tokens_in_billion=300
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion}
-###############################################################################
-## Baseline 200B tokens (67%):
-# lr=3.0e-4 # scaled based on train token reduction ratio
-# train_tokens_in_billion=200
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion}
-###############################################################################
-## Baseline 150B tokens (50%):
-# lr=4.0e-4
-# train_tokens_in_billion=150
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion}
-###############################################################################
-### Curriculum learning (CL) + Random layerwise token dropping (random-LTD).
-### DeepSpeed Data Efficiency's best composed solution.
-## CL+random-LTD 300B tokens (100%):
-# lr=2.0e-4
-# train_tokens_in_billion=300
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step=200000
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=1
-# cl_1st_max=100
-# cl_1st_total_step=110000
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=80
-# cl_2nd_max=2048
-# cl_2nd_total_step=110000
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL+random-LTD 150B tokens (50%):
-# lr=4.0e-4
-# train_tokens_in_billion=150
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step=100000
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=1
-# cl_1st_max=100
-# cl_1st_total_step=55000
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=80
-# cl_2nd_max=2048
-# cl_2nd_total_step=55000
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-### Random layerwise token dropping (random-LTD).
-## random-LTD 300B tokens (100%):
-# lr=2.0e-4
-# train_tokens_in_billion=300
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step=200000
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step}
-###############################################################################
-## random-LTD 200B tokens (67%):
-# lr=3.0e-4
-# train_tokens_in_billion=200
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step=133333
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step}
-###############################################################################
-## random-LTD 150B tokens (50%):
-# lr=4.0e-4
-# train_tokens_in_billion=150
-# ltd_enabled="true"
-# ltd_start=128
-# ltd_step=100000
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step}
-###############################################################################
-### Curriculum learning (CL).
-## CL vocab rarity + seqlen truncation 300B tokens (100%):
-# lr=2.0e-4
-# train_tokens_in_billion=300
-# ltd_enabled="false"
-# ltd_start=2048
-# ltd_step=1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=1
-# cl_1st_max=100
-# cl_1st_total_step=110000
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=80
-# cl_2nd_max=2048
-# cl_2nd_total_step=110000
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL vocab rarity + seqlen truncation 200B tokens (67%):
-# lr=3.0e-4
-# train_tokens_in_billion=200
-# ltd_enabled="false"
-# ltd_start=2048
-# ltd_step=1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=1
-# cl_1st_max=100
-# cl_1st_total_step=73000
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=80
-# cl_2nd_max=2048
-# cl_2nd_total_step=73000
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL vocab rarity + seqlen truncation 150B tokens (50%):
-# lr=4.0e-4
-# train_tokens_in_billion=150
-# ltd_enabled="false"
-# ltd_start=2048
-# ltd_step=1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=1
-# cl_1st_max=100
-# cl_1st_total_step=55000
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_truncate"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=80
-# cl_2nd_max=2048
-# cl_2nd_total_step=55000
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL vocab rarity + seqlen reshape 300B tokens (100%):
-# lr=2.0e-4
-# train_tokens_in_billion=300
-# ltd_enabled="false"
-# ltd_start=2048
-# ltd_step=1
-# cl_enabled="true"
-# cl_num_metric=2
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=1
-# cl_1st_max=100
-# cl_1st_total_step=110000
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# cl_2nd_metric="seqlen_reshape"
-# cl_2nd_index_to_sample_path="dummy"
-# cl_2nd_index_to_metric_path="dummy"
-# cl_2nd_difficulty_type="value"
-# cl_2nd_clustering_type="single_cluster"
-# cl_2nd_min=80
-# cl_2nd_max=2048
-# cl_2nd_total_step=110000
-# cl_2nd_difficulty_step=8
-# cl_2nd_root=1
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
-#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
-#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
-###############################################################################
-## CL vocab rarity 300B tokens (100%):
-# lr=2.0e-4
-# train_tokens_in_billion=300
-# ltd_enabled="false"
-# ltd_start=2048
-# ltd_step=1
-# cl_enabled="true"
-# cl_num_metric=1
-# cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="percentile"
-# cl_1st_clustering_type="schedule_based"
-# cl_1st_min=1
-# cl_1st_max=100
-# cl_1st_total_step=110000
-# cl_1st_difficulty_step=1
-# cl_1st_root=2
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root}
-###############################################################################
-## CL seqlen truncation 300B tokens (100%):
-# lr=2.0e-4
-# train_tokens_in_billion=300
-# ltd_enabled="false"
-# ltd_start=2048
-# ltd_step=1
-# cl_enabled="true"
-# cl_num_metric=1
-# cl_1st_metric="seqlen_truncate"
-# cl_1st_index_to_sample_path="dummy"
-# cl_1st_index_to_metric_path="dummy"
-# cl_1st_difficulty_type="value"
-# cl_1st_clustering_type="single_cluster"
-# cl_1st_min=80
-# cl_1st_max=2048
-# cl_1st_total_step=110000
-# cl_1st_difficulty_step=8
-# cl_1st_root=1
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root}
-###############################################################################
-## CL seqlen reshape 300B tokens (100%):
-# lr=2.0e-4
-# train_tokens_in_billion=300
-# ltd_enabled="false"
-# ltd_start=2048
-# ltd_step=1
-# cl_enabled="true"
-# cl_num_metric=1
-# cl_1st_metric="seqlen_reshape"
-# cl_1st_index_to_sample_path="dummy"
-# cl_1st_index_to_metric_path="dummy"
-# cl_1st_difficulty_type="value"
-# cl_1st_clustering_type="single_cluster"
-# cl_1st_min=80
-# cl_1st_max=2048
-# cl_1st_total_step=110000
-# cl_1st_difficulty_step=8
-# cl_1st_root=1
-# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
-#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
-#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
-#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
-#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
-#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root}
-###############################################################################
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/README.md
deleted file mode 100644
index 540763fdd125dff11cc026ca59711f0948ac725e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/README.md
+++ /dev/null
@@ -1,107 +0,0 @@
-# Megatron-DeepSpeed Rebase with Optimizations
-
-We rebased and enabled DeepSpeed with the latest Megatron repo. This folder contains examples that demonstrate how to use the new Megatron-DeepSpeed for training GPT like models with new features.
-
-## Rebasing Efforts/Achievements
-New features:
-- Enabled Megatron-LM's sequence parallel.
-- Enabled rotary positional embedding.
-- Enabled FlashAttention v1 and v2.
-- Enabled new fused kernels from NVIDIA.
-
-New optimizations:
-- Enabled attention map memory optimization, where we first generated attention mask on CPU memory and then moved it into GPU memory to avoid out-of-memory errors when training with very large sequence lengths.
-- Position embedding partitioning, where we split weights of position encoding across all GPUs when enabling sequence parallel to further reduce the memory footprint.
-
-Resolved Issues:
-- Fixed the conflicts related to activation checkpointing when DeepSpeed was used with the newest Megatron-LM. NVIDIA introduced new fine-grained partial checkpointing technique, which DeepSpeed was not compatible with. Support for fine-grained checkpointing will be left as future work.
-- Major refactoring to DeepSpeed pipeline parallelism implementation for GPT model in order to work with the newest Megatron-LM.
-- Fixed model checkpoint save/load when DeepSpeed was used with the newest Megatron-LM.
-- Fully verified the performance and correctness of GPT pretraining after rebasing.
-
-## Setting Up the Virtual Environment
-
-```shell
-# clone source code
-git clone https://github.com/microsoft/DeepSpeed.git
-git clone https://github.com/microsoft/Megatron-DeepSpeed.git
-git clone https://github.com/NVIDIA/apex
-
-# creat a new virtual environment
-cd Megatron-DeepSpeed
-python3 -m venv ./venvs/megatron-deepspeed --system-site-packages
-source ./venvs/megatron-deepspeed/bin/activate
-
-# install the newest DeepSpeed
-cd ../DeepSpeed/
-pip install -e .
-
-# install apex
-cd ../apex/
-pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" -e ./
-
-# install pybind11
-cd ../
-pip install pybind11
-```
-
-Megatron-DeepSpeed's sequence parallelism can be combined with the following types of attention.
-
-- Classic attention
-- FlashAttention version 1.x (enabled by `--use-flash-attn-v1`)
-- FlashAttention version 2.x (enabled by `--use-flash-attn-v2`)
-- FlashAttention + Triton (enabled by `--use-flash-attn-triton`)
-
-FlashAttention version 2.x may have numerical stability issues. For the best performance, we recommend using FlashAttention + Triton. 
-We show installation steps of thoes 3 types of FlashAttention
-
-```shell
-
-# install FlashAttention version 1.x
-pip install flash-attn==1.0.4
-
-# install FlashAttention version 2.x
-cd ../
-git clone https://github.com/Dao-AILab/flash-attention.git
-cd flash-attention
-python setup.py install
-
-# install Triton-based FlashAttention
-git clone -b legacy-backend https://github.com/openai/triton
-cd triton/python/
-pip install cmake
-pip install .
-
-cd ../
-git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention
-cd flash-attention
-python setup.py install
-```
-
-## Example Showcase
-
-One of the optimizations enabled from this rebase is to enable Megatron-style long sequence parallelism. To enable sequence parallelism, add the `--sequence-parallel` flag in the training script. We provide two training scripts for ([GPT1.3B](pretrain_gpt_1.3B_seq_parallel.sh) and [GPT30B](pretrain_gpt_13B_seq_parallel.sh)) that enable sequence parallelism, which are available in this foloder.
-
-By default, the degree of sequence parallelism is equal to the degree of model tensor parallelism. The users may also want to ensure that the sequence length is divisible by the degree of sequence parallelism to avoid performance penalties. 
-Please also ensure that your model dimension is compliant with FlashAttention's requirements. For instance, to achieve the optimal performance, the head size should be divisible by 8. Refer to the document of [FlashAttention](https://github.com/Dao-AILab/flash-attention/tree/v1.0.4) for more details.
-
-## Performance Comparison between Old Megatron-DeepSpeed and New Megatron-DeepSpeed
-
-The following experiments are performed on 4 NVIDIA DGX A100-40GB nodes, connected through 8 HDR InfiniBand (200Gb/s per HDR). TP stands for tensor parallelism.
-
-| Sequence Length | Old Megatron-DeepSpeed  (TFLOPS) | New Megatron-DeepSpeed  (TFLOPS) |
-|-----------------|----------------------------------|----------------------------------|
-| 2k              | 25 (TP=32)                       | 68 (TP size=32)                  |
-| 4k              | 28 (TP=32)                       | 80 (TP size=32)                  |
-| 8k              | OoM                              | 86 (TP size=32)                  |
-| 16k             | OoM                              | 92 (TP size=32)                  |
-| 32k             | OoM                              | 100 (TP size=32)                 |
-| 64k             | OoM                              | 106 (TP size=32)                 |
-| 128k            | OoM                              | 119 (TP size=32)                 |
-| 256k            | OoM                              | 94 (TP size=32)                  |
-
-The new Megatron-DeepSpeed is able to support longer sequence lengths without triggering out-of-memory errors because it enables sequence parallelism, which partitions the activation memory when sequence lengths are massive. The new Megatron-DeepSpeed supports FlashAttention, which reduces the memory consumption of the attention map calculation from quadratic to linear complexity with respect to the sequence length. It supports position embedding partitioning, which further reduces the memory consumption. The new Megatron-DeepSpeed can achieve higher TFLPOS because it includes new fused kernels from NVIDIA and supports larger batch sizes using the memory optimizations without triggering out-of-memory errors.
-
-## Acknowledgements
-
-We would like to acknowledge the use of the supercomputing resources of the Argonne Leadership Computing Facility (ALCF), which is a DOE Office of Science User Facility supported under Contract DE-AC02-06CH11357.  The resources provided by ALCF(Argonne) have been invaluable in helping us to conduct this work and achieve our goals.
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/ds_config_gpt_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/ds_config_gpt_TEMPLATE.json
deleted file mode 100644
index 14290ec036bc8e106ed3e5dc5ca8b00400f20972..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/ds_config_gpt_TEMPLATE.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "flops_profiler": {
-    "enabled": true,
-    "profile_step": 1,
-    "module_depth": -1,
-    "top_modules": 3,
-    "detailed": true,
-    "output_file": null
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/host_file b/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/host_file
deleted file mode 100644
index 91fe1ab431a01e04dfd40fae6d51e50e6db5983f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/host_file
+++ /dev/null
@@ -1 +0,0 @@
-worker-1 slots=4
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/pretrain_gpt_1.3B_seq_parallel.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/pretrain_gpt_1.3B_seq_parallel.sh
deleted file mode 100644
index 410a047b11744a3bf733e4b7aa569d3e65428364..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/pretrain_gpt_1.3B_seq_parallel.sh
+++ /dev/null
@@ -1,349 +0,0 @@
-#!/bin/bash
-
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-n_k=2
-seq_len=$(( 1024 * $n_k ))
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-model_size=1.3
-num_layers=24
-hidden_size=2048
-num_attn_heads=16
-global_batch_size=2
-lr=2.0e-4
-min_lr=1.0e-6
-init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=8
-
-## Sequence parallelism, 0 is no SP, 1 enable SP
-enable_sequence_parallel=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=0
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=1
-
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-# estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-estimated_train_iter=6
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase"
-
-username=$(whoami)
-output_home="output"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-tensorboard_dir="${output_home}/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --use-flash-attn-triton \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [[ "$enable_sequence_parallel" == 1 ]]; then
-megatron_options="\
-    --sequence-parallel \
-    ${megatron_options}"
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/pretrain_gpt_30B_seq_parallel.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/pretrain_gpt_30B_seq_parallel.sh
deleted file mode 100644
index 12d49d5702c4c734e0e0d4e8963ac4efb19e92ed..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/deepspeed4science/megatron_long_seq_support/pretrain_gpt_30B_seq_parallel.sh
+++ /dev/null
@@ -1,360 +0,0 @@
-#!/bin/bash
-
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-n_k=2
-seq_len=$(( 1024 * $n_k ))
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=16
-# global_batch_size=2
-# lr=2.0e-4
-# min_lr=1.0e-6
-# init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 30B
-model_size=30
-num_layers=64
-hidden_size=6144
-num_attn_heads=64
-global_batch_size=2
-lr=1.0e-4
-min_lr=1.0e-6
-init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=32
-
-## Sequence parallelism, 0 is no SP, 1 enable SP
-enable_sequence_parallel=1
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=0
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=1
-
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-# estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-estimated_train_iter=6
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase"
-
-username=$(whoami)
-output_home="output"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-tensorboard_dir="${output_home}/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --pipeline-model-parallel-size ${pp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --use-flash-attn-triton \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [[ "$enable_sequence_parallel" == 1 ]]; then
-megatron_options="\
-    --sequence-parallel \
-    ${megatron_options}"
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-# Since mp_size=32 involving multi-node compute resources. Users may need to specify hostfile via "--hostfile=myhostfile" command line option.
-deepspeed ${dir}/../../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/README.md
deleted file mode 100644
index e8641ced21abad16b18235fce44f053cb5f6db56..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-## Example of Finetuning LLAMA-7B from Hugging Face Weights
-
-### Dataset
-You can access the dataset from [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json).
-
-### Pre-trained Weights
-The pre-trained weights can be found at [Hugging Face - LLAMA-7B](https://huggingface.co/huggyllama/llama-7b).
-
-### Usage:
-
-#### 1. Converting Hugging Face Model Weights to Megatron-Deepspeed Model
-```bash
-bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh convert
-```
-This command writes the Hugging Face model weights into the Megatron-Deepspeed model and saves it. You can adjust the parallel configuration in the script.
-
-#### 2. Fine-tuning Process
-```bash
-bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh
-```
-Execute this command to initiate the finetuning process. The task originates from [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca.git).
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/ds_config.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/ds_config.json
deleted file mode 100644
index 9c0b332473ed7132b1a488f3b4a4ddabed73893c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/ds_config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "train_batch_size" : 256,
-  "train_micro_batch_size_per_gpu": 16,
-  "steps_per_print": 100,
-  "zero_optimization": {
-    "stage": 0
-  },
-  "bf16": {
-    "enabled": true
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
deleted file mode 100644
index c48ea11b93d29427ae00c684f276ff847ea31663..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json
-DATASET_PATH=./alpaca_data.json
-# dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json
-
-HF_LLAMA_PATH=/data/llama-7b/
-# weights link: https://huggingface.co/huggyllama/llama-7b
-
-MICRO_BATCH_SIZE=16
-GLOBAL_BATCH_SIZE=256
-TP=2
-PP=2
-# require to align with weight dimensions
-HIDDEN_SIZE=4096
-FFN_HIDDEN_SIZE=11008
-NUM_LAYERS=32
-NUM_HEADS=32
-SEQ_LENGTH=512
-######################################
-
-MEGA_DS_LLAMA_PATH=./"llama-7b-mega-ds-T${TP}P${PP}"
-
-# Below configuration required for llama model as per llama paper
-# --no-query-key-layer-scaling \
-# --attention-dropout 0 \
-# --hidden-dropout 0 \
-# --use-rotary-position-embeddings \
-# --untie-embeddings-and-output-weights \
-# --swiglu \
-# --normalization rmsnorm \
-# --disable-bias-linear \
-######################################
-cat <<EOT > $DS_CONFIG
-{
-  "train_batch_size" : $GLOBAL_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
-  "steps_per_print": 100,
-  "zero_optimization": {
-    "stage": 0
-  },
-  "bf16": {
-    "enabled": true
-  }
-}
-EOT
-
-
-covert_args="deepspeed tools/hf2megads_weight_converter.py \
---hf-ckpt-num-shards 2 \
---origin-hf-ckpt-dir $HF_LLAMA_PATH \
---save $MEGA_DS_LLAMA_PATH"
-
-finetune_args="deepspeed finetune_llama.py \
---load $MEGA_DS_LLAMA_PATH"
-
-comm_args="--tensor-model-parallel-size $TP \
---pipeline-model-parallel-size $PP \
---lr-warmup-iters 2000 \
---weight-decay 0.1 \
---clip-grad 1 \
---num-layers $NUM_LAYERS \
---hidden-size $HIDDEN_SIZE \
---num-attention-heads $NUM_HEADS \
---ffn-hidden-size $FFN_HIDDEN_SIZE \
---attention-dropout 0 \
---hidden-dropout 0 \
---no-query-key-layer-scaling \
---disable-bias-linear \
---normalization rmsnorm \
---use-rotary-position-embeddings \
---untie-embeddings-and-output-weights \
---swiglu \
---seq-length $SEQ_LENGTH \
---max-position-embeddings $SEQ_LENGTH \
---micro-batch-size $MICRO_BATCH_SIZE \
---global-batch-size $GLOBAL_BATCH_SIZE \
---train-iters 3500 \
---lr 2e-5 \
---tensorboard-dir tensorboard_output \
---lr-decay-iters 320000 \
---lr-decay-style cosine \
---log-interval 1 \
---eval-iters 100 \
---eval-interval 100 \
---data-path $DATASET_PATH \
---save-interval 1500 \
---split 100,0,0 \
---bf16 \
---zero-stage 0 \
---tokenizer-type HFTokenizer \
---tokenizer-model $HF_LLAMA_PATH \
---deepspeed_config ./examples_deepspeed/finetune_hf_llama/ds_config.json \
---deepspeed \
---distributed-backend nccl \
---num-workers 0 \
---no-masked-softmax-fusion \
---no-bias-gelu-fusion \
---no-bias-dropout-fusion \
---no-gradient-accumulation-fusion \
---repeated-dataloader"
-
-if [ "$1" = "convert" ]; then
-    task_args="$covert_args"
-else
-    task_args="$finetune_args"
-fi
-
-full_cmd="$task_args $comm_args"
-
-eval "$full_cmd"
-
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/generate_text.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/generate_text.sh
deleted file mode 100644
index e29d521e1ccb0c98198c9040781e4fc38841b072..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/generate_text.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-export TORCH_CUDA_ARCH_LIST=8.6+PTX
-CHECKPOINT_PATH=dataset/checkpoints/gpt2_345m
-VOCAB_FILE=dataset/gpt2-vocab.json
-MERGE_FILE=dataset/gpt2-merges.txt
-b=8
-mp=1
-experts=1
-nodes=1
-gpus=1
-
-
-use_tutel=""
-#use_tutel="--use-tutel"
-
-
-ds_inference=""
-#ds_inference="--ds-inference"
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus"
-L=24
-H=1024
-A=16
-#experts1=${experts[$k]}
-program_cmd="tools/generate_samples_gpt.py \
-       --tensor-model-parallel-size $mp \
-       --num-layers $L \
-       --hidden-size $H \
-       --num-attention-heads $A \
-       --max-position-embeddings 1024 \
-       --tokenizer-type GPT2BPETokenizer \
-       --fp16 \
-       --num-experts ${experts} \
-       --mlp-type standard \
-       --micro-batch-size $b \
-       --seq-length 1024 \
-       --out-seq-length 1024 \
-       --temperature 1.0 \
-       --vocab-file $VOCAB_FILE \
-       --merge-file $MERGE_FILE \
-       --genfile unconditional_samples.json \
-       --top_p 0.9 \
-       --log-interval 1 \
-       --num-samples 0 \
-       --load $CHECKPOINT_PATH \
-       $use_tutel $ds_inference"
-
-echo $launch_cmd $program_cmd
-$launch_cmd $program_cmd
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/README.md
deleted file mode 100644
index eb5fb415a3f3a1af753fe8c7b8f43b710d5f7c8c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/README.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# ZeRO-Offload++ Tutorials
-
-This folder contains examples that demonstrate how to use the new ZeRO-Offload++ features. 
-
-ZeRO-Offload++ now supports **Twin-Flow** feature.
-
-## Twin-Flow
-
-Instead of all-or-nothing offloading strategy, **Twin-Flow** allows a portion of data to run on CPU and the other part on GPU simultaneously. Thus, we not only mitigate the memory pressure on GPU side by offloading data to CPU, but also utilize both CPU and GPU computation resources more efficiently. 
-
-![Twin-Flow-img](./twin-offload.png)
-
-As shown in above Figure, when ZeRO-Offload is triggered, **Twin-Flow** now allow user to set a new configuration arguement called `ratio` (default value == 1) to adjust the portion of parameter updates on CPU optimizer. For example, if this `ratio==0.4`, it means 0-40% of parameters are updated using CPUAdam on CPU side, while the rest 60% parameters are updatedusing FusedAdam on GPU side.
-
-## How to use
-
-Now **Twin-Flow** can be used at ZeRO stage 3 with Offload. Below we provide two tutorial examples on how to use **Twin-Flow**.
-
-### DeepSpeed Toy Example
-
-Here is a toy example for using **Twin-Flow** inside DeepSpeed repo. 
-
-Under `/tests/small_model_debugging/` folder, Run 
-
-```
-deepspeed partial_offload_test.py --zero 3
-```
-
-### GPT Model Training in Megatron-DeepSpeed
-
-To enable **Twin-Flow** here, we need to add two flags for Megatron configs as follows: 
-
-#### Megatron Configurations
-```
---no-pipeline-parallel \
---cpu-optimizer \
-```
-which have been added to `ds_pretrain_gpt_350M.sh`
-
-#### DeepSpeed Configurations
-On the DeepSpeed side, we need to add follow configurations:
-
-```
-    "offload_optimizer": {
-      "device": "cpu",
-      "pin_memory": true,
-      "ratio": 0.3
-    }
-```
-
-Basically, we need to first enable CPU Offload. Then user can adjust the portion of parameter updating on CPU by adjusting `ratio` here. Its default value is 1, which means all parameter updates happen on CPU side. The above config example with ` "ratio" : 0.3` meaning 0-30% parameters are updating on CPU side, while the other 70% parameter updates happens on GPU side.
-
-#### Tuning suggestion on ratio
-
-To get best performance, we recommend to set this `ratio` value as low as possible without causing GPU memory Out-Ouf-Memory issue.
-
-One additional config on DeepSpeed side is 
-
-```
-  "prescale_gradients": false,
-```
-mainly because right now ZeRO-3 does not support prescale gradients.
-
-All above configs have been added to `ds_config_gpt_TEMPLATE.json`
-
-#### End-to-end Training
-
-To run a sample training of GPT-350M model using Megatron-Deepspeed, simply run as follows:
-
-```
-bash ds_pretrain_gpt_350M.sh
-```
-
-Now the training start running with **Twin-Flow**. Enjoy!
-
-## On-going optimizations
-
-We have some other features inside ZeRO-Offload++ which will come soon, stay tuned!
-
-* Removing uncessary D2H memcpy in ZeRO-offload
-* On-the-fly fp16 to fp32 data casting inside CPUAdam
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/ds_config_gpt_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/ds_config_gpt_TEMPLATE.json
deleted file mode 100644
index ebcefa09e74c70b80e21c7becbbfad845d26dfcc..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/ds_config_gpt_TEMPLATE.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "train_batch_size" : CONFIG_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": 3,
-    "offload_optimizer": {
-      "device": "cpu",
-      "pin_memory": true,
-      "ratio": 0.3
-    }
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients":false,
-
-  "fp16": {
-    "enabled": CONFIG_FP16_ENABLED,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "bf16": {
-    "enabled": CONFIG_BF16_ENABLED
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/ds_pretrain_gpt_350M.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/ds_pretrain_gpt_350M.sh
deleted file mode 100644
index 0a8a5ce9b3f86de9cf5b8adef944f6b1b6065318..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/ds_pretrain_gpt_350M.sh
+++ /dev/null
@@ -1,316 +0,0 @@
-#!/bin/bash
-DIR=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-SEQ_LEN=2048
-
-### The "GPT-3 XXX" below are configs from GPT-3 paper
-### https://arxiv.org/abs/2005.14165, choose based on
-### your desired model size or build your own configs
-
-## GPT-3 Small 125M
-# MODEL_SIZE=0.125
-# NUM_LAYERS=12
-# HIDDEN_SIZE=768
-# NUM_ATTN_HEADS=12
-# GLOBAL_BATCH_SIZE=256
-# LR=6.0e-4
-# MIN_LR=6.0e-5
-
-## GPT-3 Medium 350M
-MODEL_SIZE=0.35
-NUM_LAYERS=24
-HIDDEN_SIZE=1024
-NUM_ATTN_HEADS=16
-GLOBAL_BATCH_SIZE=256
-LR=3.0e-4
-MIN_LR=3.0e-5
-
-## GPT-3 Large 760M
-# MODEL_SIZE=0.76
-# NUM_LAYERS=24
-# HIDDEN_SIZE=1536
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=256
-# LR=2.5e-4
-# MIN_LR=2.5e-5
-
-## GPT-3 XL 1.3B
-# MODEL_SIZE=1.3
-# NUM_LAYERS=24
-# HIDDEN_SIZE=2048
-# NUM_ATTN_HEADS=16
-# GLOBAL_BATCH_SIZE=512
-# LR=2.0e-4
-# MIN_LR=2.0e-5
-
-## GPT-3 2.7B
-# MODEL_SIZE=2.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=2560
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=512
-# LR=1.6e-4
-# MIN_LR=1.6e-5
-
-## GPT-3 6.7B
-# MODEL_SIZE=6.7
-# NUM_LAYERS=32
-# HIDDEN_SIZE=4096
-# NUM_ATTN_HEADS=32
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.2e-4
-# MIN_LR=1.2e-5
-
-## GPT-3 13B
-# MODEL_SIZE=13
-# NUM_LAYERS=40
-# HIDDEN_SIZE=5120
-# NUM_ATTN_HEADS=40
-# GLOBAL_BATCH_SIZE=1024
-# LR=1.0e-4
-# MIN_LR=1.0e-5
-
-## GPT-3 175B
-# MODEL_SIZE=175
-# NUM_LAYERS=96
-# HIDDEN_SIZE=12288
-# NUM_ATTN_HEADS=96
-# GLOBAL_BATCH_SIZE=1536
-# LR=0.6e-4
-# MIN_LR=0.6e-5
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens
-## For MoE model, we found sometimes training a bit more to 330B tokens helps
-TRAIN_TOKENS=300000000000
-# TRAIN_TOKENS=330000000000
-
-## TRAIN_SAMPLES is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
-## above, and techniques like curriculum learning has less token in some steps,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by TRAIN_SAMPLES.
-TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
-
-## Another termination condition in minutes. Set it large enough to avoid
-## undesired early termination.
-EXIT_DURATION=30000000
-###############################################################################
-### LR configs
-## LR warmup and decay duration, this token-based config is preferable since
-## no need to readjust when the batch size/seqlen is changed.
-## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
-## For MoE model, we found that setting the decay token to 300B helps.
-WARMUP_TOKENS=375000000
-LR_DECAY_TOKENS=260000000000
-# LR_DECAY_TOKENS=300000000000
-###############################################################################
-### Parallelism configs
-## Micro batch size per GPU
-## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
-BATCH_SIZE=2
-
-## Model parallelism, 1 is no MP
-MP_SIZE=1
-
-## Pipeline parallelism
-## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
-## to 1 and use the "--no-pipeline-parallel" arg.
-PP_SIZE=1
-NUM_GPUS=16
-###############################################################################
-### MoE configs
-## Number of experts. EP_SIZE 1 means dense model without MoE
-EP_SIZE=1
-# EP_SIZE=128
-
-if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
-    EP_PARALLEL_SIZE=$NUM_GPUS
-else
-    EP_PARALLEL_SIZE=$EP_SIZE
-fi
-
-## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
-## found that lower LR and min LR (than the base dense model) helps.
-## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
-## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
-## heavily tuned.
-# LR=2.0e-4
-# MIN_LR=2e-06
-
-## Coefficient for MoE loss. We find that 0.01 is a good value at least for
-## 1.3B MoE-128 model
-MLC=0.01
-
-## Below configs adjust the MoE expert token capacity limit during training and
-## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
-## Larger capacity factor or disabling capacity limit could improve training
-## convergence, but will also reduce training throughput.
-MOE_TRAIN_CAP_FACTOR=1.0
-MOE_EVAL_CAP_FACTOR=1.0
-MOE_MIN_CAP=4
-MOE_DROP_TOKEN="true"
-# MOE_DROP_TOKEN="false"
-###############################################################################
-### Curriculum learning (CL) configs
-## Enable/disable CL
-CL_ENABLED="false"
-## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
-## for tuning the following configs
-CL_START_SEQLEN=80
-CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
-CL_TOKENS=60
-CL_TOKENS=$((${CL_TOKENS} * 1000000000))
-CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
-###############################################################################
-### Misc configs
-LOG_INTERVAL=1
-EVAL_ITERS=10
-EVAL_INTERVAL=100
-SAVE_INTERVAL=1000
-
-## Standard deviation for weight initialization
-## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
-## dense model. Usually larger model needs lower std.
-INIT_STD=0.014
-# INIT_STD=0.01
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-ACTIVATION_CHECKPOINT="true"
-# ACTIVATION_CHECKPOINT="false"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-host="${HOSTNAME}"
-NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
-if [[ $EP_SIZE -gt 1 ]]; then
-    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
-fi
-if [ "${CL_ENABLED}" = "true" ]; then
-    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
-fi
-
-OUTPUT_BASEPATH=$DIR/output
-mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
-mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
-mkdir -p "${OUTPUT_BASEPATH}/log/"
-TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
-mkdir -p ${TENSORBOARD_DIR} 
-## Note that for MoE model with billion-scale base model, the checkpoint can be
-## as large as TB-scale which normal NFS cannot handle efficiently.
-CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
-
-
-VOCAB_PATH=/data/users/guanhua/Megatron-DeepSpeed/dataset/gpt2-vocab.json
-MERGE_PATH=/data/users/guanhua/Megatron-DeepSpeed/dataset/gpt2-merges.txt
-# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
-DATA_BLEND=/data/users/guanhua/Megatron-DeepSpeed/dataset/BookCorpusDataset_text_document
-
-###############################################################################
-data_options=" \
-         --vocab-file ${VOCAB_PATH} \
-         --merge-file ${MERGE_PATH} \
-         --data-path ${DATA_BLEND} \
-         --data-impl mmap"
-        
-megatron_options=" \
-        --override-opt_param-scheduler \
-        --adam-beta1 0.9 \
-        --adam-beta2 0.95 \
-        --tensor-model-parallel-size ${MP_SIZE} \
-        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
-        --num-experts ${EP_SIZE} \
-        --moe-loss-coeff ${MLC} \
-        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
-        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
-        --moe-min-capacity ${MOE_MIN_CAP} \
-        --init-method-std ${INIT_STD} \
-        --lr-decay-tokens ${LR_DECAY_TOKENS} \
-        --lr-warmup-tokens ${WARMUP_TOKENS} \
-        --micro-batch-size ${BATCH_SIZE} \
-        --exit-duration-in-mins ${EXIT_DURATION} \
-        --rampup-batch-size 32 32 1953125 \
-        --global-batch-size ${GLOBAL_BATCH_SIZE} \
-        --num-layers ${NUM_LAYERS} \
-        --hidden-size ${HIDDEN_SIZE} \
-        --num-attention-heads ${NUM_ATTN_HEADS} \
-        --seq-length ${SEQ_LEN} \
-        --max-position-embeddings ${SEQ_LEN} \
-        --train-tokens ${TRAIN_TOKENS} \
-        --train-samples ${TRAIN_SAMPLES} \
-        --lr ${LR} \
-        --min-lr ${MIN_LR} \
-        --lr-decay-style cosine \
-        --split 98,2,0 \
-        --log-interval ${LOG_INTERVAL} \
-        --eval-interval ${EVAL_INTERVAL} \
-        --eval-iters ${EVAL_ITERS} \
-        --save-interval ${SAVE_INTERVAL} \
-        --weight-decay 0.1 \
-        --clip-grad 1.0 \
-        --hysteresis 2 \
-        --num-workers 0 \
-        --fp16 \
-        --load ${CHECKPOINT_PATH} \
-        --save ${CHECKPOINT_PATH} \
-        --tensorboard-queue-size 1 \
-        --log-timers-to-tensorboard \
-        --timing-log-level 1 \
-        --no-pipeline-parallel \
-        --cpu-optimizer \
-        --log-batch-size-to-tensorboard \
-        --log-validation-ppl-to-tensorboard \
-        --tensorboard-dir ${TENSORBOARD_DIR}"
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-megatron_options="${megatron_options} \
-        --checkpoint-activations"
-fi
-
-if [[ $EP_SIZE -gt 1 ]]; then
-megatron_options="${megatron_options} \
-        --create-moe-param-group"
-fi
-
-if [ "${MOE_DROP_TOKEN}" = "false" ]; then
-megatron_options="${megatron_options} \
-        --disable-moe-token-dropping"
-fi
-
-template_json="ds_config_gpt_TEMPLATE.json"
-config_json="ds_config_gpt_${NAME}.json"
-sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
-    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
-    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
-    | sed "s/ZERO_STAGE/3/" \
-    | sed "s/PRESCALE_GRAD/true/" \
-    | sed "s/CONFIG_FP16_ENABLED/false/" \
-    | sed "s/CONFIG_BF16_ENABLED/true/" \
-    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
-    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
-    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
-    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
-	  > ${config_json}
-
-deepspeed_options=" \
-		    --deepspeed \
-		    --deepspeed_config ${config_json} \
-		    --pipeline-model-parallel-size ${PP_SIZE}"
-
-# Currently MoE is not compatible with pipeline parallel
-if [[ $EP_SIZE -gt 1 ]]; then
-deepspeed_options="${deepspeed_options} \
-        --no-pipeline-parallel"
-fi
-
-if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-        --deepspeed-activation-checkpointing"
-fi
-
-run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
-echo ${run_cmd}
-eval ${run_cmd}
-set +x
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/twin-offload.png b/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/twin-offload.png
deleted file mode 100644
index 1c8c3ef92454bc6443f9fbbe9a8a1940f9669f2f..0000000000000000000000000000000000000000
Binary files a/toolbox/Megatron-DeepSpeed/examples_deepspeed/offload_pp/twin-offload.png and /dev/null differ
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/pretrain_llama2_distributed.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/pretrain_llama2_distributed.sh
deleted file mode 100644
index f275ea636a5e3756995484fe71b5070ef65089dd..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/pretrain_llama2_distributed.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/bin/bash
-# This example script is contributed by external user https://github.com/nrailgun
-set -ex
-
-######################################
-# Change the below configurations here
-BASE_PATH=./tmp
-DS_CONFIG=${BASE_PATH}/deepspeed.json
-DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence"
-DATASET="1 ${DATASET_1}"
-CHECKPOINT_PATH=./tmp
-TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model
-
-TP=2
-PP=2
-ZERO_STAGE=0
-
-GPUS_PER_NODE=8
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-
-HIDDEN_SIZE=2048 # e.g. llama-13b: 5120
-FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824
-NUM_LAYERS=24 # e.g. llama-13b: 40
-NUM_HEADS=16 # e.g. llama-13b: 40
-SEQ_LENGTH=2048
-NUM_KV_HEADS=4 # llama2 70B uses GQA
-
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=32 # e.g. llama: 4M tokens
-TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps
-LR=3e-4
-MIN_LR=3e-5
-LR_WARMUP_STEPS=2000
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-
-# Below configuration required for llama model as per llama paper
-# --no-query-key-layer-scaling \
-# --attention-dropout 0 \
-# --hidden-dropout 0 \
-# --use-rotary-position-embeddings \
-# --untie-embeddings-and-output-weights \
-# --swiglu \
-# --normalization rmsnorm \
-# --disable-bias-linear \
-######################################
-
-
-
-cat <<EOT > $DS_CONFIG
-{
-  "train_batch_size" : $GLOBAL_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
-  "steps_per_print": 1,
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-  "bf16": {
-    "enabled": true
-  }
-}
-EOT
-
-ds_args=""
-ds_args=" --deepspeed ${ds_args}"
-ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
-ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-  ds_args="--deepspeed-activation-checkpointing ${ds_args}"
-
-  ## old argument for recomputing the transformer layer
-  # ds_args="--checkpoint-activations ${ds_args}"
-
-  ## new argument for recomputing the transformer layer
-  ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}"
-  ## new argument for recomputing only the attention layer
-  # ds_args="--recompute-granularity selective ${ds_args}"
-fi
-
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-torchrun $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       --tensor-model-parallel-size $TP \
-       --pipeline-model-parallel-size $PP \
-       --num-layers $NUM_LAYERS \
-       --hidden-size $HIDDEN_SIZE \
-       --ffn-hidden-size $FFN_HIDDEN_SIZE \
-       --num-attention-heads $NUM_HEADS \
-       --micro-batch-size $MICRO_BATCH_SIZE \
-       --global-batch-size $GLOBAL_BATCH_SIZE \
-       --seq-length $SEQ_LENGTH \
-       --max-position-embeddings $SEQ_LENGTH \
-       --train-iters $TRAIN_STEPS \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATASET \
-       --data-impl mmap \
-       --tokenizer-type GPTSentencePieceTokenizer \
-       --tokenizer-model $TOKENIZER_PATH \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr $LR \
-       --lr-decay-style cosine \
-       --min-lr $MIN_LR \
-       --weight-decay $WEIGHT_DECAY \
-       --clip-grad $GRAD_CLIP \
-       --lr-warmup-iters $LR_WARMUP_STEPS \
-       --optimizer adam \
-       --adam-beta1 0.9 \
-       --adam-beta2 0.95 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --bf16 \
-       --no-query-key-layer-scaling \
-       --attention-dropout 0 \
-       --hidden-dropout 0 \
-       --use-rotary-position-embeddings \
-       --untie-embeddings-and-output-weights \
-       --swiglu \
-       --normalization rmsnorm \
-       --disable-bias-linear \
-       --num-key-value-heads $NUM_KV_HEADS \
-       $ds_args
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/pretrain_llama_distributed.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/pretrain_llama_distributed.sh
deleted file mode 100644
index b7bf890236fe4d4b04912d0fba7b26814de8159d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/pretrain_llama_distributed.sh
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/bin/bash
-# This example script is contributed by external user https://github.com/LydiaXiaohongLi
-set -ex
-
-######################################
-# Change the below configurations here
-BASE_PATH=./tmp
-DS_CONFIG=${BASE_PATH}/deepspeed.json
-DATASET_1="./tmp/data/bookcorpus_train_1m_text_sentence"
-DATASET="1 ${DATASET_1}"
-CHECKPOINT_PATH=./tmp
-TOKENIZER_PATH=./tmp/tokenizer.model # offical llama tokenizer.model
-
-TP=2
-PP=2
-ZERO_STAGE=0
-
-GPUS_PER_NODE=8
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NNODES=1
-NODE_RANK=0
-
-HIDDEN_SIZE=2048 # e.g. llama-13b: 5120
-FFN_HIDDEN_SIZE=5504 # e.g. llama-13b: 13824
-NUM_LAYERS=24 # e.g. llama-13b: 40
-NUM_HEADS=16 # e.g. llama-13b: 40
-SEQ_LENGTH=2048
-
-MICRO_BATCH_SIZE=4
-GLOBAL_BATCH_SIZE=32 # e.g. llama: 4M tokens
-TRAIN_STEPS=250000 # e.g. llama: 1T tokens / 4M tokens_per_batch = 250000 steps
-LR=3e-4
-MIN_LR=3e-5
-LR_WARMUP_STEPS=2000
-WEIGHT_DECAY=0.1
-GRAD_CLIP=1
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-# activation_checkpoint="true"
-activation_checkpoint="false"
-
-# Below configuration required for llama model as per llama paper
-# --no-query-key-layer-scaling \
-# --attention-dropout 0 \
-# --hidden-dropout 0 \
-# --use-rotary-position-embeddings \
-# --untie-embeddings-and-output-weights \
-# --swiglu \
-# --normalization rmsnorm \
-# --disable-bias-linear \
-######################################
-
-
-
-cat <<EOT > $DS_CONFIG
-{
-  "train_batch_size" : $GLOBAL_BATCH_SIZE,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
-  "steps_per_print": 1,
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-  "bf16": {
-    "enabled": true
-  }
-}
-EOT
-
-ds_args=""
-ds_args=" --deepspeed ${ds_args}"
-ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
-ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-  ds_args="--deepspeed-activation-checkpointing ${ds_args}"
-
-  ## old argument for recomputing the transformer layer
-  # ds_args="--checkpoint-activations ${ds_args}"
-
-  ## new argument for recomputing the transformer layer
-  ds_args="--recompute-granularity full --recompute-method uniform ${ds_args}"
-  ## new argument for recomputing only the attention layer
-  # ds_args="--recompute-granularity selective ${ds_args}"
-fi
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-torchrun $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       --tensor-model-parallel-size $TP \
-       --pipeline-model-parallel-size $PP \
-       --num-layers $NUM_LAYERS \
-       --hidden-size $HIDDEN_SIZE \
-       --ffn-hidden-size $FFN_HIDDEN_SIZE \
-       --num-attention-heads $NUM_HEADS \
-       --micro-batch-size $MICRO_BATCH_SIZE \
-       --global-batch-size $GLOBAL_BATCH_SIZE \
-       --seq-length $SEQ_LENGTH \
-       --max-position-embeddings $SEQ_LENGTH \
-       --train-iters $TRAIN_STEPS \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATASET \
-       --data-impl mmap \
-       --tokenizer-type GPTSentencePieceTokenizer \
-       --tokenizer-model $TOKENIZER_PATH \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr $LR \
-       --lr-decay-style cosine \
-       --min-lr $MIN_LR \
-       --weight-decay $WEIGHT_DECAY \
-       --clip-grad $GRAD_CLIP \
-       --lr-warmup-iters $LR_WARMUP_STEPS \
-       --optimizer adam \
-       --adam-beta1 0.9 \
-       --adam-beta2 0.95 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --bf16 \
-       --no-query-key-layer-scaling \
-       --attention-dropout 0 \
-       --hidden-dropout 0 \
-       --use-rotary-position-embeddings \
-       --untie-embeddings-and-output-weights \
-       --swiglu \
-       --normalization rmsnorm \
-       --disable-bias-linear \
-       $ds_args
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/README.md
deleted file mode 100644
index 004469bd44f2cadfa2659501f84e8c0a8c7c3f0b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/README.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# July 2023 sync with NVIDIA/Megatron-LM
-This folder includes details about the recent sync with the NVIDIA/Megatron-LM repo (where this repo is forked from). It includes example scripts we used to test after the sync, together with this README documentation about what were tested.
-
-We also created a [backup branch](https://github.com/microsoft/Megatron-DeepSpeed/tree/before_rebase) which is the version before this sync. This branch is just for comparison tests and for temporary use when debugging the main branch. We do not plan to continue supporting the version before sync.
-
-## List of rebase efforts/achievements
-* Enabling Megatron-LM's sequence parallel.
-* Enabling rotary positional embedding.
-* Enabling FlashAttention v1 and v2.
-* Fix the conflicts related to activation checkpointing when DeepSpeed is used with the newest Megatron-LM since NVIDIA introduced some new fine-grained partial checkpointing techniques which DeepSpeed is currently not compatible.
-* Major refactor to DeepSpeed pipeline parallelism implementation for GPT model in order to work with newest Megatron-LM.
-* Fix model checkpoint save/load when DeepSpeed is used with the newest Megatron-LM.
-* Fully verified the performance and correctness of GPT pretraining after rebasing.
-
-## Test environment
-We used 128 V100 GPUs (8 DGX-2 nodes, 16 GPU per node, inter-node network is InfiniBand with around 660 Gbps measured bandwidth) for the tests. For software, we used DeepSpeed v0.9.5.
-
-## Verified cases and results
-We verified the following cases (matching training/validation curves before/after sync, checkpoint save/load works) for GPT-3 pretraining:
-
-* With DeepSpeed ZeRO stage 1
-* With DeepSpeed ZeRO stage 1 and Megatron-LM's tensor parallelism
-* With DeepSpeed ZeRO stage 1, Megatron-LM's tensor parallelism, and DeepSpeed's pipeline parallelism (i.e., 3D parallelism)
-
-In addition, below is a performance/convergence comparison between before and after this sync.
-
-| Case | TFLOPs (per GPU) | Validation loss at step 200 | Training script |
-| ---- | ---------------- | --------------------------- | --------------- |
-| Before sync, GPT-3 13B, 3D parallelism | 50 | 5.73 | [script (in the backup branch)](https://github.com/microsoft/Megatron-DeepSpeed/blob/before_rebase/examples/before_rebase_test/ds_pretrain_gpt_13B.sh) |
-| After sync, GPT-3 13B, 3D parallelism | 55.6 | 5.71 | [script](ds_pretrain_gpt_13B.sh) |
-
-At last, we provide a [toy example script](ds_pretrain_gpt_125M.sh) that users can try as the first test.
-
-## Flash attention
-We tested and verified that flash attention feature introduced by this sync works properly for GPT pretraining. 
-Our code automatically uses [FlashAttention-2](https://github.com/Dao-AILab/flash-attention) when avaiable.
-
-We compared the training using the [toy example script](ds_pretrain_gpt_125M.sh) and the [toy example script with flash attention](ds_pretrain_gpt_125M_flashattn.sh) on 8 A100 GPUs, and found that FlashAttention (1.0,4) increased training throughput (TFLOPs per GPU) from 25 to 32. When scaling up the model to 2.7B using the same script, FlashAttention-2 improved the training throughput 121 TFLOPs to 132 TFLOPs in comparison to FlashAttention 1.x.
-
-For installation instructions, please refer to [FlashAttention's repository](https://github.com/Dao-AILab/flash-attention).
-
-## Rotary Positional Embedding (RoPE)
-We also tested and verified that the Rotary Positional Embedding (RoPE) introduced by this sync works properly for GPT pretraining. By comparing the training between [without RoPE](ds_pretrain_gpt_1.3B.sh) and [with RoPE](ds_pretrain_gpt_1.3B_rope.sh), we are able to observe that RoPE helps improving the model convergence just like [previous observation](https://blog.eleuther.ai/rotary-embeddings/).
-
-## Notes/TODOs
-* After the sync, DeepSpeed still relies on the older activation checkpointing mechanism (see function ```_checkpointed_forward``` in ```Megatron-DeepSpeed/megatron/model/transformer.py```) since we didn't have time to integrate with the new version yet. Contribution is very welcomed.
-* (Aug 2023 update) With the contribution from 3P users (https://github.com/microsoft/Megatron-DeepSpeed/pull/225), now it's also possible to use Megatron-LM's newer activation checkpointing mechanism. However, currently it's still not compatible with DeepSpeed, so you won't be able to combine it with any DeepSpeed technologies. We DeepSpeed team compared the [older mechanism](ds_pretrain_gpt_1.3B.sh) and [newer mechanism](ds_pretrain_gpt_1.3B_megatron_checkpointing.sh) on 1 DGX-2 node (16 V100), and found that the older mechanism has less memory saving (older max allocated 15241 MB, newer 12924 MB) and higher throughput (older 23.11 TFLOPs newer 17.26 TFLOPs). Thus currently we still recommend using the older mechanism both because of the similar checkpointing performance, and (more importantly) because only older mechnaism is compatible with DeepSpeed (and in this case you can combine with ZeRO to achieve more memeory saving).
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_config_gpt_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_config_gpt_TEMPLATE.json
deleted file mode 100644
index 3526aae85f0465ff7ec017f70b3e145d651da2f2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_config_gpt_TEMPLATE.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_config_gpt_slw_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_config_gpt_slw_TEMPLATE.json
deleted file mode 100644
index f1abcedcb2b187bd2200df8e0a1f6824a84a1f57..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_config_gpt_slw_TEMPLATE.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false,
-  "curriculum_learning": {
-    "enabled": true,
-    "curriculum_type": "seqlen",
-    "min_difficulty": CONFIG_CL_MIN,
-    "max_difficulty": CONFIG_CL_MAX,
-    "schedule_type": "fixed_linear",
-    "schedule_config": {
-      "total_curriculum_step": CONFIG_CL_DURATION,
-      "difficulty_step": 8
-    }
-  }
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B.sh
deleted file mode 100644
index ccc2e581a7581eadd0a3ccc909ba05f031a05311..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B.sh
+++ /dev/null
@@ -1,332 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-model_size=1.3
-num_layers=24
-hidden_size=2048
-num_attn_heads=16
-global_batch_size=512
-lr=2.0e-4
-min_lr=1.0e-6
-init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=2
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=0
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=2
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-## Public the Pile dataset, can be downloaded at
-## https://mystic.the-eye.eu/public/AI/pile_neox/ or 
-## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you
-## store the pile_text_document.bin and pile_text_document.idx.
-data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-data_path="${data_home}/pile_text_document"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase"
-
-username=$(whoami)
-output_home="/blob/users/${username}/project/data_efficient_gpt"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_megatron_checkpointing.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_megatron_checkpointing.sh
deleted file mode 100644
index 343dc9f0e8079858fd64218669ee42307296d6d1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_megatron_checkpointing.sh
+++ /dev/null
@@ -1,345 +0,0 @@
-#!/bin/bash
-###############################################################################
-###############################################################################
-###############################################################################
-## WARNING: This script is only for evaluating Megatron-LM's activation
-## checkpointing. We do not recommend using it for actual training because
-## you are not able to use any DeepSpeed technologies.
-###############################################################################
-###############################################################################
-###############################################################################
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-model_size=1.3
-num_layers=24
-hidden_size=2048
-num_attn_heads=16
-global_batch_size=512
-lr=2.0e-4
-min_lr=1.0e-6
-init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=2
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=0
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=2
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-## Public the Pile dataset, can be downloaded at
-## https://mystic.the-eye.eu/public/AI/pile_neox/ or 
-## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you
-## store the pile_text_document.bin and pile_text_document.idx.
-data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-data_path="${data_home}/pile_text_document"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase_megatron_checkpointing"
-
-username=$(whoami)
-output_home="/blob/users/${username}/project/data_efficient_gpt"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-# test megatron activation checkpointing
-# we fixed bug in the code of this activation checkpointing, i.e., --recompute-granularity full --recompute-method uniform
-# the two arguments can be found in megatron/arguments.py
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --recompute-granularity full \
-    --recompute-method uniform \
-    --recompute-num-layers 1"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-# disable the deepspeed activation checkpointing
-
-# if [ "${activation_checkpoint}" = "true" ]; then
-# deepspeed_options="${deepspeed_options} \
-#     --deepspeed-activation-checkpointing"
-# fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_rope.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_rope.sh
deleted file mode 100644
index a3d6918ef1e8f3d8982ad837a57352313e45a1f1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_rope.sh
+++ /dev/null
@@ -1,334 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-model_size=1.3
-num_layers=24
-hidden_size=2048
-num_attn_heads=16
-global_batch_size=512
-lr=2.0e-4
-min_lr=1.0e-6
-init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=4
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=8
-no_pp="false"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=1
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=2
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-## Public the Pile dataset, can be downloaded at
-## https://mystic.the-eye.eu/public/AI/pile_neox/ or 
-## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you
-## store the pile_text_document.bin and pile_text_document.idx.
-data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-data_path="${data_home}/pile_text_document"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase_rope0.25"
-
-username=$(whoami)
-output_home="/blob/users/${username}/project/data_efficient_gpt"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --use-rotary-position-embeddings \
-    --rotary-percent 0.25 \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_rope_slw.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_rope_slw.sh
deleted file mode 100644
index 209021a39273fcdd2e421da4e694ffed53de5c72..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_1.3B_rope_slw.sh
+++ /dev/null
@@ -1,347 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-model_size=1.3
-num_layers=24
-hidden_size=2048
-num_attn_heads=16
-global_batch_size=512
-lr=2.0e-4
-min_lr=1.0e-6
-init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=4
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=8
-no_pp="false"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=1
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=2
-###############################################################################
-### curriculum learning (sequence length warmup) configs
-# The "divided by 3" means we use 1/3 of baseline's total steps for sequence length warmup.
-# This is not always the best config, but usually a reasonable choice to start with.
-cl_step=$(( ${lr_warmup_tokens} / 3 / ${global_batch_size} / ${seq_len} ))
-# Starting sequence length during sequence length warmup. If the train/validation loss is
-# unstable at the beginning of training, need to increase this but also need to keep as multiples
-# of 8 in order to enable Tensor Core acceleration.
-cl_min=64
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-## Public the Pile dataset, can be downloaded at
-## https://mystic.the-eye.eu/public/AI/pile_neox/ or 
-## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you
-## store the pile_text_document.bin and pile_text_document.idx.
-data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-data_path="${data_home}/pile_text_document"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase_rope0.25"
-jobname="${jobname}_cl_step${cl_step}_cl_min${cl_min}"
-
-username=$(whoami)
-output_home="/blob/users/${username}/project/data_efficient_gpt"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --use-rotary-position-embeddings \
-    --rotary-percent 0.25 \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_cl_step${cl_step}_cl_min${cl_min}.json"
-template_json="ds_config_gpt_slw_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-    | sed "s/CONFIG_CL_MIN/${cl_min}/" \
-    | sed "s/CONFIG_CL_MAX/${seq_len}/" \
-    | sed "s/CONFIG_CL_DURATION/${cl_step}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_125M.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_125M.sh
deleted file mode 100644
index 8235b6c1aeeee408f552b5e7e041d85f6e721ac2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_125M.sh
+++ /dev/null
@@ -1,331 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-model_size=0.125
-num_layers=12
-hidden_size=768
-num_attn_heads=12
-global_batch_size=256
-lr=6.0e-4
-min_lr=1.0e-6
-init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=16
-# global_batch_size=512
-# lr=2.0e-4
-# min_lr=1.0e-6
-# init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=2
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=2
-no_pp="false"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=1
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=2
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase"
-
-username=$(whoami)
-output_home="output"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-tensorboard_dir="${output_home}/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_125M_flashattn.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_125M_flashattn.sh
deleted file mode 100644
index 3a26aab262f3cf0f348bbb08a1f7fe3ffdd4a4c5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_125M_flashattn.sh
+++ /dev/null
@@ -1,332 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-model_size=0.125
-num_layers=12
-hidden_size=768
-num_attn_heads=12
-global_batch_size=256
-lr=6.0e-4
-min_lr=1.0e-6
-init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=16
-# global_batch_size=512
-# lr=2.0e-4
-# min_lr=1.0e-6
-# init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=2
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=2
-no_pp="false"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=1
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=2
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase"
-
-username=$(whoami)
-output_home="output"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-tensorboard_dir="${output_home}/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --use-flash-attn \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_13B.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_13B.sh
deleted file mode 100644
index 931886b34d8d2402059e24d3f5f4ddaf99a00fcf..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/rebase/ds_pretrain_gpt_13B.sh
+++ /dev/null
@@ -1,332 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=2048
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=16
-# global_batch_size=512
-# lr=2.0e-4
-# min_lr=1.0e-6
-# init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-model_size=13
-num_layers=40
-hidden_size=5120
-num_attn_heads=40
-global_batch_size=1024
-lr=1.0e-4
-min_lr=1.0e-6
-init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-mp_size=4
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=8
-no_pp="false"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=1
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=2
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-## Public the Pile dataset, can be downloaded at
-## https://mystic.the-eye.eu/public/AI/pile_neox/ or 
-## https://the-eye.eu/public/AI/pile_neox/ Change data_home to where you
-## store the pile_text_document.bin and pile_text_document.idx.
-data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
-data_path="${data_home}/pile_text_document"
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase"
-
-username=$(whoami)
-output_home="/blob/users/${username}/project/data_efficient_gpt"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-## Microsoft internal constraint: because tensorboard is logged by last rank,
-## it's better to put the path in NFS instead of Blob.
-tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size ${mp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/run_deepspeed_example.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/run_deepspeed_example.sh
deleted file mode 100644
index 909cdf671387090e40097c9ace8b606fc9f5a948..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/run_deepspeed_example.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/bin/bash
-set -ex
-
-BASE_PATH=/vc_data/Megatron-LM/data
-DATA_PATH=${BASE_PATH}/indexed_datasets/megatron
-DS_CONFIG=ds_config.json
-
-TP=1
-PP=1
-NLAYERS=24
-HIDDEN=512
-
-GLOBAL_BATCH=64
-MICRO_BATCH=4
-
-ZERO_STAGE=2
-
-OUTPUT_DIR=ds_z${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
-#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
-mkdir -p $OUTPUT_DIR
-
-cat <<EOT > $DS_CONFIG
-{
-  "train_batch_size" : $GLOBAL_BATCH,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
-  "steps_per_print": 1,
-
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-
-  "fp16": {
-    "enabled": true,
-    "initial_scale_power": 12
-  },
-
-  "wall_clock_breakdown" : true
-}
-EOT
-
-export NCCL_DEBUG=warn 
-
-ds_args=""
-ds_args=" --deepspeed ${ds_args}"
-ds_args=" --no-pipeline-parallel ${ds_args}" 
-ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
-ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
-ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
-
-
-deepspeed pretrain_gpt.py \
-    --tensor-model-parallel-size $TP \
-    --pipeline-model-parallel-size $PP \
-    --num-layers $NLAYERS \
-    --hidden-size $HIDDEN \
-    --num-attention-heads 16 \
-    --seq-length 256 \
-    --loss-scale 12 \
-    --max-position-embeddings 1024 \
-    --micro-batch-size 4 \
-    --global-batch-size 1024 \
-    --train-iters 1000 \
-    --lr 6.0e-5 \
-    --min-lr 6.0e-6 \
-    --lr-decay-style cosine \
-    --log-interval 1 \
-    --eval-iters 40 \
-    --eval-interval 1000 \
-    --data-path $DATA_PATH \
-    --vocab-file $BASE_PATH/gpt2-vocab.json \
-    --merge-file $BASE_PATH/gpt2-merges.txt \
-    --save-interval 1000 \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.006 \
-    --fp16 \
-    --checkpoint-activations \
-    --tensorboard-dir $OUTPUT_DIR \
-    $ds_args \
-    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
-
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/README.md
deleted file mode 100644
index 96e0ef8a87432a96a91f369a0883069879323dd8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Sequence Parallelism
-
-This folder contains examples that demonstrate how to use DeepSpeed's sequence parallelism.
-
-## Setting Up the Environment for FlashAttention
-
-DeepSpeed's sequence parallelism can be combined with the following types of attention.
-
-- Classic attention
-- FlashAttention (enabled by `--use-flash-attn`)
-- FlashAttention + Triton (enabled by `--use-flash-attn-triton`)
-
-For the best performance, we recommend using FlashAttention + Triton. Here are the installation steps and the versions we have tested. Note that FlashAttention is compatible only with Turing, Ampere, Ada, or Hopper GPUs.
-
-```shell
-# install triton
-git clone -b legacy-backend https://github.com/openai/triton
-cd triton/python/
-pip install cmake
-pip install .
-
-# install
-cd ${WORK_DIR}
-git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention
-cd flash-attention
-python setup.py install
-```
-
-## Enabling Sequence Parallelism
-
-To enable sequence parallelism, set the degree of parallelism using the `--ds-sequence-parallel-size` argument. Ensure that the number of attention heads is divisible by this value.
-Ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the document of [FlashAttention](https://github.com/Dao-AILab/flash-attention/tree/v1.0.4) for more details.
-
-Some working examples ([GPT1.3B](ds_pretrain_gpt_1.3B_seq_parallel_32k.sh), [GPT30B](ds_pretrain_gpt_30B_seq_parallel_32k.sh)), that enable sequence parallelism, are available in this foloder.
-
-Please note that our sequence parallelism feature is currently incompatible with Megatron-LM's tensor or pipeline parallelism.
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_config_gpt_TEMPLATE.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_config_gpt_TEMPLATE.json
deleted file mode 100644
index 3526aae85f0465ff7ec017f70b3e145d651da2f2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_config_gpt_TEMPLATE.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "train_batch_size": GBSIZE,
-  "train_micro_batch_size_per_gpu": MBSIZE,
-  "steps_per_print": LOG_INTERVAL,
-
-  "zero_optimization": {
-    "stage": ZERO_STAGE
-  },
-
-  "gradient_clipping": 1.0,
-  "prescale_gradients": PRESCALE_GRAD,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 500,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 11
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
deleted file mode 100644
index da028dc731433d817a583f37bac64bbb108e81e5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
+++ /dev/null
@@ -1,341 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=32768
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-model_size=1.3
-num_layers=24
-hidden_size=2048
-num_attn_heads=16
-global_batch_size=2
-lr=2.0e-4
-min_lr=1.0e-6
-init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-## Currently we only support MP=1 with SP>1
-mp_size=1
-
-## Sequence parallelism, 1 is no SP
-sp_size=4
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=1
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} / ${sp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=1
-
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $sp_size -gt 1 ]]; then
-    jobname="${jobname}_sp${sp_size}"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase"
-
-username=$(whoami)
-output_home="output"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-tensorboard_dir="${output_home}/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size 1 \
-    --ds-sequence-parallel-size ${sp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --use-flash-attn-triton \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_30B_seq_parallel_32k.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_30B_seq_parallel_32k.sh
deleted file mode 100644
index f23e6f9585a4a84272263d9159d8645edbd76f53..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_30B_seq_parallel_32k.sh
+++ /dev/null
@@ -1,351 +0,0 @@
-#!/bin/bash
-dir=`pwd`
-###############################################################################
-### Main configs
-## GPT-3 models use 2K sequence length/context window
-seq_len=32768
-
-## The "GPT-3 XXX" below are configs from GPT-3 paper
-## https://arxiv.org/abs/2005.14165, choose based on
-## your desired model size or build your own configs
-
-## init_std is standard deviation for weight initialization. Usually larger
-## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
-## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
-
-## We changed min_lr to a lower number (1.0e-6), which we found is able to
-## provide better zero-shot eval results.
-
-## GPT-3 Small 125M
-# model_size=0.125
-# num_layers=12
-# hidden_size=768
-# num_attn_heads=12
-# global_batch_size=256
-# lr=6.0e-4
-# min_lr=1.0e-6
-# init_std=0.02
-
-## GPT-3 Medium 350M
-# model_size=0.35
-# num_layers=24
-# hidden_size=1024
-# num_attn_heads=16
-# global_batch_size=256
-# lr=3.0e-4
-# min_lr=1.0e-6
-# init_std=0.018
-
-## GPT-3 Large 760M
-# model_size=0.76
-# num_layers=24
-# hidden_size=1536
-# num_attn_heads=16
-# global_batch_size=256
-# lr=2.5e-4
-# min_lr=1.0e-6
-# init_std=0.015
-
-## GPT-3 XL 1.3B
-# model_size=1.3
-# num_layers=24
-# hidden_size=2048
-# num_attn_heads=16
-# global_batch_size=32
-# lr=2.0e-4
-# min_lr=1.0e-6
-# init_std=0.013
-
-## GPT-3 2.7B
-# model_size=2.7
-# num_layers=32
-# hidden_size=2560
-# num_attn_heads=32
-# global_batch_size=512
-# lr=1.6e-4
-# min_lr=1.0e-6
-# init_std=0.011
-
-## GPT-3 6.7B
-# model_size=6.7
-# num_layers=32
-# hidden_size=4096
-# num_attn_heads=32
-# global_batch_size=1024
-# lr=1.2e-4
-# min_lr=1.0e-6
-# init_std=0.009
-
-## GPT-3 13B
-# model_size=13
-# num_layers=40
-# hidden_size=5120
-# num_attn_heads=40
-# global_batch_size=1024
-# lr=1.0e-4
-# min_lr=1.0e-6
-# init_std=0.008
-
-# GPT-3 30B
-model_size=30
-num_layers=64
-hidden_size=6144
-num_attn_heads=64
-global_batch_size=2
-lr=1.0e-4
-min_lr=1.0e-6
-init_std=0.008
-
-## GPT-3 175B
-# model_size=175
-# num_layers=96
-# hidden_size=12288
-# num_attn_heads=96
-# global_batch_size=1536
-# lr=0.6e-4
-# min_lr=1.0e-6
-# init_std=0.005
-###############################################################################
-### Training duration configs
-## The main termination condition, original GPT-3 paper trains for 300B tokens.
-train_tokens_in_billion=300
-train_tokens=$((${train_tokens_in_billion} * 1000000000))
-
-## train_samples is another termination condition and also affect the number of 
-## data samples to be indexed. Since we want to reach the train_tokens
-## above, and data efficiency techniques may change num tokens in some samples,
-## so we just set this config large enough to make sure we have enough
-## processed data and don't terminate by train_samples.
-train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
-
-## Another wall-clock time termination condition in minutes. Set it large
-## enough to avoid undesired early termination.
-exit_duration=30000000
-###############################################################################
-### lr configs
-## lr warmup and decay duration.
-## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
-## Here we increase the warmup tokens to 3B since when batch size warmup is not
-## used, there are more tokens per step. Thus we need to increase warmup tokens
-## to make sure there are enough warmup steps, which is important for training
-## stability.
-lr_warmup_tokens_in_million=3000
-lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
-## Here we changed the LR decay tokens to align with total train tokens, since
-## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
-## learning rate schedule to match the number of training tokens results in the
-## best final model quality 
-lr_decay_tokens_in_billion=${train_tokens_in_billion}
-lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
-lr_decay_style="cosine"
-###############################################################################
-### Parallelism configs
-## Model parallelism, 1 is no MP
-## Currently we only support MP=1 with SP>1
-mp_size=1
-
-## Sequence parallelism, 1 is no SP
-sp_size=4
-
-## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
-## Note that currently both curriculum learning and random-LTD are NOT
-## compatible with pipeline parallelism.
-pp_size=1
-no_pp="true"
-
-## ZeRO-based data parallelism, stage=0 will disable ZeRO
-zero_stage=3
-
-## Total number of GPUs. ds_ssh is from DeepSpeed library.
-num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
-num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
-num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
-
-## Data parallel size.
-dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} / ${sp_size} ))
-
-## Micro batch size per GPU
-## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
-## Reduce it manually if GPU OOM
-# batch_size=$(( ${global_batch_size} / ${dp_size} ))
-batch_size=1
-
-###############################################################################
-### Misc configs
-log_interval=10
-eval_iters=10
-eval_interval=100
-# num_save controls how frequent to save checkpoint. num_save=20 means that a
-# checkpoint will be saved every 5% of training. For longer training you would
-# want larger num_save to save more frequently, and vice versa.
-num_save=100
-estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
-# save_interval=$((${estimated_train_iter} / ${num_save}))
-save_interval=100
-
-## Activation checkpointing saves GPU memory, but reduces training speed
-activation_checkpoint="true"
-# activation_checkpoint="false"
-
-## Whether or not log optimizer states (norms, max abs values) to tensorboard.
-## This is not required for training and might save GPU memory when turned off.
-log_optimizer_state="true"
-###############################################################################
-### Output and data configs
-current_time=$(date "+%Y.%m.%d_%H.%M.%S")
-host="${HOSTNAME}"
-seed=1234
-num_workers=0
-
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
-    wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
-vocab_path="gpt2-vocab.json"
-if [ ! -f "$vocab_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-fi
-merge_path="gpt2-merges.txt"
-if [ ! -f "$merge_path" ]; then
-    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-fi
-
-prescale_grad="true"
-jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
-jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
-jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
-if [[ $zero_stage -gt 0 ]]; then
-    jobname="${jobname}_z${zero_stage}"
-    prescale_grad="false"
-fi
-if [[ $sp_size -gt 1 ]]; then
-    jobname="${jobname}_sp${sp_size}"
-fi
-if [[ $mp_size -gt 1 ]]; then
-    jobname="${jobname}_mp${mp_size}"
-fi
-if [ "${no_pp}" = "false" ]; then
-    jobname="${jobname}_pp${pp_size}"
-fi
-jobname="${jobname}_seed${seed}_rebase"
-
-username=$(whoami)
-output_home="output"
-log_path="${output_home}/log/"
-checkpoint_path="${output_home}/checkpoint/${jobname}"
-tensorboard_dir="${output_home}/tensorboard/"
-tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
-mkdir -p ${log_path}
-mkdir -p ${checkpoint_path}
-mkdir -p ${tensorboard_path}
-###############################################################################
-data_options=" \
-    --vocab-file ${vocab_path} \
-    --merge-file ${merge_path} \
-    --data-path ${data_path} \
-    --data-impl mmap"
-
-## If CL is used, make sure to set "--split" the same as what you used during
-## offline data analysis&indexing.
-megatron_options=" \
-    --override-opt_param-scheduler \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --tensor-model-parallel-size 1 \
-    --ds-sequence-parallel-size ${sp_size} \
-    --init-method-std ${init_std} \
-    --lr-decay-tokens ${lr_decay_tokens} \
-    --lr-warmup-tokens ${lr_warmup_tokens} \
-    --micro-batch-size ${batch_size} \
-    --exit-duration-in-mins ${exit_duration} \
-    --global-batch-size ${global_batch_size} \
-    --num-layers ${num_layers} \
-    --hidden-size ${hidden_size} \
-    --num-attention-heads ${num_attn_heads} \
-    --seq-length ${seq_len} \
-    --max-position-embeddings ${seq_len} \
-    --train-tokens ${train_tokens} \
-    --train-samples ${train_samples} \
-    --lr ${lr} \
-    --min-lr ${min_lr} \
-    --lr-decay-style ${lr_decay_style} \
-    --split 949,50,1 \
-    --log-interval ${log_interval} \
-    --eval-interval ${eval_interval} \
-    --eval-iters ${eval_iters} \
-    --save-interval ${save_interval} \
-    --weight-decay 0.1 \
-    --clip-grad 1.0 \
-    --hysteresis 2 \
-    --num-workers ${num_workers} \
-    --fp16 \
-    --seed ${seed} \
-    --load ${checkpoint_path} \
-    --save ${checkpoint_path} \
-    --no-async-tensor-model-parallel-allreduce \
-    --use-flash-attn-triton \
-    --tensorboard-queue-size 1 \
-    --log-timers-to-tensorboard \
-    --log-batch-size-to-tensorboard \
-    --log-validation-ppl-to-tensorboard \
-    --tensorboard-dir ${tensorboard_path}"
-
-if [ "${activation_checkpoint}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --checkpoint-activations"
-fi
-
-if [ "${log_optimizer_state}" = "true" ]; then
-megatron_options="${megatron_options} \
-    --log-optimizer-states-to-tensorboard"
-fi
-
-config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json"
-template_json="ds_config_gpt_TEMPLATE.json"
-sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
-    | sed "s/MBSIZE/${batch_size}/" \
-    | sed "s/LOG_INTERVAL/${log_interval}/" \
-    | sed "s/ZERO_STAGE/${zero_stage}/" \
-    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
-      > ${config_json}
-
-deepspeed_options=" \
-    --deepspeed \
-    --deepspeed_config ${config_json} \
-    --zero-stage ${zero_stage} \
-    --pipeline-model-parallel-size ${pp_size}"
-
-if [[ "${no_pp}" = "true" ]]; then
-deepspeed_options="${deepspeed_options} \
-    --no-pipeline-parallel"
-fi
-
-if [ "${activation_checkpoint}" = "true" ]; then
-deepspeed_options="${deepspeed_options} \
-    --deepspeed-activation-checkpointing"
-fi
-
-## When saving checkpoint to a storage with cache, their could be consistency
-## issue of the pointer to latest checkpoint. Here we find the correct pointer
-## and broadcast it to all nodes.
-iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
-iteration_file_2="$checkpoint_path/latest"
-iteration=0
-for (( node = 0; node <= num_node-1; node++ ))
-do
-    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
-        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
-        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
-    fi
-done
-if [[ $iteration -gt 0 ]]; then
-    iteration_2="global_step${iteration}"
-    ds_ssh "echo $iteration > $iteration_file"
-    ds_ssh "echo $iteration_2 > $iteration_file_2"
-fi
-
-deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/README.md b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/README.md
deleted file mode 100644
index 341b0d113f599f1641f5af62b363f6a56c76d903..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/README.md
+++ /dev/null
@@ -1,119 +0,0 @@
-# Universal Checkpoint examples
-
-This folder contains example scripts that demonstrate how to use Universal Checkpoints to change the number of GPUs when training with ZeRO. With Universal Checkpoints, training can be resumed with a different parallelism degree on any of tensor slicing (TP), pipeline parallelism (PP), sequence parallelism (SP) and data parallelism (DP). Using universal checkpoints involves the following three steps:
-
-1. ZeRO-based training run, optionally combining TP and PP or SP, that creates normal ZeRO checkpoints.  
-2. Converting ZeRO checkpoint into the universal format using `ds_to_universal.py` utility of DeepSpeed.
-3. Resuming training with the universal checkpoint, on a different number of GPUs.
-
-## ZeRO stage 1 training
-For ZeRO stage 1, we provide bash scripts for bf16 and fp16 training examples corresponding to the steps 1 and 3 above. The step 1 scripts launch a training run of TP=PP=DP=2 of 200 iterations that creates a checkpoint every 100 iterations. The step 3 scripts load a universal checkpoint of iteration 100 and resume training with TP=PP=2 and DP=1 for an additional 100 iterations. Users can modify these scripts to try out other save and resume 3D combinations (e.g., save TP=PP=DP=1 and resume TP=PP=DP=2). Tensorboard logs are created by both step 1 and 3 scripts to enable visual inspection of how well the loss curves of the initial and resumed training runs match, especially at iteration 101.  
-
-1.  bf16:
-    * run_bf16.sh: step 1
-    * run_universal_bf16.sh: step 3
-
-2. fp16:
-    * run_fp16.sh: step 1 
-    * run_universal_fp16.sh: step 3
-
-Please note that these scripts should be run from the root folder of the repo (i.e., two levels above this README). For illustration, here are the commands for running the bf16 example. 
-
-### Download and Pre-process Training Dataset
-Before executing the steps below, you can download and pre-process the training set using the following commands (see [here](https://github.com/bigscience-workshop/Megatron-DeepSpeed?tab=readme-ov-file#quick-pre-processing-to-start-training-with) for more details):
-```bash
-wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
-wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-xz -d oscar-1GB.jsonl.xz
-python tools/preprocess_data.py \
-    --input oscar-1GB.jsonl \
-    --output-prefix my-gpt2 \
-    --vocab-file gpt2-vocab.json \
-    --dataset-impl mmap \
-    --tokenizer-type GPT2BPETokenizer \
-    --merge-file gpt2-merges.txt \
-    --append-eod \
-    --workers 8
-```
-
-NOTE: Make sure to update your `BASE_DATA_PATH` path in the `run_[bf16/fp16].sh` and `run_universal_[bf16/fp16].sh` scripts to point to the pre-processed data.
-
-### Step 1: Create ZeRO checkpoint
-```bash 
-  bash examples_deepspeed/universal_checkpointing/run_bf16.sh 
-```
-By default the script will create the checkpoints in folder `z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy`
-
-### Step 2: Convert ZeRO checkpoint of iteration 100 to Universal format
-Assuming the DeepSpeed source code is cloned into the home folder, the following command will generate universal checkpoint for iteration 100. 
-
-```bash
-python ${HOME}/DeepSpeed/deepspeed/checkpoint/ds_to_universal.py \
-    --input_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/global_step100 \
-    --output_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/global_step100_universal
-```
-Note that we chose to create the universal checkpoint in the same checkpoint folder as the ZeRO checkpoint. This maintains the normal checkpoint folder structure expected by the Megatron-DeepSpeed code, which makes it easy to load universal checkpoints with little/no script or code changes. For clarity, we show below the contents of the checkpoint folder after creation of the universal checkpoint. Note that the conversion script creates `global_step100_universal` folder and `latest_universal` file.   
-
-```bash
-ls -l z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/
-total 48
-drwxr-xr-x 2 user group  4096 Oct 21 08:51 global_step100
-drwxr-xr-x 3 user group  4096 Oct 21 09:28 global_step100_universal
-drwxr-xr-x 2 user group  4096 Oct 21 09:01 global_step200
--rw-r--r-- 1 user group    14 Oct 21 09:50 latest
--rw-r--r-- 1 user group     3 Oct 21 09:50 latest_checkpointed_iteration.txt
--rw-r--r-- 1 user group    24 Oct 21 09:28 latest_universal
--rwxr--r-- 1 user group 24177 Oct 21 09:50 zero_to_fp32.py
-```
-
-### Step 3: Resume training with Universal checkpoint of iteration 100
-```bash 
-bash examples_deepspeed/universal_checkpointing/run_universal_bf16.sh
-```
-This resumption script effects the loading of universal checkpoint rather than the ZeRO checkpoint in the folder by passing `--universal-checkpoint` command line flag to the main training script (i.e., `pretrain_gpt.py`). 
-
-Please see the corresponding [pull request](https://github.com/microsoft/Megatron-DeepSpeed/pull/276) for visualizations of matching loss values between original and universal checkpoint runs for bf16 and fp16 examples.
-
-Combining sequence parallelism with data parallelism is another good use case for universal checkpointing, see [sp pull request](https://github.com/microsoft/DeepSpeed/pull/4752) for example and visualization of matching loss values.
-
-### TensorBoard Log Analysis
-
-The Universal Checkpointing example includes a TensorBoard analysis script that will generate `csv` files and `png` plots across the unviersal checkpointing training steps for comparison of training and validation loss curves.
-
-After Step 3 is completed, the script may be executed as follows:
-```bash
-bash examples_deepspeed/universal_checkpointing/run_tb_analysis.sh z1_uni_ckpt
-```
-
-The script will output the following `csv` files:
-  - uc_out_tp_2_pp_2_dp_2_sp_1.csv
-  - uc_out_tp_2_pp_2_dp_1_sp_1.csv
-  - val_uc_out_tp_2_pp_2_dp_2_sp_1.csv
-  - val_uc_out_tp_2_pp_2_dp_1_sp_1.csv
-
-The script will also output the following `png` files:
-  - uc_char_training_loss.png
-  - uc_char_validation_loss.png
-
-Below is the visualization of the `png` files generated from this example.
-
-<div align="center">
-  <img src="assets/image/uc_char_training_loss.png" alt="" width="600"/>
-
-  *Figure 1: Training LM loss curve for first 200 training steps of Step 1 (TP=2, PP=2, DP=2) and training steps 101 to 200 of Step 3 (TP=2, PP=2, DP=1), which was loaded using the Universal Checkpoint.*
-</div>
-
-<div align="center">
-  <img src="assets/image/uc_char_validation_loss.png" alt="" width="600"/>
-
-  *Figure 2: Validation LM loss curve for first 200 training steps of Step 1 (TP=2, PP=2, DP=2) and training steps 101 to 200 of Step 3 (TP=2, PP=2, DP=1), which was loaded using the Universal Checkpoint.*
-</div>
-
-
-## ZeRO stage 2 training 
-Repeat steps in ZeRO stage 1 training above with the following modifications to your job batch scripts:
-* Set ZERO_STAGE=2 
-* Add `--no-pipeline-parallel` flag to deepspeed options  
-
-## ZeRO stage 3 training (**Coming soon**)
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/assets/image/uc_char_training_loss.png b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/assets/image/uc_char_training_loss.png
deleted file mode 100644
index 4df1ff1fc83ca2284f826369bb43185fa7a1e3da..0000000000000000000000000000000000000000
Binary files a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/assets/image/uc_char_training_loss.png and /dev/null differ
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/assets/image/uc_char_validation_loss.png b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/assets/image/uc_char_validation_loss.png
deleted file mode 100644
index 5a65f6bd12977042bdc3690d8fa51a69cbdf570a..0000000000000000000000000000000000000000
Binary files a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/assets/image/uc_char_validation_loss.png and /dev/null differ
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/ds_config.json b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/ds_config.json
deleted file mode 100644
index 329bd9b8ad164f97f2b00fe447b40bfdb2b5dc0c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/ds_config.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "train_batch_size" : 16,
-  "train_micro_batch_size_per_gpu": 16,
-  "steps_per_print": 1,
-
-  "zero_optimization": {
-    "stage": 1
-  },
-
-  "bf16": {
-    "enabled": true
-  },
-
-  "data_types": {
-        "grad_accum_dtype": "fp32" 
-  },
-
-  "wall_clock_breakdown" : false
-}
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_bf16.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_bf16.sh
deleted file mode 100644
index 0953954222692bdeabef4623a88de070758a6e1f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_bf16.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/bin/bash
-
-
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-BASE_DATA_PATH=datasets
-DATASET=${BASE_DATA_PATH}/my-gpt2_text_document
-VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-CONFIG_JSON="$script_dir/ds_config.json"
-
-ZERO_STAGE=1
-DTYPE="bf16"
-
-# Debug
-DEBUG_MODE=1
-if [[ $DEBUG_MODE == 1 ]]; then
-        LAYERS=4
-        HIDDEN=512
-        SEQ=512
-        EXIT_INTERVAL=200
-        SIZE_TAG="toy"
-else
-        HIDDEN=1024
-        LAYERS=24
-        SEQ=1024
-        EXIT_INTERVAL=100
-        SIZE_TAG="big"
-fi  
-
-# 3D parallelism of training 
-TP=2
-PP=2
-DP=2
-SP=1
-WORLD_SIZE=$((TP*PP*DP*SP))
-GLOBAL_BATCH=16
-MICRO_BATCH=$((GLOBAL_BATCH/WORLD_SIZE))
-TRAIN_ITERS=100000
-LR=6.0e-3
-MIN_LR=6.0e-4
-
-# 3D parallelism of checkpoint to load
-LOAD_TP=$TP
-LOAD_PP=$PP
-LOAD_DP=$DP
-LOAD_SP=$SP
-RUN_TAG="save"
-# RUN_TAG="ref_load${LOAD_TP}_${LOAD_PP}_${LOAD_DP}"
-
-EXP_DIR="z${ZERO_STAGE}_uni_ckpt" 
-CHECKPOINT_PATH=${EXP_DIR}/checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_${SIZE_TAG}
-LOAD_CHECKPOINT_PATH=${EXP_DIR}/checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${LOAD_TP}_pp${LOAD_PP}_dp${LOAD_DP}_sp${LOAD_SP}_${SIZE_TAG}
-LOG_DIR="${EXP_DIR}/tensorboard/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_${SIZE_TAG}_${RUN_TAG}"
-mkdir -p $LOG_DIR
-
-while [[ $# -gt 0 ]]
-do
-key="$1"
-case $key in
-    -z|--zero-stage)
-    ZERO_STAGE=$2;
-    shift
-    ;;
-    *)
-    echo "Unknown argument(s)"
-    usage
-    exit 1
-    shift
-    ;;
-esac
-done
-
-
-options=" \
-	--tensor-model-parallel-size $TP \
-	--pipeline-model-parallel-size $PP \
-	--ds-sequence-parallel-size $SP \
-        --num-layers $LAYERS \
-        --hidden-size $HIDDEN \
-        --num-attention-heads 32 \
-        --seq-length $SEQ \
-        --loss-scale 12 \
-        --max-position-embeddings $SEQ \
-	--micro-batch-size $MICRO_BATCH \
-	--global-batch-size $GLOBAL_BATCH \
-	--train-iters $TRAIN_ITERS \
-        --lr $LR \
-	--min-lr $MIN_LR \
-        --lr-decay-style cosine \
-        --log-interval 1 \
-        --eval-iters 40 \
-        --eval-interval 10 \
-	--data-path ${DATASET} \
-	--vocab-file ${VOCAB_PATH} \
-	--merge-file ${MERGE_PATH} \
-	--save-interval 100 \
-        --split 98,2,0 \
-        --clip-grad 1.0 \
-	--weight-decay 0.1 \
-	--adam-beta1 0.9 \
-	--adam-beta2 0.95 \
-	--init-method-std 0.006 \
-        --${DTYPE} \
-	--checkpoint-activations \
-	--exit-interval ${EXIT_INTERVAL} \
-        --save ${CHECKPOINT_PATH} \
-        --load ${LOAD_CHECKPOINT_PATH} \
-        --make-vocab-size-divisible-by 256 \
-	--tensorboard-dir $LOG_DIR
-        "
-
-options="${options} \
-        --deepspeed \
-        --deepspeed_config=${CONFIG_JSON} \
-        --zero-stage=${ZERO_STAGE} \
-        --deepspeed-activation-checkpointing \
-"
-if [[ ${ZERO_STAGE} -gt 1 ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-cat <<EOT > $CONFIG_JSON
-{
-  "train_batch_size" : $GLOBAL_BATCH,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
-  "steps_per_print": 1,
-
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-
-  "bf16": {
-    "enabled": true
-  },
-
-  "data_types": {
-        "grad_accum_dtype": "fp32" 
-  },
-
-  "wall_clock_breakdown" : false
-}
-EOT
-
-WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
-run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
-
-echo ${options}
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_fp16.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_fp16.sh
deleted file mode 100644
index 691fa8a8e6e0b7a4d878a6061af9513340b6699a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_fp16.sh
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/bin/bash
-
-
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-BASE_DATA_PATH=datasets
-DATASET=${BASE_DATA_PATH}/my-gpt2_text_document
-VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-CONFIG_JSON="$script_dir/ds_config.json"
-
-ZERO_STAGE=1
-DTYPE="fp16"
-
-# Debug
-DEBUG_MODE=1
-if [[ $DEBUG_MODE == 1 ]]; then
-        LAYERS=4
-        HIDDEN=512
-        SEQ=512
-        EXIT_INTERVAL=200
-        SIZE_TAG="toy"
-else
-        HIDDEN=1024
-        LAYERS=24
-        SEQ=1024
-        EXIT_INTERVAL=100
-        SIZE_TAG="big"
-fi  
-
-# 3D parallelism of training 
-TP=2
-PP=2
-DP=2
-SP=1
-WORLD_SIZE=$((TP*PP*DP*SP))
-GLOBAL_BATCH=16
-MICRO_BATCH=$((GLOBAL_BATCH/WORLD_SIZE))
-TRAIN_ITERS=100000
-LR=6.0e-3
-MIN_LR=6.0e-4
-
-# 3D parallelism of checkpoint to load
-LOAD_TP=$TP
-LOAD_PP=$PP
-LOAD_DP=$DP
-LOAD_SP=$SP
-RUN_TAG="save"
-# RUN_TAG="ref_load${LOAD_TP}_${LOAD_PP}_${LOAD_DP}"
-
-EXP_DIR="z${ZERO_STAGE}_uni_ckpt" 
-CHECKPOINT_PATH=${EXP_DIR}/checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_${SIZE_TAG}
-LOAD_CHECKPOINT_PATH=${EXP_DIR}/checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${LOAD_TP}_pp${LOAD_PP}_dp${LOAD_DP}_sp${LOAD_SP}_${SIZE_TAG}
-LOG_DIR="${EXP_DIR}/tensorboard/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_${SIZE_TAG}_${RUN_TAG}"
-mkdir -p $LOG_DIR
-
-while [[ $# -gt 0 ]]
-do
-key="$1"
-case $key in
-    -z|--zero-stage)
-    ZERO_STAGE=$2;
-    shift
-    ;;
-    *)
-    echo "Unknown argument(s)"
-    usage
-    exit 1
-    shift
-    ;;
-esac
-done
-
-
-options=" \
-	--tensor-model-parallel-size $TP \
-	--pipeline-model-parallel-size $PP \
-    --ds-sequence-parallel-size $SP \
-        --num-layers $LAYERS \
-        --hidden-size $HIDDEN \
-        --num-attention-heads 32 \
-        --seq-length $SEQ \
-        --loss-scale 12 \
-        --max-position-embeddings $SEQ \
-	--micro-batch-size $MICRO_BATCH \
-	--global-batch-size $GLOBAL_BATCH \
-	--train-iters $TRAIN_ITERS \
-        --lr $LR \
-	--min-lr $MIN_LR \
-        --lr-decay-style cosine \
-        --log-interval 1 \
-        --eval-iters 40 \
-        --eval-interval 10 \
-	--data-path ${DATASET} \
-	--vocab-file ${VOCAB_PATH} \
-	--merge-file ${MERGE_PATH} \
-	--save-interval 100 \
-        --split 98,2,0 \
-        --clip-grad 1.0 \
-	--weight-decay 0.1 \
-	--adam-beta1 0.9 \
-	--adam-beta2 0.95 \
-	--init-method-std 0.006 \
-        --${DTYPE} \
-	--checkpoint-activations \
-	--exit-interval ${EXIT_INTERVAL} \
-        --save ${CHECKPOINT_PATH} \
-        --load ${LOAD_CHECKPOINT_PATH} \
-        --make-vocab-size-divisible-by 256 \
-	--tensorboard-dir $LOG_DIR
-        "
-
-options="${options} \
-        --deepspeed \
-        --deepspeed_config=${CONFIG_JSON} \
-        --zero-stage=${ZERO_STAGE} \
-        --deepspeed-activation-checkpointing \
-"
-if [[ ${ZERO_STAGE} -gt 1 ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-cat <<EOT > $CONFIG_JSON
-{
-  "train_batch_size" : $GLOBAL_BATCH,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
-  "steps_per_print": 1,
-
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-
-  "bf16": {
-    "enabled": false
-  },
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 50,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 12
-  },
-
-  "wall_clock_breakdown" : false
-}
-EOT
-
-WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
-run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
-
-
-echo ${options}
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_tb_analysis.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_tb_analysis.sh
deleted file mode 100644
index 7aa988a0a03827adbc1316a2bce46c20a2ffcd06..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_tb_analysis.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-OUTPUT_PATH=$1
-
-if [ "$OUTPUT_PATH" == "" ]; then
-    OUTPUT_PATH="z1_uni_ckpt"
-fi
-
-# Training Loss
-python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \
-    --tb_dir $OUTPUT_PATH \
-    --tb_event_key "lm-loss-training/lm loss" \
-    --plot_name "uc_char_training_loss.png" \
-    --plot_title "Megatron-GPT Universal Checkpointing - Training Loss" \
-    --use_sns
-
-# Validation Loss
-python3 examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py \
-    --tb_dir $OUTPUT_PATH \
-    --tb_event_key "lm-loss-validation/lm loss validation" \
-    --csv_name "val_" \
-    --plot_name "uc_char_validation_loss.png" \
-    --plot_title "Megatron-GPT Universal Checkpointing - Validation Loss" \
-    --plot_y_label "Validation LM Loss" \
-    --use_sns
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_universal_bf16.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_universal_bf16.sh
deleted file mode 100644
index ef0e134cfc99d5ea3f93426e7c885e1c47e6e297..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_universal_bf16.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/bin/bash
-
-
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-BASE_DATA_PATH=datasets
-DATASET=${BASE_DATA_PATH}/my-gpt2_text_document
-VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-CONFIG_JSON="$script_dir/ds_config.json"
-
-ZERO_STAGE=1
-DTYPE="bf16"
-
-# Debug
-DEBUG_MODE=1
-if [[ $DEBUG_MODE == 1 ]]; then
-        LAYERS=4
-        HIDDEN=512
-        SEQ=512
-        EXIT_INTERVAL=200
-        SIZE_TAG="toy"
-else
-        HIDDEN=1024
-        LAYERS=24
-        SEQ=1024
-        EXIT_INTERVAL=100
-        SIZE_TAG="big"
-fi  
-
-# 3D parallelism of training 
-TP=2
-PP=2
-DP=1
-SP=1
-WORLD_SIZE=$((TP*PP*DP*SP))
-GLOBAL_BATCH=4
-MICRO_BATCH=$((GLOBAL_BATCH/WORLD_SIZE))
-TRAIN_ITERS=100000
-LR=6.0e-3
-MIN_LR=6.0e-4
-
-# 3D parallelism of checkpoint to load
-LOAD_TP=2
-LOAD_PP=2
-LOAD_DP=2
-LOAD_SP=1
-RUN_TAG="uni_load${LOAD_TP}_${LOAD_PP}_${LOAD_DP}_${LOAD_SP}"
-
-EXP_DIR="z${ZERO_STAGE}_uni_ckpt" 
-CHECKPOINT_PATH=${EXP_DIR}/checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_${SIZE_TAG}
-LOAD_CHECKPOINT_PATH=${EXP_DIR}/checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${LOAD_TP}_pp${LOAD_PP}_dp${LOAD_DP}_sp${LOAD_SP}_${SIZE_TAG}
-LOG_DIR="${EXP_DIR}/tensorboard/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_${SIZE_TAG}_${RUN_TAG}"
-mkdir -p $LOG_DIR
-
-while [[ $# -gt 0 ]]
-do
-key="$1"
-case $key in
-    -z|--zero-stage)
-    ZERO_STAGE=$2;
-    shift
-    ;;
-    *)
-    echo "Unknown argument(s)"
-    usage
-    exit 1
-    shift
-    ;;
-esac
-done
-
-
-options=" \
-	--tensor-model-parallel-size $TP \
-	--pipeline-model-parallel-size $PP \
-	--ds-sequence-parallel-size $SP \
-        --num-layers $LAYERS \
-        --hidden-size $HIDDEN \
-        --num-attention-heads 32 \
-        --seq-length $SEQ \
-        --loss-scale 12 \
-        --max-position-embeddings $SEQ \
-	--micro-batch-size $MICRO_BATCH \
-	--global-batch-size $GLOBAL_BATCH \
-	--train-iters $TRAIN_ITERS \
-        --lr $LR \
-	--min-lr $MIN_LR \
-        --lr-decay-style cosine \
-        --log-interval 1 \
-        --eval-iters 40 \
-        --eval-interval 10 \
-	--data-path ${DATASET} \
-	--vocab-file ${VOCAB_PATH} \
-	--merge-file ${MERGE_PATH} \
-	--save-interval 100 \
-        --split 98,2,0 \
-        --clip-grad 1.0 \
-	--weight-decay 0.1 \
-	--adam-beta1 0.9 \
-	--adam-beta2 0.95 \
-	--init-method-std 0.006 \
-        --${DTYPE} \
-	--checkpoint-activations \
-	--exit-interval ${EXIT_INTERVAL} \
-        --save ${CHECKPOINT_PATH} \
-        --load ${LOAD_CHECKPOINT_PATH} \
-        --make-vocab-size-divisible-by 256 \
-        --universal-checkpoint \
-	--tensorboard-dir $LOG_DIR
-        "
-
-options="${options} \
-        --deepspeed \
-        --deepspeed_config=${CONFIG_JSON} \
-        --zero-stage=${ZERO_STAGE} \
-        --deepspeed-activation-checkpointing \
-"
-if [[ ${ZERO_STAGE} -gt 1 ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-cat <<EOT > $CONFIG_JSON
-{
-  "train_batch_size" : $GLOBAL_BATCH,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
-  "steps_per_print": 1,
-
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-
-  "bf16": {
-    "enabled": true
-  },
-
-  "data_types": {
-        "grad_accum_dtype": "fp32" 
-  },
-
-  "wall_clock_breakdown" : false
-}
-EOT
-
-WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
-run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
-
-echo ${options}
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_universal_fp16.sh b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_universal_fp16.sh
deleted file mode 100644
index 1e207e422bacf7f91e8e56929945a8e5fa006a65..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/run_universal_fp16.sh
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/bin/bash
-
-
-DIR=`pwd`
-DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
-BASE_DATA_PATH=datasets
-DATASET=${BASE_DATA_PATH}/my-gpt2_text_document
-VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
-MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
-
-
-script_path=$(realpath $0)
-script_dir=$(dirname $script_path)
-CONFIG_JSON="$script_dir/ds_config.json"
-
-ZERO_STAGE=1
-DTYPE="fp16"
-
-# Debug
-DEBUG_MODE=1
-if [[ $DEBUG_MODE == 1 ]]; then
-        LAYERS=4
-        HIDDEN=512
-        SEQ=512
-        EXIT_INTERVAL=200
-        SIZE_TAG="toy"
-else
-        HIDDEN=1024
-        LAYERS=24
-        SEQ=1024
-        EXIT_INTERVAL=100
-        SIZE_TAG="big"
-fi  
-
-# 3D parallelism of training 
-TP=2
-PP=2
-DP=1
-SP=1
-WORLD_SIZE=$((TP*PP*DP*SP))
-GLOBAL_BATCH=16
-MICRO_BATCH=$((GLOBAL_BATCH/WORLD_SIZE))
-TRAIN_ITERS=100000
-LR=6.0e-3
-MIN_LR=6.0e-4
-
-# 3D parallelism of checkpoint to load
-LOAD_TP=2
-LOAD_PP=2
-LOAD_DP=2
-LOAD_SP=1
-RUN_TAG="uni_load${LOAD_TP}_${LOAD_PP}_${LOAD_DP}_${LOAD_SP}"
-
-EXP_DIR="z${ZERO_STAGE}_uni_ckpt" 
-CHECKPOINT_PATH=${EXP_DIR}/checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_${SIZE_TAG}
-LOAD_CHECKPOINT_PATH=${EXP_DIR}/checkpoints/gpt2/z${ZERO_STAGE}/$DTYPE/tp${LOAD_TP}_pp${LOAD_PP}_dp${LOAD_DP}_sp${LOAD_SP}_${SIZE_TAG}
-LOG_DIR="${EXP_DIR}/tensorboard/$DTYPE/tp${TP}_pp${PP}_dp${DP}_sp${SP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_${SIZE_TAG}_${RUN_TAG}"
-mkdir -p $LOG_DIR
-
-while [[ $# -gt 0 ]]
-do
-key="$1"
-case $key in
-    -z|--zero-stage)
-    ZERO_STAGE=$2;
-    shift
-    ;;
-    *)
-    echo "Unknown argument(s)"
-    usage
-    exit 1
-    shift
-    ;;
-esac
-done
-
-
-options=" \
-	--tensor-model-parallel-size $TP \
-	--pipeline-model-parallel-size $PP \
-    --ds-sequence-parallel-size $SP \
-        --num-layers $LAYERS \
-        --hidden-size $HIDDEN \
-        --num-attention-heads 32 \
-        --seq-length $SEQ \
-        --loss-scale 12 \
-        --max-position-embeddings $SEQ \
-	--micro-batch-size $MICRO_BATCH \
-	--global-batch-size $GLOBAL_BATCH \
-	--train-iters $TRAIN_ITERS \
-        --lr $LR \
-	--min-lr $MIN_LR \
-        --lr-decay-style cosine \
-        --log-interval 1 \
-        --eval-iters 40 \
-        --eval-interval 10 \
-	--data-path ${DATASET} \
-	--vocab-file ${VOCAB_PATH} \
-	--merge-file ${MERGE_PATH} \
-	--save-interval 100 \
-        --split 98,2,0 \
-        --clip-grad 1.0 \
-	--weight-decay 0.1 \
-	--adam-beta1 0.9 \
-	--adam-beta2 0.95 \
-	--init-method-std 0.006 \
-        --${DTYPE} \
-	--checkpoint-activations \
-	--exit-interval ${EXIT_INTERVAL} \
-        --save ${CHECKPOINT_PATH} \
-        --load ${LOAD_CHECKPOINT_PATH} \
-        --make-vocab-size-divisible-by 256 \
-        --universal-checkpoint \
-	--tensorboard-dir $LOG_DIR
-        "
-
-options="${options} \
-        --deepspeed \
-        --deepspeed_config=${CONFIG_JSON} \
-        --zero-stage=${ZERO_STAGE} \
-        --deepspeed-activation-checkpointing \
-"
-if [[ ${ZERO_STAGE} -gt 1 ]]; then
-options="${options} \
-    --no-pipeline-parallel"
-fi
-
-cat <<EOT > $CONFIG_JSON
-{
-  "train_batch_size" : $GLOBAL_BATCH,
-  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
-  "steps_per_print": 1,
-
-  "zero_optimization": {
-    "stage": $ZERO_STAGE
-  },
-
-  "bf16": {
-    "enabled": false
-  },
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 50,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-    "initial_scale_power": 12
-  },
-
-  "wall_clock_breakdown" : false
-}
-EOT
-
-WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
-run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
-
-
-echo ${options}
-echo ${run_cmd}
-eval ${run_cmd}
-
-set +x
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/abstract_analysis.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/abstract_analysis.py
deleted file mode 100644
index a079b95230e09cf16003d197f0dc6a70c94c3292..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/abstract_analysis.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-import abc
-from abc import ABC
-
-
-class TensorBoardAnalysis(ABC):
-
-    def __init__(self):
-        self._name = None
-        self._label_name = None
-        self._csv_name = None
-
-    @abc.abstractmethod
-    def set_names(self, path_name):
-        ...
-
-    @abc.abstractmethod
-    def get_label_name(self):
-        ...
-
-    @abc.abstractmethod
-    def get_csv_filename(self):
-        ...
-
-    @abc.abstractmethod
-    def path_regex(self):
-        ...
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py
deleted file mode 100644
index 3dacb45d4eea20b39530488d7da5e50c51c888fb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/arguments.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-from argparse import ArgumentParser
-
-parser = ArgumentParser()
-parser.add_argument("--tb_dir", required=True, type=str, help="Directory for tensorboard output")
-parser.add_argument("--analyzer", default="universal_checkpointing", type=str, choices=["universal_checkpointing"], help="Specify the analyzer to use")
-parser.add_argument("--tb_event_key", required=False, default="lm-loss-training/lm loss", type=str, help="Optional override of the TensorBoard event key")
-parser.add_argument("--plot_title", required=False, default="Megatron-GPT Universal Checkpointing", type=str, help="Optional override of the plot title")
-parser.add_argument("--plot_x_label", required=False, default="Training Step", type=str, help="Optional override of the plot x-label")
-parser.add_argument("--plot_y_label", required=False, default="LM Loss", type=str, help="Optional override of the plot y-label")
-parser.add_argument("--plot_name", required=False, default="uni_ckpt_char.png", type=str, help="Optional override of the plot file name")
-parser.add_argument("--skip_plot", action='store_true', help="Skip generation of plot file")
-parser.add_argument("--skip_csv", action='store_true', help="Skip generation of csv files")
-parser.add_argument("--use_sns", action='store_true', help="Use the SNS library to format plot")
-parser.add_argument("--csv_name", required=False, default="", type=str, help="Unique name for CSV files")
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py
deleted file mode 100644
index 337f6540ab53a37aedd1fb5e2a4fb0aafa119ef5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/tb_analysis_script.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-import os
-import re
-import pandas as pd
-import matplotlib.pyplot as plt
-from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
-from utils import get_analyzer, find_files
-from arguments import parser
-
-args = parser.parse_args()
-
-if args.use_sns:
-    import seaborn as sns
-    sns.set()
-
-def main():
-    target_affix = 'events.out.tfevents'
-    tb_log_paths = find_files(args.tb_dir, target_affix)
-
-    analyzer = get_analyzer(args.analyzer)
-
-    for tb_path in tb_log_paths:
-        print(f"Processing: {tb_path}")
-        analyzer.set_names(tb_path)
-
-        event_accumulator = EventAccumulator(tb_path)
-        event_accumulator.Reload()
-
-        events = event_accumulator.Scalars(args.tb_event_key)
-
-        x = [x.step for x in events]
-        y = [x.value for x in events]
-
-        plt.plot(x, y, label=f'{analyzer.get_label_name()}')
-
-        if not args.skip_csv:
-            df = pd.DataFrame({"step": x, "value": y})
-            df.to_csv(f"{args.csv_name}{analyzer.get_csv_filename()}.csv")
-
-    if not args.skip_plot:
-        plt.legend()
-        plt.title(args.plot_title)
-        plt.xlabel(args.plot_x_label)
-        plt.ylabel(args.plot_y_label)
-        plt.savefig(args.plot_name)
-
-if __name__ == "__main__":
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py
deleted file mode 100644
index f5809c3dc1dc135bd627259efeb6c8efcc53859f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/uc_analysis.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-import re
-from abstract_analysis import TensorBoardAnalysis
-
-
-class UniversalCheckpointingAnalysis(TensorBoardAnalysis):
-
-    def __init__(self):
-        self._name = "universal_checkpointing"
-
-    def set_names(self, path_name):
-        match = re.match(self.path_regex(), path_name)
-        if not match:
-            raise ValueError(f"Path ({path_name}) did not match regex ({self.path_regex()})")
-        tp, pp, dp, sp = match.groups()
-
-        self._label_name = f"Training Run: TP: {tp}, PP: {pp}, DP: {dp}"
-        self._csv_name = f"uc_out_tp_{tp}_pp_{pp}_dp_{dp}_sp_{sp}"
-
-    def get_label_name(self):
-        return self._label_name
-
-    def get_csv_filename(self):
-        return self._csv_name
-
-    def path_regex(self):
-        return '.*tp(\d+).*pp(\d+).*dp(\d+).*sp(\d+)'
diff --git a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/utils.py b/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/utils.py
deleted file mode 100644
index 4bbbb3f2f04f7d138b35163912d2bd7c9def7f37..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/examples_deepspeed/universal_checkpointing/tb_analysis/utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-import os
-from uc_analysis import UniversalCheckpointingAnalysis
-
-
-def find_files(directory, file_affix):
-    """
-    Searches for files with a specific affix in a directory using os.walk().
-
-    Args:
-        directory (str): The path to the directory to search.
-        file_affix (str): The desired file affix.
-
-    Returns:
-        list: A list of paths to matching files.
-    """
-    matching_paths = []
-    for root, _, files in os.walk(directory):
-        for filename in files:
-            if root not in matching_paths and filename.lower().startswith(file_affix.lower()):
-                matching_paths.append(os.path.join(root))
-    return matching_paths
-
-def get_analyzer(analyzer_name):
-    if analyzer_name == 'universal_checkpointing':
-        return UniversalCheckpointingAnalysis()
-    else:
-        raise ValueError(f"Unsupported analyzer {analyzer_name}")
diff --git a/toolbox/Megatron-DeepSpeed/finetune_llama.py b/toolbox/Megatron-DeepSpeed/finetune_llama.py
deleted file mode 100644
index a9417eb9b8fb4193d061df183046b4f99b1c5c21..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/finetune_llama.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Finetune LLAMA, Modified from pretrain_gpt.py"""
-
-import torch
-import math
-from functools import partial
-import megatronspeed.megatron_adaptor
-from megatron.training import get_args
-from megatron.training import print_rank_0
-from megatron.training import get_timers
-from megatron.training import get_tokenizer
-from megatron.core import mpu, tensor_parallel
-from megatron.core.enums import ModelType
-from megatron.legacy.data.gpt_dataset import build_train_valid_test_datasets
-from megatron.legacy.data.prompt_dataset import SupervisedDataset
-from megatron.legacy.model import GPTModel, GPTModelPipe
-from megatron.training.training import pretrain
-from megatron.training.utils import get_ltor_masks_and_position_ids
-from megatron.training.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb
-from megatron.training.arguments import core_transformer_config_from_args
-
-import deepspeed
-from deepspeed.runtime.utils import see_memory_usage
-from deepspeed.accelerator.real_accelerator import get_accelerator
-import os
-import subprocess
-
-from torch import nn
-import torch.nn.functional as F
-from transformers import AutoTokenizer
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    print_rank_0('building GPT model ...')
-    see_memory_usage(f"Before Building Model", force=True)
-
-    args = get_args()
-    config = core_transformer_config_from_args(args)
-    with deepspeed.zero.Init(sequence_data_parallel_group=mpu.get_sequence_data_parallel_group(),
-                             remote_device=None if args.remote_device == 'none' else args.remote_device,
-                             config_dict_or_path=args.deepspeed_config,
-                             enabled=args.zero_stage == 3,
-                             mpu=mpu):
-        if args.deepspeed and not args.no_pipeline_parallel:
-            model = GPTModelPipe(
-                config=config,
-                num_tokentypes=0,
-                parallel_output=True
-            )
-            # This is a hack to give us a reference to get_batch_pipe from within training.py
-            # We need to call model.set_batch_fn after deepspeed.initialize
-            model._megatron_batch_fn = get_batch_pipe
-
-            # Predompute the attention mask and store it in args. This avoids having to
-            # pipeline it as an activation during training. The mask is constant, and thus
-            # we can reuse it.
-            attention_mask = torch.tril(torch.ones(
-                (1, args.seq_length, args.seq_length), device=get_accelerator().current_device_name())).view(
-                    1, 1, args.seq_length, args.seq_length)
-
-            # Convert attention mask to binary:
-            attention_mask = (attention_mask < 0.5)
-            if args.fp16:
-                attention_mask = attention_mask.half()
-            elif args.bf16:
-                attention_mask = attention_mask.bfloat16()
-
-            # Attention mask must be bool.
-            args.attn_mask = attention_mask.to(torch.bool)
-
-            # For prertaining, since sequence length is fixed, cache rotary embedding in args, to avoid communicating around
-            if args.use_rotary_position_embeddings:
-                update_rotary_pos_emb(args.seq_length)
-
-        else:
-            model = GPTModel(
-                config=config,
-                num_tokentypes=0,
-                parallel_output=True,
-                pre_process=pre_process,
-                post_process=post_process
-            )
-    see_memory_usage(f"After Building Model", force=True)
-    return model
-
-
-def get_batch(data_iterator):
-    """Generate a batch"""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Items and their type.
-    keys = ['text']
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
-    # Get the masks and postition ids.
-    skip_mask = args.use_flash_attn or args.use_flash_attn_triton
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss,
-        skip_mask)
-
-    # For DS's sequence parallel
-    seq_parallel_world_size = mpu.get_sequence_parallel_world_size()
-    seq_parallel_world_rank = mpu.get_sequence_parallel_rank()
-
-    # For Megatron's sequence parallel
-    if args.sequence_parallel:
-        seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size()
-        seq_parallel_world_rank = mpu.get_tensor_model_parallel_rank()
-    seq_length = tokens.size(1)
-
-    assert seq_length % seq_parallel_world_size == 0
-    sub_seq_length = seq_length // seq_parallel_world_size
-    sub_seq_start = seq_parallel_world_rank * sub_seq_length
-    sub_seq_end = (seq_parallel_world_rank + 1) * sub_seq_length
-
-    tokens = tokens[:, sub_seq_start:sub_seq_end]
-    position_ids = position_ids[:, sub_seq_start:sub_seq_end]
-    # For DS's sequence parallel
-    if mpu.get_sequence_parallel_world_size() > 1:
-        labels = labels[:, sub_seq_start:sub_seq_end]
-
-    return tokens, labels, loss_mask, attention_mask, position_ids
-
-def data_post_process(data, data_sampler_state_dict):
-    args = get_args()
-    if args.data_efficiency_curriculum_learning:
-        if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']:
-            args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_truncate'
-            current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate']
-            if current_seqlen < args.seq_length:
-                data['text'] = data['text'][:, :(current_seqlen+1)].contiguous()
-        elif 'seqlen_reshape' in data_sampler_state_dict['current_difficulties']:
-            args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_reshape'
-            current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_reshape']
-            if current_seqlen < args.seq_length:
-                orig_num_token = torch.numel(data['text'])
-                reshape_len = (data['text'].size()[1] // (current_seqlen+1)) * (current_seqlen+1)
-                data['text'] = torch.cat((data['text'][:, :reshape_len].contiguous().view(-1, current_seqlen+1),
-                    data['text'][:, -(current_seqlen+1):]), 0).contiguous()
-                num_row = math.ceil(orig_num_token / (current_seqlen+1))
-                num_row = min(num_row, data['text'].size()[0])
-                if num_row > 1 and num_row % 2 != 0:
-                    num_row -= 1
-                data['text'] = data['text'][:num_row, :].contiguous()
-        else:
-            args.data_efficiency_curriculum_learning_seqlen_type = None
-    return data
-
-def get_batch_pipe(data):
-    """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Items and their type.
-    keys = ['input_ids','labels']
-    datatype = torch.int64
-
-    # Broadcast data.
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    # HF will automatically handle tokens alignment for labels, while in Megatron, we need to manually adjust it.
-    labels = data_b['labels'].long()[:,1:].contiguous()
-    tokens = data_b['input_ids'].long()[:,:-1].contiguous()
-
-    # Get the masks and postition ids.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-    
-    # mask loss for SFT training
-    # we use padding to fill the prompt in the labels
-    loss_mask = labels.ne(tokenizer.pad)
-
-    if args.curriculum_learning_legacy and args.curriculum_seqlen < tokens.size()[1]:
-        # seqlen-based curriculum learning
-        # tokens, position_ids, labels, loss_mask have size [batch size, seqlen]
-        tokens = tokens[:, :args.curriculum_seqlen].contiguous()
-        position_ids = position_ids[:, :args.curriculum_seqlen].contiguous()
-        if labels is not None:
-            labels = labels[:, :args.curriculum_seqlen].contiguous()
-        loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous()
-    
-    return (tokens, position_ids, attention_mask), (labels, loss_mask)
-
-
-def loss_func(loss_mask, moe_loss, mos_loss, output_tensor):
-    args = get_args()
-    losses = output_tensor.float()
-    loss_mask = loss_mask.view(-1).float()
-    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
-
-    # Reduce loss for logging.
-    averaged_loss = average_losses_across_data_parallel_group([loss])
-    if args.mos or args.kd:
-        # assert max(args.num_experts) >= 1
-        loss = loss + moe_loss + mos_loss
-        if args.mos:
-            return loss, {'total loss': loss, 'lm loss': averaged_loss[0], 'moe loss': moe_loss, 'mos loss': mos_loss}
-        elif args.kd:
-            return loss, {'total loss': loss, 'lm loss': averaged_loss[0], 'moe loss': moe_loss, 'kd loss': mos_loss}
-        print_rank_0('>>> total loss: {}, lm loss {}, kd loss {}'.format(loss, averaged_loss[0], mos_loss))
-    else:
-        if max(args.num_experts) <= 1:
-            return loss, {'lm loss': averaged_loss[0]}
-        else:
-            loss = loss + moe_loss
-            return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss}
-
-def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, attention_mask):
-    mos_loss = 0
-    alpha = args.kd_alpha_ce
-    beta = args.kd_beta_ce
-    kd_temp = args.kd_temp
-
-    if teacher_model:
-        with torch.no_grad():
-            if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length:
-                assert args.curriculum_seqlen is not None
-                curriculum_seqlen = args.curriculum_seqlen
-                tokens = tokens[:, :curriculum_seqlen].contiguous()
-                position_ids = position_ids[:, :curriculum_seqlen].contiguous()
-                attention_mask = attention_mask[:, :, :curriculum_seqlen, :curriculum_seqlen].contiguous()
-                # No need to truncate labels as we do not need it for the teacher logits
-            tea_output, tea_other_losses = teacher_model(tokens, position_ids, attention_mask)
-            assert stu_output.size() == tea_output.size(), 'teacher and student output should match in size. Student: {}, Teacher: {}, CL seq length {}'.format(stu_output.size(), tea_output.size(), args.curriculum_seqlen)
-
-        student_logits = F.log_softmax(stu_output / kd_temp, dim=2)
-        tea_logits = F.softmax(tea_output / kd_temp, dim=2) # The target logits is expected to be probabilities. If we use log_softmax, then we need to set target_log to true when initializing the KLDivLoss.
-
-        mos_loss = kd_temp * kd_temp * nn.KLDivLoss(reduction='batchmean')(student_logits, tea_logits)
-
-        mos_loss = mos_loss.div(args.seq_length) * beta
-    return mos_loss
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    args = get_args()
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator', log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-        data_iterator)
-    timers('batch-generator').stop()
-
-    if args.data_efficiency_curriculum_learning:
-        args.curriculum_seqlen = tokens.size()[1]
-        if hasattr(args, 'data_efficiency_curriculum_learning_seqlen_type') and \
-            args.data_efficiency_curriculum_learning_seqlen_type == 'seqlen_reshape':
-            args.data_efficiency_curriculum_learning_numel = torch.numel(tokens)
-
-    if args.mos or args.kd:
-        # The forward func can return either the loss or the logits, depending on whether passing in the labels or not.
-        stu_output, other_losses = model(tokens, position_ids, attention_mask)
-        if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length:
-            assert args.curriculum_seqlen is not None
-            labels = labels[:, :args.curriculum_seqlen].contiguous()
-        output_tensor = tensor_parallel.vocab_parallel_cross_entropy(stu_output.contiguous().float(), labels)
-    else:
-        output_tensor, other_losses = model(tokens, position_ids, attention_mask,
-                                            labels=labels)
-    if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length:
-        loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous()
-
-    moe_losses = []
-    for moe_loss in other_losses:
-        if moe_loss is not None:
-            moe_losses.append(moe_loss)
-    moe_loss = sum(moe_losses) * args.moe_loss_coeff
-
-    mos_loss = 0
-    if args.mos or args.kd:
-        assert model.training
-        if args.teacher_forward and args.teacher_model is not None:
-            mos_loss = calculate_mos_loss(args, stu_output,
-                args.teacher_model[0], tokens, position_ids, attention_mask)
-
-    # Output_tensor stores the standard loss, loos_func calculates the total loss.
-    return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss)
-
-
-def prompt_train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    print_rank_0('> building finetune prompt datasets '
-                 'for llama ...')
-
-    tokenizer = get_tokenizer()
-    
-   # The finetune dataset is not large and defaults to using one file
-    train_ds = SupervisedDataset(args.data_path[0],tokenizer)
-    return train_ds, None ,None
-    
-def command_exists(cmd):
-    result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
-    return result.wait() == 0
-
-
-def git_ds_info():
-    from deepspeed.env_report import main as ds_report
-    ds_report()
-
-    # Write out version/git info
-    git_hash_cmd = "git rev-parse --short HEAD"
-    git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
-    if command_exists('git'):
-        try:
-            result = subprocess.check_output(git_hash_cmd, shell=True)
-            git_hash = result.decode('utf-8').strip()
-            result = subprocess.check_output(git_branch_cmd, shell=True)
-            git_branch = result.decode('utf-8').strip()
-        except subprocess.CalledProcessError:
-            git_hash = "unknown"
-            git_branch = "unknown"
-    else:
-        git_hash = "unknown"
-        git_branch = "unknown"
-    print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****')
-
-
-if __name__ == "__main__":
-    git_ds_info()
-    pretrain(prompt_train_valid_test_datasets_provider,
-             model_provider,
-             ModelType.encoder_or_decoder,
-             forward_step,
-             data_post_process=data_post_process)
diff --git a/toolbox/Megatron-DeepSpeed/images/Achieved_petaFLOPs.png b/toolbox/Megatron-DeepSpeed/images/Achieved_petaFLOPs.png
deleted file mode 100644
index 3431099f3f4b1e1421d1024f12051bec0ccc4f9c..0000000000000000000000000000000000000000
Binary files a/toolbox/Megatron-DeepSpeed/images/Achieved_petaFLOPs.png and /dev/null differ
diff --git a/toolbox/Megatron-DeepSpeed/images/cases_april2021.png b/toolbox/Megatron-DeepSpeed/images/cases_april2021.png
deleted file mode 100644
index 8a6d9e9f8b649900162efc942f2e2e448c15777c..0000000000000000000000000000000000000000
Binary files a/toolbox/Megatron-DeepSpeed/images/cases_april2021.png and /dev/null differ
diff --git a/toolbox/Megatron-DeepSpeed/install_megatron-deepspeed.sh b/toolbox/Megatron-DeepSpeed/install_megatron-deepspeed.sh
deleted file mode 100644
index e08e9f8270da5377c92256efcc204af0576f8802..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/install_megatron-deepspeed.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-TARGET_DIR=${TARGET_DIR:-}
-
-PYTHON_PATH=$(which python3)
-
-PYTHON_DIST_PATH=${TARGET_DIR}/lib/python3/dist-packages
-
-PKG_DIR="build_pip"
-PKG_NAME="megatron_deepspeed"
-
-if [[ ! -d ${PKG_DIR} ]]; then
-  echo "ERROR: Package directory ${PKG_DIR} doesn't exist"
-  exit 1
-fi
-
-latest_pkg="$(ls -t ${PKG_DIR} | grep ${PKG_NAME} | head -1)"
-if [[ "${latest_pkg}" == "" ]]; then
-  echo "ERROR: Cannot find latest ${PKG_NAME} package"
-  exit 1
-else
-  echo "INFO: Found latest package ${latest_pkg} in directory ${PKG_DIR}"
-fi
-
-if [[ "${TARGET_DIR}" != "" ]]; then
-  mkdir tmp
-  cp -R ${PYTHON_DIST_PATH}/bin ./tmp/
-  ${PYTHON_PATH} -m pip install --upgrade --no-deps -t ${PYTHON_DIST_PATH} ${PKG_DIR}/${latest_pkg} || exit
-  cp -n ./tmp/bin/* ${PYTHON_DIST_PATH}/bin
-  rm -rf ./tmp
-  echo "pycuda installed in ${PYTHON_DIST_PATH}; please add it to your PYTHONPATH."
-else
-  ${PYTHON_PATH} -m pip uninstall ${PKG_NAME} -y
-  ${PYTHON_PATH} -m pip install ${PKG_DIR}/${latest_pkg} || exit
-fi
-
-# Return 0 status if all finished
-exit 0
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/__init__.py
deleted file mode 100644
index 85fc8fa64d91e9c3818ce3bdff0ff49c98b81548..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-from .global_vars import get_args, get_retro_args, set_args
-from .global_vars import get_rlhf_args, set_rlhf_args
-from .global_vars import get_current_global_batch_size
-from .global_vars import get_num_microbatches
-from .global_vars import get_signal_handler
-from .global_vars import update_num_microbatches
-from .global_vars import get_tokenizer
-from .global_vars import get_tensorboard_writer
-from .global_vars import get_wandb_writer
-from .global_vars import get_adlr_autoresume
-from .global_vars import get_timers
-from .initialize  import initialize_megatron
-
-from .utils import (print_rank_0,
-                    is_last_rank,
-                    print_rank_last,
-                    is_rank_0,
-                    is_aml)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/checkpointing.py b/toolbox/Megatron-DeepSpeed/megatron_ds/checkpointing.py
deleted file mode 100644
index fc74a80b0d1b4ee8158f7e7b5b4ac5859df0d74c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/checkpointing.py
+++ /dev/null
@@ -1,759 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Input/output checkpointing."""
-
-import os
-import random
-import sys
-import numpy as np
-
-import torch
-
-from megatron_ds import update_num_microbatches,get_tokenizer
-from megatron_ds.core import mpu, tensor_parallel
-from .global_vars import get_args
-from .utils import (unwrap_model,
-                    print_rank_0)
-
-from deepspeed.checkpoint import (
-    ORIGINAL_VOCAB_SIZE,
-    PADDED_VOCAB_SIZE,
-    UNIVERSAL_CHECKPOINT_INFO,
-    UNIVERSAL_CHECKPOINT_VERSION_KEY,
-    UNIVERSAL_CHECKPOINT_VERSION_VALUE,
-)
-
-_CHECKPOINT_VERSION = None
-
-
-def set_checkpoint_version(value):
-    global _CHECKPOINT_VERSION
-    if _CHECKPOINT_VERSION is not None:
-        assert _CHECKPOINT_VERSION == value, \
-            "checkpoint versions do not match"
-    _CHECKPOINT_VERSION = value
-
-
-def get_checkpoint_version():
-    global _CHECKPOINT_VERSION
-    return _CHECKPOINT_VERSION
-
-
-def check_checkpoint_args(checkpoint_args):
-    """Ensure fixed arguments for a model are the same for the input
-    arguments and the one retrieved from checkpoint."""
-    args = get_args()
-
-    def _compare(arg_name, old_arg_name=None, default=None):
-        if old_arg_name is not None:
-            ckpt_arg_name = old_arg_name
-        else:
-            ckpt_arg_name = arg_name
-        if default is not None:
-            checkpoint_value = getattr(checkpoint_args, ckpt_arg_name, default)
-        else:
-            checkpoint_value = getattr(checkpoint_args, ckpt_arg_name)
-        args_value = getattr(args, arg_name)
-        error_message = '{} value from checkpoint ({}) is not equal to the ' \
-                        'input argument value ({}).'.format(
-                            arg_name, checkpoint_value, args_value)
-        assert checkpoint_value == args_value, error_message
-
-    _compare('num_layers')
-    _compare('hidden_size')
-    _compare('num_attention_heads')
-    _compare('add_position_embedding', default=True)
-    if args.vocab_file:
-        _compare('max_position_embeddings')
-        if not args.universal_checkpoint:
-            _compare('make_vocab_size_divisible_by')
-            _compare('padded_vocab_size')
-        _compare('tokenizer_type')
-    if args.data_parallel_random_init:
-        _compare('data_parallel_random_init')
-    if get_checkpoint_version() < 3.0 and not args.universal_checkpoint:
-        _compare('tensor_model_parallel_size',      
-                 old_arg_name='model_parallel_size')
-    if get_checkpoint_version() >= 3.0 and not args.universal_checkpoint:
-        _compare('tensor_model_parallel_size')
-        _compare('pipeline_model_parallel_size')
-
-
-def ensure_directory_exists(filename):
-    """Build filename's path if it does not already exists."""
-    dirname = os.path.dirname(filename)
-    os.makedirs(dirname, exist_ok = True)
-
-
-def get_checkpoint_name(checkpoints_path, iteration, release=False,
-                        pipeline_parallel=None,
-                        tensor_rank=None, pipeline_rank=None,
-                        expert_parallel=None, expert_rank=None):
-    """Determine the directory name for this rank's checkpoint."""
-    if release:
-        directory = 'release'
-    else:
-        directory = 'iter_{:07d}'.format(iteration)
-
-    # Use both the tensor and pipeline MP rank.
-    if pipeline_parallel is None:
-        pipeline_parallel = (mpu.get_pipeline_model_parallel_world_size() > 1)
-    if tensor_rank is None:
-        tensor_rank = mpu.get_tensor_model_parallel_rank()
-    if pipeline_rank is None:
-        pipeline_rank = mpu.get_pipeline_model_parallel_rank()
-    if expert_parallel is None:
-        expert_parallel = (mpu.get_expert_model_parallel_world_size() > 1)
-    if expert_rank is None:
-        expert_rank = mpu.get_expert_model_parallel_rank()
-
-    # Use both the tensor and pipeline MP rank. If using the distributed
-    # optimizer, then the optimizer's path must additionally include the
-    # data parallel rank.
-    if not pipeline_parallel:
-        common_path = os.path.join(checkpoints_path, directory,
-                            f'mp_rank_{tensor_rank:02d}')
-    else:
-        common_path = os.path.join(checkpoints_path, directory,
-                f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}')
-
-    if expert_parallel:
-        common_path = common_path + f'_{expert_rank:03d}'
-
-    return os.path.join(common_path, "model_optim_rng.pt")
-
-
-def get_distributed_optimizer_checkpoint_name(model_checkpoint_name):
-    return os.path.join(os.path.dirname(model_checkpoint_name),
-                        "distrib_optim.pt")
-
-
-def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
-    """Finds the checkpoint for rank 0 without knowing if we are using
-    pipeline parallelism/expert parallelism or not.
-
-    Since the checkpoint naming scheme changes if pipeline or expert
-    parallelism is present, we need to look for both naming schemes if
-    we don't know if the checkpoint has pipeline or expert parallelism.
-    """
-
-    # Look for checkpoint with no pipelining and no expert parallelism
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=False,
-                                   tensor_rank=0, pipeline_rank=0,
-                                   expert_parallel=False, expert_rank=0)
-    if os.path.isfile(filename):
-        return filename
-
-    # Look for checkpoint with no pipelining and expert parallelism
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=False,
-                                   tensor_rank=0, pipeline_rank=0,
-                                   expert_parallel=True, expert_rank=0)
-    if os.path.isfile(filename):
-        return filename
-
-    # Look for checkpoint with pipelining and no expert parallelism
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=True,
-                                   tensor_rank=0, pipeline_rank=0,
-                                   expert_parallel=False, expert_rank=0)
-    if os.path.isfile(filename):
-        return filename
-
-    # Look for checkpoint with pipelining and expert parallelism
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=True,
-                                   tensor_rank=0, pipeline_rank=0,
-                                   expert_parallel=True, expert_rank=0)
-    if os.path.isfile(filename):
-        return filename
-
-    return None, None
-
-
-def get_checkpoint_tracker_filename(checkpoints_path):
-
-    """Tracker file rescords the latest chckpoint during
-    training to restart from."""
-    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
-
-
-def read_metadata(tracker_filename):
-    # Read the tracker file and either set the iteration or
-    # mark it as a release checkpoint.
-    iteration = 0
-    release = False
-    with open(tracker_filename, 'r') as f:
-        metastring = f.read().strip()
-        try:
-            iteration = int(metastring)
-        except ValueError:
-            release = metastring == 'release'
-            if not release:
-                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
-                    tracker_filename))
-                sys.exit()
-    assert iteration > 0 or release, 'error parsing metadata file {}'.format(
-        tracker_filename)
-
-    # Get the max iteration retrieved across the ranks.
-    if torch.distributed.is_initialized():
-        iters_cuda = torch.cuda.LongTensor([iteration])
-        torch.distributed.all_reduce(iters_cuda, op=torch.distributed.ReduceOp.MAX)
-        max_iter = iters_cuda[0].item()
-
-        # We should now have all the same iteration.
-        # If not, print a warning and chose the maximum
-        # iteration across all ranks.
-        if iteration != max_iter:
-            rank = torch.distributed.get_rank()
-            print('WARNING: on rank {} found iteration {} in the '
-                  'metadata while max iteration across the ranks '
-                  'is {}, replacing it with max iteration.'.format(
-                      rank, iteration, max_iter), flush=True)
-    else:
-        # When loading a checkpoint outside of training (for example,
-        # when editing it), we might not have torch distributed
-        # initialized, in this case, just assume we have the latest
-        max_iter = iteration
-    return max_iter, release
-
-
-def get_rng_state():
-    """ collect rng state across data parallel ranks """
-    args = get_args()
-    rng_state = {
-        'random_rng_state': random.getstate(),
-        'np_rng_state': np.random.get_state(),
-        'torch_rng_state': torch.get_rng_state(),
-        'cuda_rng_state': torch.cuda.get_rng_state(),
-        'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states()}
-
-    rng_state_list = None
-    if torch.distributed.is_initialized() and \
-            mpu.get_data_parallel_world_size() > 1 and \
-            args.data_parallel_random_init:
-        rng_state_list = \
-            [None for i in range(mpu.get_data_parallel_world_size())]
-        torch.distributed.all_gather_object(
-            rng_state_list,
-            rng_state,
-            group=mpu.get_data_parallel_group())
-    else:
-        rng_state_list = [rng_state]
-
-    return rng_state_list
-
-
-def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
-    """Save a model checkpoint."""
-    args = get_args()
-
-    # Only rank zero of the data parallel writes to the disk.
-    model = unwrap_model(model)
-
-    print_rank_0('saving checkpoint at iteration {:7d} to {}'.format(
-        iteration, args.save))
-
-    # Collect rng state across data parallel ranks.
-    rng_state = get_rng_state()
-
-    # Checkpoint name.
-    checkpoint_name = get_checkpoint_name(args.save, iteration)
-
-    # Save distributed optimizer's custom parameter state.
-    if args.use_distributed_optimizer and not args.no_save_optim and optimizer is not None:
-        optim_checkpoint_name = \
-            get_distributed_optimizer_checkpoint_name(checkpoint_name)
-        ensure_directory_exists(optim_checkpoint_name)
-        optimizer.save_parameter_state(optim_checkpoint_name)
-
-    # Collect args, model, RNG.
-    if not torch.distributed.is_initialized() \
-            or mpu.get_data_modulo_expert_parallel_rank() == 0:
-
-        # Arguments, iteration, and model.
-        state_dict = {}
-        state_dict['args'] = args
-        state_dict['checkpoint_version'] = 3.0
-        state_dict['iteration'] = iteration
-        if len(model) == 1:
-            state_dict['model'] = model[0].state_dict_for_save_checkpoint()
-        else:
-            for i in range(len(model)):
-                mpu.set_virtual_pipeline_model_parallel_rank(i)
-                state_dict['model%d' % i] = \
-                    model[i].state_dict_for_save_checkpoint()
-
-        # Optimizer stuff.
-        if not args.no_save_optim:
-            if optimizer is not None:
-                state_dict['optimizer'] = optimizer.state_dict()
-            if opt_param_scheduler is not None:
-                state_dict['opt_param_scheduler'] = \
-                    opt_param_scheduler.state_dict()
-
-        # RNG states.
-        if not args.no_save_rng:
-            state_dict["rng_state"] = rng_state
-
-        # Save.
-        ensure_directory_exists(checkpoint_name)
-        torch.save(state_dict, checkpoint_name)
-
-    # Wait so everyone is done (necessary)
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-
-    print_rank_0('  successfully saved checkpoint at iteration {:7d} to {}' \
-                 .format(iteration, args.save))
-
-    # And update the latest iteration
-    if not torch.distributed.is_initialized() \
-       or(torch.distributed.get_rank() % 8) == 0: ## 确保多机每个节点都会保存此文件
-        tracker_filename = get_checkpoint_tracker_filename(args.save)
-        with open(tracker_filename, 'w') as f:
-            f.write(str(iteration))
-
-    # Wait so everyone is done (not necessary)
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-
-
-def _transpose_first_dim(t, num_splits, num_splits_first, model):
-    input_shape = t.size()
-    # We use a self_attention module but the values extracted aren't
-    # specific to self attention so should work for cross attention as well
-    while hasattr(model, 'module'):
-        model = model.module
-    attention_module = model.language_model.encoder.layers[0].self_attention
-    hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
-    num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
-    if num_splits_first:
-        """[num_splits * np * hn, h]
-        -->(view) [num_splits, np, hn, h]
-        -->(tranpose) [np, num_splits, hn, h]
-        -->(view) [np * num_splits * hn, h] """
-
-        intermediate_shape = \
-            (num_splits, num_attention_heads_per_partition,
-             hidden_size_per_attention_head) + input_shape[1:]
-
-        t = t.view(*intermediate_shape)
-        t = t.transpose(0, 1).contiguous()
-    else:
-        """[np * hn * num_splits, h]
-        -->(view) [np, hn, num_splits, h]
-        -->(tranpose) [np, num_splits, hn, h]
-        -->(view) [np * num_splits * hn, h] """
-
-        intermediate_shape = \
-            (num_attention_heads_per_partition,
-             hidden_size_per_attention_head, num_splits) +\
-             input_shape[1:]
-
-        t = t.view(*intermediate_shape)
-        t = t.transpose(1, 2).contiguous()
-    t = t.view(*input_shape)
-
-    return t
-
-
-def fix_query_key_value_ordering(model, checkpoint_version):
-    """Fix up query/key/value matrix ordering if checkpoint
-    version is smaller than 2.0
-    """
-    if checkpoint_version < 2.0:
-        if isinstance(model, list):
-            assert len(model)==1
-            model = model[0]
-        for name, param in model.named_parameters():
-            if name.endswith(('.query_key_value.weight', '.query_key_value.bias')):
-                if checkpoint_version == 0:
-                    fixed_param = _transpose_first_dim(param.data, 3, True, model)
-                elif checkpoint_version == 1.0:
-                    fixed_param = _transpose_first_dim(param.data, 3, False, model)
-                else:
-                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
-                    sys.exit()
-                param.data.copy_(fixed_param)
-            if name.endswith(('.key_value.weight', '.key_value.bias')):
-                if checkpoint_version == 0:
-                    fixed_param = _transpose_first_dim(param.data, 2, True, model)
-                elif checkpoint_version == 1.0:
-                    fixed_param = _transpose_first_dim(param.data, 2, False, model)
-                else:
-                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
-                    sys.exit()
-                param.data.copy_(fixed_param)
-        print_rank_0(" succesfully fixed query-key-values ordering for"
-                     " checkpoint version {}".format(checkpoint_version))
-
-
-def _load_base_checkpoint(load_dir, rank0=False):
-    """ Load the base state_dict from the given directory
-
-    If rank0 is true, just loads rank 0 checkpoint, ignoring arguments.
-    """
-
-    # Read the tracker file and set the iteration.
-    tracker_filename = get_checkpoint_tracker_filename(load_dir)
-
-    # If no tracker file, return nothing
-    if not os.path.isfile(tracker_filename):
-        if not rank0:
-            print_rank_0('WARNING: could not find the metadata file {} '.format(
-                tracker_filename))
-            print_rank_0('    will not load any checkpoints and will start from '
-                         'random')
-        return None, "", False
-
-    # Otherwise, read the tracker file and either set the iteration or
-    # mark it as a release checkpoint.
-    iteration, release = read_metadata(tracker_filename)
-
-    # Checkpoint.
-    if rank0:
-        checkpoint_name = find_checkpoint_rank_0(load_dir, iteration, release)
-    else:
-        checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
-        if release:
-            print_rank_0(f' loading release checkpoint from {load_dir}')
-        else:
-            print_rank_0(f' loading checkpoint from {load_dir} at iteration {iteration}')
-
-    # Load the checkpoint.
-    try:
-        state_dict = torch.load(checkpoint_name, map_location='cpu')
-    except ModuleNotFoundError:
-        from megatron_ds.fp16_deprecated import loss_scaler
-        # For backward compatibility.
-        if not rank0:
-            print_rank_0(' > deserializing using the old code structure ...')
-        sys.modules['fp16.loss_scaler'] = sys.modules[
-            'megatron_ds.fp16_deprecated.loss_scaler']
-        sys.modules['megatron_ds.fp16.loss_scaler'] = sys.modules[
-            'megatron_ds.fp16_deprecated.loss_scaler']
-        state_dict = torch.load(checkpoint_name, map_location='cpu')
-        sys.modules.pop('fp16.loss_scaler', None)
-        sys.modules.pop('megatron_ds.fp16.loss_scaler', None)
-    except BaseException as e:
-        print_rank_0('could not load the checkpoint')
-        print_rank_0(e)
-        sys.exit()
-
-    return state_dict, checkpoint_name, release
-
-
-def load_args_from_checkpoint(args, load_arg='load'):
-    """Set required arguments from the checkpoint specified in the
-    arguments.
-
-    Will overwrite arguments that have a non-None default value, but
-    will leave any arguments that default to None as set.
-
-    Returns the same args NameSpace with the new values added/updated.
-
-    If no checkpoint is specified in args, or if the checkpoint is
-    there but invalid, the arguments will not be modified
-
-    """
-    load_dir = getattr(args, load_arg)
-
-    if load_dir is None:
-        print_rank_0('No load directory specified, using provided arguments.')
-        return args
-
-    state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=True)
-
-    # Args.
-    if not state_dict:
-        print_rank_0('Checkpoint not found to provide arguments, using provided arguments.')
-        return args
-
-    if 'args' not in state_dict:
-        print_rank_0('Checkpoint provided does not have arguments saved, using provided arguments.')
-        return args
-
-    checkpoint_args = state_dict['args']
-    checkpoint_version = state_dict.get('checkpoint_version', 0)
-    args.iteration = state_dict['iteration']
-
-    # One-off conversion for foundation models
-    if hasattr(checkpoint_args, 'disable_bias_linear'):
-        setattr(checkpoint_args, 'add_bias_linear', not getattr(checkpoint_args, 'disable_bias_linear'))
-
-    def _set_arg(arg_name, old_arg_name=None, force=False):
-        if not force and getattr(args, arg_name, None) is not None:
-            return
-
-        if old_arg_name is not None:
-            checkpoint_value = getattr(checkpoint_args, old_arg_name, None)
-        else:
-            checkpoint_value = getattr(checkpoint_args, arg_name, None)
-
-        if checkpoint_value is not None:
-            print_rank_0(f"Setting {arg_name} to {checkpoint_value} from checkpoint")
-            setattr(args, arg_name, checkpoint_value)
-        else:
-            print_rank_0(f"Checkpoint did not provide arguments {arg_name}")
-
-    _set_arg('num_layers')
-    _set_arg('hidden_size')
-    _set_arg('ffn_hidden_size')
-    _set_arg('seq_length')
-    _set_arg('num_attention_heads')
-    _set_arg('num_query_groups', force=True)
-    _set_arg('group_query_attention', force=True)
-    _set_arg('kv_channels')
-    _set_arg('max_position_embeddings')
-    _set_arg('position_embedding_type', force=True)
-    _set_arg('add_position_embedding', force=True)
-    _set_arg('use_rotary_position_embeddings', force=True)
-    _set_arg('rotary_percent', force=True)
-    _set_arg('add_bias_linear', force=True)
-    _set_arg('swiglu', force=True)
-    _set_arg('untie_embeddings_and_output_weights', force=True)
-    _set_arg('apply_layernorm_1p', force=True)
-    _set_arg('normalization', force=True)
-    _set_arg('tokenizer_type')
-    _set_arg('padded_vocab_size')
-    if checkpoint_version < 3.0:
-        _set_arg('tensor_model_parallel_size',
-                 'model_parallel_size')
-    else:
-        _set_arg('tensor_model_parallel_size', force=True)
-        _set_arg('pipeline_model_parallel_size', force=True)
-        _set_arg('virtual_pipeline_model_parallel_size', force=True)
-        _set_arg('num_layers_per_virtual_pipeline_stage')
-    return args, checkpoint_args
-
-
-def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True):
-    """Load a model checkpoint and return the iteration.
-    strict (bool): whether to strictly enforce that the keys in
-        :attr:`state_dict` of the checkpoint match the names of
-        parameters and buffers in model.
-    """
-    args = get_args()
-    load_dir = getattr(args, load_arg)
-
-    model = unwrap_model(model)
-
-    state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=False)
-
-    # Checkpoint not loaded.
-    if state_dict is None:
-
-        # Conditionally exit at this point.
-        if args.exit_on_missing_checkpoint:
-            print_rank_0(">> '--exit-on-missing-checkpoint' set ... exiting. <<")
-            torch.distributed.barrier()
-            sys.exit()
-
-        # Iteration defaults to 0.
-        return 0
-
-    # Set checkpoint version.
-    set_checkpoint_version(state_dict.get('checkpoint_version', 0))
-
-    # Set iteration.
-    if args.finetune or release:
-        iteration = 0
-    else:
-        try:
-            iteration = state_dict['iteration']
-        except KeyError:
-            try:  # Backward compatible with older checkpoints
-                iteration = state_dict['total_iters']
-            except KeyError:
-                print_rank_0('A metadata file exists but unable to load '
-                             'iteration from checkpoint {}, exiting'.format(
-                                 checkpoint_name))
-                sys.exit()
-
-    # Check arguments.
-    assert args.consumed_train_samples == 0
-    assert args.consumed_valid_samples == 0
-    if 'args' in state_dict and not args.finetune:
-        checkpoint_args = state_dict['args']
-        check_checkpoint_args(checkpoint_args)
-        args.consumed_train_samples = getattr(checkpoint_args,
-                                              'consumed_train_samples', 0)
-        update_num_microbatches(consumed_samples=args.consumed_train_samples)
-        args.consumed_valid_samples = getattr(checkpoint_args,
-                                              'consumed_valid_samples', 0)
-    else:
-        print_rank_0('could not find arguments in the checkpoint ...')
-
-    # Model.
-    if len(model) == 1:
-        model[0].load_state_dict(state_dict['model'], strict=strict)
-    else:
-        for i in range(len(model)):
-            mpu.set_virtual_pipeline_model_parallel_rank(i)
-            model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
-
-    # Fix up query/key/value matrix ordering if needed.
-    checkpoint_version = get_checkpoint_version()
-    print_rank_0(f' checkpoint version {checkpoint_version}')
-    fix_query_key_value_ordering(model, checkpoint_version)
-
-    # Optimizer.
-    if not release and not args.finetune and not args.no_load_optim:
-        try:
-            # Load state dict.
-            if optimizer is not None:
-                optimizer.load_state_dict(state_dict['optimizer'])
-
-            # Load distributed optimizer's custom parameter state.
-            if args.use_distributed_optimizer:
-                tracker_filename = get_checkpoint_tracker_filename(load_dir)
-                iteration, release = read_metadata(tracker_filename)
-                model_checkpoint_name = \
-                    get_checkpoint_name(load_dir, iteration, release)
-                optim_checkpoint_name = \
-                    get_distributed_optimizer_checkpoint_name(
-                        model_checkpoint_name)
-                optimizer.load_parameter_state(optim_checkpoint_name)
-
-            # Load scheduler.
-            if opt_param_scheduler is not None:
-                if 'lr_scheduler' in state_dict: # backward compatbility
-                    opt_param_scheduler.load_state_dict(state_dict['lr_scheduler'])
-                else:
-                    opt_param_scheduler.load_state_dict(state_dict['opt_param_scheduler'])
-        except KeyError:
-            print_rank_0('Unable to load optimizer from checkpoint {}. '
-                         'Specify --no-load-optim or --finetune to prevent '
-                         'attempting to load the optimizer state, '
-                         'exiting ...'.format(checkpoint_name))
-            sys.exit()
-    else:
-        if (args.fp16 or args.bf16) and optimizer is not None:
-            optimizer.reload_model_params()
-
-    # rng states.
-    if not release and not args.finetune and not args.no_load_rng:
-        try:
-            if 'rng_state' in state_dict:
-                # access rng_state for data parallel rank
-                if args.data_parallel_random_init:
-                    rng_state = state_dict['rng_state'][mpu.get_data_parallel_rank()]
-                else:
-                    rng_state = state_dict['rng_state'][0]
-                random.setstate(rng_state['random_rng_state'])
-                np.random.set_state(rng_state['np_rng_state'])
-                torch.set_rng_state(rng_state['torch_rng_state'])
-                torch.cuda.set_rng_state(rng_state['cuda_rng_state'])
-                # Check for empty states array
-                if not rng_state['rng_tracker_states']:
-                    raise KeyError
-                tensor_parallel.get_cuda_rng_tracker().set_states(
-                    rng_state['rng_tracker_states'])
-            else:  # backward compatability
-                random.setstate(state_dict['random_rng_state'])
-                np.random.set_state(state_dict['np_rng_state'])
-                torch.set_rng_state(state_dict['torch_rng_state'])
-                torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
-                # Check for empty states array
-                if not state_dict['rng_tracker_states']:
-                    raise KeyError
-                tensor_parallel.get_cuda_rng_tracker().set_states(
-                    state_dict['rng_tracker_states'])
-        except KeyError:
-            print_rank_0('Unable to load rng state from checkpoint {}. '
-                         'Specify --no-load-rng or --finetune to prevent '
-                         'attempting to load the rng state, '
-                         'exiting ...'.format(checkpoint_name))
-            sys.exit()
-
-        if args.universal_checkpoint:
-            # TLDR: unique rng is needed for dropout to be really random on TP ranks
-            #
-            # Each tp-rank stores its model-parallel-rng states info.
-            # This is required to e.g. have different dropout patterns on different tp ranks that operate on
-            # slices of attention_probs tensor.
-            #
-            # When loading from universal checkpoint, we use mp_rank_<mp>_model_states.pt checkpoint files
-            # to restore the model-parallel-rng (<mp> is {tp-rank, pp-rank} combination).
-            # However, if the loaded checkpoint mp configuration does not match the current mp configuration,
-            # we can not use it to restore model-parallel-rng info.
-            #
-            # In the case of mp configuration change, we reconfigure the model-parallel-rng states s.t. each
-            # tp-rank will have a unique state. In order to ensure that subsequent loads from universal will
-            # not cause the model-parallel-rng states to be repeated, we add the iteration number to the base seed.
-            ckp_args = state_dict['args']
-            if ((args.tensor_model_parallel_size != ckp_args.tensor_model_parallel_size)
-                    or (args.pipeline_model_parallel_size != ckp_args.pipeline_model_parallel_size)):
-                print_rank_0(' loading universal checkpoint with modified mp configuration '
-                             '-> reconfigure tp seed')
-                tensor_parallel.model_parallel_reconfigure_tp_seed(args.seed + iteration)
-
-    # Some utilities want to load a checkpoint without distributed being initialized
-    if torch.distributed.is_initialized():
-        torch.distributed.barrier()
-
-    print_rank_0(f'  successfully loaded checkpoint from {args.load} '
-                 f'at iteration {iteration}')
-
-    # from .utils import dump_weights, dump_position_embed_weights
-    # dump_weights(f'{args.universal_checkpoint=}', iteration, model, optimizer)
-    # dump_position_embed_weights("init", 0, model)
-
-    return iteration
-
-
-def load_biencoder_checkpoint(model, only_query_model=False,
-                              only_context_model=False, custom_load_path=None):
-    """
-    selectively load retrieval models for indexing/retrieving
-    from saved checkpoints
-    """
-
-    args = get_args()
-
-    model = unwrap_model(model)
-
-    load_path = custom_load_path if custom_load_path is not None else args.load
-
-    tracker_filename = get_checkpoint_tracker_filename(load_path)
-    with open(tracker_filename, 'r') as f:
-        iteration = int(f.read().strip())
-
-    checkpoint_name = get_checkpoint_name(load_path, iteration,
-                                          args.use_distributed_optimizer,
-                                          release=False)
-
-    if mpu.get_data_parallel_rank() == 0:
-        print('global rank {} is loading checkpoint {}'.format(
-            torch.distributed.get_rank(), checkpoint_name))
-
-    state_dict = torch.load(checkpoint_name, map_location='cpu')
-    ret_state_dict = state_dict['model']
-
-    if only_query_model:
-        ret_state_dict.pop('context_model')
-    if only_context_model:
-        ret_state_dict.pop('query_model')
-
-    assert len(model) == 1
-    model[0].load_state_dict(ret_state_dict)
-    torch.distributed.barrier()
-
-    if mpu.get_data_parallel_rank() == 0:
-        print(' successfully loaded {}'.format(checkpoint_name))
-
-    return model
-
-
-def _universal_checkpoint_info(model):
-    args = get_args()
-    tokenizer = get_tokenizer()
-    info = dict()
-    info[UNIVERSAL_CHECKPOINT_VERSION_KEY] = UNIVERSAL_CHECKPOINT_VERSION_VALUE
-    info[ORIGINAL_VOCAB_SIZE] = tokenizer.vocab_size
-    info[PADDED_VOCAB_SIZE] = args.padded_vocab_size
-    info.update(model[0].universal_checkpoint_info())
-    return info
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/__init__.py
deleted file mode 100644
index fca659f9811962459d358dbcdae72231c117d134..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import megatron_ds.core.tensor_parallel
-import megatron_ds.core.utils
-from megatron_ds.core import parallel_state
-from megatron_ds.core.distributed import DistributedDataParallel
-from megatron_ds.core.inference_params import InferenceParams
-from megatron_ds.core.model_parallel_config import ModelParallelConfig
-
-# Alias parallel_state as mpu, its legacy name
-mpu = parallel_state
-
-__all__ = [
-    "parallel_state",
-    "tensor_parallel",
-    "utils",
-    "DistributedDataParallel",
-    "InferenceParams",
-    "ModelParallelConfig",
-]
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/Makefile b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/Makefile
deleted file mode 100644
index 30f6f1231e39469fdd43c18e51bceae434b2de48..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
-CPPFLAGS += $(shell python3 -m pybind11 --includes)
-LIBNAME = helpers
-LIBEXT = $(shell $$(which python3) -c "from sysconfig import get_config_vars as gv; print(gv()['EXT_SUFFIX'])")
-
-default: $(LIBNAME)$(LIBEXT)
-
-%$(LIBEXT): %.cpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_dataset.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_dataset.py
deleted file mode 100644
index e1aa76e3794b1bfd72774a77e5c12b17acdf72d1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_dataset.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import hashlib
-import json
-import logging
-import os
-import time
-from collections import OrderedDict
-from typing import Dict, List, Tuple, Union
-
-import numpy
-import torch
-
-from megatron_ds.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
-from megatron_ds.core.datasets.megatron_dataset import MegatronDataset
-from megatron_ds.core.datasets.utils import log_single_rank, normalize
-
-logger = logging.getLogger(__name__)
-
-_VERBOSE = False
-
-
-class BlendedDataset(torch.utils.data.Dataset):
-    """Conjugating class for a set of MegatronDataset instances
-
-    Args:
-        datasets (List[MegatronDataset]): The MegatronDataset instances to blend
-
-        weights (List[float]): The weights which determines the dataset blend ratios
-
-        size (int): The number of samples to draw from the blend
-
-        config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
-
-    Raises:
-        RuntimeError: When the dataset has fewer or more samples than 'size' post-initialization
-    """
-
-    def __init__(
-        self,
-        datasets: List[MegatronDataset],
-        weights: List[float],
-        size: int,
-        config: BlendedMegatronDatasetConfig,
-    ) -> None:
-        assert len(datasets) < 32767
-        assert len(datasets) == len(weights)
-        assert numpy.isclose(sum(weights), 1.0)
-        assert all(map(lambda _: type(_) == type(datasets[0]), datasets))
-
-        # Alert user to unnecessary blending
-        if len(datasets) == 1:
-            log_single_rank(
-                logger, logging.WARNING, f"Building a BlendedDataset for a single MegatronDataset"
-            )
-
-        # Redundant normalization for bitwise identical comparison with Megatron-LM
-        weights = normalize(weights)
-
-        self.datasets = datasets
-        self.weights = weights
-        self.size = size
-        self.config = config
-
-        unique_identifiers = OrderedDict()
-        unique_identifiers["class"] = type(self).__name__
-        unique_identifiers["datasets"] = [dataset.unique_identifiers for dataset in self.datasets]
-        unique_identifiers["weights"] = self.weights
-        unique_identifiers["size"] = self.size
-
-        self.unique_description = json.dumps(unique_identifiers, indent=4)
-        self.unique_description_hash = hashlib.md5(
-            self.unique_description.encode("utf-8")
-        ).hexdigest()
-
-        self.dataset_index, self.dataset_sample_index = self._build_indices()
-
-        # Check size
-        _ = self[self.size - 1]
-        try:
-            _ = self[self.size]
-            raise RuntimeError(f"{type(self).__name__} size is improperly bounded")
-        except IndexError:
-            log_single_rank(logger, logging.INFO, f"> {type(self).__name__} length: {len(self)}")
-
-    def __len__(self) -> int:
-        return self.size
-
-    def __getitem__(self, idx: int) -> Dict[str, Union[int, numpy.ndarray]]:
-        dataset_id = self.dataset_index[idx]
-        dataset_sample_id = self.dataset_sample_index[idx]
-        return {
-            "dataset_id": dataset_id,
-            **self.datasets[dataset_id][dataset_sample_id],
-        }
-
-    def _build_indices(self) -> Tuple[numpy.ndarray, numpy.ndarray]:
-        """Build and optionally cache the dataset index and the dataset sample index
-
-        The dataset index is a 1-D mapping which determines the dataset to query. The dataset
-        sample index is a 1-D mapping which determines the sample to request from the queried
-        dataset.
-
-        Returns:
-            Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
-        """
-        path_to_cache = getattr(self.config, "path_to_cache")
-
-        if path_to_cache:
-            get_path_to = lambda suffix: os.path.join(
-                path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}"
-            )
-            path_to_description = get_path_to("description.txt")
-            path_to_dataset_index = get_path_to("dataset_index.npy")
-            path_to_dataset_sample_index = get_path_to("dataset_sample_index.npy")
-            cache_hit = all(
-                map(
-                    os.path.isfile,
-                    [path_to_description, path_to_dataset_index, path_to_dataset_sample_index],
-                )
-            )
-        else:
-            cache_hit = False
-
-        if not path_to_cache or (not cache_hit and torch.distributed.get_rank() == 0):
-            log_single_rank(
-                logger, logging.INFO, f"Build and save the {type(self).__name__} indices",
-            )
-
-            # Build the dataset and dataset sample indexes
-            log_single_rank(
-                logger, logging.INFO, f"\tBuild and save the dataset and dataset sample indexes"
-            )
-            t_beg = time.time()
-            from megatron_ds.core.datasets import helpers
-
-            dataset_index = numpy.zeros(self.size, dtype=numpy.int16)
-            dataset_sample_index = numpy.zeros(self.size, dtype=numpy.int64)
-            helpers.build_blending_indices(
-                dataset_index,
-                dataset_sample_index,
-                self.weights,
-                len(self.datasets),
-                self.size,
-                _VERBOSE,
-            )
-
-            if path_to_cache:
-                os.makedirs(path_to_cache, exist_ok=True)
-                # Write the description
-                with open(path_to_description, "wt") as writer:
-                    writer.write(self.unique_description)
-                # Save the indexes
-                numpy.save(path_to_dataset_index, dataset_index, allow_pickle=True)
-                numpy.save(path_to_dataset_sample_index, dataset_sample_index, allow_pickle=True)
-            else:
-                log_single_rank(
-                    logger,
-                    logging.WARNING,
-                    "Unable to save the indexes because path_to_cache is None",
-                )
-
-            t_end = time.time()
-            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-            return dataset_index, dataset_sample_index
-
-        log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} indices")
-
-        log_single_rank(
-            logger, logging.INFO, f"\tLoad the dataset index from {path_to_dataset_index}"
-        )
-        t_beg = time.time()
-        dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r')
-        t_end = time.time()
-        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        log_single_rank(
-            logger,
-            logging.INFO,
-            f"\tLoad the dataset sample index from {path_to_dataset_sample_index}",
-        )
-        t_beg = time.time()
-        dataset_sample_index = numpy.load(
-            path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r'
-        )
-        t_end = time.time()
-        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        return dataset_index, dataset_sample_index
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_megatron_dataset_builder.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_megatron_dataset_builder.py
deleted file mode 100644
index 624f90cff0d00e719819ed7fa059ffbe0b7e69ad..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_megatron_dataset_builder.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import logging
-import math
-from typing import Any, List, Optional, Tuple, Type, Union
-
-import numpy
-import torch
-
-from deepspeed.accelerator import get_accelerator
-from megatron_ds.core.datasets.blended_dataset import BlendedDataset
-from megatron_ds.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
-from megatron_ds.core.datasets.indexed_dataset import MMapIndexedDataset
-from megatron_ds.core.datasets.megatron_dataset import MegatronDataset
-from megatron_ds.core.datasets.utils import Split, normalize
-
-logger = logging.getLogger(__name__)
-
-DistributedDataset = Union[BlendedDataset, MegatronDataset, MMapIndexedDataset]
-
-
-class BlendedMegatronDatasetBuilder(object):
-    """Builder class for the BlendedDataset and MegatronDataset classes
-
-    Args:
-        cls (Type[MegatronDataset]): The class to instantiate, must inherit from MegatronDataset
-
-        sizes (List[int]): The minimum number of total samples to draw from each split, varies
-        with blend
-
-        config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
-    """
-
-    def __init__(
-        self, cls: Type[MegatronDataset], sizes: List[int], config: BlendedMegatronDatasetConfig,
-    ):
-        self.cls = cls
-        self.sizes = sizes
-        self.config = config
-
-    def build(self) -> List[Optional[Union[BlendedDataset, MegatronDataset]]]:
-        """Build all dataset splits according to the provided blend(s)
-        
-        This method is distributed-aware and must be called on all ranks.
-        
-        The dataset splits returned can vary according to the config. Supply config.blend and
-        config.split to build BlendedDataset and/or MegatronDataset splits from the same
-        distribution. Supply config.blend_per_split to build BlendedDataset and/or MegatronDataset
-        splits from separate distributions.
-
-        Returns:
-            List[Optional[Union[BlendedDataset, MegatronDataset]]]: A list of either
-            MegatronDataset or BlendedDataset (or None) per split
-        """
-        return self._build_blended_dataset_splits()
-
-    def _build_blended_dataset_splits(
-        self,
-    ) -> List[Optional[Union[BlendedDataset, MegatronDataset]]]:
-        """Build all dataset splits according to the provided blend(s)
-        
-        See the BlendedMegatronDatasetBuilder.build alias for more information.
-
-        Returns:
-            List[Optional[Union[BlendedDataset, MegatronDataset]]]: A list of either
-            MegatronDataset or BlendedDataset (or None) per split
-        """
-
-        if getattr(self.config, "blend"):
-            blend = getattr(self.config, "blend")
-            split = getattr(self.config, "split_vector")
-
-            # Blend consists of a single prefix
-            if len(blend) == 1:
-                return self._build_megatron_dataset_splits(blend[0], split, self.sizes)
-
-            # Blend consists of multiple weights and prefixes
-            (
-                prefix_per_dataset,
-                weight_per_dataset,
-                sizes_per_dataset,
-            ) = _get_prefixes_weights_and_sizes_for_blend(blend, self.sizes)
-
-            megatron_datasets = [[] for _ in range(len(Split))]
-
-            for i in range(len(prefix_per_dataset)):
-                megatron_datasets_split = self._build_megatron_dataset_splits(
-                    prefix_per_dataset[i], split, sizes_per_dataset[i]
-                )
-                for j in range(len(megatron_datasets_split)):
-                    megatron_datasets[j].append(megatron_datasets_split[j])
-
-            # Sum over all contributing datasets, per split
-            size_per_split = list(map(sum, zip(*sizes_per_dataset)))
-
-            blended_datasets = []
-
-            for i in range(len(megatron_datasets)):
-                is_none = map(lambda _: _ is None, megatron_datasets[i])
-
-                if split[i] == 0.0:
-                    assert all(is_none)
-                    blended_datasets.append(None)
-                else:
-                    assert all(is_none) or not any(is_none)
-                    blended_datasets.append(
-                        self._build_generic_dataset(
-                            BlendedDataset,
-                            megatron_datasets[i],
-                            weight_per_dataset,
-                            size_per_split[i],
-                            self.config,
-                        )
-                    )
-
-            return blended_datasets
-
-        else:
-            blended_datasets = []
-            for i in range(len(Split)):
-                blend = getattr(self.config, "blend_per_split")[i]
-
-                # Blend is not provided
-                if not blend:
-                    blended_datasets.append(None)
-                    continue
-
-                split_spoof = [0.0] * len(Split)
-                split_spoof[i] = 1.0
-                sizes_spoof = [0] * len(Split)
-                sizes_spoof[i] = self.sizes[i]
-
-                # Blend consists of a sigle prefix
-                if len(blend) == 1:
-                    blended_datasets.append(
-                        self._build_megatron_dataset_splits(blend[0], split_spoof, sizes_spoof)[i]
-                    )
-
-                # Blend consists of multiple weights and prefixes
-                else:
-                    (
-                        prefix_per_dataset,
-                        weight_per_dataset,
-                        sizes_per_dataset,
-                    ) = _get_prefixes_weights_and_sizes_for_blend(blend, sizes_spoof)
-
-                    megatron_datasets = []
-                    for j in range(len(prefix_per_dataset)):
-                        megatron_datasets.append(
-                            self._build_megatron_dataset_splits(
-                                prefix_per_dataset[j], split_spoof, sizes_per_dataset[j],
-                            )[i]
-                        )
-
-                    size_per_split = list(map(sum, zip(*sizes_per_dataset)))
-
-                    blended_datasets.append(
-                        self._build_generic_dataset(
-                            BlendedDataset,
-                            megatron_datasets,
-                            weight_per_dataset,
-                            size_per_split[i],
-                            self.config,
-                        )
-                    )
-
-            return blended_datasets
-
-    def _build_megatron_dataset_splits(
-        self, path_prefix: str, split: List[float], sizes: List[int],
-    ) -> List[Optional[MegatronDataset]]:
-        """Build each MegatronDataset split from a single MMapIndexedDataset
-
-        Args:
-            path_prefix (str): The MMapIndexedDataset .bin and .idx file prefix
-
-            split (List[float]): The dataset split ratios (must sum to 1.00)
-
-            sizes (List[int]): The number of total samples to draw from each split
-
-        Returns:
-            List[Optional[MegatronDataset]]: The MegatronDatset (or None) per split
-        """
-        indexed_dataset = self._build_generic_dataset(
-            MMapIndexedDataset, path_prefix, self.cls.is_multimodal()
-        )
-
-        if indexed_dataset is not None:
-            if self.cls.is_split_by_sequence():
-                split_idx_bounds = _get_split_indices(
-                    split, indexed_dataset.sequence_lengths.shape[0]
-                )
-            else:
-                split_idx_bounds = _get_split_indices(
-                    split, indexed_dataset.document_indices.shape[0] - 1
-                )
-            split_indices = [
-                numpy.arange(
-                    start=split_idx_bounds[i],
-                    stop=split_idx_bounds[i + 1],
-                    step=1,
-                    dtype=numpy.int32,
-                )
-                for i, _ in enumerate(Split)
-            ]
-        else:
-            split_indices = [None for _ in Split]
-
-        megatron_datasets = []
-        for i, _split in enumerate(Split):
-            if split[i] == 0.0:
-                megatron_datasets.append(None)
-            else:
-                megatron_datasets.append(
-                    self._build_generic_dataset(
-                        self.cls, indexed_dataset, split_indices[i], sizes[i], _split, self.config
-                    )
-                )
-
-        return megatron_datasets
-
-    def _build_generic_dataset(
-        self, cls: Type[DistributedDataset], *args: Any,
-    ) -> Optional[DistributedDataset]:
-        """Build the DistributedDataset
-
-        Return None if and only if the underlying MegatronDataset class is not built on the current
-        rank and torch.distributed is initialized.
-
-        Args:
-            cls (Type[DistributedDataset]): The DistributedDataset class to be built
-
-            args (Tuple[Any]): The positional arguments used to build the provided
-            DistributedDataset class
-
-        Raises:
-            Exception: When the dataset constructor raises an OSError
-
-        Returns:
-            Optional[DistributedDataset]: The DistributedDataset instantion or None
-        """
-        if torch.distributed.is_initialized():
-            rank = torch.distributed.get_rank()
-
-            dataset = None
-
-            # First, build on rank 0
-            # WA: each node's first rank build the dataset cache (some node could not need to do this, but this can work on no shared storage, only given a litte overhead)
-            if torch.distributed.get_rank() % get_accelerator().device_count() == 0:
-            #if rank == 0and getattr(self.config, "is_built_on_rank")():
-                try:
-                    # @todo: if data_parallel_group has been created, we can use this group to avoid overhead
-                    #vote = get_accelerator().LongTensor([1]).fill_(mpu.get_data_parallel_rank(with_context_parallel=True))
-                    #torch.distributed.all_reduce(vote, group=mpu.get_data_parallel_group(), op=torch.distributed.ReduceOp.MIN)
-                    #if vote.item() == mpu.get_data_parallel_rank(with_context_parallel=True):
-                    dataset = cls(*args)
-                except OSError as err:
-                    log = (
-                        f"Failed to write dataset materials to the data cache directory. "
-                        + f"Please supply a directory to which you have write access via "
-                        + f"the path_to_cache attribute in BlendedMegatronDatasetConfig and "
-                        + f"retry. Refer to the preserved traceback above for more information."
-                    )
-                    raise Exception(log) from err
-
-            torch.distributed.barrier()
-
-            # After, build on other ranks
-            if rank != 0 and getattr(self.config, "is_built_on_rank")():
-                dataset = cls(*args)
-
-            return dataset
-
-        return cls(*args)
-
-
-def _get_split_indices(split: List[float], num_elements: int) -> List[int]:
-    """Determine the document index bounds per split
-
-    Args:
-        split (List[float]): The dataset split ratios (must sum to 1.00)
-
-        num_elements (int): The number of elements, e.g. sequences or documents, available for
-        the split
-
-    Returns:
-        List[int]: The indices for all three splits e.g. [0, 900, 990, 1000] for a 1000-document
-        set and a [90.0, 9.0, 1.0] split
-    """
-    split_indices = [0]
-    for split_pct in split:
-        split_indices.append(split_indices[-1] + int(round(split_pct * float(num_elements))))
-    split_indices[1:] = list(
-        map(lambda _: _ - (split_indices[-1] - num_elements), split_indices[1:])
-    )
-
-    assert len(split_indices) == len(split) + 1
-    assert split_indices[-1] == num_elements
-
-    return split_indices
-
-
-def _get_prefixes_weights_and_sizes_for_blend(
-    blend: List[str], target_num_samples_per_split: List[int]
-) -> Tuple[List[str], List[float], List[List[int]]]:
-    """Determine the contribution of the MegatronDataset splits to the BlendedDataset splits
-    
-    Args:
-        blend (List[str]): e.g. ["30", "path/to/dataset_1_prefix", "70", 
-        "path/to/dataset_2_prefix"]
-
-        target_num_samples_per_split (List[int]): The number of samples to target for each
-        BlendedDataset split
-
-    Returns:
-        Tuple[List[str], List[float], List[List[int]]]: The prefix strings e.g.
-        ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], the normalized weights e.g.
-        [0.3, 0.7], and the number of samples to request per MegatronDataset per split
-    """
-    weights, prefixes = zip(
-        *[(float(blend[i]), blend[i + 1].strip()) for i in range(0, len(blend), 2)]
-    )
-
-    weights = normalize(weights)
-
-    # Use 0.5% target margin to ensure we satiate the network
-    sizes_per_dataset = [
-        [
-            int(math.ceil(target_num_samples * weight * 1.005))
-            for target_num_samples in target_num_samples_per_split
-        ]
-        for weight in weights
-    ]
-
-    return prefixes, weights, sizes_per_dataset
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_megatron_dataset_config.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_megatron_dataset_config.py
deleted file mode 100644
index 41add1ccc11d48f084e2cf5ebd9310d064f95a3b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/blended_megatron_dataset_config.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import logging
-import re
-from dataclasses import dataclass, field
-from typing import Callable, List, Optional
-
-import torch
-
-from megatron_ds.core.datasets.utils import Split, log_single_rank, normalize
-from megatron_ds.core.parallel_state import get_virtual_pipeline_model_parallel_rank
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class BlendedMegatronDatasetConfig:
-    """Configuration object for megatron-core blended and megatron datasets
-    
-    Attributes:
-        is_built_on_rank (Callable): A callable which returns True if the dataset should be built
-        on the current rank. It should be Megatron Core parallelism aware i.e. global rank, group
-        rank, and virtual rank may inform its return value.
-
-        random_seed (int): The seed for all RNG during dataset creation.
-
-        sequence_length (int): The sequence length.
-
-        blend (Optional[List[str]]): The blend string, consisting of either a single dataset or a
-        flattened sequential sequence of weight-dataset pairs. For exampe, ["dataset-path1"] and
-        ["50", "dataset-path1", "50", "dataset-path2"] are both valid. Not to be used with
-        'blend_per_split'. Defaults to None.
-
-        blend_per_split (blend_per_split: Optional[List[Optional[List[str]]]]): A set of blend
-        strings, as defined above, one for each split distribution. Not to be used with 'blend'.
-        Defauls to None.
-
-        split (Optional[str]): The split string, a comma separated weighting for the dataset splits
-        when drawing samples from a single distribution. Not to be used with 'blend_per_split'.
-        Defaults to None.
-
-        split_vector: (Optional[List[float]]): The split string, parsed and normalized post-
-        initialization. Not to be passed to the constructor.
-
-        path_to_cache (str): Where all re-useable dataset indices are to be cached.
-    """
-
-    is_built_on_rank: Callable
-
-    random_seed: int
-
-    sequence_length: int
-
-    blend: Optional[List[str]] = None
-
-    blend_per_split: Optional[List[Optional[List[str]]]] = None
-
-    split: Optional[str] = None
-
-    split_vector: Optional[List[float]] = field(init=False, default=None)
-
-    path_to_cache: str = None
-
-    def __post_init__(self):
-        """Python dataclass method that is used to modify attributes after initialization. See
-        https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
-        """
-        if torch.distributed.is_initialized():
-            gb_rank = torch.distributed.get_rank()
-            vp_rank = get_virtual_pipeline_model_parallel_rank()
-            if gb_rank == 0 and (vp_rank == 0 or vp_rank is None):
-                assert (
-                    self.is_built_on_rank()
-                ), "is_built_on_rank must return True when global rank = 0 and vp rank = 0"
-
-        if self.blend_per_split is not None and any(self.blend_per_split):
-            assert self.blend is None, "blend and blend_per_split are incompatible"
-            assert len(self.blend_per_split) == len(
-                Split
-            ), f"blend_per_split must contain {len(Split)} blends"
-            if self.split is not None:
-                self.split = None
-                log_single_rank(logger, logging.WARNING, f"Let split = {self.split}")
-        else:
-            assert self.blend is not None, "one of either blend or blend_per_split must be provided"
-            assert self.split is not None, "both blend and split must be provided"
-            self.split_vector = _parse_and_normalize_split(self.split)
-            log_single_rank(logger, logging.INFO, f"Let split_vector = {self.split_vector}")
-
-
-@dataclass
-class GPTDatasetConfig(BlendedMegatronDatasetConfig):
-    """Configuration object for megatron-core blended and megatron GPT datasets
-
-    Attributes:
-        return_document_ids (bool): Whether to return the document ids when querying the dataset.
-    """
-
-    return_document_ids: bool = False
-
-
-def _parse_and_normalize_split(split: str) -> List[float]:
-    """Parse the dataset split ratios from a string
-
-    Args:
-        split (str): The train valid test split string e.g. "99,1,0"
-
-    Returns:
-        List[float]: The trian valid test split ratios e.g. [99.0, 1.0, 0.0]
-    """
-    split = list(map(float, re.findall(r"[.0-9]+", split)))
-    split = split + [0.0 for _ in range(len(Split) - len(split))]
-
-    assert len(split) == len(Split)
-    assert all(map(lambda _: _ >= 0.0, split))
-
-    split = normalize(split)
-
-    return split
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/gpt_dataset.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/gpt_dataset.py
deleted file mode 100644
index 13622a91eb09c6d8cc5605bfa5ccd6698453c998..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/gpt_dataset.py
+++ /dev/null
@@ -1,460 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import logging
-import os
-import time
-from typing import Dict, Tuple
-
-import numpy
-import torch
-from deepspeed.accelerator import get_accelerator
-from megatron_ds.core.datasets.blended_megatron_dataset_config import GPTDatasetConfig
-from megatron_ds.core.datasets.indexed_dataset import MMapIndexedDataset
-from megatron_ds.core.datasets.megatron_dataset import MegatronDataset
-from megatron_ds.core.datasets.utils import Split, log_single_rank
-
-logger = logging.getLogger(__name__)
-
-
-class GPTDataset(MegatronDataset):
-    """The base GPT dataset
-
-    Args:
-        indexed_dataset (MMapIndexedDataset): The MMapIndexedDataset around which to build the
-        MegatronDataset
-
-        indexed_indices (numpy.ndarray): The set of the documents indices to expose
-
-        num_samples (int): The number of samples to draw from the indexed dataset
-
-        index_split (Split): The indexed_indices Split
-
-        config (GPTDatasetConfig): The GPT-specific container for all config sourced parameters
-    """
-
-    def __init__(
-        self,
-        indexed_dataset: MMapIndexedDataset,
-        indexed_indices: numpy.ndarray,
-        num_samples: int,
-        index_split: Split,
-        config: GPTDatasetConfig,
-    ) -> None:
-        super().__init__(indexed_dataset, indexed_indices, num_samples, index_split, config)
-
-    def _finalize(self) -> None:
-        """Abstract method implementation
-        
-        Load or build/cache the document, sample, and shuffle indices
-        """
-        assert isinstance(self.config, GPTDatasetConfig)
-
-        (
-            self.document_index,
-            self.sample_index,
-            self.shuffle_index,
-        ) = self._build_document_sample_shuffle_indices()
-
-    def __len__(self) -> int:
-        """Abstract method implementation
-
-        Returns:
-            int: The length of the dataset
-        """
-        return self.sample_index.shape[0] - 1
-
-    def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
-        """Abstract method implementation
-
-        Args:
-            idx (int): The index into the dataset
-
-        Returns:
-            Dict[str, numpy.ndarray]: The text ids and (optionally) the document ids wrapped in a
-            dictionary
-        """
-        text, document_ids = self._query_document_sample_shuffle_indices(idx)
-        if getattr(self.config, "return_document_ids"):
-            return {"text": text, "document_ids": document_ids}
-        else:
-            return {"text": text}
-
-    @staticmethod
-    def is_multimodal() -> bool:
-        """Abstract method implementation
-
-        Returns:
-            bool: False
-        """
-        return False
-
-    @staticmethod
-    def is_split_by_sequence() -> bool:
-        """Abstract method implementation
-
-        Returns:
-            bool: True
-        """
-        return True
-
-    def _query_document_sample_shuffle_indices(
-        self, idx: int
-    ) -> Tuple[numpy.ndarray, numpy.ndarray]:
-        """Get the text (token ids) and document ids for a given index
-
-        Args:
-            idx (int): The index into the dataset
-
-        Returns:
-            Tuple[numpy.ndarray, numpy.ndarray]: The text ids and document ids
-        """
-        # Do the shuffle mapping
-        idx = self.shuffle_index[idx]
-
-        # Get the beginning and end documents and offsets
-        doc_index_beg, doc_index_beg_offset = self.sample_index[idx]
-        doc_index_end, doc_index_end_offset = self.sample_index[idx + 1]
-
-        document_ids = []
-        sample_parts = []
-
-        # Sample spans a single document
-        if doc_index_beg == doc_index_end:
-            # Add the document id
-            document_ids.append(self.document_index[doc_index_beg])
-
-            # Add the entire sample
-            sample_parts.append(
-                self.indexed_dataset.get(
-                    self.document_index[doc_index_beg],
-                    offset=doc_index_beg_offset,
-                    length=doc_index_end_offset - doc_index_beg_offset + 1,
-                )
-            )
-
-        # Sample spans multiple documents
-        else:
-            for i in range(doc_index_beg, doc_index_end + 1):
-                # Add the document id
-                document_ids.append(self.document_index[i])
-
-                # Add the sample part
-                offset = 0 if i > doc_index_beg else doc_index_beg_offset
-                length = None if i < doc_index_end else doc_index_end_offset + 1
-                sample_parts.append(
-                    self.indexed_dataset.get(self.document_index[i], offset=offset, length=length)
-                )
-
-        return (
-            numpy.array(numpy.concatenate(sample_parts), dtype=numpy.int64),
-            numpy.array(document_ids, dtype=numpy.int64),
-        )
-
-    def _build_document_sample_shuffle_indices(
-        self,
-    ) -> Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]:
-        """Build the document index, the sample index, and the shuffle index
-        
-        The document index:
-            -- 1-D
-            -- An ordered array of document ids
-
-        The sample index:
-            -- 2-D
-            -- The document indices and offsets which mark the start of every sample
-
-        The shuffle index:
-            -- 1-D
-            -- A random permutation of index range of the sample index
-
-        Returns:
-            Tuple[numpy.ndarray, numpy.ndarray]: The document index, the sample index, and the
-            shuffle index
-
-        TODO: Explain the 80% threshold
-        """
-        path_to_cache = getattr(self.config, "path_to_cache")
-        if path_to_cache is None:
-            path_to_cache = os.path.join(
-                self.indexed_dataset.path_prefix, "cache", f"{type(self).__name__}_indices"
-            )
-
-        get_path_to = lambda suffix: os.path.join(
-            path_to_cache, f"{self.unique_description_hash}-{type(self).__name__}-{suffix}"
-        )
-        path_to_description = get_path_to("description.txt")
-        path_to_document_index = get_path_to("document_index.npy")
-        path_to_sample_index = get_path_to("sample_index.npy")
-        path_to_shuffle_index = get_path_to("shuffle_index.npy")
-        cache_hit = all(
-            map(
-                os.path.isfile,
-                [
-                    path_to_description,
-                    path_to_document_index,
-                    path_to_sample_index,
-                    path_to_shuffle_index,
-                ],
-            )
-        )
-
-        num_tokens_per_epoch = _get_num_tokens_per_epoch(self.indexed_dataset, self.indexed_indices)
-
-        sequence_length = getattr(self.config, "sequence_length")
-
-        num_epochs = _get_num_epochs(num_tokens_per_epoch, sequence_length, self.num_samples)
-
-        if not cache_hit:
-            log_single_rank(
-                logger,
-                logging.INFO,
-                f"Build and save the {type(self).__name__} {self.index_split.name} indices",
-            )
-
-            if num_epochs == 1:
-                separate_final_epoch = False
-            else:
-                # Get the number of samples for the last epoch
-                num_samples_sans_final_epoch = (
-                    (num_epochs - 1) * num_tokens_per_epoch - 1
-                ) // sequence_length
-                num_samples_from_final_epoch = self.num_samples - num_samples_sans_final_epoch
-                num_samples_per_epoch = (num_tokens_per_epoch - 1) // sequence_length
-
-                # num_samples_from_final_epoch should be non-negative
-                assert num_samples_from_final_epoch >= 0
-
-                # num_samples_from_final_epoch should not exceed max value
-                assert num_samples_from_final_epoch <= num_samples_per_epoch + 1
-
-                # Separate the final epoch if it falls below the threshold
-                threshold = 0.80
-                separate_final_epoch = num_samples_from_final_epoch < int(
-                    threshold * num_samples_per_epoch
-                )
-
-                log_single_rank(
-                    logger,
-                    logging.DEBUG,
-                    f"> num_samples_from_final_epoch: {num_samples_from_final_epoch}",
-                )
-                log_single_rank(logger, logging.DEBUG, f"> threshold: {threshold}")
-                log_single_rank(
-                    logger, logging.DEBUG, f"> num_samples_per_epoch: {num_samples_per_epoch}"
-                )
-
-            log_single_rank(
-                logger, logging.DEBUG, f"> separate_final_epoch: {separate_final_epoch}"
-            )
-
-            numpy_random_state = numpy.random.RandomState(getattr(self.config, "random_seed"))
-
-            os.makedirs(path_to_cache, exist_ok=True)
-
-            # Write the description
-            with open(path_to_description, "wt") as writer:
-                writer.write(self.unique_description)
-
-            # Build the document index
-            log_single_rank(
-                logger,
-                logging.INFO,
-                f"\tBuild and save the document index to {os.path.basename(path_to_document_index)}",
-            )
-            t_beg = time.time()
-            document_index = _build_document_index(
-                self.indexed_indices, num_epochs, numpy_random_state, separate_final_epoch
-            )
-            numpy.save(path_to_document_index, document_index, allow_pickle=True)
-            t_end = time.time()
-            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-            # Build the sample index
-            log_single_rank(
-                logger,
-                logging.INFO,
-                f"\tBuild and save the sample index to {os.path.basename(path_to_sample_index)}",
-            )
-            t_beg = time.time()
-            from megatron_ds.core.datasets import helpers
-
-            assert document_index.dtype == numpy.int32
-            assert self.indexed_dataset.sequence_lengths.dtype == numpy.int32
-            sample_index = helpers.build_sample_idx(
-                self.indexed_dataset.sequence_lengths,
-                document_index,
-                sequence_length,
-                num_epochs,
-                num_tokens_per_epoch,
-            )
-            numpy.save(path_to_sample_index, sample_index, allow_pickle=True)
-            t_end = time.time()
-            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-            # Build the shuffle index
-            log_single_rank(
-                logger,
-                logging.INFO,
-                f"\tBuild and save the shuffle index to {os.path.basename(path_to_shuffle_index)}",
-            )
-            t_beg = time.time()
-            if separate_final_epoch:
-                shuffle_index = _build_shuffle_index(
-                    num_samples_sans_final_epoch, sample_index.shape[0] - 1, numpy_random_state
-                )
-            else:
-                shuffle_index = _build_shuffle_index(
-                    sample_index.shape[0] - 1, sample_index.shape[0] - 1, numpy_random_state
-                )
-            numpy.save(path_to_shuffle_index, shuffle_index, allow_pickle=True)
-            t_end = time.time()
-            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        log_single_rank(
-            logger, logging.INFO, f"Load the {type(self).__name__} {self.index_split.name} indices"
-        )
-
-        log_single_rank(
-            logger,
-            logging.INFO,
-            f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
-        )
-        t_beg = time.time()
-        document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode='r')
-        t_end = time.time()
-        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        log_single_rank(
-            logger,
-            logging.INFO,
-            f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
-        )
-        t_beg = time.time()
-        sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
-        t_end = time.time()
-        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        log_single_rank(
-            logger,
-            logging.INFO,
-            f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
-        )
-        t_beg = time.time()
-        shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode='r')
-        t_end = time.time()
-        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        log_single_rank(
-            logger, logging.INFO, f"> total number of samples: {sample_index.shape[0] - 1}"
-        )
-        log_single_rank(logger, logging.INFO, f"> total number of epochs: {num_epochs}")
-
-        return document_index, sample_index, shuffle_index
-
-
-def _get_num_tokens_per_epoch(indexed_dataset: MMapIndexedDataset, indices: numpy.ndarray) -> int:
-    """Calculate the number of tokens in a single epoch
-
-    Args:
-        indexed_dataset (MMapIndexedDataset): The underlying MMapIndexedDataset
-
-        indices (numpy.ndarray): The subset of indices into the underlying MMapIndexedDataset
-
-    Returns:
-        int: The number of tokens in a single epoch
-    """
-    return numpy.sum(indexed_dataset.sequence_lengths[indices])
-
-
-def _get_num_epochs(num_tokens_per_epoch: int, seq_length: int, num_samples: int) -> int:
-    """Calculate the number of epochs
-
-    Args:
-        num_tokens_per_epoch (int): The number of tokens in a single epoch
-
-        seq_length (int): The sequence length in tokens
-
-        num_samples (int): The total number of samples
-
-    Returns:
-        int: The number of epochs
-    """
-    num_epochs = 0
-    num_tokens = 0
-    while True:
-        num_epochs += 1
-        num_tokens += num_tokens_per_epoch
-        # -1 is because we need to retrieve seq_length + 1 token each time
-        # but the last token will overlap with the first token of the next
-        # sample except for the last sample.
-        if ((num_tokens - 1) // seq_length) >= num_samples:
-            return num_epochs
-
-
-def _build_document_index(
-    documents: numpy.ndarray,
-    num_epochs: int,
-    numpy_random_state: numpy.random.RandomState,
-    separate_final_epoch: bool,
-) -> numpy.ndarray:
-    """Build an array with length = num epochs * num documents
-
-    Args:
-        documents (numpy.ndarray): the subset of exposed document indices
-
-        num_epochs (int): The number of epochs
-
-        numpy_random_state (numpy.random.RandomState): The NumPy random state
-
-        separate_final_epoch (bool): Whether to exclude the last epoch from the global shuffle
-
-    Returns:
-        numpy.ndarray: The document index
-
-    TODO: Explain separate_final_epoch
-    """
-    if not separate_final_epoch or num_epochs == 1:
-        document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1]
-        document_index[:] = documents
-        document_index = document_index.reshape(-1)
-        document_index = document_index.astype(numpy.int32)
-        numpy_random_state.shuffle(document_index)
-        return document_index
-
-    doc_idx_first = _build_document_index(documents, num_epochs - 1, numpy_random_state, False)
-    doc_idx_last = _build_document_index(documents, 1, numpy_random_state, False)
-    return numpy.concatenate((doc_idx_first, doc_idx_last))
-
-
-def _build_shuffle_index(
-    num_samples: int, total_size: int, numpy_random_state: numpy.random.RandomState
-) -> numpy.ndarray:
-    """Build the range [0, size) and shuffle
-
-    Args:
-        num_samples (int): The size of the first shuffle range [0, num_samples)
-
-        total_size (int): The size of the entire index. If larger than 'num_samples', it defines
-
-        the second shuffle range [num_samples, total_size)
-
-        numpy_random_state (numpy.random.RandomState): The NumPy random state
-
-    Returns:
-        numpy.ndarray: The shuffle index
-
-    TODO: Explain [0, num_samples) [num_samples, total_size) split
-    """
-    dtype_ = numpy.uint32
-    if total_size >= (numpy.iinfo(numpy.uint32).max - 1):
-        dtype_ = numpy.int64
-
-    shuffle_idx_first = numpy.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
-    numpy_random_state.shuffle(shuffle_idx_first)
-    if num_samples == total_size:
-        return shuffle_idx_first
-
-    shuffle_idx_last = numpy.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_)
-    numpy_random_state.shuffle(shuffle_idx_last)
-
-    return numpy.concatenate((shuffle_idx_first, shuffle_idx_last))
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/helpers.cpp b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/helpers.cpp
deleted file mode 100644
index 2a12451279699873e3cb3b5c18c638252f468399..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/helpers.cpp
+++ /dev/null
@@ -1,765 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
-
-/* Helper methods for fast index mapping builds */
-
-#include <algorithm>
-#include <iostream>
-#include <limits>
-#include <math.h>
-#include <stdexcept>
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <random>
-
-namespace py = pybind11;
-using namespace std;
-
-const int32_t LONG_SENTENCE_LEN = 512;
-
-void build_blending_indices(py::array_t<int16_t> &dataset_index,
-                            py::array_t<int64_t> &dataset_sample_index,
-                            const py::array_t<double> &weights,
-                            const int32_t num_datasets,
-                            const int64_t size, const bool verbose)
-{
-  /* Given multiple datasets and a weighting array, build samples
-   such that it follows those wieghts.*/
-
-  if (verbose)
-  {
-    std::cout << "> building indices for blended datasets ..." << std::endl;
-  }
-
-  // Get the pointer access without the checks.
-  auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
-  auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
-  auto weights_ptr = weights.unchecked<1>();
-
-  // Initialize buffer for number of samples used for each dataset.
-  int64_t current_samples[num_datasets];
-  for (int64_t i = 0; i < num_datasets; ++i)
-  {
-    current_samples[i] = 0;
-  }
-
-  // For each sample:
-  for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx)
-  {
-
-    // Determine where the max error in sampling is happening.
-    auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
-    int64_t max_error_index = 0;
-    double max_error = weights_ptr[0] * sample_idx_double -
-                       static_cast<double>(current_samples[0]);
-    for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx)
-    {
-      double error = weights_ptr[dataset_idx] * sample_idx_double -
-                     static_cast<double>(current_samples[dataset_idx]);
-      if (error > max_error)
-      {
-        max_error = error;
-        max_error_index = dataset_idx;
-      }
-    }
-
-    // Populate the indices.
-    dataset_index_ptr[sample_idx] = static_cast<int16_t>(max_error_index);
-    dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
-
-    // Update the total samples.
-    current_samples[max_error_index] += 1;
-  }
-
-  // print info
-  if (verbose)
-  {
-    std::cout << " > sample ratios:" << std::endl;
-    for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx)
-    {
-      auto ratio = static_cast<double>(current_samples[dataset_idx]) /
-                   static_cast<double>(size);
-      std::cout << "   dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl;
-    }
-  }
-}
-
-py::array build_sample_idx(const py::array_t<int64_t> &sizes_,
-                           const py::array_t<int64_t> &doc_idx_,
-                           const int32_t seq_length,
-                           const int32_t num_epochs,
-                           const int64_t tokens_per_epoch)
-{
-  /* Sample index (sample_idx) is used for gpt2 like dataset for which
-     the documents are flattened and the samples are built based on this
-     1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
-     where [..., 0] contains the index into `doc_idx` and [..., 1] is the
-     starting offset in that document.*/
-
-  // Consistency checks.
-  assert(seq_length > 1);
-  assert(num_epochs > 0);
-  assert(tokens_per_epoch > 1);
-
-  // Remove bound checks.
-  auto sizes = sizes_.unchecked<1>();
-  auto doc_idx = doc_idx_.unchecked<1>();
-
-  // Mapping and it's length (1D).
-  int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
-  int64_t *sample_idx = new int64_t[2 * (num_samples + 1)];
-
-  // Index into sample_idx.
-  int64_t sample_index = 0;
-  // Index into doc_idx.
-  int64_t doc_idx_index = 0;
-  // Begining offset for each document.
-  int32_t doc_offset = 0;
-  // Start with first document and no offset.
-  sample_idx[2 * sample_index] = doc_idx_index;
-  sample_idx[2 * sample_index + 1] = doc_offset;
-  ++sample_index;
-
-  while (sample_index <= num_samples)
-  {
-    // Start with a fresh sequence.
-    int32_t remaining_seq_length = seq_length + 1;
-    while (remaining_seq_length != 0)
-    {
-      // Get the document length.
-      auto doc_id = doc_idx[doc_idx_index];
-      auto doc_length = sizes[doc_id] - doc_offset;
-      // And add it to the current sequence.
-      remaining_seq_length -= doc_length;
-      // If we have more than a full sequence, adjust offset and set
-      // remaining length to zero so we return from the while loop.
-      // Note that -1 here is for the same reason we have -1 in
-      // `_num_epochs` calculations.
-      if (remaining_seq_length <= 0)
-      {
-        doc_offset += (remaining_seq_length + doc_length - 1);
-        remaining_seq_length = 0;
-      }
-      else
-      {
-        // Otherwise, start from the begining of the next document.
-        ++doc_idx_index;
-        doc_offset = 0;
-      }
-    }
-    // Record the sequence.
-    sample_idx[2 * sample_index] = doc_idx_index;
-    sample_idx[2 * sample_index + 1] = doc_offset;
-    ++sample_index;
-  }
-
-  // Method to deallocate memory.
-  py::capsule free_when_done(sample_idx, [](void *mem_)
-                             {
-	int64_t *mem = reinterpret_cast<int64_t*>(mem_);
-	delete[] mem; });
-
-  // Return the numpy array.
-  const auto byte_size = sizeof(int64_t);
-  return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
-                   {2 * byte_size, byte_size},               // C-style contiguous strides
-                   sample_idx,                               // the data pointer
-                   free_when_done);                          // numpy array references
-}
-
-inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
-                                     const int32_t max_length,
-                                     std::mt19937 &rand32_gen)
-{
-  /* Training sample length. */
-  if (short_seq_ratio == 0)
-  {
-    return max_length;
-  }
-  const auto random_number = rand32_gen();
-  if ((random_number % short_seq_ratio) == 0)
-  {
-    return 2 + random_number % (max_length - 1);
-  }
-  return max_length;
-}
-
-template <typename DocIdx>
-py::array build_mapping_impl(const py::array_t<int64_t> &docs_,
-                             const py::array_t<int32_t> &sizes_,
-                             const int32_t num_epochs,
-                             const uint64_t max_num_samples,
-                             const int32_t max_seq_length,
-                             const double short_seq_prob,
-                             const int32_t seed,
-                             const bool verbose,
-                             const int32_t min_num_sent)
-{
-  /* Build a mapping of (start-index, end-index, sequence-length) where
-     start and end index are the indices of the sentences in the sample
-     and sequence-length is the target sequence length.
-  */
-
-  // Consistency checks.
-  assert(num_epochs > 0);
-  assert(max_seq_length > 1);
-  assert(short_seq_prob >= 0.0);
-  assert(short_seq_prob <= 1.0);
-  assert(seed > 0);
-
-  // Remove bound checks.
-  auto docs = docs_.unchecked<1>();
-  auto sizes = sizes_.unchecked<1>();
-
-  // For efficiency, convert probability to ratio. Note: rand() generates int.
-  int32_t short_seq_ratio = 0;
-  if (short_seq_prob > 0)
-  {
-    short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
-  }
-
-  if (verbose)
-  {
-    const auto sent_start_index = docs[0];
-    const auto sent_end_index = docs[docs_.shape(0) - 1];
-    const auto num_sentences = sent_end_index - sent_start_index;
-    cout << "    using:" << endl
-         << std::flush;
-    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
-         << std::flush;
-    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
-         << std::flush;
-    cout << "     total number of sentences:      " << num_sentences << endl
-         << std::flush;
-    cout << "     number of epochs:               " << num_epochs << endl
-         << std::flush;
-    cout << "     maximum number of samples:      " << max_num_samples << endl
-         << std::flush;
-    cout << "     maximum sequence length:        " << max_seq_length << endl
-         << std::flush;
-    cout << "     short sequence probability:     " << short_seq_prob << endl
-         << std::flush;
-    cout << "     short sequence ration (1/prob): " << short_seq_ratio << endl
-         << std::flush;
-    cout << "     seed:                           " << seed << endl
-         << std::flush;
-  }
-
-  // Mapping and it's length (1D).
-  int64_t num_samples = -1;
-  DocIdx *maps = NULL;
-
-  // Perform two iterations, in the first iteration get the size
-  // and allocate memory and in the second iteration populate the map.
-  bool second = false;
-  for (int32_t iteration = 0; iteration < 2; ++iteration)
-  {
-
-    // Set the seed so both iterations produce the same results.
-    std::mt19937 rand32_gen(seed);
-
-    // Set the flag on second iteration.
-    second = (iteration == 1);
-
-    // Counters:
-    uint64_t empty_docs = 0;
-    uint64_t one_sent_docs = 0;
-    uint64_t long_sent_docs = 0;
-
-    // Current map index.
-    uint64_t map_index = 0;
-
-    // For each epoch:
-    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
-    {
-      if (map_index >= max_num_samples)
-      {
-        if (verbose && (!second))
-        {
-          cout << "    reached " << max_num_samples << " samples after "
-               << epoch << " epochs ..." << endl
-               << std::flush;
-        }
-        break;
-      }
-      // For each document:
-      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
-      {
-
-        // Document sentences are in [sent_index_first, sent_index_last)
-        const auto sent_index_first = docs[doc];
-        const auto sent_index_last = docs[doc + 1];
-
-        // At the begining of the document previous index is the
-        // start index.
-        auto prev_start_index = sent_index_first;
-
-        // Remaining documents.
-        auto num_remain_sent = sent_index_last - sent_index_first;
-
-        // Some bookkeeping
-        if ((epoch == 0) && (!second))
-        {
-          if (num_remain_sent == 0)
-          {
-            ++empty_docs;
-          }
-          if (num_remain_sent == 1)
-          {
-            ++one_sent_docs;
-          }
-        }
-
-        // Detect documents with long sentences.
-        bool contains_long_sentence = false;
-        if (num_remain_sent > 1)
-        {
-          for (auto sent_index = sent_index_first;
-               sent_index < sent_index_last; ++sent_index)
-          {
-            if (sizes[sent_index] > LONG_SENTENCE_LEN)
-            {
-              if ((epoch == 0) && (!second))
-              {
-                ++long_sent_docs;
-              }
-              contains_long_sentence = true;
-              break;
-            }
-          }
-        }
-
-        // If we have more than two sentences.
-        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
-        {
-
-          // Set values.
-          auto seq_len = int32_t{0};
-          auto num_sent = int32_t{0};
-          auto target_seq_len = get_target_sample_len(short_seq_ratio,
-                                                      max_seq_length,
-                                                      rand32_gen);
-
-          // Loop through sentences.
-          for (auto sent_index = sent_index_first;
-               sent_index < sent_index_last; ++sent_index)
-          {
-
-            // Add the size and number of sentences.
-            seq_len += sizes[sent_index];
-            ++num_sent;
-            --num_remain_sent;
-
-            // If we have reached the target length.
-            // and if not only one sentence is left in the document.
-            // and if we have at least two sentneces.
-            // and if we have reached end of the document.
-            if (((seq_len >= target_seq_len) &&
-                 (num_remain_sent > 1) &&
-                 (num_sent >= min_num_sent)) ||
-                (num_remain_sent == 0))
-            {
-
-              // Check for overflow.
-              if ((3 * map_index + 2) >
-                  std::numeric_limits<int64_t>::max())
-              {
-                cout << "number of samples exceeded maximum "
-                     << "allowed by type int64: "
-                     << std::numeric_limits<int64_t>::max()
-                     << endl;
-                throw std::overflow_error("Number of samples");
-              }
-
-              // Populate the map.
-              if (second)
-              {
-                const auto map_index_0 = 3 * map_index;
-                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
-                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
-                maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
-              }
-
-              // Update indices / counters.
-              ++map_index;
-              prev_start_index = sent_index + 1;
-              target_seq_len = get_target_sample_len(short_seq_ratio,
-                                                     max_seq_length,
-                                                     rand32_gen);
-              seq_len = 0;
-              num_sent = 0;
-            }
-
-          } // for (auto sent_index=sent_index_first; ...
-        }   // if (num_remain_sent > 1) {
-      }     // for (int doc=0; doc < num_docs; ++doc) {
-    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
-
-    if (!second)
-    {
-      if (verbose)
-      {
-        cout << "   number of empty documents: " << empty_docs << endl
-             << std::flush;
-        cout << "   number of documents with one sentence: " << one_sent_docs << endl
-             << std::flush;
-        cout << "   number of documents with long sentences: " << long_sent_docs << endl
-             << std::flush;
-        cout << "   will create mapping for " << map_index << " samples" << endl
-             << std::flush;
-      }
-      assert(maps == NULL);
-      assert(num_samples < 0);
-      maps = new DocIdx[3 * map_index];
-      num_samples = static_cast<int64_t>(map_index);
-    }
-
-  } // for (int iteration=0; iteration < 2; ++iteration) {
-
-  // Shuffle.
-  // We need a 64 bit random number generator as we might have more
-  // than 2 billion samples.
-  std::mt19937_64 rand64_gen(seed + 1);
-  for (auto i = (num_samples - 1); i > 0; --i)
-  {
-    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
-    const auto i0 = 3 * i;
-    const auto j0 = 3 * j;
-    // Swap values.
-    swap(maps[i0], maps[j0]);
-    swap(maps[i0 + 1], maps[j0 + 1]);
-    swap(maps[i0 + 2], maps[j0 + 2]);
-  }
-
-  // Method to deallocate memory.
-  py::capsule free_when_done(maps, [](void *mem_)
-                             {
-            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
-	    delete[] mem; });
-
-  // Return the numpy array.
-  const auto byte_size = sizeof(DocIdx);
-  return py::array(std::vector<int64_t>{num_samples, 3}, // shape
-                   {3 * byte_size, byte_size},           // C-style contiguous strides
-                   maps,                                 // the data pointer
-                   free_when_done);                      // numpy array references
-}
-
-py::array build_mapping(const py::array_t<int64_t> &docs_,
-                        const py::array_t<int> &sizes_,
-                        const int num_epochs,
-                        const uint64_t max_num_samples,
-                        const int max_seq_length,
-                        const double short_seq_prob,
-                        const int seed,
-                        const bool verbose,
-                        const int32_t min_num_sent)
-{
-
-  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
-  {
-    if (verbose)
-    {
-      cout << "    using uint64 for data mapping..." << endl
-           << std::flush;
-    }
-    return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
-                                        max_num_samples, max_seq_length,
-                                        short_seq_prob, seed, verbose,
-                                        min_num_sent);
-  }
-  else
-  {
-    if (verbose)
-    {
-      cout << "    using uint32 for data mapping..." << endl
-           << std::flush;
-    }
-    return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
-                                        max_num_samples, max_seq_length,
-                                        short_seq_prob, seed, verbose,
-                                        min_num_sent);
-  }
-}
-
-template <typename DocIdx>
-py::array build_blocks_mapping_impl(const py::array_t<int64_t> &docs_,
-                                    const py::array_t<int32_t> &sizes_,
-                                    const py::array_t<int32_t> &titles_sizes_,
-                                    const int32_t num_epochs,
-                                    const uint64_t max_num_samples,
-                                    const int32_t max_seq_length,
-                                    const int32_t seed,
-                                    const bool verbose,
-                                    const bool use_one_sent_blocks)
-{
-  /* Build a mapping of (start-index, end-index, sequence-length) where
-     start and end index are the indices of the sentences in the sample
-     and sequence-length is the target sequence length.
-  */
-
-  // Consistency checks.
-  assert(num_epochs > 0);
-  assert(max_seq_length > 1);
-  assert(seed > 0);
-
-  // Remove bound checks.
-  auto docs = docs_.unchecked<1>();
-  auto sizes = sizes_.unchecked<1>();
-  auto titles_sizes = titles_sizes_.unchecked<1>();
-
-  if (verbose)
-  {
-    const auto sent_start_index = docs[0];
-    const auto sent_end_index = docs[docs_.shape(0) - 1];
-    const auto num_sentences = sent_end_index - sent_start_index;
-    cout << "    using:" << endl
-         << std::flush;
-    cout << "     number of documents:            " << docs_.shape(0) - 1 << endl
-         << std::flush;
-    cout << "     sentences range:                [" << sent_start_index << ", " << sent_end_index << ")" << endl
-         << std::flush;
-    cout << "     total number of sentences:      " << num_sentences << endl
-         << std::flush;
-    cout << "     number of epochs:               " << num_epochs << endl
-         << std::flush;
-    cout << "     maximum number of samples:      " << max_num_samples << endl
-         << std::flush;
-    cout << "     maximum sequence length:        " << max_seq_length << endl
-         << std::flush;
-    cout << "     seed:                           " << seed << endl
-         << std::flush;
-  }
-
-  // Mapping and its length (1D).
-  int64_t num_samples = -1;
-  DocIdx *maps = NULL;
-
-  // Acceptable number of sentences per block.
-  int min_num_sent = 2;
-  if (use_one_sent_blocks)
-  {
-    min_num_sent = 1;
-  }
-
-  // Perform two iterations, in the first iteration get the size
-  // and allocate memory and in the second iteration populate the map.
-  bool second = false;
-  for (int32_t iteration = 0; iteration < 2; ++iteration)
-  {
-
-    // Set the flag on second iteration.
-    second = (iteration == 1);
-
-    // Current map index.
-    uint64_t map_index = 0;
-
-    uint64_t empty_docs = 0;
-    uint64_t one_sent_docs = 0;
-    uint64_t long_sent_docs = 0;
-    // For each epoch:
-    for (int32_t epoch = 0; epoch < num_epochs; ++epoch)
-    {
-      // assign every block a unique id
-      int32_t block_id = 0;
-
-      if (map_index >= max_num_samples)
-      {
-        if (verbose && (!second))
-        {
-          cout << "    reached " << max_num_samples << " samples after "
-               << epoch << " epochs ..." << endl
-               << std::flush;
-        }
-        break;
-      }
-      // For each document:
-      for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc)
-      {
-
-        // Document sentences are in [sent_index_first, sent_index_last)
-        const auto sent_index_first = docs[doc];
-        const auto sent_index_last = docs[doc + 1];
-        const auto target_seq_len = max_seq_length - titles_sizes[doc];
-
-        // At the begining of the document previous index is the
-        // start index.
-        auto prev_start_index = sent_index_first;
-
-        // Remaining documents.
-        auto num_remain_sent = sent_index_last - sent_index_first;
-
-        // Some bookkeeping
-        if ((epoch == 0) && (!second))
-        {
-          if (num_remain_sent == 0)
-          {
-            ++empty_docs;
-          }
-          if (num_remain_sent == 1)
-          {
-            ++one_sent_docs;
-          }
-        }
-        // Detect documents with long sentences.
-        bool contains_long_sentence = false;
-        if (num_remain_sent >= min_num_sent)
-        {
-          for (auto sent_index = sent_index_first;
-               sent_index < sent_index_last; ++sent_index)
-          {
-            if (sizes[sent_index] > LONG_SENTENCE_LEN)
-            {
-              if ((epoch == 0) && (!second))
-              {
-                ++long_sent_docs;
-              }
-              contains_long_sentence = true;
-              break;
-            }
-          }
-        }
-        // If we have enough sentences and no long sentences.
-        if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence))
-        {
-
-          // Set values.
-          auto seq_len = int32_t{0};
-          auto num_sent = int32_t{0};
-
-          // Loop through sentences.
-          for (auto sent_index = sent_index_first;
-               sent_index < sent_index_last; ++sent_index)
-          {
-
-            // Add the size and number of sentences.
-            seq_len += sizes[sent_index];
-            ++num_sent;
-            --num_remain_sent;
-
-            // If we have reached the target length.
-            // and there are an acceptable number of sentences left
-            // and if we have at least the minimum number of sentences.
-            // or if we have reached end of the document.
-            if (((seq_len >= target_seq_len) &&
-                 (num_remain_sent >= min_num_sent) &&
-                 (num_sent >= min_num_sent)) ||
-                (num_remain_sent == 0))
-            {
-
-              // Populate the map.
-              if (second)
-              {
-                const auto map_index_0 = 4 * map_index;
-                // Each sample has 4 items: the starting sentence index, ending sentence index,
-                // the index of the document from which the block comes (used for fetching titles)
-                // and the unique id of the block (used for creating block indexes)
-
-                maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
-                maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
-                maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
-                maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
-              }
-
-              // Update indices / counters.
-              ++map_index;
-              ++block_id;
-              prev_start_index = sent_index + 1;
-              seq_len = 0;
-              num_sent = 0;
-            }
-          } // for (auto sent_index=sent_index_first; ...
-        }   // if (num_remain_sent > 1) {
-      }     // for (int doc=0; doc < num_docs; ++doc) {
-    }       // for (int epoch=0; epoch < num_epochs; ++epoch) {
-
-    if (!second)
-    {
-      if (verbose)
-      {
-        cout << "   number of empty documents: " << empty_docs << endl
-             << std::flush;
-        cout << "   number of documents with one sentence: " << one_sent_docs << endl
-             << std::flush;
-        cout << "   number of documents with long sentences: " << long_sent_docs << endl
-             << std::flush;
-        cout << "   will create mapping for " << map_index << " samples" << endl
-             << std::flush;
-      }
-      assert(maps == NULL);
-      assert(num_samples < 0);
-      maps = new DocIdx[4 * map_index];
-      num_samples = static_cast<int64_t>(map_index);
-    }
-
-  } // for (int iteration=0; iteration < 2; ++iteration) {
-
-  // Shuffle.
-  // We need a 64 bit random number generator as we might have more
-  // than 2 billion samples.
-  std::mt19937_64 rand64_gen(seed + 1);
-  for (auto i = (num_samples - 1); i > 0; --i)
-  {
-    const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
-    const auto i0 = 4 * i;
-    const auto j0 = 4 * j;
-    // Swap values.
-    swap(maps[i0], maps[j0]);
-    swap(maps[i0 + 1], maps[j0 + 1]);
-    swap(maps[i0 + 2], maps[j0 + 2]);
-    swap(maps[i0 + 3], maps[j0 + 3]);
-  }
-
-  // Method to deallocate memory.
-  py::capsule free_when_done(maps, [](void *mem_)
-                             {
-            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
-	    delete[] mem; });
-
-  // Return the numpy array.
-  const auto byte_size = sizeof(DocIdx);
-  return py::array(std::vector<int64_t>{num_samples, 4}, // shape
-                   {4 * byte_size, byte_size},           // C-style contiguous strides
-                   maps,                                 // the data pointer
-                   free_when_done);                      // numpy array references
-}
-
-py::array build_blocks_mapping(const py::array_t<int64_t> &docs_,
-                               const py::array_t<int> &sizes_,
-                               const py::array_t<int> &titles_sizes_,
-                               const int num_epochs,
-                               const uint64_t max_num_samples,
-                               const int max_seq_length,
-                               const int seed,
-                               const bool verbose,
-                               const bool use_one_sent_blocks)
-{
-
-  if (sizes_.size() > std::numeric_limits<uint32_t>::max())
-  {
-    if (verbose)
-    {
-      cout << "    using uint64 for data mapping..." << endl
-           << std::flush;
-    }
-    return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
-                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
-  }
-  else
-  {
-    if (verbose)
-    {
-      cout << "    using uint32 for data mapping..." << endl
-           << std::flush;
-    }
-    return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
-                                               num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
-  }
-}
-
-PYBIND11_MODULE(helpers, m)
-{
-  m.def("build_mapping", &build_mapping);
-  m.def("build_blocks_mapping", &build_blocks_mapping);
-  m.def("build_sample_idx", &build_sample_idx);
-  m.def("build_blending_indices", &build_blending_indices);
-}
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/indexed_dataset.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/indexed_dataset.py
deleted file mode 100644
index 7dbadf73d4449c741f6fcd825fcfee54c3a34314..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/indexed_dataset.py
+++ /dev/null
@@ -1,639 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Essentially re-written in entirety
-
-import logging
-import os
-import shutil
-import struct
-import time
-from enum import Enum
-from functools import lru_cache
-from itertools import accumulate
-from types import TracebackType
-from typing import List, Optional, Tuple, Type, Union
-
-import numpy
-import torch
-
-from megatron_ds.core.datasets.utils import log_single_rank
-
-logger = logging.getLogger(__name__)
-
-_INDEX_HEADER = b"MMIDIDX\x00\x00"
-
-
-class DType(Enum):
-    """The NumPy data type Enum for writing/reading the MMapIndexedDataset indices
-    """
-
-    uint8 = 1
-    int8 = 2
-    int16 = 3
-    int32 = 4
-    int64 = 5
-    float64 = 6
-    float32 = 7
-    uint16 = 8
-
-    @classmethod
-    def code_from_dtype(cls, value: Type[numpy.number]) -> int:
-        """Get the code from the dtype
-
-        Args:
-            value (Type[numpy.number]): The dtype
-
-        Returns:
-            int: The code
-        """
-        return cls[value.__name__].value
-
-    @classmethod
-    def dtype_from_code(cls, value: int) -> Type[numpy.number]:
-        """Get the dtype from the code
-
-        Args:
-            value (int): The code
-
-        Returns:
-            Type[numpy.number]: The dtype
-        """
-        return getattr(numpy, cls(value).name)
-
-    @staticmethod
-    def size(key: Union[int, Type[numpy.number]]) -> int:
-        """Get the size of the dtype/code in bytes
-
-        Args:
-            key (Union[int, Type[numpy.number]]): The dtype or code
-
-        Raises:
-            ValueError: If the key is neither dtype nor integer code
-
-        Returns:
-            int: The size of the dtype/code in in bytes
-        """
-        if isinstance(key, int):
-            return DType.dtype_from_code(key)().itemsize
-        elif numpy.number in key.__mro__:
-            return key().itemsize
-        else:
-            raise ValueError
-
-    @staticmethod
-    def optimal_dtype(cardinality: Optional[int]) -> Type[numpy.number]:
-        """Get the dtype to use for an index of a certain cardinality
-
-        Args:
-            cardinality (Optional[int]): The number of elements to be indexed
-
-        Returns:
-            Type[numpy.number]: The dtype to use for the index
-        """
-        if cardinality is not None and cardinality < 65500:
-            return numpy.uint16
-        else:
-            return numpy.int32
-
-
-class _IndexWriter(object):
-    """Object class to write the index (.idx) file
-
-    Args:
-        idx_path (str): The path to the index file
-
-        dtype (Type[numpy.number]): The dtype of the index file
-    """
-
-    def __init__(self, idx_path: str, dtype: Type[numpy.number]) -> None:
-        self.idx_path = idx_path
-        self.dtype = dtype
-
-    def __enter__(self) -> "_IndexWriter":
-        """Enter the context introduced by the 'with' keyword
-
-        Returns:
-            _IndexWriter: The instance
-        """
-        self.idx_writer = open(self.idx_path, "wb")
-        # fixed, vestigial practice
-        self.idx_writer.write(_INDEX_HEADER)
-        # fixed, vestigial practice
-        self.idx_writer.write(struct.pack("<Q", 1))
-        # the numeric code for the dtype
-        self.idx_writer.write(struct.pack("<B", DType.code_from_dtype(self.dtype)))
-        return self
-
-    def __exit__(
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_val: Optional[BaseException],
-        exc_tb: Optional[TracebackType],
-    ) -> Optional[bool]:
-        """Exit the context introduced by the 'with' keyword
-
-        Args:
-            exc_type (Optional[Type[BaseException]]): Exception type
-
-            exc_val (Optional[BaseException]): Exception value
-
-            exc_tb (Optional[TracebackType]): Exception traceback object
-
-        Returns:
-            Optional[bool]: Whether to silence the exception
-        """
-        self.idx_writer.close()
-
-    def write(
-        self,
-        sequence_lengths: List[int],
-        sequence_modes: Optional[List[int]],
-        document_indices: List[int],
-    ) -> None:
-        """Write the index (.idx) file
-
-        Args:
-            sequence_lengths (List[int]): The length of each sequence
-
-            sequence_modes (Optional[List[int]]): The mode of each sequences
-
-            document_indices (List[int]): The seqyebce indices demarcating the end of each document
-        """
-        sequence_pointers = self._sequence_pointers(sequence_lengths)
-
-        # the number of sequences in the dataset
-        sequence_count = len(sequence_lengths)
-        self.idx_writer.write(struct.pack("<Q", sequence_count))
-
-        # the number of documents in the dataset
-        document_count = len(document_indices)
-        self.idx_writer.write(struct.pack("<Q", document_count))
-
-        # the number of tokens per sequence
-        sequence_lengths = numpy.array(sequence_lengths, dtype=numpy.int32)
-        self.idx_writer.write(sequence_lengths.tobytes(order="C"))
-        del sequence_lengths
-
-        # the byte offsets for all sequences
-        sequence_pointers = numpy.array(sequence_pointers, dtype=numpy.int64)
-        self.idx_writer.write(sequence_pointers.tobytes(order="C"))
-        del sequence_pointers
-
-        # the sequence indices marking the end of each document
-        document_indices = numpy.array(document_indices, dtype=numpy.int64)
-        self.idx_writer.write(document_indices.tobytes(order="C"))
-
-        # the mode per sequence
-        if sequence_modes is not None:
-            sequence_modes = numpy.array(sequence_modes, dtype=numpy.int8)
-            self.idx_writer.write(sequence_modes.tobytes(order='C'))
-            del sequence_modes
-
-    def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
-        """Build the sequence pointers per the sequence lengths and dtype size
-
-        Args:
-            sequence_lengths (List[int]): The length of each sequence
-
-        Returns:
-            List[int]: The pointer to the beginning of each sequence
-        """
-        itemsize = DType.size(self.dtype)
-        curr_ptr = 0
-        list_ptr = []
-        for length in sequence_lengths:
-            list_ptr.append(curr_ptr)
-            curr_ptr += length * itemsize
-        return list_ptr
-
-
-class _IndexReader(object):
-    """Object class to read the index (.idx) file
-
-    Args:
-        idx_path (str): The path to the index file
-
-        multimodal (bool): Whether the dataset is multimodal
-    """
-
-    def __init__(self, idx_path: str, multimodal: bool) -> None:
-
-        log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} from {idx_path}")
-
-        with open(idx_path, "rb") as stream:
-            header = stream.read(9)
-            assert header == _INDEX_HEADER, f"bad header, cannot read: {idx_path}"
-
-            version = struct.unpack("<Q", stream.read(8))[0]
-            assert version == 1, f"bad version, cannot read: {idx_path}"
-
-            code = struct.unpack("<B", stream.read(1))[0]
-            self.dtype = DType.dtype_from_code(code)
-            self.dtype_size = DType.size(self.dtype)
-
-            self.sequence_count = struct.unpack("<Q", stream.read(8))[0]
-            self.document_count = struct.unpack("<Q", stream.read(8))[0]
-
-            offset = stream.tell()
-
-        self.bin_buffer_mmap = numpy.memmap(idx_path, mode="r", order="C")
-        self.bin_buffer = memoryview(self.bin_buffer_mmap)
-
-        log_single_rank(logger, logging.INFO, f"\tExtract the sequence lengths")
-        t_beg = time.time()
-        self.sequence_lengths = numpy.frombuffer(
-            self.bin_buffer, dtype=numpy.int32, count=self.sequence_count, offset=offset
-        )
-        t_end = time.time()
-        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        log_single_rank(logger, logging.INFO, f"\tExtract the sequence pointers")
-        t_beg = time.time()
-        self.sequence_pointers = numpy.frombuffer(
-            self.bin_buffer,
-            dtype=numpy.int64,
-            count=self.sequence_count,
-            offset=offset + self.sequence_lengths.nbytes,
-        )
-        t_end = time.time()
-        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        log_single_rank(logger, logging.INFO, f"\tExtract the document indices")
-        t_beg = time.time()
-        self.document_indices = numpy.frombuffer(
-            self.bin_buffer,
-            dtype=numpy.int64,
-            count=self.document_count,
-            offset=offset + self.sequence_lengths.nbytes + self.sequence_pointers.nbytes,
-        )
-        t_end = time.time()
-        log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        self.sequence_modes = None
-        if multimodal:
-            log_single_rank(logger, logging.INFO, f"\tExtract the sequence modes")
-            t_beg = time.time()
-            self.sequence_modes = numpy.frombuffer(
-                self.bin_buffer,
-                dtype=numpy.int8,
-                count=self.sequence_count,
-                offset=offset
-                + self.sequence_lengths.nbytes
-                + self.sequence_pointers.nbytes
-                + self.document_indices.nbytes,
-            )
-            t_end = time.time()
-            log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
-
-        assert self.sequence_lengths.shape[0] == len(self)
-        assert self.sequence_lengths.shape[0] == self.sequence_count
-        assert self.sequence_lengths.shape[0] == self.document_indices[-1]
-
-        log_single_rank(logger, logging.INFO, f"> total number of sequences: {len(self)}")
-        log_single_rank(
-            logger,
-            logging.INFO,
-            f"> total number of documents: {self.document_indices.shape[0] - 1}",
-        )
-
-    def __del__(self) -> None:
-        """Clean up the object
-        """
-        self.bin_buffer_mmap._mmap.close()
-        del self.bin_buffer_mmap
-
-    def __len__(self) -> int:
-        """Return the length of the dataset
-
-        Returns:
-            int: The length of the dataset
-        """
-        return self.sequence_count
-
-    @lru_cache(maxsize=8)
-    def __getitem__(self, idx: int) -> Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]:
-        """Return the pointer, length, and mode at the index
-
-        Args:
-            idx (int): The index into the dataset
-
-        Returns:
-            Tuple[numpy.int32, numpy.int64, Optional[numpy.int8]]: The pointer, length and mode at
-            the index
-        """
-        return (
-            self.sequence_pointers[idx],
-            self.sequence_lengths[idx],
-            self.sequence_modes[idx] if self.sequence_modes is not None else None,
-        )
-
-
-class MMapIndexedDataset(torch.utils.data.Dataset):
-    """The low-level interface dataset class
-
-    Args:
-        path_prefix (str): The index (.idx) and data (.bin) prefix
-
-        multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
-    """
-
-    def __init__(self, path_prefix: str, multimodal: bool = False) -> None:
-        super().__init__()
-        self.path_prefix = None
-        self.multimodal = None
-
-        self.index = None
-        self.bin_buffer = None
-        self.bin_buffer_mmap = None
-
-        self.initialize(path_prefix, multimodal)
-
-    def initialize(self, path_prefix: str, multimodal: bool) -> None:
-        """Initialize the dataset
-
-        This method is called by MMapIndexedDataset.__init__ during object creation and by
-        MMapIndexedDataset.__setstate__ during un-puckling
-
-        Args:
-            path_prefix (str): The index (.idx) and data (.bin) prefix
-
-            multimodal (bool): Whether the dataset is multimodal
-        """
-        self.path_prefix = path_prefix
-        self.multimodal = multimodal
-        self.index = _IndexReader(get_idx_path(self.path_prefix), self.multimodal)
-        self.bin_buffer_mmap = numpy.memmap(get_bin_path(self.path_prefix), mode="r", order="C")
-        self.bin_buffer = memoryview(self.bin_buffer_mmap)
-
-    def __getstate__(self) -> Tuple[str, bool]:
-        """Get the state during pickling
-
-        Returns:
-            Tuple[str, bool]: The state tuple
-        """
-        return self.path_prefix, self.multimodal
-
-    def __setstate__(self, state: Tuple[str, bool]) -> None:
-        """Set the state during un-pickling
-
-        Args:
-            state (Tuple[str, bool]): The state tuple
-        """
-        path_prefix, multimodal = state
-        self.initialize(path_prefix, multimodal)
-
-    def __del__(self) -> None:
-        """Clean up the object
-        """
-        if self.bin_buffer_mmap is not None:
-            self.bin_buffer_mmap._mmap.close()
-        del self.bin_buffer_mmap
-        del self.index
-
-    def __len__(self) -> int:
-        """Return the length of the dataset i.e. the number of sequences in the index
-
-        Returns:
-            int: The length of the dataset
-        """
-        return len(self.index)
-
-    def __getitem__(
-        self, idx: Union[int, numpy.integer, slice]
-    ) -> Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]:
-        """Return from the dataset
-
-        Args:
-            idx (Union[int, numpy.integer, slice]): The index or index slice into the dataset
-
-        Raises:
-            ValueError: When the index slice is non-contiguous
-
-            TypeError: When the index is of an unexpected type
-
-        Returns:
-            Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]: The sequence tokens and
-            modes at the index or index slice
-        """
-        if isinstance(idx, (int, numpy.integer)):
-            sequence_pointer, sequence_length, sequence_mode = self.index[idx]
-            sequence = numpy.frombuffer(
-                self.bin_buffer,
-                dtype=self.index.dtype,
-                count=sequence_length,
-                offset=sequence_pointer,
-            )
-            return (sequence, sequence_mode) if sequence_mode is not None else sequence
-        elif isinstance(idx, slice):
-            start, stop, step = idx.indices(len(self))
-            if step != 1:
-                raise ValueError("Slices into indexed_dataset must be contiguous")
-            sequence_lengths = self.index.sequence_lengths[idx]
-            sequence_modes = self.index.sequence_modes[idx] if self.multimodal else None
-            sequence_offsets = list(accumulate(sequence_lengths))
-            sequences = numpy.split(
-                numpy.frombuffer(
-                    self.bin_buffer,
-                    dtype=self.index.dtype,
-                    count=sum(sequence_lengths),
-                    offset=self.index.sequence_pointers[start],
-                ),
-                sequence_offsets[:-1],
-            )
-            return (sequences, sequence_modes) if sequence_modes is not None else sequences
-        else:
-            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
-
-    def get(self, idx: int, offset: int = 0, length: Optional[int] = None) -> numpy.ndarray:
-        """Retrieve a single item from the dataset with the option to only
-        return a portion of the item.
-
-        get(idx) is the same as [idx] but get() does not support slicing.
-        """
-        sequence_pointer, sequence_length, sequence_mode = self.index[idx]
-        if length is None:
-            length = sequence_length - offset
-        sequence_pointer += offset * DType.size(self.index.dtype)
-        sequence = numpy.frombuffer(
-            self.bin_buffer, dtype=self.index.dtype, count=length, offset=sequence_pointer
-        )
-        return (sequence, sequence_mode) if sequence_mode is not None else sequence
-
-    @property
-    def sequence_lengths(self) -> numpy.ndarray:
-        """Get the sequence lengths
-
-        Returns:
-            numpy.ndarray: The sequence lengths
-        """
-        return self.index.sequence_lengths
-
-    @property
-    def document_indices(self) -> numpy.ndarray:
-        """Get the document indices
-
-        Returns:
-            numpy.ndarray: The document indices
-        """
-        return self.index.document_indices
-
-    def get_document_indices(self) -> numpy.ndarray:
-        """Get the document indices
-
-        This method is slated for deprecation.
-
-        Returns:
-            numpy.ndarray: The document indices
-        """
-        return self.index.document_indices
-
-    def set_document_indices(self, document_indices: numpy.ndarray) -> None:
-        """Set the document indices
-
-        This method is slated for deprecation.
-
-        Args:
-            document_indices (numpy.ndarray): The document indices
-        """
-        self.index.document_indices = document_indices
-
-    @property
-    def sequence_modes(self) -> numpy.ndarray:
-        """Get the sequence modes
-
-        Returns:
-            numpy.ndarray: The sequence modes
-        """
-        return self.index.sequence_modes
-
-    @staticmethod
-    def exists(path_prefix: str) -> bool:
-        """Return whether the MMapIndexedDataset exists on disk at the prefix
-
-        Args:
-            path_prefix (str): The prefix to the index (.idx) and data (.bin) files
-
-        Returns:
-            bool: Whether the MMapIndexedDataset exists on disk at the prefix
-        """
-        return os.path.exists(get_idx_path(path_prefix)) and os.path.exists(
-            get_bin_path(path_prefix)
-        )
-
-
-class MMapIndexedDatasetBuilder(object):
-    """Builder class for the MMapIndexedDataset class
-
-    Args:
-        bin_path (str): The path to the data (.bin) file
-
-        dtype (Type[numpy.number], optional): The dtype of the index file. Defaults to numpy.int32.
-
-        multimodal (bool, optional): Whether the dataset is multimodal. Defaults to False.
-    """
-
-    def __init__(
-        self, bin_path: str, dtype: Type[numpy.number] = numpy.int32, multimodal: bool = False
-    ) -> None:
-        self.data_file = open(bin_path, "wb")
-        self.dtype = dtype
-        self.multimodal = multimodal
-
-        self.sequence_lengths = []
-        self.document_indices = [0]
-        self.sequence_modes = [] if self.multimodal else None
-
-    def add_item(self, tensor: torch.Tensor, mode: int = 0) -> None:
-        """Add a single item to the dataset
-
-        Args:
-            tensor (torch.Tensor): The item to add to the data file
-
-            mode (int, optional): The mode for the item. Defaults to 0.
-        """
-        np_array = numpy.array(tensor.numpy(), dtype=self.dtype)
-        self.data_file.write(np_array.tobytes(order="C"))
-        self.sequence_lengths.append(np_array.size)
-        if self.multimodal:
-            self.sequence_modes.append(mode)
-
-    def add_document(
-        self, tensor: torch.Tensor, lengths: List[int], modes: Optional[List[int]] = None
-    ) -> None:
-        """Add an entire document to the dataset
-
-        Args:
-            tensor (torch.Tensor): The document to add
-            lengths (List[int]): The lengths of each item in the document
-            modes (Optional[List[int]], optional): The modes for each item in the document.
-            Defaults to None.
-        """
-        np_array = numpy.array(tensor, dtype=self.dtype)
-        self.data_file.write(np_array.tobytes(order="C"))
-        self.sequence_lengths.extend(lengths)
-        self.document_indices.append(len(self.sequence_lengths))
-        if self.multimodal:
-            self.sequence_modes.extend(modes if modes is not None else [0] * lengths)
-
-    def end_document(self) -> None:
-        """Finalize the document, for use with MMapIndexedDatasetBuilder.add_item
-        """
-        self.document_indices.append(len(self.sequence_lengths))
-
-    def add_index(self, path_prefix: str) -> None:
-        """Add an entire MMapIndexedDataset to the dataset
-
-        Args:
-            path_prefix (str): The index (.idx) and data (.bin) prefix
-        """
-        # Concatenate index
-        index = _IndexReader(get_idx_path(path_prefix), multimodal=self.multimodal)
-        assert index.dtype == self.dtype
-
-        offset = len(self.sequence_lengths)
-        self.sequence_lengths.extend(index.sequence_lengths)
-        self.document_indices.extend((offset + index.document_indices)[1:])
-
-        if self.multimodal:
-            self.sequence_modes.extend(index.sequence_modes)
-
-        # Concatenate data
-        with open(get_bin_path(path_prefix), "rb") as f:
-            shutil.copyfileobj(f, self.data_file)
-
-    def finalize(self, idx_path: str) -> None:
-        """Clean up and write the index (.idx) file
-
-        Args:
-            idx_path (str): The path to the index file
-        """
-        self.data_file.close()
-        with _IndexWriter(idx_path, self.dtype) as writer:
-            writer.write(self.sequence_lengths, self.sequence_modes, self.document_indices)
-
-
-def get_idx_path(path_prefix: str) -> str:
-    """Get the path to the index file from the prefix
-
-    Args:
-        path_prefix (str): The prefix
-
-    Returns:
-        str: The path to the index file
-    """
-    return path_prefix + ".idx"
-
-
-def get_bin_path(path_prefix: str) -> str:
-    """Get the path to the data file from the prefix
-
-    Args:
-        path_prefix (str): The prefix
-
-    Returns:
-        str: The path to the data file
-    """
-    return path_prefix + ".bin"
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/megatron_dataset.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/megatron_dataset.py
deleted file mode 100644
index af0294711ab05a3bb2c4f8ff946780ea181ffdf1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/megatron_dataset.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import hashlib
-import json
-from abc import ABC, abstractmethod, abstractstaticmethod
-from collections import OrderedDict
-from typing import Dict, List
-
-import numpy
-import torch
-
-from megatron_ds.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
-from megatron_ds.core.datasets.indexed_dataset import MMapIndexedDataset
-from megatron_ds.core.datasets.utils import Split
-
-
-class MegatronDataset(ABC, torch.utils.data.Dataset):
-    """The wrapper class from which dataset classes should inherit e.g. GPTDataset
-
-    Args:
-        indexed_dataset (MMapIndexedDataset): The MMapIndexedDataset around which to build the
-        MegatronDataset
-
-        indexed_indices (numpy.ndarray): The set of the documents indices to expose
-
-        num_samples (int): The number of samples to draw from the indexed dataset
-
-        index_split (Split): The indexed_indices Split
-
-        config (BlendedMegatronDatasetConfig): The container for all config sourced parameters
-    """
-
-    def __init__(
-        self,
-        indexed_dataset: MMapIndexedDataset,
-        indexed_indices: numpy.ndarray,
-        num_samples: int,
-        index_split: Split,
-        config: BlendedMegatronDatasetConfig,
-    ) -> None:
-        assert indexed_indices.size > 0
-        assert num_samples > 0
-        assert self.is_multimodal() == indexed_dataset.multimodal
-        assert self.is_split_by_sequence() != self.is_split_by_document()
-
-        self.indexed_dataset = indexed_dataset
-        self.indexed_indices = indexed_indices
-        self.num_samples = num_samples
-        self.index_split = index_split
-        self.config = config
-
-        self.unique_identifiers = OrderedDict()
-        self.unique_identifiers["class"] = type(self).__name__
-        self.unique_identifiers["path_prefix"] = self.indexed_dataset.path_prefix
-        self.unique_identifiers["num_samples"] = self.num_samples
-        self.unique_identifiers["index_split"] = self.index_split.name
-        for attr in self._key_config_attributes():
-            self.unique_identifiers[attr] = getattr(self.config, attr)
-
-        self.unique_description = json.dumps(self.unique_identifiers, indent=4)
-        self.unique_description_hash = hashlib.md5(
-            self.unique_description.encode("utf-8")
-        ).hexdigest()
-
-        self._finalize()
-
-    @abstractmethod
-    def _finalize(self) -> None:
-        """Build the dataset and assert any subclass-specific conditions
-        """
-        pass
-
-    @abstractmethod
-    def __len__(self) -> int:
-        """Return the length of the dataset
-
-        Returns:
-            int: See abstract implementation
-        """
-        pass
-
-    @abstractmethod
-    def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
-        """Return from the dataset
-
-        Args:
-            idx (int): The index into the dataset
-
-        Returns:
-            Dict[str, numpy.ndarray]: See abstract implementation
-        """
-        pass
-
-    @abstractstaticmethod
-    def is_multimodal() -> bool:
-        """Return True if the inheritor class and its internal MMapIndexedDataset are multimodal
-
-        Returns:
-            bool: See abstract implementation
-        """
-        pass
-
-    @abstractstaticmethod
-    def is_split_by_sequence() -> bool:
-        """Return whether the dataset is split by sequence
-
-        For example, the GPT train/valid/test split is document agnostic
-
-        Returns:
-            bool: See abstract implementation
-        """
-        pass
-
-    @classmethod
-    def is_split_by_document(cls) -> bool:
-        """Return whether the dataset is split by document
-
-        For example, the BERT train/valid/test split is document aware
-
-        Returns:
-            bool: The negation of cls.is_split_by_sequence
-        """
-        return not cls.is_split_by_sequence()
-
-    @staticmethod
-    def _key_config_attributes() -> List[str]:
-        """Return all config attributes which contribute to uniquely identifying the dataset.
-
-        These attributes will be used to build a uniquely identifying string and MD5 hash which
-        will be used to cache/load the dataset from run to run.
-
-        Returns:
-            List[str]: The key config attributes
-        """
-        return ["split", "random_seed", "sequence_length"]
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/readme.md b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/readme.md
deleted file mode 100644
index 77d1e5862f54a9c224d1c4f655883e1b877616f5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/readme.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Data Pipeline
-
-## Data pre-processing
-
-Data preprocessing is built around the following classes:
-
-1. `MMapIndexedDatasetBuilder`
-2. `MMapIndexedDataset`
-
-At the moment, an end-to-end data preprocessing implementation is left to the user. See the class docstring(s) for more details.
-
-#### MMapIndexedDatasetBuilder
-
-The `MMapIndexedDatasetBuilder` is capable of building and merging `MMapIndexedDataset` instances.
-
-#### MMapIndexedDataset
-
-The `MMapIndexedDataset` class is the lowest-level data interface in Megatron Core. Internally, an `MMapIndexedDataset` instance references two binaries: the data file (`.bin`) contains document/sequence data and the index file (`.idx`) contains document/sequence metadata.
-
-The index file stores dataset-level metadata first:
-- The index header, for backward compatibility
-- The index version, for backward compatibility
-- A numeric code corresponding to the data type used to write data to the data file
-- The number of sequences in the dataset
-- The number of documents in the dataset
-
-The index file stores document-level and sequence-level metadata second:
-- In order, the number of elements per sequence
-- In order, the byte offset (pointer) per sequence
-- In order, the consecutive sequence index range `[...)` per document
-- In order, the mode per sequence (in the multimodal case)
-
-## Data loading: construction
-
-Building the data loaders is a distributed-aware process built around the following classes:
-
-1. `BlendedMegatronDatasetConfig`
-2. `BlendedMegatronDatasetBuilder`
-3. `MMapIndexedDataset`
-3. `MegatronDataset`
-4. `BlendedDataset`
-
-See the class docstrings for more details.
-
-#### BlendedMegatronDatasetConfig (extendable)
-
-The `BlendedMegatronDatasetConfig` class parameterizes the `BlendedMegatronDatasetBuilder` and in turn the `MegatronDataset` and `BlendedDataset`.
-
-Different training/inference regimes will require different extensions e.g. the `GPTDatasetConfig`
-
-#### BlendedMegatronDatasetBuilder
-
-The `BlendedMegatronDatasetBuilder` class builds the highest-level data interfaces in Megatron Core.
-
-**NB:** All ranks should attempt to build the dataset via the `BlendedMegatronDatasetBuilder` or the program will hang. Which ranks follow through on their attempts can be controlled via the `BlendedMegatronDatasetConfig`.
-
-#### MMapIndexedDataset
-
-The `MMapIndexedDataset` class is the lowest-level data interface in Megatron Core.
-
-The `MMapIndexedDataset` should already exist on disk before attempting to build any of the high-level data interfaces.
-
-
-#### MegatronDataset (extendable)
-
-The `MegatronDataset` abstract class is a high-level data interface in Megatron Core. It is an abstraction built upon the `MMapIndexedDataset`.
-
-Different training/inference regimes will require different extensions e.g. the `GPTDataset`
-
-#### BlendedDataset
-
-The `BlendedDataset` class is a high-level data interface in Megatron Core. It is an abstraction built upon the `MegatronDataset`.
-
-The `BlendedDataset` is only necessary when a blend multiple data distributions, i.e. multiple `MegatronDataset` instances, should contribute to a certain dataset split. The blend can be controlled via the `BlendedMegatronDatasetConfig`.
-
-## Data loading: implementation
-
-### GPTDataset
-
-The `GPTDataset` is parameterized by the following variables: the underlying `MMapIndexedDataset` instance `indexed_dataset`, the split indices `indexed_indices` (the congituous subset of document or sequence indices used for training, validation, and testing), the number of samples `N`, the sequence length `S`, and the random seed `R`.
-
-The `GPTDataset` creates three index mappings to facilitate lookup: (1) the document index, (2) the sample index, and (3) the shuffle index.
-
-1. The document index _Do_idx_ is a 1-D array mapping from _i_ to document index of length `E * |indexed_indices|` where `E` corresponds to the minimum number of epochs such that `E * |indexed_indices| >= N`. The document index is shuffled according to `R`.
-
-    ```
-    Given:
-
-    N = 15
-    indexed_indices = [5, 6, 7, 8, 9]
-    E = 3
-
-    Then, for example:
-
-    Do_idx = [8, 8, 9, 6, 7, 5, 8, 5, 6, 6, 5, 9, 7, 7, 9]
-    ```
-
-2. The sample index _Sa_idx_ is a 2-D array mapping from _j_ to pairs of (_i_, _Do_idx_[ _i_ ] offset) of shape `[N + 1, 2]`. The rows _j_ and _j_ + 1 serve as the left and right bounds for the _j_-th sample. 
-
-    ```
-    Given:
-
-    S = 1024
-
-    Then, for example:
-
-    Sa_idx[0] = (0, 0)
-    Sa_idx[1] = (0, 1024)       => Do_idx[0] has length greater than S
-    Sa_idx[2] = (1, 512)        => Do_idx[0] has length 1536
-    Sa_idx[3] = (2, 0)          => Do_idx[1] has length 1536
-    Sa_idx[4] = (5, 300)        => Do_idx[2:5] are shorter documents relative to Do_idx[0:2]
-    Sa_idx[5] = (6, 24)         => Do_idx[5] has length 1300
-    ```
-
-3. The shuffle index _Sh_idx_ is a 1-D array mapping from _k_ to _j_ of length `N`. The shuffle index is shuffled according to `R`.
-
-    ```
-    Given
-
-    N = 10
-
-    Then, for example:
-
-    Sh_idx = [4, 0, 2, 6, 1, 9, 5, 8, 7, 3]
-    ```
-
-To query the `GPTDataset` for the _k_-th sample we do the following
-
--  Use the shuffle index to get the index _j_ into the sample index.
-
-    ```
-    j = Sh_idx[k]
-    ```
-- Use the sample index to get the left and right sample-bounding indices into the document index and the starting token offset for each document.
-
-    ```
-    i, offset = Sa_idx[j]
-    i_next, offset_next = Sa_idx[j + 1]
-    ```
-- Use the document index to retrieve `S` tokens from consecutive (in the document index) documents.
-
-    ```
-    sample = []
-    sample += indexed_dataset[Do_idx[i]][offset:]
-    if i != i_next:
-        sample += indexed_dataset[Do_idx[i + 1:i_next]]
-    sample += indexed_dataset[Do_idx[i_next]][:offset_next]
-    ```
-
-To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `MegatronDataset.__init__` function.
-
-### BlendedDataset
-
-The `BlendedDataset` is parameterized by the following variables: the underlying `MegatronDataset` instances `D`, the weights `W` (one per dataset), and the size `S`. The `BlendedDataset` will draw samples from contributing datasets in proportion to the weights until achieving a composite dataset of the desired size. During each sampling step, we draw a single sample from the dataset which has the greatest sampling error.
-
-The `BlendedDataset` creates two "blending" indices to facilitate lookup: (1) the dataset index and (2) the dataset sample index.
-
-1. The dataset index _Da_idx_ is a 1-D array mapping from _i_ to dataset index of length `S`.
-
-    ```
-    Given
-
-    D = [d0, d1, d2]
-    W = [1/2, 1/4, 1/4]
-    S = 4
-
-    Then, for example:
-
-    Da_idx = [0, 1, 2, 0]
-
-    ```
-
-2. The dataset sample index _Sa_idx_ is a 1-D mapping from _i_ to the sample index for dataset _Da_idx[i]_ of length `S`.
-
-    ```
-    Given
-
-    Da_idx = [0, 1, 2, 0]
-
-    Then, for example:
-
-    Sa_idx = [0, 0, 0, 1]
-    ```
-
-To query the `BlendedDataset` for the _k_-th sample we do the following
-
-- Use the dataset index to retrieve the corresponding dataset from `D` and the dataset sample index to retrieve the corresponding sample from that dataset.
-
-    ```
-    sample = D[Da_idx[k]][Sa_idx[k]]
-    ```
-
-To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `BlendedDataset.__init__` function.
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/utils.py
deleted file mode 100644
index 8a3279b5f44bf0f31f5850a836f2412771356d07..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/utils.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import logging
-from enum import Enum
-from typing import List
-
-import numpy
-import torch
-
-logger = logging.getLogger(__name__)
-
-
-class Split(Enum):
-    train = 0
-    valid = 1
-    test = 2
-
-
-def compile_helpers():
-    """Compile C++ helper functions at runtime. Make sure this is invoked on a single process.
-    """
-    import os
-    import subprocess
-
-    command = ["make", "-C", os.path.abspath(os.path.dirname(__file__))]
-    if subprocess.run(command).returncode != 0:
-        import sys
-
-        log_single_rank(logger, logging.ERROR, "Failed to compile the C++ dataset helper functions")
-        sys.exit(1)
-
-
-def log_single_rank(logger: logging.Logger, *args, rank=0, **kwargs):
-    """If torch distributed is initialized, log only on rank
-
-    Args:
-        logger (logging.Logger): The logger to write the logs
-
-        rank (int, optional): The rank to write on. Defaults to 0.
-    """
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == rank:
-            logger.log(*args, **kwargs)
-    else:
-        logger.log(*args, **kwargs)
-
-
-def normalize(weights: List[float]) -> List[float]:
-    """Do non-exponentiated normalization
-
-    Args:
-        weights (List[float]): The weights
-
-    Returns:
-        List[float]: The normalized weights
-    """
-    w = numpy.array(weights, dtype=numpy.float64)
-    w_sum = numpy.sum(w)
-    w = (w / w_sum).tolist()
-    return w
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/__init__.py
deleted file mode 100644
index df08d7eaba8e55e30a2ade39bb8b01ca580854b5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .core import check_is_distributed_checkpoint
-from .mapping import LocalNonpersitentObject, ShardedTensor
-from .serialization import (
-    load,
-    load_common_state_dict,
-    load_plain_tensors,
-    load_tensors_metadata,
-    save,
-)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/core.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/core.py
deleted file mode 100644
index f20a0c3a2de730a747b3bf5696b4bdf8b60550ce..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/core.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-import json
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from typing import Optional
-
-CONFIG_FNAME = 'metadata.json'
-
-
-class CheckpointingException(Exception):
-    pass
-
-
-@dataclass
-class CheckpointingConfig:
-    """ Documents backends used in the checkpoint. """
-
-    sharded_backend: str
-    sharded_backend_version: int = 1
-    common_backend: str = 'torch'
-    common_backend_version: int = 1
-
-
-def check_is_distributed_checkpoint(checkpoint_dir):
-    return maybe_load_config(checkpoint_dir) is not None
-
-
-def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConfig]:
-    config_path = Path(checkpoint_dir, CONFIG_FNAME)
-    if not config_path.exists():
-        return None
-    with config_path.open() as f:
-        config_dict = json.load(f)
-    return CheckpointingConfig(**config_dict)
-
-
-def save_config(config: CheckpointingConfig, checkpoint_dir: str):
-    config_path = Path(checkpoint_dir, CONFIG_FNAME)
-    with config_path.open('w') as f:
-        json.dump(asdict(config), f)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/dict_utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/dict_utils.py
deleted file mode 100644
index 493a61c91a89a0ee30a7869a8f3a84bef0e1eff4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/dict_utils.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-""" Utilities for operating with dicts and lists. """
-
-from collections import defaultdict
-from typing import Any, Callable, Iterable, Optional, Tuple, Union
-
-import torch
-
-
-def extract_matching_values(
-    x: Union[dict, list], predicate: Callable[[Any], bool], return_lists_as_dicts: bool = False
-) -> Tuple[Union[dict, list], Union[dict, list]]:
-    """ Return matching and nonmatching values. Keeps hierarchy.
-
-    Arguments:
-        x (Union[dict, list]) : state dict to process. Top-level argument must be a dict or list
-        predicate (object -> bool): determines matching values
-        return_lists_as_dicts (bool): if True, matching lists will be turned
-            into dicts, with keys indicating the indices of original elements.
-            Useful for reconstructing the original hierarchy.
-    """
-
-    def _set_elem(target, k, v):
-        if return_lists_as_dicts:
-            target[k] = v
-        else:
-            target.append(v)
-
-    if isinstance(x, dict):
-        matching_vals = {}
-        nonmatching_vals = {}
-        for k, v in x.items():
-            if isinstance(v, (list, dict)):
-                match, nonmatch = extract_matching_values(v, predicate, return_lists_as_dicts)
-                if match:
-                    matching_vals[k] = match
-                if nonmatch or not v:
-                    nonmatching_vals[k] = nonmatch
-            elif predicate(v):
-                matching_vals[k] = v
-            else:
-                nonmatching_vals[k] = v
-    elif isinstance(x, list):
-        matching_vals = {} if return_lists_as_dicts else []
-        nonmatching_vals = {} if return_lists_as_dicts else []
-        for ind, v in enumerate(x):
-            if isinstance(v, (list, dict)) and v:
-                match, nonmatch = extract_matching_values(v, predicate, return_lists_as_dicts)
-                if match:
-                    _set_elem(matching_vals, ind, match)
-                if nonmatch or not v:
-                    _set_elem(nonmatching_vals, ind, nonmatch)
-            else:
-                target = matching_vals if predicate(v) else nonmatching_vals
-                _set_elem(target, ind, v)
-    else:
-        raise ValueError(f'Unexpected top-level object type: {type(x)}')
-    return matching_vals, nonmatching_vals
-
-
-def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
-    mismatch = []
-    if isinstance(x1, dict) and isinstance(x2, dict):
-        only_left = [prefix + (k,) for k in x1.keys() - x2.keys()]
-        only_right = [prefix + (k,) for k in x2.keys() - x1.keys()]
-        for k in x2.keys() & x1.keys():
-            _left, _right, _mismatch = diff(x1[k], x2[k], prefix + (k,))
-            only_left.extend(_left)
-            only_right.extend(_right)
-            mismatch.extend(_mismatch)
-    elif isinstance(x1, list) and isinstance(x2, list):
-        only_left = list(range(len(x1) - 1, len(x2) - 1, -1))
-        only_right = list(range(len(x1) - 1, len(x2) - 1, -1))
-        for i, (v1, v2) in enumerate(zip(x1, x2)):
-            _left, _right, _mismatch = diff(v1, v2, prefix + (i,))
-            only_left.extend(_left)
-            only_right.extend(_right)
-            mismatch.extend(_mismatch)
-    else:
-        only_left = []
-        only_right = []
-        if isinstance(x1, torch.Tensor) and isinstance(x2, torch.Tensor):
-            _is_mismatch = not torch.all(x1 == x2)
-        else:
-            try:
-                _is_mismatch = bool(x1 != x2)
-            except RuntimeError:
-                _is_mismatch = True
-
-        if _is_mismatch:
-            mismatch.append((prefix, type(x1), type(x2)))
-
-    return only_left, only_right, mismatch
-
-
-def inspect_keys_types(d: dict, prefix: Tuple = (), indent: int = 4):
-    print_indent = lambda: print(' ' * indent * len(prefix), end='')
-    for k, v in d.items():
-        if isinstance(v, dict):
-            print_indent()
-            print(f'> {k}:')
-            inspect_keys_types(v, prefix + (k,), indent)
-        else:
-            print_indent()
-            if isinstance(v, torch.Tensor):
-                print(f'> {k}: {type(v)} of shape {v.shape}')
-            else:
-                print(f'> {k}: {type(v)}')
-
-
-def inspect_types(x: Any, prefix: Tuple = (), indent: int = 4):
-    print_indent = lambda: print(' ' * indent * len(prefix), end='')
-    if isinstance(x, dict):
-        print()
-        for k, v in x.items():
-            print_indent()
-            print(f'> {k}: ', end='')
-            inspect_types(v, prefix + (k,), indent)
-    elif isinstance(x, list):
-        print()
-        for i, v in enumerate(x):
-            print_indent()
-            print(f'- {i}: ', end='')
-            inspect_types(v, prefix + (i,), indent)
-    else:
-        if isinstance(x, torch.Tensor):
-            print(f'Tensor of shape {x.shape}')
-        else:
-            try:
-                x_str = str(x)
-            except:
-                x_str = '<no string repr>'
-            if len(x_str) > 30:
-                x_str = x_str[:30] + '... (truncated)'
-            print(f'[{type(x)}]: {x_str}')
-
-
-def nested_values(x: Union[dict, list]):
-    x_iter = x.values() if isinstance(x, dict) else x
-    for v in x_iter:
-        if isinstance(v, (dict, list)):
-            yield from nested_values(v)
-        else:
-            yield v
-
-
-def nested_items_iter(x: Union[dict, list]):
-    x_iter = x.items() if isinstance(x, dict) else enumerate(x)
-    for k, v in x_iter:
-        if isinstance(v, (dict, list)):
-            yield from nested_items_iter(v)
-        else:
-            yield x, k, v
-
-
-def dict_map(f: Callable, d: dict):
-    for sub_d, k, v in nested_items_iter(d):
-        sub_d[k] = f(v)
-
-
-def dict_map_with_key(f: Callable, d: dict):
-    for sub_d, k, v in nested_items_iter(d):
-        sub_d[k] = f(k, v)
-
-
-def dict_list_map_inplace(f: Callable, x: Union[dict, list]):
-    if isinstance(x, dict):
-        for k, v in x.items():
-            x[k] = dict_list_map_inplace(f, v)
-    elif isinstance(x, list):
-        x[:] = (dict_list_map_inplace(f, v) for v in x)
-    else:
-        return f(x)
-    return x
-
-
-def dict_list_map_outplace(f: Callable, x: Union[dict, list]):
-    if isinstance(x, dict):
-        return {k: dict_list_map_outplace(f, v) for k, v in x.items()}
-    elif isinstance(x, list):
-        return [dict_list_map_outplace(f, v) for v in x]
-    else:
-        return f(x)
-
-
-def merge(x1: dict, x2: dict, key: Tuple[str, ...] = ()):
-    if isinstance(x1, dict) and isinstance(x2, dict):
-        for k, v2 in x2.items():
-            if k not in x1:
-                x1[k] = v2
-            else:
-                x1[k] = merge(x1[k], v2, key=key + (k,))
-    elif isinstance(x1, list) and isinstance(x2, list):
-        if len(x1) != len(x2):
-            raise ValueError(
-                f'Cannot merge two lists with different lengths ({len(x1)} and {len(x2)}, encountered at level {key})'
-            )
-        for i, v2 in enumerate(x2):
-            x1[i] = merge(x1[i], v2, key=key + (i,))
-    else:
-        raise ValueError(
-            f'Duplicate non-dict and non-list values encountered: `{x1}` and `{x2}` (at level {key})'
-        )
-    return x1
-
-
-def map_reduce(
-    xs: Iterable,
-    key_fn: Callable = lambda x: x,
-    value_fn: Callable = lambda x: x,
-    reduce_fn: Callable = lambda x: x,
-) -> dict:
-    res = defaultdict(list)
-    for x in xs:
-        res[key_fn(x)].append(value_fn(x))
-    for k in res:
-        res[k] = reduce_fn(res[k])
-    return dict(res)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/mapping.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/mapping.py
deleted file mode 100644
index 2b4d5677d37c01f7196d887f8e7a767d7e0c581b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/mapping.py
+++ /dev/null
@@ -1,308 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-""" Core library classes. """
-import logging
-from dataclasses import dataclass, replace
-from itertools import chain
-from typing import Any, Callable, Dict, Optional, Tuple, Union
-
-import numpy as np
-import torch
-
-from .core import CheckpointingException
-from .dict_utils import dict_list_map_inplace, dict_list_map_outplace
-
-logger = logging.getLogger(__name__)
-
-# These type definitions are just hints to differentiate a plain model state
-#  dict (StateDict) from a state dict with tensors replaced with ShardedTensors
-#  (ShardedStateDict).
-StateDict = Dict[str, Any]
-ShardedStateDict = Dict[str, Any]
-ReplicaId = Union[int, Tuple[int, ...]]
-
-
-@dataclass
-class ShardedTensor:
-    """Represents a mapping between a local tensor and a global tensor.
-
-    Global tensor is assumed to consist of many local tensors distributed
-    between different processes.
-
-    Attributes:
-        key: unique identifier of a global tensor
-        data: local tensor data. Can be None only for consistency validation
-        dtype: tensor dtype
-        local_shape: local tensor shape
-        global_shape: global tensor shape
-        global_offset: offset of a local tensor in a global tensor, specified
-            in number of tensor elements
-        axis_fragmentations: global tensor fragmentation of each axis
-        replica_id: indicates given local tensor's replication wrt. local
-            tensors in different processes
-        prepend_axis_num: number of axes prepended to the local tensor
-            to reflect global tensor shape.
-            The behavior is similar to unsqueezing the local tensor.
-        allow_shape_mismatch: if True, during loading, the global shape of a
-            stored tensor does not have to match the expected global shape.
-            Useful for representing tensors with flexible shape, e.g. padded.
-        flattened_range: specifies a slice that should be applied to a flattened
-            tensor with `local_shape` in order to get the tensor stored as `data`
-    """
-
-    key: str
-    data: Optional[torch.Tensor]
-    dtype: torch.dtype
-    local_shape: Tuple[int, ...]
-    global_shape: Tuple[int, ...]
-    global_offset: Tuple[int, ...]
-    axis_fragmentations: Optional[Tuple[int, ...]]
-    replica_id: ReplicaId = 0
-    prepend_axis_num: int = 0
-    allow_shape_mismatch: bool = False
-    flattened_range: Optional[slice] = None
-
-    def global_slice(self) -> Tuple[Union[int, slice], ...]:
-        assert len(self.global_offset) == len(self.local_shape) + self.prepend_axis_num
-        return tuple(
-            chain(
-                (off for off in self.global_offset[: self.prepend_axis_num]),
-                (
-                    slice(off, off + sh)
-                    for off, sh in zip(
-                        self.global_offset[self.prepend_axis_num :], self.local_shape
-                    )
-                ),
-            )
-        )
-
-    def global_coordinates(self) -> Tuple[np.ndarray, ...]:
-        if self.flattened_range is None:
-            raise CheckpointingException(
-                f'`global_coordinates` is undefined for'
-                f' {self.__class__.__name__} without `flattened_range`'
-            )
-
-        local_coords = self.local_coordinates()
-        assert len(local_coords) + self.prepend_axis_num == len(self.global_offset), (
-            len(local_coords),
-            self,
-        )
-        global_coords = tuple(
-            c + off
-            for c, off in zip((0,) * self.prepend_axis_num + local_coords, self.global_offset)
-        )
-        return global_coords
-
-    def local_coordinates(self) -> Tuple[np.ndarray, ...]:
-        if self.flattened_range is None:
-            raise CheckpointingException(
-                f'`local_coordinates` is undefined for'
-                f' {self.__class__.__name__} without `flattened_range`'
-            )
-
-        # TODO: np.unravel_index?
-        mask = np.zeros(np.product(self.local_shape), dtype=bool)
-        mask[self.flattened_range] = True
-        return np.nonzero(mask.reshape(self.local_shape))
-
-    def max_allowed_chunks(self) -> Tuple[int, ...]:
-        chunks = []
-        for axis_sh, axis_fragm in zip(self.global_shape, self.axis_fragmentations):
-            if not self.allow_shape_mismatch and axis_sh % axis_fragm != 0:
-                raise CheckpointingException(
-                    f'Axis shape ({axis_sh}) not divisible' f' by axis fragmentation ({axis_fragm}'
-                )
-            axis_chunk_size = axis_sh // axis_fragm
-            chunks.append(axis_chunk_size)
-        return tuple(chunks)
-
-    def without_data(self):
-        return replace(self, data=None)
-
-    @classmethod
-    def from_rank_offsets(
-        cls,
-        key: str,
-        data: torch.Tensor,
-        *rank_offsets: Tuple[int, int, int],
-        replica_id: ReplicaId = 0,
-        prepend_axis_num: int = 0,
-        allow_shape_mismatch: bool = False,
-    ):
-        """Allows to construct the ShardedTensor given offset specified in process ranks.
-        Arguments:
-            key: unique key
-            data: local tensor data
-            rank_offsets: each tuple (axis, axis_rank_offset, axis_fragm)
-                says that if global tensor is divided into `axis_fragm`
-                 fragment along `axis` axis, then local tensor data
-                 corresponds to the `axis_rank_offset` chunk.
-            replica_id: see ShardedTensor
-            prepend_axis_num: see ShardedTensor
-            allow_shape_mismatch: see ShardedTensor
-        """
-        global_offset = [0] * (data.ndim + prepend_axis_num)
-        global_shape = ([1] * prepend_axis_num) + list(data.shape)
-        axis_fragmentations = [1] * (data.ndim + prepend_axis_num)
-        _seen_axis = set()
-        for axis, axis_rank_offset, axis_fragm in rank_offsets:
-            assert axis >= 0 and axis_rank_offset >= 0 and axis_fragm >= 0, (
-                axis,
-                axis_rank_offset,
-                axis_fragm,
-            )
-            assert (
-                axis_rank_offset < axis_fragm
-            ), 'Rank offset must be lower than axis fragmentation'
-            if axis in _seen_axis:
-                raise CheckpointingException('Duplicated axis specified')
-            _seen_axis.add(axis)
-
-            local_axis_shape = 1 if axis < prepend_axis_num else data.shape[axis - prepend_axis_num]
-            global_shape[axis] = axis_fragm * local_axis_shape
-            global_offset[axis] = axis_rank_offset * local_axis_shape
-            axis_fragmentations[axis] = axis_fragm
-
-        return cls(
-            key,
-            data,
-            data.dtype,
-            tuple(data.shape),
-            tuple(global_shape),
-            tuple(global_offset),
-            tuple(axis_fragmentations),
-            replica_id,
-            prepend_axis_num,
-            allow_shape_mismatch,
-        )
-
-    def __str__(self):
-        return f'{self.__class__.__name__}(key=\'{self.key}\')'
-
-
-def is_main_replica(replica_id):
-    if isinstance(replica_id, int):
-        return replica_id == 0
-    return all(r == 0 for r in replica_id)
-
-
-class LocalNonpersitentObject:
-    """Object that should not be stored in a checkpoint, but restored locally.
-
-    Wrapping any object inside the state dict with LocalNonpersitentObject
-    will result in:
-    - during saving, this object will *not* be stored in the checkpoint
-    - during loading, a local version of this object will be placed in a state dict
-    """
-
-    def __init__(self, obj):
-        self.obj = obj
-
-    def unwrap(self):
-        return self.obj
-
-
-@dataclass
-class ShardedObject:
-    """Represents a mapping between a local object and a global object.
-
-    Global object is assumed to consist of many local objects distributed
-    between different processes.
-
-    NOTE: Contrary to ShardedTensor, it's impossible to change global object
-    sharding. Conceptually, ShardedObject is a fully-sharded ShardedTensor
-    with atomic arbitrary typed elements.
-
-    Attributes:
-        key: unique identifier of a global tensor
-        data: local object data. Can be None only for consistency validation
-        global_shape: global object shape
-        global_offset: offset of a local object in a global object, specified
-            in number of shards
-        replica_id: indicates local object replication wrt. local
-            objects in different processes
-    """
-
-    key: str
-    data: object
-    global_shape: Tuple[int, ...]
-    global_offset: Tuple[int, ...]
-    replica_id: ReplicaId = 0
-
-    def without_data(self):
-        return replace(self, data=None)
-
-    @property
-    def unique_key(self):
-        return f'{self.key}/shard_{".".join(map(str, self.global_offset))}_{".".join(map(str, self.global_shape))}'
-
-    def __str__(self):
-        return f'{self.__class__.__name__}(key=\'{self.key}\')'
-
-
-@dataclass
-class ShardedTensorFactory:
-    """ Allows to apply transformations to tensors before/after serialization.
-
-    The essence of those transformations is that they can be applied to
-    optimizer states the same way they are applied to the model params.
-
-    Builder creates a sub-state-dict out of a tensor before saving, and merger
-    merges the corresponding state dict after loading.
-    """
-
-    key: str
-    data: torch.Tensor
-    build_fn: Callable[[str, torch.Tensor], ShardedStateDict]
-    merge_fn: Callable[[StateDict], torch.Tensor]
-
-    def build(self):
-        return self.build_fn(self.key, self.data)
-
-
-def apply_factories(sharded_state_dict: ShardedStateDict):
-    def apply(x):
-        if isinstance(x, ShardedTensorFactory):
-            x = x.build()
-        return x
-
-    dict_list_map_inplace(apply, sharded_state_dict)
-
-
-def apply_factory_merges(x1: StateDict, x2: ShardedStateDict, key: Tuple[str, ...] = ()):
-    if isinstance(x2, ShardedTensorFactory):
-        return x2.merge_fn(x1)
-
-    # There rest is almost the same as the `merge` function from `dict_utils`
-    if isinstance(x1, dict) and isinstance(x2, dict):
-        for k, v2 in x2.items():
-            if k not in x1:
-                raise ValueError(
-                    f'Different dict keys encountered in `apply_factory_merges` ({x1.keys()} vs {x2.keys()})'
-                )
-            else:
-                x1[k] = apply_factory_merges(x1[k], v2, key=key + (k,))
-    elif isinstance(x1, list) and isinstance(x2, list):
-        if len(x1) != len(x2):
-            err_msg = f'Cannot merge two lists with different lengths ({len(x1)} and {len(x2)}, encountered at key {key})'
-            logger.error(err_msg + f'\nx1: {x1}\nx2: {x2}')
-            raise ValueError(err_msg)
-        for i, v2 in enumerate(x2):
-            x1[i] = apply_factory_merges(x1[i], v2, key=key + (i,))
-    elif isinstance(x1, list) and isinstance(x2, dict):
-        for k, v2 in x2.items():
-            if not isinstance(k, int):
-                raise ValueError(
-                    f'Invalid dict key {k} non-integer type encountered in a list-dict merge at level {key}'
-                )
-            if k >= len(x1):
-                raise ValueError(
-                    f'Dict key {k} out of bound for list of length {len(x1)} (encountered at level {key})'
-                )
-            x1[k] = apply_factory_merges(x1[k], v2, key=key + (k,))
-    else:
-        raise ValueError(
-            f'Duplicate non-dict and non-list values encountered: `{x1}` and `{x2} (at key {key})`'
-        )
-    return x1
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/optimizer.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/optimizer.py
deleted file mode 100644
index d1c698787c4678009f09b5496fa4c5ddc17574d8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/optimizer.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-""" Optimizer related helpers. """
-
-import logging
-from copy import deepcopy
-from dataclasses import replace
-from itertools import chain
-from typing import Dict, Iterable, List, Tuple, Union
-
-logger = logging.getLogger(__name__)
-
-import torch
-
-from .dict_utils import nested_values
-from .mapping import (
-    LocalNonpersitentObject,
-    ShardedStateDict,
-    ShardedTensor,
-    ShardedTensorFactory,
-    StateDict,
-)
-from .utils import extract_sharded_tensors, extract_sharded_tensors_and_factories
-
-
-def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Parameter]) -> Dict[int, int]:
-    param_mappings = {}
-    for i, param in enumerate(optim_params_iter):
-        if id(param) not in param_mappings:
-            param_mappings[id(param)] = i
-    return param_mappings
-
-
-def get_param_id_to_sharded_param_map(
-    model_sharded_state_dict: ShardedStateDict, optim_params_iter: Iterable[torch.nn.Parameter]
-) -> Dict[int, Union[ShardedTensor, ShardedTensorFactory]]:
-    model_sharded_state_dict, _ = extract_sharded_tensors_and_factories(model_sharded_state_dict)
-    id_to_sharded_param_map = {}
-    param_to_id_map = get_optim_param_to_id_map(optim_params_iter)
-    for ten in nested_values(model_sharded_state_dict):
-        if id(ten.data) in param_to_id_map:
-            id_to_sharded_param_map[param_to_id_map[id(ten.data)]] = ten
-        else:
-            logger.debug(f'{ten} is not tracked by the optimizer')
-
-    if not id_to_sharded_param_map:
-        logger.warning(
-            "Sharded parameters mapping is empty. It means tensors in model state dict"
-            " do not correspond to tensors in optimizer parameters map."
-            " Make sure to call state_dict with `keep_vars=True`."
-        )
-    return id_to_sharded_param_map
-
-
-def make_sharded_optimizer_tensor(
-    model_param: Union[ShardedTensor, ShardedTensorFactory], optim_param: torch.Tensor, prefix: str
-) -> Union[ShardedTensor, ShardedTensorFactory]:
-    if isinstance(model_param, ShardedTensorFactory):
-        return replace(model_param, key=f'{prefix}.{model_param.key}', data=optim_param)
-
-    assert (
-        tuple(optim_param.shape) == model_param.local_shape
-    ), f'Optimizer shape ({tuple(optim_param.shape)} does not match model shape ({model_param.local_shape})'
-    return replace(
-        model_param, key=f'{prefix}.{model_param.key}', data=optim_param, dtype=optim_param.dtype
-    )
-
-
-def optim_state_to_sharding_state(
-    optim_state_dict: StateDict,
-    id_to_sharded_param_map: Dict[int, ShardedTensor],
-    exclude_keys: Tuple[str] = (),
-):
-    sharded_state = {}
-    for param_id, param_state in optim_state_dict['state'].items():
-        sharded_state[param_id] = {}
-        for state_key, param in param_state.items():
-            if state_key in exclude_keys:
-                continue
-            if param_id in id_to_sharded_param_map:
-                sharded_state[param_id][state_key] = make_sharded_optimizer_tensor(
-                    id_to_sharded_param_map[param_id], param, prefix=f'optimizer.state.{state_key}'
-                )
-            else:
-                raise ValueError(f'Param id {param_id} does not match any model sharded param')
-
-    optim_state_dict['param_groups'] = deepcopy(optim_state_dict['param_groups'])
-    for group in optim_state_dict['param_groups']:
-        group['params'] = LocalNonpersitentObject(group['params'])
-    optim_state_dict['state'] = sharded_state
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/serialization.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/serialization.py
deleted file mode 100644
index 85baa16c21eb784369fd87adb8239cd0ac68d05f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/serialization.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-import logging
-import os
-from collections import Counter, defaultdict
-from itertools import chain
-from pathlib import Path
-from typing import Iterable, List, Tuple, Union
-
-import numpy as np
-import torch
-
-from .core import CheckpointingConfig, maybe_load_config, save_config
-from .dict_utils import (
-    dict_list_map_inplace,
-    diff,
-    extract_matching_values,
-    map_reduce,
-    merge,
-    nested_values,
-)
-from .mapping import (
-    CheckpointingException,
-    ShardedObject,
-    ShardedStateDict,
-    ShardedTensor,
-    ShardedTensorFactory,
-    StateDict,
-    apply_factories,
-    apply_factory_merges,
-    is_main_replica,
-)
-from .strategies.base import (
-    LoadCommonStrategy,
-    LoadShardedStrategy,
-    SaveCommonStrategy,
-    SaveShardedStrategy,
-    StrategyAction,
-    get_default_strategy,
-)
-from .utils import extract_sharded_tensors, extract_sharded_tensors_or_nonpersistent
-
-COMMON_STATE_FNAME = 'common.pt'
-
-logger = logging.getLogger(__name__)
-
-
-def load(
-    sharded_state_dict: ShardedStateDict,
-    checkpoint_dir: str,
-    sharded_strategy: Union[LoadShardedStrategy, None] = None,
-    common_strategy: Union[LoadCommonStrategy, None] = None,
-    validate_access_integrity: bool = True,
-) -> StateDict:
-    """Loading entrypoint.
-
-    Arguments:
-        sharded_state_dict (ShardedStateDict): state dict of the existing model
-            populated with ShardedTensors. Used as a mapping to determine which
-            parts of global tensors stored in the checkpoint should be loaded.
-        checkpoint_dir (str): directory with the checkpoint
-        sharded_strategy (LoadShardedStrategy, optional): configures loading behavior for sharded tensors
-        common_strategy (LoadCommonStrategy, optional): configures loading behavior for common data
-        validate_access_integrity (bool default = True): checks if each tensor shard is accessed
-            exactly once (as main replica) by some process
-    """
-    if common_strategy is not None:
-        raise NotImplementedError('The only supported common strategy is torch')
-
-    checkpoint_dir = Path(checkpoint_dir)
-    common_state_dict = load_common_state_dict(checkpoint_dir)
-    if not sharded_state_dict:
-        return common_state_dict
-
-    sharded_objects, sharded_state_dict = load_sharded_objects(sharded_state_dict, checkpoint_dir)
-    merge(common_state_dict, sharded_objects)
-
-    saved_config = maybe_load_config(checkpoint_dir)
-    if saved_config is None:
-        raise CheckpointingException(f'{checkpoint_dir} is not a distributed checkpoint')
-
-    sh_ten_factories, _ = extract_matching_values(
-        sharded_state_dict,
-        lambda x: isinstance(x, ShardedTensorFactory),
-        return_lists_as_dicts=True,
-    )
-    apply_factories(sharded_state_dict)
-    sharded_state_dict, _ = extract_sharded_tensors_or_nonpersistent(sharded_state_dict)
-    sharded_state_dict, nonpersistent_state_dict = extract_sharded_tensors(sharded_state_dict)
-    dict_list_map_inplace(lambda o: o.unwrap(), nonpersistent_state_dict)
-    merge(common_state_dict, nonpersistent_state_dict)
-
-    if validate_access_integrity:
-        validate_sharding_integrity(nested_values(sharded_state_dict))
-
-    if sharded_strategy is None:
-        sharded_strategy = get_default_strategy(
-            StrategyAction.LOAD_SHARDED,
-            saved_config.sharded_backend,
-            saved_config.sharded_backend_version,
-        )
-    else:
-        # TODO: implement consistency checks here
-        pass
-    loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir)
-
-    loaded_state_dict = apply_factory_merges(loaded_state_dict, sh_ten_factories)
-
-    merge(common_state_dict, loaded_state_dict)
-    return common_state_dict
-
-
-# TODO: implement it as common torch strategy
-def load_common_state_dict(checkpoint_dir: Path):
-    return torch.load(Path(checkpoint_dir) / COMMON_STATE_FNAME, map_location='cpu')
-
-
-def load_sharded_objects(sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
-    sharded_objects, sharded_state_dict = extract_matching_values(
-        sharded_state_dict, lambda v: isinstance(v, ShardedObject)
-    )
-
-    def load_sharded_object(sh_obj: ShardedObject):
-        sh_obj.data = None
-        load_path = (checkpoint_dir / sh_obj.unique_key).with_suffix('.pt')
-        loaded_obj = torch.load(load_path)
-        return loaded_obj
-
-    return dict_list_map_inplace(load_sharded_object, sharded_objects), sharded_state_dict
-
-
-def load_tensors_metadata(
-    checkpoint_dir: str, sharded_strategy: Union[LoadShardedStrategy, None] = None
-) -> ShardedStateDict:
-    """Load tensors metadata from the checkpoint.
-
-    Returns a dictionary similar to a sharded state dict, but note that
-    the dictionary keys are simply ShardedTensor keys (contrary to the
-    actual sharded state dicts where keys correspond to state dict keys).
-
-    Dict values are ShardedTensors without any sharding (so, the only useful
-    information is tensors global shape and dtype).
-
-    Concrete implementation depends on the loading strategy. If no strategy is
-    given, a default for a given backend is used.
-    """
-    saved_config = maybe_load_config(checkpoint_dir)
-    if saved_config is None:
-        raise CheckpointingException(f'{checkpoint_dir} is not a distributed checkpoint')
-
-    if sharded_strategy is None:
-        sharded_strategy = get_default_strategy(
-            StrategyAction.LOAD_SHARDED,
-            saved_config.sharded_backend,
-            saved_config.sharded_backend_version,
-        )
-    else:
-        # TODO: implement consistency checks here
-        pass
-    return sharded_strategy.load_tensors_metadata(Path(checkpoint_dir))
-
-
-def load_plain_tensors(checkpoint_dir: str):
-    """Load checkpoint tensors without any sharding.
-
-    NOTE: common state dict is NOT included."""
-    sharded_state_dict = load_tensors_metadata(checkpoint_dir)
-    # Don't validate integrity because shards will be overlapped
-    # if world_size > 1 (all processes load whole tensors)
-    return load(sharded_state_dict, checkpoint_dir, validate_access_integrity=False)
-
-
-def save(
-    sharded_state_dict: ShardedStateDict,
-    checkpoint_dir: str,
-    sharded_strategy: Union[SaveShardedStrategy, None] = None,
-    common_strategy: Union[SaveCommonStrategy, None] = None,
-    validate_access_integrity: bool = True,
-):
-    """Saving entrypoint.
-
-    Extracts ShardedTensors from the given state dict. Rank 0 saves the
-    "regular" part of the checkpoint to common torch file.
-    The ShardedTensors are saved according to a strategy specified by the
-    config.
-
-    Arguments:
-        sharded_state_dict (ShardedStateDict): state dict of the populated with
-            ShardedTensors. Used as a mapping to determine how local tensors
-            should be saved as global tensors in the checkpoint.
-        checkpoint_dir (str): directory to save the checkpoint to
-        sharded_strategy (SaveShardedStrategy, optional): configures sharded tensors saving behavior and backend
-        common_strategy (SaveCommonStrategy, optional): configures common data saving behavior and backend
-        validate_access_integrity (bool default = True): checks if each tensor shard is accessed
-            exactly once (as main replica) by some process
-    """
-    checkpoint_dir = Path(checkpoint_dir)
-
-    if torch.distributed.get_rank() == 0:
-        if not checkpoint_dir.exists():
-            raise CheckpointingException(
-                f'Checkpoint destination directory does not exist: {checkpoint_dir}'
-            )
-
-        if next(checkpoint_dir.iterdir(), None) is not None:
-            raise CheckpointingException(
-                f'Checkpoint destination directory ({checkpoint_dir}) is not empty'
-            )
-
-    if common_strategy is not None:
-        raise NotImplementedError('The only supported common strategy is torch')
-
-    if sharded_strategy is None:
-        sharded_strategy = get_default_strategy(StrategyAction.SAVE_SHARDED, 'zarr', 1)
-
-    apply_factories(sharded_state_dict)
-    sharded_state_dict, state_dict = extract_sharded_tensors_or_nonpersistent(sharded_state_dict)
-    sharded_state_dict, _ = extract_sharded_tensors(sharded_state_dict)
-    sharded_tensors = list(nested_values(sharded_state_dict))
-    if validate_access_integrity:
-        validate_sharding_integrity(sharded_tensors)
-
-    _save_common_dict(state_dict, checkpoint_dir, True)
-
-    sharded_strategy.save(sharded_tensors, checkpoint_dir)
-    save_config(
-        CheckpointingConfig(sharded_strategy.backend, sharded_strategy.version), checkpoint_dir
-    )
-
-
-# TODO: implement it as common torch strategy
-def _save_common_dict(
-    state_dict: StateDict, checkpoint_dir: Path, validate_consistency: bool = False
-):
-    common_state_dict = _extract_and_save_sharded_objects(
-        state_dict, checkpoint_dir, validate_consistency
-    )
-    if torch.distributed.get_rank() == 0:
-        torch.save(common_state_dict, checkpoint_dir / COMMON_STATE_FNAME)
-    if validate_consistency:
-        # TODO: implement checking consistency with rank 0 common dict on other ranks
-        pass
-        # torch.distributed.barrier()
-        # if not torch.distributed.get_rank() == 0:
-        #     rank_0_state_dict = torch.load(checkpoint_dir / COMMON_STATE_FNAME)
-        #     print(diff(common_state_dict, rank_0_state_dict))
-
-
-def _extract_and_save_sharded_objects(
-    state_dict: StateDict, checkpoint_dir: Path, validate_consistency: bool = False
-):
-    sharded_objects, state_dict = extract_matching_values(
-        state_dict, lambda v: isinstance(v, ShardedObject)
-    )
-    sharded_objects = list(nested_values(sharded_objects))
-    if validate_consistency:
-        validate_objects_sharding_integrity(sharded_objects)
-    for sh_obj in sharded_objects:
-        if is_main_replica(sh_obj.replica_id):
-            save_path = (checkpoint_dir / sh_obj.unique_key).with_suffix('.pt')
-            os.makedirs(save_path.parent, exist_ok=True)
-            torch.save(sh_obj.data, save_path)
-    return state_dict
-
-
-def validate_sharding_integrity(sharded_tensors: Iterable[ShardedTensor]):
-    sharding = [ten.without_data() for ten in sharded_tensors]
-    all_sharding = [None] * torch.distributed.get_world_size()
-    torch.distributed.all_gather_object(all_sharding, sharding)
-    if torch.distributed.get_rank() != 0:
-        return
-
-    key_shardings = defaultdict(list)
-    for rank, rank_shardings in enumerate(all_sharding):
-        for sharding in rank_shardings:
-            key_shardings[sharding.key].append((rank, sharding))
-    for key, shardings in key_shardings.items():
-        _validate_sharding_for_key(shardings)
-
-
-def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]):
-    some_rank_shard = rank_sharding[0][1]
-    global_shape = some_rank_shard.global_shape
-    local_shape = some_rank_shard.local_shape
-    dtype = some_rank_shard.dtype
-    has_flattened_range = some_rank_shard.flattened_range is not None
-    for rank, sharding in rank_sharding:
-        assert sharding.dtype == dtype, (sharding.dtype, dtype, some_rank_shard)
-        assert sharding.global_shape == global_shape, (
-            sharding.global_shape,
-            global_shape,
-            some_rank_shard,
-        )
-        assert sharding.local_shape == local_shape, (
-            sharding.local_shape,
-            local_shape,
-            some_rank_shard,
-        )
-        assert (sharding.flattened_range is not None) == has_flattened_range, (
-            (sharding.flattened_range is not None),
-            has_flattened_range,
-            some_rank_shard,
-        )
-
-    shard_access_cnt = _compute_shards_access(rank_sharding)
-    if has_flattened_range:
-        map_reduce(
-            rank_sharding,
-            lambda x: x[1].global_offset,
-            lambda x: x[1],
-            _validate_sharding_for_key_flattened,
-        )
-    else:
-        if not torch.all(shard_access_cnt == 1):
-            logger.error(f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}')
-            raise CheckpointingException(f'Invalid access pattern for {rank_sharding[0][1]}')
-
-
-def _compute_shards_access(rank_sharding):
-    def chunk_offset(sharding):
-        assert len(sharding.global_offset) == len(sharding.local_shape) + sharding.prepend_axis_num
-        return tuple(
-            chain(
-                (off for off in sharding.global_offset[: sharding.prepend_axis_num]),
-                (
-                    off // sh
-                    for off, sh in zip(
-                        sharding.global_offset[sharding.prepend_axis_num :], sharding.local_shape
-                    )
-                ),
-            )
-        )
-
-    shard_access_cnt = torch.zeros(
-        rank_sharding[0][1].axis_fragmentations, dtype=torch.int, device='cpu'
-    )
-    for rank, sharding in rank_sharding:
-        if is_main_replica(sharding.replica_id):
-            shard_access_cnt[chunk_offset(sharding)] += 1
-        # TODO: consider validating different replicas too
-    return shard_access_cnt
-
-
-def _validate_sharding_for_key_flattened(tensors_by_shard):
-    all_slices = []
-    local_shape = tensors_by_shard[0].local_shape
-    for sharding in tensors_by_shard:
-        assert sharding.local_shape == local_shape
-        sharding: ShardedTensor
-        if not is_main_replica(sharding.replica_id):
-            # TODO: this checks only saving (and loading replica_id=0) consistency
-            continue
-
-        all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
-
-    starts, stops = map(np.asarray, zip(*sorted(all_slices)))
-    if (
-        starts[0] != 0
-        or stops[-1] != np.product(local_shape)
-        or not np.all(starts[1:] == stops[:-1])
-    ):
-        logger.error(
-            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}'
-        )
-        raise CheckpointingException(
-            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}'
-        )
-
-
-def validate_objects_sharding_integrity(sharded_objects: List[ShardedObject]):
-    """ Ensure uniqueness of saved objects. """
-    local_sh_objs = [sh_obj.without_data() for sh_obj in sharded_objects]
-    all_sh_objs = [None] * torch.distributed.get_world_size()
-    torch.distributed.all_gather_object(all_sh_objs, local_sh_objs)
-    if torch.distributed.get_rank() != 0:
-        return
-    unique_keys = [
-        sh_obj.unique_key
-        for sh_obj in chain.from_iterable(all_sh_objs)
-        if is_main_replica(sh_obj.replica_id)
-    ]
-    if len(unique_keys) != len(set(unique_keys)):
-        duplicates = {k: cnt for k, cnt in Counter(unique_keys).items() if cnt > 1}
-        logger.error(f'Duplicate ShardedObject keys and counts: {duplicates}')
-        raise CheckpointingException(f'Duplicate ShardedObject keys: {list(duplicates.keys())}')
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/__init__.py
deleted file mode 100644
index 7177d973cfccdbf52dbc1fd8efc153f99e389eec..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-""" Various loading and saving strategies """
-
-import logging
-
-logger = logging.getLogger(__name__)
-
-try:
-    import tensorstore
-    import zarr
-
-    from .tensorstore import _import_trigger
-    from .zarr import _import_trigger
-except ImportError:
-    logger.warning('Zarr-based strategies will not be registered because of missing packages')
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/base.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/base.py
deleted file mode 100644
index 3989ea74a204349fd8623736c5a2ecaf05b2c06b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/base.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-from abc import ABC, abstractmethod
-from collections import defaultdict
-from enum import Enum
-from pathlib import Path
-from typing import Dict, List, Optional
-
-from ..mapping import CheckpointingException, ShardedStateDict, ShardedTensor, StateDict
-
-
-class StrategyAction(Enum):
-    LOAD_COMMON = 'load_common'
-    LOAD_SHARDED = 'load_sharded'
-    SAVE_COMMON = 'save_common'
-    SAVE_SHARDED = 'save_sharded'
-
-
-default_strategies = defaultdict(dict)
-
-
-def get_default_strategy(action: StrategyAction, backend: str, version: int):
-    try:
-        return default_strategies[action.value][(backend, version)]
-    except KeyError as e:
-        hint = ''
-        if backend == 'zarr':
-            try:
-                import tensorstore
-                import zarr
-            except ImportError:
-                hint = ' Please install `zarr` and `tensorstore<=0.1.45` packages'
-        raise CheckpointingException(
-            f'Cannot find a default strategy for: {(action.value, backend, version)}.{hint}'
-        ) from e
-
-
-class LoadStrategyBase(ABC):
-    @abstractmethod
-    def check_backend_compatibility(self, loaded_version):
-        raise NotImplementedError
-
-    @abstractmethod
-    def check_version_compatibility(self, loaded_version):
-        raise NotImplementedError
-
-
-class SaveStrategyBase(ABC):
-    def __init__(self, backend: str, version: int):
-        self.backend = backend
-        self.version = version
-
-
-class LoadCommonStrategy(LoadStrategyBase):
-    @abstractmethod
-    def load(self, checkpoint_dir: Path):
-        raise NotImplementedError
-
-
-class LoadShardedStrategy(LoadStrategyBase):
-    @abstractmethod
-    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
-        raise NotImplementedError
-
-    @abstractmethod
-    def load_tensors_metadata(self, checkpoint_dir: Path):
-        """Load tensors metadata from the checkpoint.
-
-        Returns a dictionary similar to a sharded state dict, but note that
-        the dictionary keys are simply ShardedTensor keys (contrary to the
-        actual sharded state dicts where keys correspond to state dict keys).
-
-        Dict values are ShardedTensors without any sharding (so, the only useful
-        information is tensors global shape and dtype).
-        """
-        raise NotImplementedError(
-            f'{self.__class__.__name__} doesnt allow loading only sharded metadata'
-        )
-
-
-class SaveCommonStrategy(SaveStrategyBase):
-    @abstractmethod
-    def save(self, common_state_dict: StateDict, checkpoint_dir: Path):
-        raise NotImplementedError
-
-
-class SaveShardedStrategy(SaveStrategyBase):
-    @abstractmethod
-    def save(self, sharded_tensors: List[ShardedTensor], checkpoint_dir: Path):
-        raise NotImplementedError
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/tensorstore.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/tensorstore.py
deleted file mode 100644
index 4a619353a19075f41b68a6cdae519313a7e305b8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/tensorstore.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-""" Strategies using TensorStore to load and save Zarr arrays. """
-
-from functools import partial
-from itertools import starmap
-from logging import getLogger
-from pathlib import Path
-
-import tensorstore as ts
-import torch
-
-from ..core import CheckpointingException
-from ..dict_utils import dict_list_map_inplace
-from ..mapping import ShardedStateDict, ShardedTensor
-from .base import LoadShardedStrategy, StrategyAction, default_strategies
-from .zarr import (
-    load_zarr_based_sharded_metadata,
-    numpy_to_torch_dtype_dict,
-    postprocess_numpy_array,
-)
-
-_import_trigger = None
-
-logger = getLogger(__name__)
-
-
-class TensorStoreLoadShardedStrategy(LoadShardedStrategy):
-    def __init__(self, load_directly_on_device: bool = False):
-        super().__init__()
-        self.load_directly_on_device = load_directly_on_device
-
-    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
-        if torch.distributed.get_rank() == 0:
-            print(f'Loading distributed checkpoint with {self.__class__.__name__}')
-            if self.load_directly_on_device:
-                print(f'Loading distributed checkpoint directly on the GPU')
-        load_fn = partial(
-            _load_from_array,
-            checkpoint_dir=checkpoint_dir,
-            load_directly_on_device=self.load_directly_on_device,
-        )
-        dict_list_map_inplace(load_fn, sharded_state_dict)
-        return sharded_state_dict
-
-    def load_tensors_metadata(self, checkpoint_dir: Path):
-        def get_ts_shape_dtype(path):
-            arr = open_ts_array(path)
-            return arr.shape, arr.dtype.numpy_dtype
-
-        return load_zarr_based_sharded_metadata(checkpoint_dir, get_ts_shape_dtype)
-
-    def check_backend_compatibility(self, loaded_version):
-        pass  # TODO
-
-    def check_version_compatibility(self, loaded_version):
-        pass  # TODO
-
-
-def merge_global_slice_with_shape(global_slice, actual_shape, key):
-    def _merge_slice(dim_slice, dim_size):
-        if isinstance(dim_slice, slice):
-            assert (
-                dim_slice.start < dim_size
-            ), f'Got empty slice for ShardedTensor {key} ({dim_slice}, {dim_size})'
-            if dim_slice.stop > dim_size:
-                dim_slice = slice(dim_slice.start, dim_size, dim_slice.step)
-        return dim_slice
-
-    assert len(global_slice) == len(actual_shape), (global_slice, actual_shape, key)
-    return tuple(starmap(_merge_slice, zip(global_slice, actual_shape)))
-
-
-def _load_from_array(
-    sharded_tensor: ShardedTensor,
-    checkpoint_dir: Path,
-    load_directly_on_device: bool = False,
-    apply_flattened_range: bool = True,
-):
-    x = _load_regular_chunk(sharded_tensor, checkpoint_dir)
-    ten = postprocess_numpy_array(x, sharded_tensor, apply_flattened_range)
-    if load_directly_on_device:
-        sharded_tensor.data.data.copy_(ten)
-        return sharded_tensor.data
-    else:
-        return ten
-
-
-def _load_regular_chunk(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
-    assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor)
-    arr = open_ts_array(checkpoint_dir / sharded_tensor.key)
-    if sharded_tensor.global_shape == arr.shape:
-        x = (
-            arr[sharded_tensor.global_slice()].read().result()
-        )  # flattened tensors loading is delayed
-    elif sharded_tensor.allow_shape_mismatch:
-        global_slice = merge_global_slice_with_shape(
-            sharded_tensor.global_slice(), arr.shape, sharded_tensor.key
-        )
-        x = arr[global_slice].read().result()  # flattened tensors loading is delayed
-    else:
-        _msg = (
-            f'Global shape mismatch for loaded ({arr.shape})'
-            f' and expected ({sharded_tensor.global_shape}) tensor'
-            f' for key {sharded_tensor.key}'
-        )
-        raise CheckpointingException(_msg)
-    return x
-
-
-def open_ts_array(arr_path: Path):
-    """Opens a Zarr file array with Tensorstore with basic setting.
-
-    Arguments:
-        arr_path (Path): path to a Zarr (Tensorstore) array
-    """
-    spec = {'driver': 'zarr', 'metadata_key': '.zarray', 'kvstore': {}}
-    spec['kvstore'] = {
-        'driver': 'file',
-        'path': str(arr_path),
-    }
-    try:
-        arr = ts.open(ts.Spec(spec), open=True).result()
-    except Exception as e:
-        raise CheckpointingException(f'Array {arr_path} could not be loaded. Error: {e}') from e
-    return arr
-
-
-default_strategies[StrategyAction.LOAD_SHARDED.value][
-    ('zarr', 1)
-] = TensorStoreLoadShardedStrategy()
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/two_stage.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/two_stage.py
deleted file mode 100644
index a9844ff6e54a629fbe689771f172239a41687f5e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/two_stage.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-""" 2-stage checkpoint loading. """
-import os
-import time
-from collections import defaultdict
-from dataclasses import dataclass
-from functools import partial, wraps
-from itertools import chain
-from logging import DEBUG, INFO, StreamHandler, getLogger
-from operator import attrgetter, itemgetter
-from pathlib import Path
-from typing import Iterable, List, NamedTuple, Optional, Tuple, Union
-
-import torch
-
-from ..dict_utils import dict_list_map_inplace, map_reduce, nested_values
-from ..mapping import ShardedStateDict, ShardedTensor, StateDict
-from .base import LoadShardedStrategy
-from .tensorstore import TensorStoreLoadShardedStrategy, _load_from_array, open_ts_array
-from .zarr import flatten_range, load_zarr_based_sharded_metadata
-
-_import_trigger = None
-
-
-timers = defaultdict(list)
-
-logger = getLogger(__name__)
-
-
-def timed(verbose=True):
-    def timed_dec(fn):
-        name = fn.__name__
-
-        @wraps(fn)
-        def wrapped(*args, **kwargs):
-            if verbose:
-                logger.debug(f'{name} init')
-            start = time.time()
-            ret = fn(*args, **kwargs)
-            took = time.time() - start
-            if verbose:
-                logger.debug(f'{name} took {took}s')
-            timers[name].append(took)
-            return ret
-
-        return wrapped
-
-    return timed_dec
-
-
-@dataclass
-class _ShardedTensorMetadata:
-    global_rank: int
-    sharded_tensor_no_data: ShardedTensor
-    dist_group_rank: Tuple[int]  # id of distributed group
-    dist_group_ranks: Tuple[int]  # id of distributed group
-    data_size: Optional[int] = None  # bytes
-
-
-def sharded_tensor_chunk_id(sharded_tensor: ShardedTensor):
-    return (
-        sharded_tensor.key,
-        sharded_tensor.global_offset,
-    )
-
-
-class TwoStageDataParallelLoadShardedStrategy(LoadShardedStrategy):
-    """ Loads one checkpoint replica from storage and broadcasts to other nodes.
-
-    This strategy loads checkpoint from storage on minimal set of nodes
-    and distributes the checkpoint to other nodes with torch.distributed.
-    Loading is performed with tensorstore.
-
-    Steps:
-    0. (optional) create Gloo distributed groups
-    1. Exchange ShardedTensors metadata between all nodes
-    2. Align needed tensors within DP groups
-    3. For each globally unique tensor:
-      a) on one of the ranks load it from storage to CPU and move to CUDA
-      b) allocate CUDA tensor on other ranks
-      c) broadcast within DP group
-      d) copy tensor content to the model param location
-      e) free tensor buffers from a) and b)
-
-    Notes:
-    1. Loading and broadcasting is done sequentially to avoid both host and device OOMs
-    2. There is a lot of overlap potential between all three steps done for each tensor:
-      a) loading from storage to numpy
-      b) moving CPU tensors to CUDA
-      c) broadcast
-
-    """
-
-    def __init__(self, data_parallel_group, cpu_transfer=True):
-        super().__init__()
-
-        self.cpu_transfer = cpu_transfer
-        self.data_parallel_group_orig = data_parallel_group
-        self.data_parallel_group = None if cpu_transfer else data_parallel_group
-        self.dp_group_ranks = tuple(
-            sorted(torch.distributed.get_process_group_ranks(data_parallel_group))
-        )
-        self.dp_group_rank = torch.distributed.get_rank(self.data_parallel_group_orig)
-        self.global_rank = torch.distributed.get_rank()
-
-    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
-        self.maybe_init_gloo_group()
-        all_tensors_sorted = self._build_load_plan(sharded_state_dict)
-        self._exchange_loaded_tensors(all_tensors_sorted, sharded_state_dict, checkpoint_dir)
-        self.summarize_load_times()
-        return sharded_state_dict
-
-    def summarize_load_times(self):
-        torch.distributed.barrier()
-        logger.info('Checkpoint loading finished. Summary:')
-        for key, times in sorted(timers.items()):
-            times_sum = sum(times)
-            max_times = torch.tensor([times_sum], device='cuda')
-            avg_times = torch.tensor([times_sum], device='cuda')
-            torch.distributed.all_reduce(max_times, op=torch.distributed.ReduceOp.MAX)
-            torch.distributed.all_reduce(avg_times, op=torch.distributed.ReduceOp.SUM)
-            avg_times /= torch.distributed.get_world_size()
-            if torch.distributed.get_rank() == 0:
-                logger.info(f'{key}: max {max_times[0]}, avg {avg_times[0]}')
-
-    @timed(verbose=False)
-    def load_tensor_from_storage(self, checkpoint_dir, ten_meta: _ShardedTensorMetadata):
-        logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) init')
-        ret = _load_from_array(
-            ten_meta.sharded_tensor_no_data,
-            checkpoint_dir,
-            load_directly_on_device=False,
-            apply_flattened_range=False,
-        )
-        logger.debug(f'_load_from_array({ten_meta.sharded_tensor_no_data.key}) DONE')
-        return ret
-
-    @timed()
-    def maybe_init_gloo_group(self):
-        if not self.cpu_transfer:
-            return
-        all_groups = [None] * torch.distributed.get_world_size()
-        torch.distributed.all_gather_object(all_groups, self.dp_group_ranks)
-        all_groups = set(tuple(sorted(gr)) for gr in all_groups)
-        for group_ranks in sorted(all_groups):
-            gloo_pg = torch.distributed.new_group(ranks=group_ranks, backend='gloo')
-            if self.global_rank in group_ranks:
-                self.data_parallel_group = gloo_pg
-                assert self.dp_group_rank == torch.distributed.get_rank(self.data_parallel_group)
-
-    def check_backend_compatibility(self, loaded_version):
-        pass  # TODO
-
-    def check_version_compatibility(self, loaded_version):
-        pass  # TODO
-
-    @timed()
-    def _build_load_plan(
-        self, sharded_state_dict: ShardedStateDict
-    ) -> List[_ShardedTensorMetadata]:
-        local_meta = [
-            _ShardedTensorMetadata(
-                self.global_rank,
-                sharded_ten.without_data(),
-                self.dp_group_rank,
-                self.dp_group_ranks,
-            )
-            for sharded_ten in nested_values(sharded_state_dict)
-        ]
-        all_meta = [None] * torch.distributed.get_world_size(group=self.data_parallel_group)
-        torch.distributed.all_gather_object(all_meta, local_meta, group=self.data_parallel_group)
-        all_meta = list(chain.from_iterable(all_meta))
-        all_tensors_sorted = self.deduplicate_chunks(all_meta)
-        return all_tensors_sorted
-
-    @timed()
-    def deduplicate_chunks(self, ten_metas: List[_ShardedTensorMetadata]):
-        """ Group tensors by chunk and then pick the tensor with the lowest rank.
-
-        NOTE: with proper loading overlap, loading from randomized ranks
-         (instead of the smallest one) could be beneficial here.
-        """
-        ten_metas = map_reduce(
-            ten_metas,
-            key_fn=lambda meta: sharded_tensor_chunk_id(meta.sharded_tensor_no_data),
-            reduce_fn=partial(min, key=attrgetter('dist_group_rank')),
-        )
-        all_metas_sorted = list(map(itemgetter(1), sorted(ten_metas.items())))
-        return all_metas_sorted
-
-    @timed()
-    def _exchange_loaded_tensors(
-        self, ten_metas: List[_ShardedTensorMetadata], sharded_state_dict, checkpoint_dir
-    ):
-        logger.debug(f'_exchange_loaded_tensors, num ten_metas: {len(ten_metas)}')
-        for ten_meta in ten_metas:
-
-            src_rank = torch.distributed.get_global_rank(
-                self.data_parallel_group, ten_meta.dist_group_rank
-            )
-
-            if self.dp_group_rank == ten_meta.dist_group_rank:
-                exchange_tensor = self.load_tensor_from_storage(checkpoint_dir, ten_meta)
-                if not self.cpu_transfer:
-                    exchange_tensor = exchange_tensor.cuda()
-            else:
-                # TODO: for non-flattened ranges we could reuse the buffer from the start here
-                exchange_tensor = torch.empty(
-                    ten_meta.sharded_tensor_no_data.local_shape,
-                    device='cpu' if self.cpu_transfer else 'cuda',
-                    dtype=ten_meta.sharded_tensor_no_data.dtype,
-                )
-
-            logger.debug(
-                f'exchange {ten_meta.sharded_tensor_no_data.key}, {exchange_tensor.shape}({exchange_tensor.numel()}), broadcast({src_rank} -> {self.dp_group_ranks})'
-            )
-            torch.distributed.broadcast(
-                exchange_tensor, group=self.data_parallel_group, src=src_rank
-            )
-            self._distribute_data_to_state_dict(ten_meta, exchange_tensor, sharded_state_dict)
-            logger.debug(f'exchange {ten_meta.sharded_tensor_no_data.key} done')
-
-            # free buffer memory
-            exchange_tensor = None
-
-    @timed(verbose=False)
-    def _distribute_data_to_state_dict(
-        self,
-        ten_meta: _ShardedTensorMetadata,
-        loaded_ten: torch.Tensor,
-        sharded_state_dict: ShardedStateDict,
-    ):
-        tensor_key = sharded_tensor_chunk_id(ten_meta.sharded_tensor_no_data)
-
-        def _fill_in_data(t: Union[ShardedTensor, torch.Tensor]):
-            if not isinstance(t, ShardedTensor) or sharded_tensor_chunk_id(t) != tensor_key:
-                # already filled-in or key not matching
-                return t
-            sharded_tensor: ShardedTensor = t
-            x = loaded_ten
-            if sharded_tensor.flattened_range is not None:
-                x = flatten_range(sharded_tensor, x)
-
-            # Reuse existing buffer
-            sharded_tensor.data.data.copy_(x)
-            return sharded_tensor.data
-
-        dict_list_map_inplace(_fill_in_data, sharded_state_dict)
-
-    def load_tensors_metadata(self, checkpoint_dir: Path):
-        def get_ts_shape_dtype(path):
-            arr = open_ts_array(path)
-            return arr.shape, arr.dtype.numpy_dtype
-
-        return load_zarr_based_sharded_metadata(checkpoint_dir, get_ts_shape_dtype)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/zarr.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/zarr.py
deleted file mode 100644
index 0ce0cf0e27c8ab2441c6432840bb1c8f368632c3..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/strategies/zarr.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-""" Strategies using Zarr as an underlying format. """
-import os
-from functools import partial
-from logging import getLogger
-from pathlib import Path
-from typing import Callable, List, Optional, Tuple
-
-import numpy as np
-import torch
-import zarr
-
-from ..core import CheckpointingException
-from ..dict_utils import dict_list_map_inplace
-from ..mapping import ShardedStateDict, ShardedTensor, is_main_replica
-from .base import LoadShardedStrategy, SaveShardedStrategy, StrategyAction, default_strategies
-
-numpy_to_torch_dtype_dict = {
-    np.dtype('bool'): torch.bool,
-    np.dtype('uint8'): torch.uint8,
-    np.dtype('int8'): torch.int8,
-    np.dtype('int16'): torch.int16,
-    np.dtype('int32'): torch.int32,
-    np.dtype('int64'): torch.int64,
-    np.dtype('float16'): torch.float16,
-    np.dtype('float32'): torch.float32,
-    np.dtype('float64'): torch.float64,
-    np.dtype('complex64'): torch.complex64,
-    np.dtype('complex128'): torch.complex128,
-}
-
-torch_to_numpy_dtype_dict = {v: k for k, v in numpy_to_torch_dtype_dict.items()}
-
-
-try:
-    import tensorstore
-
-    HAS_BFLOAT16 = True
-    numpy_to_torch_dtype_dict[np.dtype('bfloat16')] = torch.bfloat16
-    torch_to_numpy_dtype_dict[torch.bfloat16] = np.dtype('bfloat16')
-except ImportError:
-    HAS_BFLOAT16 = False
-
-_import_trigger = None
-
-logger = getLogger(__name__)
-
-
-class ZarrSaveShardedStrategy(SaveShardedStrategy):
-    def save(self, sharded_tensors: List[ShardedTensor], checkpoint_dir: Path):
-        arrays = _create_or_open_zarr_arrays(sharded_tensors, checkpoint_dir)
-        for ten, arr in zip(sharded_tensors, arrays):
-            _save_to_existing_array(ten, arr)
-        torch.distributed.barrier()
-
-
-def _create_or_open_zarr_arrays(
-    sharded_tensors: List[ShardedTensor], checkpoint_dir: Path
-) -> List[Optional[zarr.Array]]:
-    """ Returns list of zarr arrays corresponding to given tensors.
-
-    For a sharded tensors that:
-    a) is main replica and represents the first chunk (all offsets 0), creates the Zarr array
-    b) is main replica but not the first chunk, opens the arrays created in (a) (possibly by other process)
-    c) otherwise, sets the corresponding array to None since it won't be used
-
-    Args:
-        sharded_tensors (List[ShardedTensor]): sharded tensors from a given rank that will be saved to checkpoint
-        checkpoint_dir (Path): checkpoint in which the arrays will be created
-    """
-    arrays = []
-    for ten in sharded_tensors:
-        arr = _create_zarr_array(ten, checkpoint_dir) if _should_create_array(ten) else None
-        arrays.append(arr)
-
-    torch.distributed.barrier()
-    # Open arrays created above by other processes
-    for arr_idx, ten in enumerate(sharded_tensors):
-        if arrays[arr_idx] is not None:
-            # array created by this process
-            assert _should_create_array(ten), ten
-            continue
-        if not is_main_replica(ten.replica_id):
-            # this array won't be needed for saving and can stay None
-            continue
-        open_kwargs = {}
-        if ten.flattened_range is not None:
-            open_kwargs['synchronizer'] = zarr.ProcessSynchronizer(
-                str(checkpoint_dir / f'{ten.key}.sync')
-            )
-        arrays[arr_idx] = zarr.open(checkpoint_dir / ten.key, 'r+', **open_kwargs)
-    return arrays
-
-
-def _should_create_array(ten: ShardedTensor):
-    return (
-        is_main_replica(ten.replica_id)
-        and set(ten.global_offset) == {0}
-        and (ten.flattened_range is None or ten.flattened_range.start == 0)
-    )
-
-
-def _save_to_existing_array(sharded_tensor: ShardedTensor, arr: Optional[zarr.Array]):
-    if not is_main_replica(sharded_tensor.replica_id):
-        return
-    assert arr is not None
-    x = sharded_tensor.data
-    x = x.detach().cpu()
-    torch.cuda.synchronize()
-    if x.dtype == torch.bfloat16:
-        x = x.float()
-        x = x.numpy()
-        x = x.astype('bfloat16')
-    else:
-        x = x.numpy()
-
-    if sharded_tensor.flattened_range is None:
-        arr[sharded_tensor.global_slice()] = x
-    else:
-        arr.set_coordinate_selection(sharded_tensor.global_coordinates(), x)
-
-
-def _create_zarr_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
-    np_dtype = torch_to_numpy_dtype_dict[sharded_tensor.dtype]
-    try:
-        arr = zarr.create(
-            sharded_tensor.global_shape,
-            dtype=np_dtype,
-            store=checkpoint_dir / sharded_tensor.key,
-            chunks=sharded_tensor.max_allowed_chunks(),
-            compressor=None,
-            fill_value=None,
-            write_empty_chunks=True,
-        )
-    except zarr.errors.ContainsArrayError as e:
-        raise CheckpointingException(
-            f'Array {checkpoint_dir / sharded_tensor.key} already exists'
-        ) from e
-
-    if HAS_BFLOAT16 and np_dtype == np.dtype('bfloat16'):
-        arr._dtype = np_dtype
-        zarray = arr.store['.zarray']
-        arr.store['.zarray'] = zarray.replace(b'<V2', b'bfloat16')
-    return arr
-
-
-class ZarrLoadShardedStrategy(LoadShardedStrategy):
-    def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
-        dict_list_map_inplace(
-            partial(_load_from_array, checkpoint_dir=checkpoint_dir), sharded_state_dict
-        )
-        return sharded_state_dict
-
-    def load_tensors_metadata(self, checkpoint_dir: Path):
-        def get_zarr_shape_dtype(path):
-            arr = zarr.open(path, 'r')
-            return arr.shape, arr.dtype
-
-        return load_zarr_based_sharded_metadata(checkpoint_dir, get_zarr_shape_dtype)
-
-    def check_backend_compatibility(self, loaded_version):
-        pass  # TODO
-
-    def check_version_compatibility(self, loaded_version):
-        pass  # TODO
-
-
-def _load_from_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
-    assert isinstance(sharded_tensor, ShardedTensor), type(sharded_tensor)
-    try:
-        arr = zarr.open(checkpoint_dir / sharded_tensor.key, 'r')
-    except zarr.errors.PathNotFoundError as e:
-        raise CheckpointingException(
-            f'Array {checkpoint_dir / sharded_tensor.key} not found'
-        ) from e
-
-    if not sharded_tensor.allow_shape_mismatch and sharded_tensor.global_shape != arr.shape:
-        _msg = (
-            f'Global shape mismatch for loaded ({arr.shape})'
-            f' and expected ({sharded_tensor.global_shape}) tensor'
-            f' for key {sharded_tensor.key}'
-        )
-        raise CheckpointingException(_msg)
-
-    x = arr[sharded_tensor.global_slice()]  # flattened tensors loading is delayed
-    return postprocess_numpy_array(x, sharded_tensor)
-
-
-def postprocess_numpy_array(loaded_array, sharded_tensor, apply_flattened_range=True):
-    x = loaded_array
-    if HAS_BFLOAT16 and x.dtype == np.dtype('bfloat16'):
-        x = x.astype(np.dtype('float32'))
-        x = torch.from_numpy(x)
-        x = x.bfloat16()
-    else:
-        x = torch.from_numpy(x)
-    # TODO: consider some other consistency checks
-    if x.shape != sharded_tensor.local_shape:
-        if sharded_tensor.allow_shape_mismatch:
-            x = pad_to_expected_shape(x, sharded_tensor)
-        else:
-            _msg = (
-                f'Local shape mismatch for loaded ({x.shape})'
-                f' and expected ({sharded_tensor.local_shape}) tensor'
-                f' for key {sharded_tensor.key}'
-            )
-            raise CheckpointingException(_msg)
-
-    if apply_flattened_range and sharded_tensor.flattened_range is not None:
-        x = flatten_range(sharded_tensor, x)
-
-    # TODO: consider cuda() tensors support
-    return x
-
-
-def flatten_range(sharded_tensor, x):
-    return x.flatten()[sharded_tensor.flattened_range]
-
-
-def pad_to_expected_shape(x: torch.Tensor, expected_sharded_ten: ShardedTensor):
-    pad_args = []
-    assert len(x.shape) == len(expected_sharded_ten.local_shape)
-    # Reversed iteration order because F.pad expects so
-    for x_sh, exp_sh, axis_fragm in reversed(
-        list(
-            zip(x.shape, expected_sharded_ten.local_shape, expected_sharded_ten.axis_fragmentations)
-        )
-    ):
-        if x_sh == exp_sh:
-            pad_args.extend((0, 0))
-        elif x_sh > exp_sh:
-            assert (
-                False
-            ), f'Expected shape ({exp_sh}) smaller than actual ({x_sh}) for {repr(expected_sharded_ten)}'
-        else:
-            pad_args.extend((0, exp_sh - x_sh))
-    # TODO: behavior control with envvar is for testing purposes only, remove it
-    if not int(os.environ.get('DIST_CKPT_PAD_REPLICATE', 0)):
-        return torch.nn.functional.pad(x, pad_args)
-
-    # unsqueeze and squeeze to get shapes supported by cudnn
-    print(f'Replicating last row for {expected_sharded_ten.key}')
-    if x.dtype == torch.bfloat16:
-        return (
-            torch.nn.functional.pad(x.float().unsqueeze(0), pad_args, mode='replicate')
-            .squeeze(0)
-            .bfloat16()
-        )
-    return torch.nn.functional.pad(x.unsqueeze(0), pad_args, mode='replicate').squeeze(0)
-
-
-def load_zarr_based_sharded_metadata(
-    checkpoint_dir: Path, get_shape_dtype_fn: Callable[[str], Tuple[Tuple[int], np.dtype]]
-) -> ShardedStateDict:
-    """Load metadata of Zarr arrays.
-
-    Arguments:
-        checkpoint_dir (str): checkpoint root directory
-        get_shape_dtype_fn (str -> ((int, ...), np.dtype)): a function returning
-            an array shape and dtype for a given Zarr array path
-    """
-    sharded_state_dict = {}
-    for subdir in checkpoint_dir.iterdir():
-        if not subdir.is_dir() or not (subdir / '.zarray').exists():
-            continue
-        key = subdir.name
-        arr_shape, arr_dtype = get_shape_dtype_fn(str(subdir))
-
-        sharded_state_dict[key] = ShardedTensor(
-            key,
-            None,
-            numpy_to_torch_dtype_dict[arr_dtype],
-            arr_shape,
-            arr_shape,
-            tuple(0 for _ in arr_shape),
-            tuple(1 for _ in arr_shape),
-        )
-    return sharded_state_dict
-
-
-# default_strategies[StrategyAction.LOAD_SHARDED.value][('zarr', 1)] = ZarrLoadShardedStrategy()
-default_strategies[StrategyAction.SAVE_SHARDED.value][('zarr', 1)] = ZarrSaveShardedStrategy(
-    'zarr', 1
-)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/utils.py
deleted file mode 100644
index f7976f007408197338b9f9a96eec85db4d63d087..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/dist_checkpointing/utils.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
-
-from typing import Tuple
-
-from .dict_utils import dict_list_map_inplace, extract_matching_values
-from .mapping import (
-    LocalNonpersitentObject,
-    ShardedStateDict,
-    ShardedTensor,
-    ShardedTensorFactory,
-    StateDict,
-)
-
-
-def extract_sharded_tensors(
-    sharded_state_dict: ShardedStateDict,
-) -> Tuple[ShardedStateDict, StateDict]:
-    return extract_matching_values(sharded_state_dict, lambda v: isinstance(v, ShardedTensor))
-
-
-def extract_sharded_tensors_and_factories(
-    sharded_state_dict: ShardedStateDict,
-) -> Tuple[ShardedStateDict, StateDict]:
-    return extract_matching_values(
-        sharded_state_dict, lambda v: isinstance(v, (ShardedTensor, ShardedTensorFactory))
-    )
-
-
-def extract_sharded_tensors_or_nonpersistent(
-    sharded_state_dict: ShardedStateDict,
-) -> Tuple[ShardedStateDict, StateDict]:
-    return extract_matching_values(
-        sharded_state_dict,
-        lambda v: isinstance(v, (ShardedTensor, LocalNonpersitentObject, ShardedTensorFactory)),
-    )
-
-
-def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix: str):
-    def add_prefix(t):
-        if isinstance(t, ShardedTensor):
-            t.key = f'{prefix}.{t.key}'
-        return t
-
-    dict_list_map_inplace(add_prefix, sharded_state_dict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/__init__.py
deleted file mode 100644
index 34c7209a27fde7c5202f275663d951276caff85d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .distributed_data_parallel import DistributedDataParallel
-from .finalize_model_grads import finalize_model_grads
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/distributed_data_parallel.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/distributed_data_parallel.py
deleted file mode 100644
index 63f6e3d65ec2bb3a7d771f3dd6fea61216112d67..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/distributed_data_parallel.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from contextlib import contextmanager
-from typing import Dict
-
-import torch
-
-from .. import parallel_state
-from ..transformer.module import MegatronModule
-from ..transformer.transformer_config import TransformerConfig
-from .grad_buffer import GradBuffer
-
-
-class DistributedDataParallel(MegatronModule):
-    """
-    DDP wrapper which stores grads in contiguous buffers. Also has option of overlapping
-    communication with backprop computation by breaking up full model's gradients into smaller
-    buckets and running all-reduce / reduce-scatter on each bucket asynchronously. This class
-    also provides the option to do the gradient accumulation in a type other than the param type
-    (e.g., fp32 for a bf16 model).
-
-    Arguments:
-        config: Transformer config object.
-        module: Underlying model.
-        data_parallel_group: Data-parallel process group.
-        accumulate_allreduce_grads_in_fp32: If true, do the gradient accumulation and
-            communication in fp32.
-        overlap_grad_reduce: If true, overlap communication with backprop computation by
-            breaking up grads into buckets. If false, single synchronous communication call
-            is used instead.
-        use_distributed_optimizer: If true, issue reduce-scatter communication calls as part
-            of distributed optimizer. If false, issue all-reduce communication calls.
-        disable_bucketing: If true, force assign all parameters to a single bucket. If false,
-            use standard bucketing policy: assign parameters to smaller buckets and all-reduce
-            per bucket _if_ overlap_grad_reduce is True and pp_rank is 0.
-
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        module: torch.nn.Module,
-        data_parallel_group: torch.distributed.ProcessGroup,
-        accumulate_allreduce_grads_in_fp32: bool,
-        overlap_grad_reduce: bool,
-        use_distributed_optimizer: bool,
-        disable_bucketing: bool = False,
-        bucket_size: int = 40000000,
-    ):
-        super().__init__(config=config)
-        self.module = module
-
-        # Set bucket_size to infinity if overlap_grad_reduce is False.
-        self.overlap_grad_reduce = overlap_grad_reduce
-        self.use_distributed_optimizer = use_distributed_optimizer
-
-        # Turn off bucketing if overlap_grad_reduce is False, if we are on a pipeline stage
-        # that is not the first (since data-parallel communication on these stages is not on
-        # the critical path), or if disable_bucketing is True (e.g., we might not want to
-        # break up model parameters into buckets for model chunks after the first
-        # in the interleaved schedule).
-        if not self.overlap_grad_reduce:
-            bucket_size = None
-        if parallel_state.get_pipeline_model_parallel_rank() > 0:
-            bucket_size = None
-        if disable_bucketing:
-            bucket_size = None
-        self.bucket_size = bucket_size
-
-        self.module = module
-        self.grad_buffers = {}
-        self.expert_grads = []
-        self.grad_buffer_param_index_map = {}
-        self.param_to_grad_buffer = {}
-
-        # Group parameters by their gradient type.
-        grad_dtype_to_params = {}
-        param_to_name = {}
-        for name, param in self.module.named_parameters():
-            if param.requires_grad and getattr(param, 'allreduce', True):
-                param.grad_added_to_main_grad = False
-                param_to_name[param] = name
-                dtype = torch.float if accumulate_allreduce_grads_in_fp32 else param.dtype
-
-                params = grad_dtype_to_params.get(dtype, [])
-                params.append(param)
-                grad_dtype_to_params[dtype] = params
-
-        # Allocate the grad buffers and map the grads.
-        # The grad buffer under the hood creates buckets as appropriate based on bucket_size.
-        self.data_parallel_world_size = torch.distributed.get_world_size(group=data_parallel_group)
-        for dtype, params in grad_dtype_to_params.items():
-            self.grad_buffers[dtype] = GradBuffer(
-                dtype,
-                params,
-                data_parallel_group,
-                bucket_size,
-                param_to_name,
-                self.overlap_grad_reduce,
-                self.use_distributed_optimizer,
-            )
-            self.grad_buffer_param_index_map[dtype] = self.grad_buffers[dtype].param_index_map
-            for param in params:
-                self.param_to_grad_buffer[param] = self.grad_buffers[dtype]
-
-        # Allocate separate buffer for MoE params' grads.
-        for param in self.module.parameters():
-            if param.requires_grad and not getattr(param, 'allreduce', True):
-                param.grad_added_to_main_grad = False
-                dtype = torch.float if accumulate_allreduce_grads_in_fp32 else param.dtype
-                param.main_grad = torch.zeros(
-                    param.data.shape,
-                    dtype=dtype,
-                    device=torch.cuda.current_device(),
-                    requires_grad=False,
-                )
-                self.expert_grads.append(param.main_grad)
-
-        # Register backward hook.
-        # Accumulation function for the gradients need to be stored so they
-        # don't go out of scope.
-        self.grad_accs = []
-        for param in self.module.parameters():
-            if param.requires_grad:
-                # Expand so we get access to grad_fn.
-                param_tmp = param.expand_as(param)
-                # Get the gradient accumulator function.
-                grad_acc = param_tmp.grad_fn.next_functions[0][0]
-                grad_acc.register_hook(self._make_param_hook(param, self.param_to_grad_buffer))
-                self.grad_accs.append(grad_acc)
-
-    def forward(self, *inputs, **kwargs):
-        """
-        Calls the wrapped module's forward() method.
-        """
-        return self.module(*inputs, **kwargs)
-
-    def _make_param_hook(
-        self, param: torch.nn.Parameter, param_to_grad_buffer: Dict[torch.nn.Parameter, GradBuffer]
-    ):
-        """
-        Creates the all-reduce / reduce-scatter hook for backprop.
-        """
-
-        def param_hook(*unused):
-            if param.requires_grad:
-                if self.overlap_grad_reduce:
-                    assert (
-                        param.grad is not None
-                    ), 'param.grad being None is not safe when overlap_grad_reduce is True'
-                if param.grad is not None and not param.grad_added_to_main_grad:
-                    param.main_grad.add_(param.grad.data)
-                param.grad = None
-                if self.overlap_grad_reduce:
-                    param_to_grad_buffer[param].register_grad_ready(param)
-
-        return param_hook
-
-    @contextmanager
-    def no_sync(self):
-        """
-        Context manager that turns off gradient synchronization.
-        """
-        for grad_buffer in self.grad_buffers.values():
-            grad_buffer.is_last_microbatch = False
-        try:
-            yield
-        finally:
-            for grad_buffer in self.grad_buffers.values():
-                grad_buffer.is_last_microbatch = True
-
-    def start_grad_sync(self, *unused):
-        """
-        Initiates grad sync (all-reduce or reduce-scatter) communication operations
-        for all model gradients.
-
-        When overlap_grad_reduce is set to True, dispatches asynchronous communication
-        calls. When overlap_grad_reduce is set to False, calls synchronous
-        communication ops.
-        """
-        for grad_buffer in self.grad_buffers.values():
-            grad_buffer.start_grad_sync()
-
-    def finish_grad_sync(self):
-        """
-        Finishes grad sync (all-reduce or reduce-scatter) communication operations
-        for all model gradients.
-
-        When overlap_grad_reduce is set to True, waits for asynchronous communication
-        calls to complete. When overlap_grad_reduce is set to False, calls synchronous
-        communication ops.
-        """
-        for grad_buffer in self.grad_buffers.values():
-            grad_buffer.finish_grad_sync()
-
-        for expert_grad in self.expert_grads:
-            expert_grad /= self.data_parallel_world_size
-
-    def zero_grad_buffer(self, zero_buffer):
-        """
-        Zeros out all grad buffers. Needs to be called at the beginning of each
-        training iteration.
-
-        When zero_buffer is set to True, the underlying grad buffer is zeroed out.
-        """
-        for param in self.module.parameters():
-            if param.requires_grad:
-                param.grad_added_to_main_grad = False
-        for grad_buffer in self.grad_buffers.values():
-            grad_buffer.reset(zero_buffer)
-        for expert_grad in self.expert_grads:
-            expert_grad.zero_()
-
-    def broadcast_params(self):
-        """
-        Syncs parameters across all DP ranks.
-        """
-        for param in self.module.parameters():
-            torch.distributed.broadcast(
-                param.data,
-                src=parallel_state.get_data_parallel_src_rank(with_context_parallel=True),
-                group=parallel_state.get_data_parallel_group(with_context_parallel=True),
-            )
-
-    def state_dict(self, prefix='', keep_vars=False):
-        """
-        Returns a dictionary containing references to the whole state of the
-        wrapped module.
-
-        Both parameters and persistent buffers (e.g. running averages) are included.
-        Keys are corresponding parameter and buffer names. Parameters and buffers
-        set to None are not included.
-        """
-        return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """
-        Returns wrapped module's state_dict for checkpoint saving.
-        """
-        return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
-
-    def load_state_dict(self, state_dict, strict=True):
-        """
-        Copies parameters and buffers from state_dict into the wrapped module and its
-        descendants. If strict is True, then the keys of state_dict must exactly match
-        the keys returned by this module’s state_dict() function.
-        """
-        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/finalize_model_grads.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/finalize_model_grads.py
deleted file mode 100644
index 916e4f3ecbffafca7f97d2b33193bb289e12228d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/finalize_model_grads.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from typing import List
-
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from .. import parallel_state
-from ..transformer.transformer_config import TransformerConfig
-from ..utils import get_attr_wrapped_model, get_model_config
-
-
-def _allreduce_word_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig):
-    """
-    All-reduce word embedding grads.
-
-    Reduce grads across first and last stages to ensure that word_embeddings parameters stay in
-    sync. This should only run for models that support pipelined model parallelism (BERT and GPT).
-    """
-
-    if (
-        parallel_state.is_rank_in_embedding_group(ignore_virtual=True)
-        and parallel_state.get_pipeline_model_parallel_world_size() > 1
-    ):
-        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-            model_module = model[0]
-        elif parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-            model_module = model[-1]
-        else:  # We do not support the interleaved schedule for T5 yet.
-            model_module = model[0]
-
-        # Look for module with 'pre_process' attribute to get around the fact that DDP and
-        # other wrapper classes inherit from non-core MegatronModule that has
-        # 'share_embeddings_and_output_weights' and 'shared_embedding_or_output_weight'
-        # attributes already, causing get_attr_wrapped_model() to not unwrap anything here.
-        # TODO: Clean this up once the wrapper classes inherit from core MegatronModule.
-        model_module = get_attr_wrapped_model(model_module, 'pre_process', return_model_obj=True)
-        if model_module.share_embeddings_and_output_weights:
-            weight = model_module.shared_embedding_or_output_weight()
-            grad = weight.main_grad
-            torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group())
-
-
-def _allreduce_position_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig):
-    """
-    All-reduce position_embeddings grad across first (encoder) and split (decoder) stages to
-    ensure that position embeddings parameters stay in sync. This should only run for T5 models
-    with pipeline parallelism.
-    """
-    if (
-        parallel_state.is_rank_in_position_embedding_group()
-        and parallel_state.get_pipeline_model_parallel_world_size() > 1
-        and config.pipeline_model_parallel_split_rank is not None
-    ):
-        model_module = model[0]
-        grad = get_attr_wrapped_model(
-            model_module, 'language_model.embedding.position_embeddings.weight.main_grad'
-        )
-        torch.distributed.all_reduce(grad, group=parallel_state.get_position_embedding_group())
-
-
-def _allreduce_embedding_grads(model: List[torch.nn.Module], config: TransformerConfig):
-    """
-    All-reduce both word and position embeddings.
-    """
-    _allreduce_word_embedding_grads(model, config)
-    _allreduce_position_embedding_grads(model, config)
-
-
-def _allreduce_layernorm_grads(model: List[torch.nn.Module], config: TransformerConfig):
-    """
-    All-reduce layernorm grads (for sequence parallelism).
-    """
-
-    # All-reduce layernorm parameters across model parallel nodes
-    # when sequence parallelism is used
-    if parallel_state.get_tensor_model_parallel_world_size() > 1 and config.sequence_parallel:
-        grads = []
-        for model_chunk in model:
-            for param in get_attr_wrapped_model(model_chunk, 'parameters')():
-                if getattr(param, 'sequence_parallel', False):
-                    grad = param.main_grad
-                    grads.append(grad.data)
-        coalesced = _flatten_dense_tensors(grads)
-        torch.distributed.all_reduce(
-            coalesced, group=parallel_state.get_tensor_model_parallel_group()
-        )
-        for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-            buf.copy_(synced)
-
-
-def _allreduce_expert_grads(model: List[torch.nn.Module], config: TransformerConfig):
-    """
-    All-reduce expert grads (for expert parallelism).
-    """
-
-    # All-reduce switchmlp parameters across data modulo expert parallel nodes
-    if (
-        config.expert_model_parallel_size > 1
-        and config.expert_model_parallel_size < parallel_state.get_data_parallel_world_size()
-    ):
-        grads = []
-        for model_chunk in model:
-            for param in get_attr_wrapped_model(model_chunk, 'parameters')():
-                if not getattr(param, 'allreduce', True):
-                    grad = param.main_grad
-                    grads.append(grad.data)
-        coalesced = _flatten_dense_tensors(grads)
-        torch.distributed.all_reduce(
-            coalesced, group=parallel_state.get_data_modulo_expert_parallel_group()
-        )
-        for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-            buf.copy_(synced)
-
-
-def finalize_model_grads(model: List[torch.nn.Module]):
-    """
-    All-reduce all model grads across DP replicas, layernorm grads for sequence parallelism,
-    embedding grads across first and last pipeline stages (if not tied), and expert grads
-    for expert parallelism.
-    """
-
-    config = get_model_config(model[0])
-
-    # All-reduce / reduce-scatter across DP replicas.
-    if config.timers is not None:
-        config.timers('all-grads-sync', log_level=1).start(barrier=config.barrier_with_L1_time)
-    for model_chunk in model:
-        model_chunk.finish_grad_sync()
-    if config.timers is not None:
-        config.timers('all-grads-sync').stop()
-
-    # All-reduce layer-norm grads (for sequence parallelism).
-    if config.timers is not None:
-        config.timers('layernorm-grads-all-reduce', log_level=1).start(
-            barrier=config.barrier_with_L1_time
-        )
-    _allreduce_layernorm_grads(model, config)
-    if config.timers is not None:
-        config.timers('layernorm-grads-all-reduce').stop()
-
-    # All-reduce embedding grads (for pipeline parallelism).
-    if config.timers is not None:
-        config.timers('embedding-grads-all-reduce', log_level=1).start(
-            barrier=config.barrier_with_L1_time
-        )
-    _allreduce_embedding_grads(model, config)
-    if config.timers is not None:
-        config.timers('embedding-grads-all-reduce').stop()
-
-    # All-reduce expert grads (for expert parallelism).
-    if config.timers is not None:
-        config.timers('expert-grads-all-reduce', log_level=1).start(
-            barrier=config.barrier_with_L1_time
-        )
-    _allreduce_expert_grads(model, config)
-    if config.timers is not None:
-        config.timers('expert-grads-all-reduce').stop()
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/grad_buffer.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/grad_buffer.py
deleted file mode 100644
index 8bc88a8e710db31840c80444ae726f0b6bd6c1be..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/distributed/grad_buffer.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import math
-from logging import getLogger
-from typing import Dict, List
-
-import torch
-
-from .. import parallel_state
-
-logger = getLogger(__name__)
-
-
-def shard_buffer(buffer: torch.Tensor, data_parallel_world_size: int):
-    """
-    Shard buffer into data_parallel_world_size chunks of equal size.
-    """
-    assert buffer.numel() % data_parallel_world_size == 0
-    shard_size = buffer.numel() // data_parallel_world_size
-    sharded_buffer = [
-        buffer[(r * shard_size) : ((r + 1) * shard_size)] for r in range(data_parallel_world_size)
-    ]
-    return sharded_buffer
-
-
-class Bucket:
-    """
-    Bucket to keep track of a subset of the model's gradients. Provides functionality to register
-    when params in the bucket have grads ready to be synced; an asynchronous communication call
-    is automatically launched when _all_ params in the bucket have grads ready.
-
-    Arguments:
-        params: List of parameters whose gradients are collated in this bucket.
-        data: View in larger GradBuffer that this bucket is responsible for.
-        offset: Offset of this bucket's view in the larger GradBuffer.
-        data_parallel_group: Data-parallel process group.
-        data_parallel_world_size: World size using the data-parallel group group.
-        overlap_grad_reduce: If true, overlap communication with backprop computation by
-            breaking up grads into buckets. If false, single synchronous communication call
-            is used instead.
-        use_distributed_optimizer: If true, issue reduce-scatter communication calls as part
-            of distributed optimizer. If false, issue all-reduce communication calls.
-    """
-
-    def __init__(
-        self,
-        params: List[torch.nn.Parameter],
-        data: torch.Tensor,
-        offset: int,
-        data_parallel_group: torch.distributed.ProcessGroup,
-        data_parallel_world_size: int,
-        overlap_grad_reduce: bool,
-        use_distributed_optimizer: bool,
-    ):
-        # State for bookkeeping: params is the set of parameters this bucket is
-        # responsible for, params_with_grad is the set of parameters with grads
-        # available. When overlap_grad_reduce is True, communication (all-reduce
-        # or reduce-scatter) is issued when params_with_grad equals params.
-        self.params_list = params
-        self.params = set(params)
-        self.params_with_grad = set()
-        self.data = data
-        # The distributed optimizer needs to keep track of this bucket's offset
-        # within the full grad_buffer.
-        self.offset = offset
-        self.data_parallel_group = data_parallel_group
-        self.data_parallel_world_size = data_parallel_world_size
-        self.data_parallel_rank = torch.distributed.get_rank(group=data_parallel_group)
-        self.overlap_grad_reduce = overlap_grad_reduce
-        self.use_distributed_optimizer = use_distributed_optimizer
-
-        self.reset()
-
-    def reset(self):
-        """
-        Reset metadata in bucket in preparation for the next iteration of training.
-        """
-        self.params_with_grad = set()
-        self.communication_handle = None
-        self.communication_issued = False
-
-    def start_grad_sync(self):
-        """
-        Initiates grad sync (all-reduce or reduce-scatter) communication operation
-        for this bucket.
-
-        When overlap_grad_reduce is set to True, dispatches an asynchronous
-        communication call. When overlap_grad_reduce is set to False, makes
-        synchronous call.
-        """
-        assert (
-            self.communication_handle is None and not self.communication_issued
-        ), 'Should not have multiple communication calls in flight at once'
-
-        self.data /= self.data_parallel_world_size
-        # Use async_op only when overlap_grad_reduce is True.
-        if self.use_distributed_optimizer:
-            local_data_view = shard_buffer(self.data, self.data_parallel_world_size)[
-                self.data_parallel_rank
-            ]
-            self.communication_handle = torch.distributed._reduce_scatter_base(
-                local_data_view,
-                self.data,
-                group=self.data_parallel_group,
-                async_op=self.overlap_grad_reduce,
-            )
-        else:
-            self.communication_handle = torch.distributed.all_reduce(
-                self.data, group=self.data_parallel_group, async_op=self.overlap_grad_reduce
-            )
-        self.communication_issued = True
-
-    def finish_grad_sync(self):
-        """
-        Finishes grad sync (all-reduce or reduce-scatter) communication operation
-        for this bucket.
-
-        When overlap_grad_reduce is set to True, waits for asynchronous communication
-        call to complete. When overlap_grad_reduce is set to False, makes synchronous call.
-        """
-        # If overlap_grad_reduce is False, start (and finish) synchronous communication call here.
-        if not self.overlap_grad_reduce:
-            self.start_grad_sync()
-            return
-        assert self.communication_handle is not None and self.communication_issued, (
-            f'Communication call has not been issued for this bucket '
-            f'({len(self.params_with_grad)}/{len(self.params)} params have grad available)'
-        )
-        self.communication_handle.wait()
-
-    def register_grad_ready(self, param: torch.nn.Parameter):
-        """
-        Registers grads for the passed-in param to be "ready" for grad sync.
-
-        When the number of microbatches is greater than 1, we only want to register
-        grads as ready when processing the last microbatch and overlap_grad_reduce is True.
-        """
-        assert param in self.params, 'Param is not in the bucket'
-        assert param not in self.params_with_grad, 'Cannot set grad twice'
-        assert (
-            self.overlap_grad_reduce
-        ), 'register_grad_ready() should be called only when overlapping grad reduce'
-        self.params_with_grad.add(param)
-        # If all params in bucket have grads available, issue communication call.
-        if len(self.params_with_grad) == len(self.params):
-            self.start_grad_sync()
-
-
-class GradBuffer:
-    """
-    Groups gradients into a contiguous buffer, and then breaks the buffer into buckets with
-    roughly `bucket_size` parameters each.
-
-    Arguments:
-        dtype: Type of underlying tensor.
-        params: List of parameters whose gradients are collated in the underlying tensor.
-        data_parallel_group: Data-parallel process group.
-        bucket_size: The rough size of each bucket in terms of number of parameters.
-        param_to_name: Mapping from `torch.nn.Parameter` to name (for logging purposes).
-        overlap_grad_reduce: If true, overlap communication with backprop computation by
-            breaking up grads into buckets. If false, single synchronous communication call
-            is used instead.
-        use_distributed_optimizer: If true, issue reduce-scatter communication calls as part
-            of distributed optimizer. If false, issue all-reduce communication calls.
-    """
-
-    def __init__(
-        self,
-        dtype: torch.dtype,
-        params: List[torch.nn.Parameter],
-        data_parallel_group: torch.distributed.ProcessGroup,
-        bucket_size: int,
-        param_to_name: Dict[torch.nn.Parameter, str],
-        overlap_grad_reduce: bool,
-        use_distributed_optimizer: bool,
-    ):
-
-        # Check that params are unique.
-        unique_params = set()
-        for param in params:
-            assert param not in unique_params
-            unique_params.add(param)
-        del unique_params
-
-        # Store attributes that will be needed later.
-        self.dtype = dtype
-        self.data_parallel_group = data_parallel_group
-        self.data_parallel_world_size = torch.distributed.get_world_size(
-            group=self.data_parallel_group
-        )
-        self.overlap_grad_reduce = overlap_grad_reduce
-        self.use_distributed_optimizer = use_distributed_optimizer
-        self.is_last_microbatch = True
-
-        # Data structures to store underlying buckets and relevant indexing data.
-        self.buckets = []
-        self.param_to_bucket = {}  # Param -> bucket mapping.
-        self.param_index_map = {}  # Param -> location in buffer mapping (used in dist. optimizer).
-
-        def _pad_if_needed(data_index: int):
-            """Pads data indices if using distributed optimizer (to ensure uniform sharding)."""
-            if use_distributed_optimizer:
-                return (
-                    int(math.ceil(data_index / self.data_parallel_world_size))
-                    * self.data_parallel_world_size
-                )
-            return data_index
-
-        # First, figure out how many elements should be in the underlying buffer storage.
-        # Note that if we need to split the buffer into smaller buckets, each of these
-        # might need to be padded as well (if using the distributed optimizer).
-        data_start_index = 0
-        bucket_data_start_index = data_start_index
-        bucket_params = set()
-        self.bucket_indices = []
-        bucket_id = 0
-        for param in params[::-1]:
-            # Iterate through parameters in reverse order to roughly follow backprop order,
-            # and skip parameters that don't require gradients.
-            if not param.requires_grad:
-                continue
-            this_numel = param.data.nelement()
-            data_end_index = data_start_index + this_numel
-            self.param_index_map[param] = (
-                data_start_index,
-                data_end_index,
-                bucket_id,
-            )
-            bucket_params.add(param)
-
-            # If we have enough elements already, form a new bucket.
-            # If bucket_size is None, accumulate everything into a single bucket.
-
-            # TODO: Remove len(bucket_params) > 1 when the final head that transforms token
-            # representations from hidden space to vocabulary space is in a PyTorch module
-            # whose forward method is called. If it is not and a bucket contains only this
-            # one parameter, we get incorrect behavior (i.e., higher losses) since we do not
-            # call the wait function on the bucket's all_gather_handle (we use forward pre-
-            # hooks on PyTorch modules to do this when --overlap-param-gather is used).
-            # As a temporary workaround, we make sure that no bucket has only one parameter.
-            if bucket_size is not None:
-                if (data_end_index - bucket_data_start_index) >= bucket_size and len(
-                    bucket_params
-                ) > 1:
-                    data_end_index = _pad_if_needed(data_end_index)
-                    self.bucket_indices.append((bucket_data_start_index, data_end_index))
-                    bucket_data_start_index = data_end_index
-                    bucket_params = set()
-                    bucket_id += 1
-            data_start_index = data_end_index
-
-        # Add remaining params to a new bucket.
-        if len(bucket_params) > 0:
-            data_end_index = _pad_if_needed(data_end_index)
-            self.bucket_indices.append((bucket_data_start_index, data_end_index))
-
-        # Next, create underlying storage for buffer (with numel elements that includes
-        # padding as necessary).
-        self.numel = data_end_index
-        if use_distributed_optimizer:
-            assert self.numel % self.data_parallel_world_size == 0
-        self.data = torch.zeros(
-            self.numel, dtype=self.dtype, device=torch.cuda.current_device(), requires_grad=False,
-        )
-
-        # Finally, map main_grad fields for each parameter with a .grad field.
-        bucket_params = set()
-        bucket_data_start_index = 0
-        cur_bucket_id = 0
-        for param in params[::-1]:
-            if not param.requires_grad:
-                continue
-            data_start_index, data_end_index, bucket_id = self.param_index_map[param]
-            param.main_grad = self._get(param.data.shape, data_start_index)
-            if bucket_id != cur_bucket_id:
-                bucket_data_end_index = _pad_if_needed(data_start_index)
-                self._set_bucket(
-                    bucket_params, bucket_data_start_index, bucket_data_end_index, cur_bucket_id
-                )
-                bucket_data_start_index = bucket_data_end_index
-                bucket_params = set()
-                assert cur_bucket_id + 1 == len(self.buckets)
-                assert bucket_id == cur_bucket_id + 1
-                cur_bucket_id = bucket_id
-            bucket_params.add(param)
-
-        # Add remaining params to a new bucket.
-        if len(bucket_params) > 0:
-            bucket_data_end_index = _pad_if_needed(data_end_index)
-            self._set_bucket(
-                bucket_params, bucket_data_start_index, bucket_data_end_index, cur_bucket_id
-            )
-
-        if not overlap_grad_reduce:
-            assert len(bucket_params) == len(
-                params
-            ), 'All params should be in one bucket when overlap_grad_reduce is False'
-
-        # Log buckets for all PP stages.
-        if (
-            parallel_state.get_data_parallel_rank(with_context_parallel=True) == 0
-            and parallel_state.get_tensor_model_parallel_rank() == 0
-        ):
-            logger.info(
-                f'Number of buckets for gradient all-reduce / reduce-scatter: {len(self.buckets)}'
-            )
-            for index, bucket in enumerate(self.buckets):
-                numel = 0
-                for param in bucket.params:
-                    numel += param.data.nelement()
-                logger.info(f'Params for bucket {index+1} ({numel} elements):')
-                for param in bucket.params:
-                    logger.info(f'    {param_to_name[param]}')
-
-    def _get(self, shape: torch.Size, start_index: int) -> torch.Tensor:
-        """
-        Return a tensor with the input `shape` as a view into the 1-D data starting at
-        `start_index`.
-        """
-        end_index = start_index + shape.numel()
-        assert end_index <= self.numel, 'Requested tensor is out of buffer range'
-        buffer_tensor = self.data[start_index:end_index]
-        buffer_tensor = buffer_tensor.view(shape)
-        return buffer_tensor
-
-    def _set_bucket(
-        self,
-        bucket_params: List[torch.nn.Parameter],
-        start_index: int,
-        end_index: int,
-        bucket_id: int,
-    ):
-        """
-        Helper function to create new bucket, add it to list of buckets, and
-        also update param->bucket mapping.
-        """
-
-        # Assert that indices are correctly padded (if needed), and that bucket
-        # position is same as originally computed.
-        if self.use_distributed_optimizer:
-            assert start_index % self.data_parallel_world_size == 0
-            assert end_index % self.data_parallel_world_size == 0
-        assert (start_index, end_index) == self.bucket_indices[bucket_id]
-
-        # Get appropriate view into global GradBuffer.
-        bucket_data = self._get(torch.Size([end_index - start_index]), start_index)
-        bucket = Bucket(
-            params=bucket_params,
-            data=bucket_data,
-            offset=start_index,
-            data_parallel_group=self.data_parallel_group,
-            data_parallel_world_size=self.data_parallel_world_size,
-            overlap_grad_reduce=self.overlap_grad_reduce,
-            use_distributed_optimizer=self.use_distributed_optimizer,
-        )
-        self.buckets.append(bucket)
-        for bucket_param in bucket_params:
-            assert bucket_param not in self.param_to_bucket
-            self.param_to_bucket[bucket_param] = bucket
-
-    def reset(self, zero_buffer):
-        """
-        Zero out the underlying buffer and reset all buckets in preparation for the next
-        iteration of training.
-
-        When zero_buffer is set to True, the underlying buffer is zeroed out.
-        """
-        if zero_buffer:
-            self.data.zero_()
-        for bucket in self.buckets:
-            bucket.reset()
-        self.is_last_microbatch = True
-
-    def start_grad_sync(self):
-        """
-        Initiates grad sync (all-reduce or reduce-scatter) communication operations
-        for all buckets in the grad buffer.
-
-        When overlap_grad_reduce is set to True, dispatches asynchronous communication
-        calls. When overlap_grad_reduce is set to False, calls synchronous
-        communication ops.
-        """
-        for bucket in self.buckets:
-            bucket.start_grad_sync()
-
-    def finish_grad_sync(self):
-        """
-        Finishes grad sync (all-reduce or reduce-scatter) communication operations
-        for all buckets in the grad buffer.
-
-        When overlap_grad_reduce is set to True, waits for asynchronous communication
-        calls to complete. When overlap_grad_reduce is set to False, calls synchronous
-        communication ops.
-        """
-        for bucket in self.buckets:
-            bucket.finish_grad_sync()
-
-    def register_grad_ready(self, param: torch.nn.Parameter):
-        """
-        Registers grads for the passed-in param to be "ready" for grad sync.
-
-        When the number of microbatches is greater than 1, we only want to register
-        grads as ready when processing the last microbatch and overlap_grad_reduce is True.
-        """
-        assert (
-            self.overlap_grad_reduce
-        ), 'register_grad_ready() should only be called when overlap_grad_reduce is True'
-        if self.is_last_microbatch:
-            bucket = self.param_to_bucket[param]
-            bucket.register_grad_ready(param)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/enums.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/enums.py
deleted file mode 100644
index 46e7d3b766af061cd36363f8486f75f7ad80b08f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/enums.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import enum
-
-
-class ModelType(enum.Enum):
-    encoder_or_decoder = 1
-    encoder_and_decoder = 2
-    retro_encoder = 3
-    retro_decoder = 4
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_bias_dropout.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_bias_dropout.py
deleted file mode 100644
index 14c1fe0d718223ba78830cf3099ac02907e65fc2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_bias_dropout.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-from typing import Optional, Tuple
-
-import torch
-
-
-def _bias_dropout_add_func(x_with_bias, residual, prob, training):
-    # type: (Tuple[Tensor, Optional[Tensor]], Tensor, float, bool) -> Tensor
-    # NOTE: Previously, the argument `bias` used to be passed as
-    # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
-    # transformer layer but broadcasting should automatically take care of that.
-    # Also, looking at broadcasting semantics, `expand_as` and broadcasting
-    # seem to be identical performance-wise (both just change the view).
-
-    x, bias = x_with_bias  # unpack
-
-    # If we want to train mixed precision, then the output of this function
-    # should be half precision. However, in AMP O1, the input (residual) is
-    # in fp32, and it will up-cast the result to fp32, causing pipeline parallel
-    # GPU communication to hang. Therefore, we need to cast residual to the same
-    # dtype as x.
-    residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
-
-    # The Dropout operation, Residual Addition and the tensor returning can be
-    # done generically outside the if statement, but that stops fusing of Bias
-    # Addition-Dropout-Residual Addition operation. So doing it together inside
-    # the conditional branch to improve performance
-    if bias is not None:
-        x = x + bias
-        out = torch.nn.functional.dropout(x, p=prob, training=training)
-        out = residual + out
-        return out
-    else:
-        out = torch.nn.functional.dropout(x, p=prob, training=training)
-        out = residual + out
-        return out
-
-
-def bias_dropout_add_unfused(training):
-    def _bias_dropout_add(x_with_bias, residual, prob):
-        return _bias_dropout_add_func(x_with_bias, residual, prob, training)
-
-    return _bias_dropout_add
-
-
-@torch.jit.script
-def bias_dropout_add_fused_train(
-    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float,
-) -> torch.Tensor:
-    return _bias_dropout_add_func(x_with_bias, residual, prob, True)
-
-
-@torch.jit.script
-def bias_dropout_add_fused_inference(
-    x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float,
-) -> torch.Tensor:
-    return _bias_dropout_add_func(x_with_bias, residual, prob, False)
-
-
-def get_bias_dropout_add(training, fused):
-    if fused:
-        # jit scripting for a nn.module (with dropout) is not
-        # triggering the fusion kernel. For now, we use two
-        # different nn.functional routines to account for varying
-        # dropout semantics during training and inference phases.
-        if training:
-            return bias_dropout_add_fused_train
-        else:
-            return bias_dropout_add_fused_inference
-    else:
-        return bias_dropout_add_unfused(training)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_bias_gelu.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_bias_gelu.py
deleted file mode 100644
index 9c791c180765b99c49e78dedf63444b57fed5ec1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_bias_gelu.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-###### BIAS GELU FUSION/ NO AUTOGRAD ################
-# 1/sqrt(2*pi)-> 0.3989423
-# 1/sqrt(2)   -> 0.70710678
-# sqrt(2/pi)  -> 0.79788456
-# this function is tanh approximation of gelu
-# actual gelu is:
-# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
-
-
-@torch.jit.script
-def bias_gelu(bias, y):
-    x = bias + y
-    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
-
-
-# gradient of tanh approximation of gelu
-# gradient of actual gelu is:
-# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
-@torch.jit.script
-def bias_gelu_back(g, bias, y):
-    x = bias + y
-    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
-    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
-    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
-        1 + tanh_out
-    )
-    return ff * g
-
-
-class GeLUFunction(torch.autograd.Function):
-    @staticmethod
-    # bias is an optional argument
-    def forward(ctx, input, bias):
-        ctx.save_for_backward(input, bias)
-        return bias_gelu(bias, input)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, bias = ctx.saved_tensors
-        tmp = bias_gelu_back(grad_output, bias, input)
-        return tmp, tmp
-
-
-bias_gelu_impl = GeLUFunction.apply
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_layer_norm.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_layer_norm.py
deleted file mode 100644
index ebe1f2ffd40ff4bb7394fcad626091341e19f90f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_layer_norm.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import importlib
-import numbers
-
-import torch
-from torch import Tensor
-from torch.nn import init
-from torch.nn.parameter import Parameter
-
-from megatron_ds.core.transformer import TransformerConfig
-from megatron_ds.core.utils import make_viewless_tensor
-
-try:
-    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
-
-    HAVE_PERSIST_LAYER_NORM = True
-except:
-    HAVE_PERSIST_LAYER_NORM = False
-
-try:
-    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
-
-    HAVE_FUSED_LAYER_NORM = True
-except:
-    HAVE_FUSED_LAYER_NORM = False
-
-
-class FusedLayerNorm(torch.nn.Module):
-
-    """Layer Norm, fused into a single CUDA kernel.
-
-    Arguments:
-      hidden_size (int): Transformer hidden dimension.
-
-      eps (float): Epsilon added to denominator, for numerical stability.
-
-      persist_layer_norm (bool): Use persistent fused layer norm kernel.
-      This kernel supports only a set of hidden sizes. Please
-      check persist_ln_hidden_sizes if your hidden size is supported.
-
-      sequence parallel (bool): Apply sequence parallelism optimization.
-
-      zero_centered_gamma (bool): Adjust LayerNorm weights such that they are
-      centered around zero. This improves numerical stability.
-
-      config (TransformerConfig): Transformer config. Include to match custom
-      layer norm interfaces.
-
-      normalization (str): Normalization type, used for Transformer Engine.
-      Must equal 'LayerNorm' here.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        hidden_size: int,
-        eps: float = 1e-5,
-        persist_layer_norm: bool = True,
-        sequence_parallel: bool = False,
-        zero_centered_gamma: bool = False,
-        normalization: str = "LayerNorm",  # included to match TE interface
-    ):
-        super().__init__()
-
-        self.zero_centered_gamma = config.layernorm_zero_centered_gamma
-        assert (
-            config.normalization == "LayerNorm"
-        ), f'({config.normalization}) is not supported in FusedLayerNorm'
-
-        # List of hiddens sizes supported in the persistent layer norm kernel
-        # If the hidden size is not supported, fall back to the non-persistent
-        # kernel.
-        persist_ln_hidden_sizes = [
-            1024,
-            1536,
-            2048,
-            2304,
-            3072,
-            3840,
-            4096,
-            5120,
-            6144,
-            8192,
-            10240,
-            12288,
-            12800,
-            15360,
-            16384,
-            18432,
-            20480,
-            24576,
-            25600,
-            30720,
-            32768,
-            40960,
-            49152,
-            65536,
-        ]
-        persist_layer_norm = config.persist_layer_norm
-        if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
-            persist_layer_norm = False
-
-        if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM:
-            # TODO: Add pytorch only layer norm
-            raise ValueError(f'Apex must currently be installed to use megatron core.')
-
-        if isinstance(hidden_size, numbers.Integral):
-            hidden_size = (hidden_size,)
-        self.hidden_size = torch.Size(hidden_size)
-        self.eps = eps
-        self.weight = Parameter(torch.Tensor(*hidden_size))
-        self.bias = Parameter(torch.Tensor(*hidden_size))
-        self.reset_parameters()
-        self.persist_layer_norm = persist_layer_norm
-        self.sequence_parallel = config.sequence_parallel
-
-        # set sequence parallelism flag on weight and bias parameters
-        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
-        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
-
-    def reset_parameters(self):
-
-        if self.zero_centered_gamma:
-            init.zeros_(self.weight)
-            init.zeros_(self.bias)
-        else:
-            init.ones_(self.weight)
-            init.zeros_(self.bias)
-
-    def forward(self, input: Tensor) -> Tensor:
-
-        weight = self.weight + 1 if self.zero_centered_gamma else self.weight
-
-        if self.persist_layer_norm:
-            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
-
-            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
-            # a populated '_base' field). This will result in schedule.py's
-            # deallocate_output_tensor() throwing an error, so a viewless tensor is
-            # created to prevent this.
-            output = make_viewless_tensor(
-                inp=output, requires_grad=input.requires_grad, keep_graph=True
-            )
-
-        else:
-            output = FusedLayerNormAffineFunction.apply(
-                input, weight, self.bias, self.hidden_size, self.eps
-            )
-
-        return output
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_softmax.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_softmax.py
deleted file mode 100644
index 2b8e5472274a195e25b8246ebf2c362ee0f678e4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/fused_softmax.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-
-import torch
-import torch.nn as nn
-
-from megatron_ds.core.transformer.enums import AttnMaskType
-
-
-class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply upper triangular mask (typically used in gpt models).
-    3. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, scale):
-        import scaled_upper_triang_masked_softmax_cuda
-
-        scale_t = torch.tensor([scale])
-        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
-
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        import scaled_upper_triang_masked_softmax_cuda
-
-        softmax_results, scale_t = ctx.saved_tensors
-        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
-            output_grads, softmax_results, scale_t[0]
-        )
-
-        return input_grads, None
-
-
-class ScaledMaskedSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply the mask.
-    3. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, mask, scale):
-        import scaled_masked_softmax_cuda
-
-        scale_t = torch.tensor([scale])
-
-        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        import scaled_masked_softmax_cuda
-
-        softmax_results, scale_t = ctx.saved_tensors
-
-        input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
-        return input_grads, None, None
-
-
-class ScaledSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following two operations in sequence
-    1. Scale the tensor.
-    2. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, scale):
-        import scaled_softmax_cuda
-
-        scale_t = torch.tensor([scale])
-
-        softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        import scaled_softmax_cuda
-
-        softmax_results, scale_t = ctx.saved_tensors
-
-        input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
-        return input_grads, None, None
-
-
-class FusedScaleMaskSoftmax(nn.Module):
-    """
-    fused operation: scaling + mask + softmax
-
-    Arguments:
-        input_in_fp16: flag to indicate if input in fp16 data format.
-        input_in_bf16: flag to indicate if input in bf16 data format.
-        attn_mask_type: attention mask type (pad or causal)
-        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
-        mask_func: mask function to be applied.
-        softmax_in_fp32: if true, softmax in performed at fp32 precision.
-        scale: scaling factor used in input tensor scaling.
-    """
-
-    def __init__(
-        self,
-        input_in_fp16,
-        input_in_bf16,
-        attn_mask_type,
-        scaled_masked_softmax_fusion,
-        mask_func,
-        softmax_in_fp32,
-        scale,
-    ):
-        super(FusedScaleMaskSoftmax, self).__init__()
-        self.input_in_fp16 = input_in_fp16
-        self.input_in_bf16 = input_in_bf16
-        assert not (
-            self.input_in_fp16 and self.input_in_bf16
-        ), "both fp16 and bf16 flags cannot be active at the same time."
-        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
-        self.attn_mask_type = attn_mask_type
-        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
-        self.mask_func = mask_func
-        self.softmax_in_fp32 = softmax_in_fp32
-        self.scale = scale
-
-        assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled"
-
-    def forward(self, input, mask):
-        # [b, np, sq, sk]
-        assert input.dim() == 4
-
-        if self.is_kernel_available(mask, *input.size()):
-            return self.forward_fused_softmax(input, mask)
-        else:
-            return self.forward_torch_softmax(input, mask)
-
-    def is_kernel_available(self, mask, b, np, sq, sk):
-        attn_batches = b * np
-
-        if (
-            self.scaled_masked_softmax_fusion  # user want to fuse
-            and self.input_in_float16  # input must be fp16
-            and 16 < sk <= 4096  # sk must be 16 ~ 2048
-            and sq % 4 == 0  # sq must be divisor of 4
-            and sk % 4 == 0  # sk must be divisor of 4
-            and attn_batches % 4 == 0  # np * b must be divisor of 4
-        ):
-            if 0 <= sk <= 4096:
-                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
-
-                if self.attn_mask_type == AttnMaskType.causal:
-                    if attn_batches % batch_per_block == 0:
-                        return True
-                else:
-                    if sq % batch_per_block == 0:
-                        return True
-        return False
-
-    def forward_fused_softmax(self, input, mask):
-        b, np, sq, sk = input.size()
-        scale = self.scale if self.scale is not None else 1.0
-
-        if self.attn_mask_type == AttnMaskType.causal:
-            assert sq == sk, "causal mask is only for self attention"
-
-            # input is 3D tensor (attn_batches, sq, sk)
-            input = input.view(-1, sq, sk)
-            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
-            return probs.view(b, np, sq, sk)
-        else:
-            # input is 4D tensor (b, np, sq, sk)
-            if mask is not None:
-                return ScaledMaskedSoftmax.apply(input, mask, scale)
-            else:
-                return ScaledSoftmax.apply(input, scale)
-
-    def forward_torch_softmax(self, input, mask):
-        if self.input_in_float16 and self.softmax_in_fp32:
-            input = input.float()
-
-        if self.scale is not None:
-            input = input * self.scale
-        mask_output = self.mask_func(input, mask) if mask is not None else input
-        probs = torch.nn.Softmax(dim=-1)(mask_output)
-
-        if self.input_in_float16 and self.softmax_in_fp32:
-            if self.input_in_fp16:
-                probs = probs.half()
-            else:
-                probs = probs.bfloat16()
-
-        return probs
-
-    @staticmethod
-    def get_batch_per_block(sq, sk, b, np):
-        import scaled_masked_softmax_cuda
-
-        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/inference_params.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/inference_params.py
deleted file mode 100644
index 287902460fab6d411781fb15c86f0a333b7cf245..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/inference_params.py
+++ /dev/null
@@ -1,27 +0,0 @@
-class InferenceParams:
-    """Inference parameters that are passed to the main model in order
-    to efficienly calculate and store the context during inference."""
-
-    def __init__(self, max_batch_size, max_sequence_length):
-        self.max_sequence_length = max_sequence_length
-        self.max_batch_size = max_batch_size
-        self.sequence_len_offset = 0
-        self.batch_size_offset = 0
-        self.key_value_memory_dict = {}
-
-    def swap_key_value_dict(self, batch_idx):
-        "swap between batches"
-        if len(self.key_value_memory_dict) == 0:
-            raise ValueError("should not swap when dict in empty")
-
-        for layer_number in self.key_value_memory_dict.keys():
-            inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
-            assert (
-                len(batch_idx) == inference_key_memory.shape[1]
-            )  # make sure batch size is the same
-            new_inference_key_memory = inference_key_memory[:, batch_idx]
-            new_inference_value_memory = inference_value_memory[:, batch_idx]
-            self.key_value_memory_dict[layer_number] = (
-                new_inference_key_memory,
-                new_inference_value_memory,
-            )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/model_parallel_config.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/model_parallel_config.py
deleted file mode 100644
index 69cebed4fc7abd3d5ecabb014eeea90258671adc..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/model_parallel_config.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from dataclasses import dataclass
-from typing import Callable, Optional
-
-import torch
-
-
-@dataclass
-class ModelParallelConfig:
-    """Base configuration for Megatron Core
-
-    Model Parallelism
-    -----------------
-
-    tensor_model_parallel_size (int): Intra-layer model parallelism. Splits tensors across GPU ranks. Defaults to 1.
-
-    context_parallel_size (int): Splits network input along sequence dimension across GPU ranks. Defaults to 1.
-
-    pipeline_model_parallel_size (int): Inter-layer model parallelism. Splits transformer layers across GPU
-        ranks. Defaults to 1.
-
-    virtual_pipeline_model_parallel_size (int): Interleaved pipeline parallelism is used to improve performance by
-        reducing the pipeline bubble.  Considers a transformer block as a list of smaller transformer (virtual) blocks.
-        The number of virtual blocks per pipeline model parallel rank is the virtual model parallel size.  See Efficient
-        Large-Scale Language Model Training on GPU Clusters Using Megatron-LM: https://arxiv.org/pdf/2104.04473.pdf for
-        more details.  Defaults to None.
-
-    sequence_parallel (bool): Makes tensor parallelism more memory efficient for LLMs (20B+) by
-        parallelizing layer norms and dropout sequentially.  See Reducing Activation Recomputation in Large Transformer
-        Models: https://arxiv.org/abs/2205.05198 for more details. Defaults to False.
-
-    expert_model_parallel_size (int): Distributes Moe Experts across sub data parallel dimension. Defaults to False.
-
-    Initialization
-    --------------
-
-    perform_initialization (bool, default=True): If true, weights are initialized. This option can be useful when you
-        know you are going to load values from a checkpoint.
-
-    use_cpu_initialization: (bool, default=False): When set to False, we initialize the weights directly on the GPU.
-        Transferring weights from CPU to GPU can take a significant amount of time for large models. Defaults to False.
-
-    Training
-    --------
-
-    fp16 (bool): If true, train with fp16 mixed precision training. Defaults to False.
-
-    bf16 (bool): If true, train with bf16 mixed precision training. Defaults to False.
-
-    params_dtype (torch.dtype): dtype used when intializing the weights. Defaults to torch.float32
-
-    timers (optional, default=None): TODO
-
-    Optimizations
-    -------------
-
-    gradient_accumulation_fusion (bool): If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA
-        extension fused_weight_gradient_mlp_cuda module. To use gradient_accumulation_fusion you must install APEX with
-        --cpp_ext and --cuda_ext. For example: "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\"
-        ". Note that the extension requires CUDA>=11. Otherwise, you must turn off gradient accumulation fusion.
-        Defaults to False.
-
-    async_tensor_model_parallel_allreduce (bool, default=True): If true, enables asynchronous execution of
-        tensor-model-parallel all-reduce with weight gradient compuation of a column-linear layer.  Defaults to False.
-
-    tp_comm_overlap (bool, default=False): If true, allows overlapping of Linear layer execution with tensor parallel
-        communication collectives like AllGather/ReduceScatter. Overlapping is done for the linear layers wherever possible
-        during the forward and the backward pass.  Defaults to False.
-
-    tp_comm_split_ag (bool, default=True): If true, allows All-Gather overlap with Fprop GEMM. Don't care if tp_comm_overlap 
-        is False.
-
-    tp_comm_split_rs (bool, default=True): If true, allows Reduce-Scatter overlap with Fprop GEMM. Don't care if 
-        tp_comm_overlap is False.
-
-    tp_comm_bulk_dgrad (bool, default=True): If true, allows All-Gather overlap with Bprop activation gradient GEMM. Don't 
-        care if tp_comm_overlap is False.
-
-    tp_comm_bulk_wgrad (bool, default=True): If true, allows Reduce-Scatter overlap with Bprop weight gradient GEMM. Don't 
-        care if tp_comm_overlap is False.
-
-    Parallelism
-    -----------
-
-    finalize_model_grads_func (optional): Function that finalizes gradients on all workers. Could include ensuring that
-        grads are all-reduced across data parallelism, pipeline parallelism, and sequence parallelism dimensions.
-
-    Pipeline Parallelism
-    --------------------
-
-    pipeline_dtype (required): dtype used in p2p communication, usually params_dtype
-
-    grad_scale_func (optional, default=None): If using loss scaling, this function should take the loss and return the
-        scaled loss. If None, no function is called on the loss.
-
-    enable_autocast (bool): If true runs the forward step function inside torch.autocast context. Default is False.
-
-    autocast_dtype (torch.dtype): dtype to pass to torch.amp.autocast when enabled. Default is pipeline_dtype.
-    
-    variable_seq_lengths (bool, default=False): Support for variable sequence lengths across microbatches. Setting this
-        communicates the size of tensors during pipeline parallelism communication, because of this extra overhead it
-        should only be set if the sequence length varies by microbatch within a global batch.
-
-    num_microbatches_with_partial_activation_checkpoints (int, default=None): If int, set the number of microbatches
-        where not all of the layers will be checkpointed and recomputed. The rest of the microbatches within the window
-        of maximum outstanding microbatches will recompute all layers (either full recompute or selective recompute). If
-        None, the checkpoint and recompute will be left up to the forward_step function.
-
-    overlap_p2p_comm (bool, optional, default=False): When True some of the peer to peer communication for pipeline
-        parallelism will overlap with computation. Must be False if batch_p2p_comm is true.
-
-    batch_p2p_comm (bool, default=True): Use batch_isend_irecv instead of individual isend/irecv calls. Must be False
-        if overlap_p2p_comm is True.
-
-    batch_p2p_sync (bool, default=True): When using batch_isend_irecv, do a cuda.device.synchronize afterward to work
-        around a bug in older version of PyTorch.
-
-    use_ring_exchange_p2p (bool, default=False): Use custom ring_exchange kernel instead of
-        torch.distributed.batch_isend_irecv(). Requires custom built torch with torch.distributed.ring_exchange.
-
-    deallocate_pipeline_outputs (optional, default=False): If True, output data is deallocated after the tensor is sent
-        to the next pipeline stage.  Helps with saving memory, does nothing when pipeline parallel is not used.
-
-    no_sync_func (optional): Function that creates a context that suppresses asynchronous data-parallel
-        communication. If the model is an instance of core.distributed.DistributedDataParallel, the default is to use
-        core.distributed.DistributedDataParallel.no_sync.
-
-    grad_sync_func (optional): Function that launches asynchronous gradient reductions (e.g. distributed optimizer
-        gradient reduce-scatters). The function should take one argument: an iterable of parameters whose gradients are
-        to be synchronized.
-
-    param_sync_func (optional): Function that launches asynchronous parameter synchronizations (e.g. distributed
-        optimizer parameter all-gathers). The function should take one argument: an iterable of parameters to be
-        synchronized.
-
-    pipeline_model_parallel_split_rank (int, default=None): If int, rank where encoder and decoder should be split in
-        cases where the model has both an encoder and decoder (e.g., T5). Ignored if None.
-
-    barrier_with_L1_time (bool, default=True): If true, use barrier with level 1 time measurements. It is up to the user
-        to make sure calling barrier with their timers will not result in hangs. This can happen if for example the user
-        adds a level 1 timer that is not called by all ranks.
-
-    """
-
-    # Model parallelism
-    tensor_model_parallel_size: int = 1
-    context_parallel_size: int = 1
-    pipeline_model_parallel_size: int = 1
-    virtual_pipeline_model_parallel_size: Optional[int] = None
-    sequence_parallel: bool = False
-    expert_model_parallel_size: int = 1
-
-    # Initialization
-    perform_initialization: bool = True
-    use_cpu_initialization: bool = False
-
-    # Training
-    fp16: bool = False
-    bf16: bool = False
-    params_dtype: torch.dtype = torch.float32
-    timers: Callable = None
-
-    # Optimizations
-    gradient_accumulation_fusion: bool = False
-    async_tensor_model_parallel_allreduce: bool = False
-    tp_comm_overlap: bool = False
-
-    # Debug Options
-    tp_comm_split_ag: bool = True
-    tp_comm_split_rs: bool = True
-    tp_comm_bulk_wgrad: bool = True
-    tp_comm_bulk_dgrad: bool = True
-
-    # Parallelism
-    finalize_model_grads_func: Callable = None
-
-    # Pipeline Parallel
-    pipeline_dtype: torch.dtype = None
-    grad_scale_func: Callable = None
-    enable_autocast: bool = False
-    autocast_dtype: torch.dtype = None
-    variable_seq_lengths: bool = False
-    num_microbatches_with_partial_activation_checkpoints: Optional[int] = None
-    overlap_p2p_comm: bool = False
-    batch_p2p_comm: bool = True
-    batch_p2p_sync: bool = True
-    pp_delay: bool = False
-    pp_split_size: int = 1
-    use_ring_exchange_p2p: bool = False
-    deallocate_pipeline_outputs: bool = False
-    no_sync_func: Callable = None
-    grad_sync_func: Callable = None
-    param_sync_func: Callable = None
-    pipeline_model_parallel_split_rank: Optional[int] = None
-
-    # Timing
-    barrier_with_L1_time: bool = True
-
-    def __post_init__(self):
-        """ Python dataclass method that is used to modify attributes after initialization.
-            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
-        """
-        if self.sequence_parallel:
-            if self.tensor_model_parallel_size <= 1:
-                raise ValueError("Can not use sequence paralllelism without tensor parallelism")
-            if self.async_tensor_model_parallel_allreduce:
-                # sequence_parallelism already does this async
-                self.async_tensor_model_parallel_allreduce = False
-
-        if self.pipeline_model_parallel_size > 1:
-            if self.pipeline_dtype is None:
-                raise ValueError(
-                    "When using pipeline parallelism, pipeline_dtype must be specified"
-                )
-
-        if self.autocast_dtype is None:
-            self.autocast_dtype = self.params_dtype
-
-        if self.expert_model_parallel_size > 1 and self.tensor_model_parallel_size > 1:
-            if self.sequence_parallel is False:
-                raise ValueError(
-                    "When using expert parallelism and tensor parallelism, sequence parallelism must be used"
-                )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/__init__.py
deleted file mode 100644
index f65859a6dafcdfeb650f6b4a0da4fdecfe7f4dcf..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .t5_model import T5Model
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/t5_model.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/t5_model.py
deleted file mode 100644
index 28c1c9472b78bf97308ba78e071807a309644395..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/t5_model.py
+++ /dev/null
@@ -1,466 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import logging
-from typing import List, Literal, Optional
-
-import torch
-from torch import Tensor
-
-from megatron_ds.core import InferenceParams, parallel_state, tensor_parallel
-from megatron_ds.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
-from megatron_ds.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
-from megatron_ds.core.models.common.language_module.language_module import LanguageModule
-from megatron_ds.core.transformer.enums import AttnMaskType, ModelType
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.spec_utils import ModuleSpec
-from megatron_ds.core.transformer.transformer_block import TransformerBlock
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.utils import make_tp_sharded_tensor_for_checkpoint
-
-
-class T5LMHead(MegatronModule):
-    """Masked LM head for T5
-
-    Args:
-        config (TransformerConfig): transformer config
-        parallel_output (bool): wether output logits being distributed or not.
-        vocab_size (int): vocabulary size
-        pre_process (bool): Include embedding layer
-        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
-            shared.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        parallel_output: bool,
-        vocab_size: int,
-        pre_process: bool = True,
-        share_embeddings_and_output_weights: bool = False,
-    ):
-        super(T5LMHead, self).__init__(config=config)
-
-        self.parallel_output = parallel_output
-
-        self.output_layer = tensor_parallel.ColumnParallelLinear(
-            config.hidden_size,
-            vocab_size,
-            config=config,
-            init_method=config.init_method,
-            bias=share_embeddings_and_output_weights,
-            skip_bias_add=not share_embeddings_and_output_weights,
-            gather_output=not self.parallel_output,
-            skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
-        )
-
-    def forward(self, hidden_states: Tensor, word_embeddings_weight: Tensor) -> Tensor:
-        """Forward pass.
-
-        Args:
-            hidden_states (Tensor): output hidden states from decoder
-            word_embeddings_weight (Tensor): word embedding weight
-
-        Returns:
-            Tensor: logits tensor
-        """
-
-        logits, _ = self.output_layer(hidden_states, weight=word_embeddings_weight)
-        return logits
-
-
-class T5Model(LanguageModule):
-    """T5 Language model.
-
-    Args:
-        config (TransformerConfig): transformer config
-
-        transformer_encoder_layer_spec (ModuleSpec): transformer layer customization specs for encoder
-
-        transformer_decoder_layer_spec (ModuleSpec): transformer layer customization specs for decoder
-                
-        vocab_size (int): vocabulary size
-
-        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
-
-        pre_process (bool): Include embedding layer (used with pipeline parallelism)
-        post_process (bool): Include an output layer (used with pipeline parallelism)
-
-        fp16_lm_cross_entropy (bool, optional): Defaults to False
-
-        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
-
-        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
-            shared. Defaults to False.
-
-        position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
-            Defaults is 'learned_absolute'.
-
-        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
-            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
-
-        seq_len_interpolation_factor (float): scale of linearly interpolating RoPE for longer sequences.
-            The value must be a float larger than 1.0. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        transformer_encoder_layer_spec: ModuleSpec,
-        transformer_decoder_layer_spec: ModuleSpec,
-        vocab_size: int,
-        max_sequence_length: int,
-        pre_process: bool = True,
-        post_process: bool = True,
-        fp16_lm_cross_entropy: bool = False,
-        parallel_output: bool = True,
-        share_embeddings_and_output_weights: bool = False,
-        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
-        rotary_percent: float = 1.0,
-        seq_len_interpolation_factor: Optional[float] = None,
-    ):
-
-        super(T5Model, self).__init__(config=config)
-
-        self.config: TransformerConfig = config
-        self.transformer_encoder_layer_spec: ModuleSpec = transformer_encoder_layer_spec
-        self.transformer_decoder_layer_spec: ModuleSpec = transformer_decoder_layer_spec
-        self.vocab_size = vocab_size
-        self.max_sequence_length = max_sequence_length
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.add_encoder = True
-        self.add_decoder = True
-        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
-        self.parallel_output = parallel_output
-        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-        self.position_embedding_type = position_embedding_type
-
-        # megatron core pipelining currently depends on model type
-        self.model_type = ModelType.encoder_and_decoder
-
-        # Embeddings.
-        if self.pre_process:
-            self.embedding = LanguageModelEmbedding(
-                config=self.config,
-                vocab_size=self.vocab_size,
-                max_sequence_length=self.max_sequence_length,
-                position_embedding_type=self.position_embedding_type,
-            )
-
-        # Rotary Position Embeddings
-        if self.position_embedding_type == 'rope':
-            self.rotary_pos_emb = RotaryEmbedding(
-                self.config.kv_channels, rotary_percent, seq_len_interpolation_factor
-            )
-
-        # Transformer encoder
-        encoder_spec, decoder_spec = (
-            self.transformer_encoder_layer_spec,
-            self.transformer_decoder_layer_spec,
-        )
-        self.encoder = TransformerBlock(
-            config=self.config,
-            spec=encoder_spec,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-        )
-        # Transformer decoder
-        self.decoder = TransformerBlock(
-            config=self.config,
-            spec=decoder_spec,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-        )
-
-        # Output
-        if post_process:
-            self.lm_head = T5LMHead(
-                config,
-                parallel_output,
-                self.vocab_size,
-                self.pre_process,
-                self.share_embeddings_and_output_weights,
-            )
-        self.output_layer = self.lm_head.output_layer
-
-        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
-            self.initialize_last_stage_with_word_embeddings()
-
-    def forward(
-        self,
-        encoder_input_ids: Tensor,
-        decoder_input_ids: Tensor,
-        encoder_attn_mask: Tensor,
-        decoder_attn_mask: Tensor,
-        encoder_decoder_attn_mask: Tensor,
-        lm_labels: Tensor = None,
-        inference_params: InferenceParams = None,
-    ) -> Tensor:
-        """Forward pass.
-
-        Args:
-            encoder_input_ids (Tensor): input ids for encoder
-            decoder_input_ids (Tensor): input ids for decoder
-            encoder_attn_mask (Tensor): self-attention mask for encoder
-            decoder_attn_mask (Tensor): self-attention mask for decoder
-            encoder_decoder_attn_mask (Tensor): cross-attention mask between encoder and decoder
-            lm_labels (Tensor): labels for decoder output
-            inference_params (InferenceParams): relevant arguments for inferencing
-
-        Returns:
-            Tensor: loss tensor
-        """
-
-        (
-            encoder_attn_mask,
-            decoder_attn_mask,
-            encoder_decoder_attn_mask,
-        ) = t5_extended_attention_mask(
-            [encoder_attn_mask, decoder_attn_mask, encoder_decoder_attn_mask]
-        )
-        encoder_position_ids = t5_position_ids(encoder_input_ids)
-        decoder_position_ids = t5_position_ids(decoder_input_ids)
-
-        ## Encoder forward
-        # Encoder embedding.
-        if self.pre_process:
-            encoder_input = self.embedding(
-                input_ids=encoder_input_ids, position_ids=encoder_position_ids
-            )
-        else:
-            # intermediate stage of pipeline
-            encoder_input = None
-
-        # Rotary positional embeddings
-        rotary_pos_emb = None
-        if self.position_embedding_type == 'rope':
-            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
-                inference_params, self.encoder, encoder_input, self.config
-            )
-            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
-
-        # Run encoder.
-        encoder_hidden_states = self.encoder(
-            hidden_states=encoder_input,
-            attention_mask=encoder_attn_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb,
-        )
-
-        ## Decoder forward
-        # Decoder embedding.
-        if self.pre_process:
-            decoder_input = self.embedding(
-                input_ids=decoder_input_ids, position_ids=decoder_position_ids
-            )
-        else:
-            # intermediate stage of pipeline
-            decoder_input = None  ### should it take encoder_hidden_states
-
-        # Rotary positional embeddings
-        rotary_pos_emb = None
-        if self.position_embedding_type == 'rope':
-            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
-                inference_params, self.decoder, decoder_input, self.config
-            )
-            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
-
-        # Run decoder.
-        decoder_hidden_states = self.decoder(
-            hidden_states=decoder_input,
-            attention_mask=decoder_attn_mask,
-            context=encoder_hidden_states,
-            context_mask=encoder_decoder_attn_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb,
-        )
-
-        # Return if not post_process
-        if not self.post_process:
-            return decoder_hidden_states
-
-        # logits and loss
-        output_weight = None
-        if self.share_embeddings_and_output_weights:
-            output_weight = self.shared_embedding_or_output_weight()
-        logits = self.lm_head(decoder_hidden_states, word_embeddings_weight=output_weight)
-
-        if lm_labels is None:
-            # [s b h] => [b s h]
-            return logits.transpose(0, 1).contiguous()
-
-        loss = self.compute_language_model_loss(lm_labels, logits)
-
-        return loss
-
-    def set_input_tensor(self, input_tensor):
-        """ See megatron_ds.model.transformer.set_input_tensor()"""
-
-        # This is usually handled in schedules.py but some inference code still
-        # gives us non-lists or None
-        if not isinstance(input_tensor, list):
-            input_tensor = [input_tensor]
-
-        if self.add_encoder and self.add_decoder:
-            assert (
-                len(input_tensor) == 1
-            ), 'input_tensor should only be length 1 for stage with both encoder and decoder'
-            self.encoder.set_input_tensor(input_tensor[0])
-        elif self.add_encoder:
-            assert (
-                len(input_tensor) == 1
-            ), 'input_tensor should only be length 1 for stage with only encoder'
-            self.encoder.set_input_tensor(input_tensor[0])
-        elif self.add_decoder:
-            if len(input_tensor) == 2:
-                self.decoder.set_input_tensor(input_tensor[0])
-                self.encoder_hidden_state = input_tensor[1]
-            elif len(input_tensor) == 1:
-                self.decoder.set_input_tensor(None)
-                self.encoder_hidden_state = input_tensor[0]
-            else:
-                raise Exception('input_tensor must have either length 1 or 2')
-        else:
-            raise Exception('Stage must have at least either encoder or decoder')
-
-    def shared_embedding_or_output_weight(self) -> Tensor:
-        """Function to share the input embeddings and output logit weights."""
-
-        if self.pre_process:
-            return self.embedding.word_embeddings.weight
-        elif self.post_process:
-            return self.lm_head.output_layer.weight
-        return None
-
-    def sharded_state_dict(self, prefix: str = ''):
-        sharded_state_dict = {}
-
-        if self.pre_process:
-            embedding_prefix = f'{prefix}embedding.'
-            embedding_sharded_state_dict = self.embedding.sharded_state_dict(
-                prefix=embedding_prefix
-            )
-            sharded_state_dict.update(embedding_sharded_state_dict)
-
-        encoder_prefix = f'{prefix}encoder.'
-        encoder_sharded_state_dict = self.encoder.sharded_state_dict(prefix=encoder_prefix)
-        sharded_state_dict.update(encoder_sharded_state_dict)
-
-        decoder_prefix = f'{prefix}decoder.'
-        decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix)
-        sharded_state_dict.update(decoder_sharded_state_dict)
-
-        if self.post_process:
-            output_layer_prefix = f'{prefix}output_layer.'
-            output_layer_weight_key = f'{output_layer_prefix}weight'
-            output_layer_bias_key = f'{output_layer_prefix}bias'
-            if self.share_embeddings_and_output_weights:
-                if not self.pre_process:
-                    # when sharing embeddings with last stage, we need to use the weights from the first stage
-                    # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight
-                    tensor = self.shared_embedding_or_output_weight()
-                    first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
-                    dp_rank = parallel_state.get_data_parallel_rank()
-                    dp_size = parallel_state.get_data_parallel_world_size()
-                    last_stage_word_emb_replica_id = (
-                        dp_rank + dp_size
-                    )  # copy of first stage embedding
-
-                    sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                        tensor=tensor,
-                        key=first_stage_word_emb_key,
-                        replica_id=last_stage_word_emb_replica_id,
-                        allow_shape_mismatch=True,
-                    )
-
-                    sharded_state_dict[output_layer_weight_key] = sharded_output_layer_tensor
-                # output_layer.weight is shared, but we still need to process output_layer.bias
-                sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                    tensor=self.lm_head.output_layer.bias,
-                    key=output_layer_bias_key,
-                    allow_shape_mismatch=True,
-                )
-                sharded_state_dict[output_layer_bias_key] = sharded_output_layer_tensor
-            else:
-                output_layer_state_dict = self.output_layer.state_dict(
-                    prefix=output_layer_prefix, keep_vars=True
-                )
-                output_layer_tensor = output_layer_state_dict[output_layer_weight_key]
-                # independent output layer
-                sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                    tensor=output_layer_tensor,
-                    key=output_layer_weight_key,
-                    replica_id=parallel_state.get_data_parallel_rank(),
-                    allow_shape_mismatch=True,
-                )
-
-                sharded_state_dict[output_layer_weight_key] = sharded_output_layer_tensor
-
-        return sharded_state_dict
-
-    def state_dict_for_save_checkpoint(self, prefix: str = '', keep_vars: bool = False):
-        """For easy load when model is combined with other heads,
-        add an extra key."""
-
-        state_dict_ = {}
-        state_dict_["embedding"] = self.embedding.state_dict_for_save_checkpoint(
-            prefix=prefix, keep_vars=keep_vars
-        )
-        state_dict_["encoder"] = self.encoder.state_dict_for_save_checkpoint(
-            prefix=prefix, keep_vars=keep_vars
-        )
-        state_dict_["decoder"] = self.decoder.state_dict_for_save_checkpoint(
-            prefix=prefix, keep_vars=keep_vars
-        )
-
-        if self.post_process and self.add_decoder:
-            state_dict_["lm_head"] = self.lm_head.state_dict_for_save_checkpoint(
-                prefix=prefix, keep_vars=keep_vars
-            )
-        # Save word_embeddings.
-        if self.post_process and not self.pre_process and self.add_decoder:
-            state_dict_["word_embeddings_for_head"] = self.embedding.state_dict(
-                prefix=prefix, keep_vars=keep_vars
-            )
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-        self.embedding.load_state_dict(state_dict["embedding"], strict=strict)
-
-        self.encoder.load_state_dict(state_dict["encoder"], strict=strict)
-
-        self.decoder.load_state_dict(state_dict["decoder"], strict=strict)
-
-        if self.post_process and self.add_decoder:
-            self.lm_head.load_state_dict(state_dict["lm_head"], strict=strict)
-
-        # Load word embeddings
-        if self.post_process and not self.pre_process and self.add_decoder:
-            self.word_embeddings.load_state_dict(
-                state_dict["word_embeddings_for_head"], strict=strict
-            )
-
-
-def t5_extended_attention_mask(attention_mask_list: List[Tensor]) -> List[Tensor]:
-    def attn_mask_postprocess(attn_mask):
-        # [b, 1, s, s]
-        extended_attention_mask = attn_mask.unsqueeze(1)
-        return extended_attention_mask
-
-    return [attn_mask_postprocess(attn_mask) for attn_mask in attention_mask_list]
-
-
-def t5_position_ids(token_ids: Tensor) -> Tensor:
-    """Calculate position ids from token ids
-    Args:
-        token_ids (Tensor): input tokens
-
-    Returns:
-        Tensor: position ids
-    """
-    seq_length = token_ids.size(1)
-    position_ids = torch.arange(seq_length, dtype=torch.long, device=token_ids.device)
-    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
-
-    return position_ids
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/t5_spec.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/t5_spec.py
deleted file mode 100644
index 1dfb640e61a5480107dd4d1796601ef92e03215e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/T5/t5_spec.py
+++ /dev/null
@@ -1,212 +0,0 @@
-from megatron_ds.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron_ds.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron_ds.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron_ds.core.transformer.attention import (
-    CrossAttention,
-    CrossAttentionSubmodules,
-    SelfAttention,
-    SelfAttentionSubmodules,
-)
-from megatron_ds.core.transformer.custom_layers.transformer_engine import (
-    TEColumnParallelLinear,
-    TEDotProductAttention,
-    TELayerNormColumnParallelLinear,
-    TENorm,
-    TERowParallelLinear,
-)
-from megatron_ds.core.transformer.dot_product_attention import DotProductAttention
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.mlp import MLP, MLPSubmodules
-from megatron_ds.core.transformer.spec_utils import ModuleSpec
-from megatron_ds.core.transformer.transformer_block import (
-    TransformerBlockSubmodules,
-    get_num_layers_to_build,
-)
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
-
-
-def encoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
-    """T5 encoder TE spec (uses Transformer Engine components)."""
-
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.padding},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(
-                    linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,
-                ),
-            ),
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
-
-
-def decoder_model_with_transformer_engine_default_spec() -> ModuleSpec:
-    """T5 decoder TE spec (uses Transformer Engine components)."""
-
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_cross_attn_layernorm=TENorm,
-            cross_attention=ModuleSpec(
-                module=CrossAttention,
-                submodules=CrossAttentionSubmodules(
-                    linear_q=TEColumnParallelLinear,
-                    linear_kv=TEColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                ),
-            ),
-            cross_attn_bda=get_bias_dropout_add,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(
-                    linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,
-                ),
-            ),
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
-
-
-def encoder_model_with_local_spec() -> ModuleSpec:
-    """T5 encoder local spec (uses Megatron-Core components)."""
-
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            input_layernorm=FusedLayerNorm,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.padding},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
-                    linear_proj=RowParallelLinear,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=FusedLayerNorm,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(
-                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
-                ),
-            ),
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
-
-
-def decoder_model_with_local_spec() -> ModuleSpec:
-    """T5 decoder local spec (uses Megatron-Core components)."""
-
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            input_layernorm=FusedLayerNorm,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
-                    linear_proj=RowParallelLinear,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_cross_attn_layernorm=FusedLayerNorm,
-            cross_attention=ModuleSpec(
-                module=CrossAttention,
-                submodules=CrossAttentionSubmodules(
-                    linear_q=ColumnParallelLinear,
-                    linear_kv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
-                    linear_proj=RowParallelLinear,
-                ),
-            ),
-            cross_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=FusedLayerNorm,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(
-                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
-                ),
-            ),
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
-
-
-def get_t5_encoder_with_transformer_engine_block_spec(
-    num_layers: int,
-) -> TransformerBlockSubmodules:
-    """T5 encoder block spec for Transformer Engine
-
-    Args:
-      config (TransformerConfig): config, containing number of layers for encoder
-    """
-
-    layer_spec = encoder_model_with_transformer_engine_default_spec()
-    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers)
-    return block_spec
-
-
-def get_t5_decoder_with_transformer_engine_block_spec(
-    num_layers: int,
-) -> TransformerBlockSubmodules:
-    """T5 decoder block spec for Transformer Engine
-
-    Args:
-      config (TransformerConfig): config, containing number of layers for decoder
-    """
-
-    layer_spec = decoder_model_with_transformer_engine_default_spec()
-    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers)
-    return block_spec
-
-
-def get_t5_encoder_with_local_block_spec(num_layers: int) -> TransformerBlockSubmodules:
-    """T5 encoder block spec for local (uses Megatron-Core components)
-
-    Args:
-      num_layers (int): number of encoder layers
-    """
-
-    layer_spec = encoder_model_with_local_spec()
-    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers)
-    return block_spec
-
-
-def get_t5_decoder_with_local_block_spec(num_layers: int) -> TransformerBlockSubmodules:
-    """T5 decoder block spec for local (uses Megatron-Core components)
-
-    Args:
-      num_layers (int): number of decoder layers
-    """
-
-    layer_spec = decoder_model_with_local_spec()
-    block_spec = TransformerBlockSubmodules([layer_spec] * num_layers)
-    return block_spec
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_layer_specs.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_layer_specs.py
deleted file mode 100644
index a72e3899f94aa43c5292e05d59dba5437c1bc2f2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_layer_specs.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from megatron_ds.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron_ds.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron_ds.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron_ds.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-from megatron_ds.core.transformer.custom_layers.transformer_engine import (
-    TEDotProductAttention,
-    TELayerNormColumnParallelLinear,
-    TERowParallelLinear,
-)
-from megatron_ds.core.transformer.dot_product_attention import DotProductAttention
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.mlp import MLP, MLPSubmodules
-from megatron_ds.core.transformer.spec_utils import ModuleSpec
-from megatron_ds.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
-
-# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
-bert_layer_with_transformer_engine_spec = ModuleSpec(
-    module=TransformerLayer,
-    submodules=TransformerLayerSubmodules(
-        self_attention=ModuleSpec(
-            module=SelfAttention,
-            params={"attn_mask_type": AttnMaskType.padding},
-            submodules=SelfAttentionSubmodules(
-                linear_qkv=TELayerNormColumnParallelLinear,
-                core_attention=TEDotProductAttention,
-                linear_proj=TERowParallelLinear,
-            ),
-        ),
-        self_attn_bda=get_bias_dropout_add,
-        mlp=ModuleSpec(
-            module=MLP,
-            submodules=MLPSubmodules(
-                linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,
-            ),
-        ),
-        mlp_bda=get_bias_dropout_add,
-    ),
-)
-
-# Use this spec for an implementation using only modules in megatron core
-bert_layer_local_spec = ModuleSpec(
-    module=TransformerLayer,
-    submodules=TransformerLayerSubmodules(
-        input_layernorm=FusedLayerNorm,
-        self_attention=ModuleSpec(
-            module=SelfAttention,
-            params={"attn_mask_type": AttnMaskType.padding},
-            submodules=SelfAttentionSubmodules(
-                linear_qkv=ColumnParallelLinear,
-                core_attention=DotProductAttention,
-                linear_proj=RowParallelLinear,
-            ),
-        ),
-        self_attn_bda=get_bias_dropout_add,
-        pre_mlp_layernorm=FusedLayerNorm,
-        mlp=ModuleSpec(
-            module=MLP,
-            submodules=MLPSubmodules(
-                linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
-            ),
-        ),
-        mlp_bda=get_bias_dropout_add,
-    ),
-)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_lm_head.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_lm_head.py
deleted file mode 100644
index cf3d36aadfe656fdf0c1e2fdb459f5ee8a780a65..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_lm_head.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import torch
-from torch import Tensor
-
-from megatron_ds.core import tensor_parallel
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.utils import erf_gelu, get_linear_layer, openai_gelu
-from megatron_ds.model import LayerNorm
-
-
-class BertLMHead(MegatronModule):
-    """Masked LM head for Bert
-
-    Args:
-        hidden_size: hidden size
-        config (TransformerConfig): TransformerConfig object
-        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
-        vocab_size(int): The vocabulary size
-        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are shared. Defaults to False
-        pre_process (bool): Include embedding layer (used with pipeline parallelism)
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        config: TransformerConfig,
-        parallel_output: bool,
-        vocab_size: int,
-        pre_process: bool,
-        share_embeddings_and_output_weights: bool = False,
-    ):
-        super().__init__(config=config)
-
-        self.vocab_size = vocab_size
-        self.parallel_output = parallel_output
-
-        # TODO: Shoudl switch this to TE ?
-        self.dense = get_linear_layer(
-            hidden_size, hidden_size, config.init_method, config.perform_initialization
-        )
-
-        setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel)
-        setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel)
-
-        self.layernorm = LayerNorm(
-            hidden_size, eps=config.layernorm_epsilon, sequence_parallel=config.sequence_parallel
-        )
-
-        self.gelu = torch.nn.functional.gelu
-        # TODO Use activation_func in config to determine what to use
-        # if config.openai_gelu: # Dont have these configs in transfomer config yet
-        #    self.gelu = openai_gelu
-        # elif config.onnx_safe: # Dont have these configs in transfomer config yet
-        #   self.gelu = erf_gelu
-
-        self.output_layer = tensor_parallel.ColumnParallelLinear(
-            config.hidden_size,
-            self.vocab_size,
-            config=config,
-            init_method=config.init_method,
-            bias=True,
-            skip_bias_add=False,
-            gather_output=not self.parallel_output,
-            skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
-        )
-
-    def forward(self, hidden_states: Tensor, word_embeddings_weight: Tensor) -> Tensor:
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.gelu(hidden_states)
-        hidden_states = self.layernorm(hidden_states)
-        logits, _ = self.output_layer(hidden_states, weight=word_embeddings_weight)
-        return logits
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_model.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_model.py
deleted file mode 100644
index ba68b842ecd093321c71e691f8a04459b313186c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/bert_model.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-from typing import Literal, Optional
-
-import torch
-from torch import Tensor
-
-from megatron_ds.core.models.bert.bert_lm_head import BertLMHead
-from megatron_ds.core.models.bert.pooler import Pooler
-from megatron_ds.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
-from megatron_ds.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
-from megatron_ds.core.models.common.language_module.language_module import LanguageModule
-from megatron_ds.core.transformer.enums import AttnMaskType, ModelType
-from megatron_ds.core.transformer.spec_utils import ModuleSpec
-from megatron_ds.core.transformer.transformer_block import TransformerBlock
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.utils import get_linear_layer
-from megatron_ds.model.bert_model import bert_extended_attention_mask, bert_position_ids
-
-
-class BertModel(LanguageModule):
-    """Transformer language model.
-
-    Args:
-        config (TransformerConfig): transformer config
-        num_tokentypes (int) : Set to 2 when args.bert_binary_head is True, and 0 otherwise. Defaults to 0.
-        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers
-        vocab_size (int): vocabulary size
-        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
-        pre_process (bool): Include embedding layer (used with pipeline parallelism)
-        post_process (bool): Include an output layer (used with pipeline parallelism)
-        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
-        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are shared. Defaults to False.
-        position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
-            Defaults is 'learned_absolute'.
-        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
-            Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        num_tokentypes: int,
-        transformer_layer_spec: ModuleSpec,
-        vocab_size: int,
-        max_sequence_length: int,
-        pre_process: bool = True,
-        post_process: bool = True,
-        fp16_lm_cross_entropy: bool = False,
-        parallel_output: bool = True,
-        share_embeddings_and_output_weights: bool = False,
-        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
-        rotary_percent: float = 1.0,
-        seq_len_interpolation_factor: Optional[float] = None,
-        add_binary_head=True,
-        return_embeddings=False,
-    ):
-        super(BertModel, self).__init__(config=config)
-
-        if return_embeddings:
-            assert self.post_process and self.add_binary_head
-
-        self.config: TransformerConfig = config
-        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
-        self.vocab_size = vocab_size
-        self.max_sequence_length = max_sequence_length
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
-        self.parallel_output = parallel_output
-        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-        self.position_embedding_type = position_embedding_type
-        self.add_binary_head = add_binary_head
-        self.return_embeddings = return_embeddings
-
-        # megatron core pipelining currently depends on model type
-        self.model_type = ModelType.encoder_or_decoder
-
-        # Embeddings.
-        if self.pre_process:
-            self.embedding = LanguageModelEmbedding(
-                config=self.config,
-                vocab_size=self.vocab_size,
-                max_sequence_length=self.max_sequence_length,
-                position_embedding_type=position_embedding_type,
-                num_tokentypes=num_tokentypes,
-            )
-
-        if self.position_embedding_type == 'rope':
-            self.rotary_pos_emb = RotaryEmbedding(
-                self.config.kv_channels, rotary_percent, seq_len_interpolation_factor
-            )
-
-        # Transformer.
-        self.encoder = TransformerBlock(
-            config=self.config,
-            spec=self.transformer_layer_spec,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-        )
-
-        # Output
-        if post_process:
-            # TODO: Make sure you are passing in the mpu_vocab_size properly
-            self.lm_head = BertLMHead(
-                config.hidden_size,
-                config,
-                parallel_output,
-                self.vocab_size,
-                self.pre_process,
-                self.share_embeddings_and_output_weights,
-            )
-
-            self.output_layer = self.lm_head.output_layer
-
-            self.binary_head = None
-            if self.add_binary_head:
-                # TODO: Shoudl switch this to TE ?
-                self.binary_head = get_linear_layer(
-                    config.hidden_size, 2, config.init_method, config.perform_initialization
-                )
-
-                self.pooler = Pooler(
-                    config.hidden_size, config.init_method, config, config.sequence_parallel
-                )
-
-        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
-            self.initialize_last_stage_with_word_embeddings()
-
-    def set_input_tensor(self, input_tensor: Tensor) -> None:
-        """Sets input tensor to the model.
-
-        See megatron_ds.model.transformer.set_input_tensor()
-
-        Args:
-            input_tensor (Tensor): Sets the input tensor for the model.
-        """
-        # This is usually handled in schedules.py but some inference code still
-        # gives us non-lists or None
-        if not isinstance(input_tensor, list):
-            input_tensor = [input_tensor]
-
-        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert'
-        self.encoder.set_input_tensor(input_tensor[0])
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        attention_mask: Tensor,
-        tokentype_ids: Tensor = None,
-        lm_labels: Tensor = None,
-        inference_params=None,
-    ):
-        """Forward function of BERT model
-
-        Forward function of the BERT Model This function passes the input tensors
-        through the embedding layer, and then the encoder and finally into the post
-        processing layer (optional).
-
-        It either returns the Loss values if labels are given  or the final hidden units
-        """
-        extended_attention_mask = bert_extended_attention_mask(attention_mask)
-
-        position_ids = bert_position_ids(input_ids)
-
-        # Encoder embedding.
-        if self.pre_process:
-            encoder_input = self.embedding(
-                input_ids=input_ids, position_ids=position_ids, tokentype_ids=tokentype_ids
-            )
-        else:
-            # intermediate stage of pipeline
-            # decoder will get hidden_states from encoder.input_tensor
-            encoder_input = None
-
-        # Rotary positional embeddings (Why not move this into BERT/GPTEmberdding ?)
-        rotary_pos_emb = None
-        if self.position_embedding_type == 'rope':
-            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
-                inference_params, self.encoder, encoder_input, self.config
-            )
-            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
-
-        # Run decoder.
-        hidden_states = self.encoder(
-            hidden_states=encoder_input,
-            attention_mask=extended_attention_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb,
-        )
-        if not self.post_process:
-            return hidden_states
-
-        if self.add_binary_head:
-            pooled_output = self.pooler(hidden_states, 0)
-
-        if self.return_embeddings:
-            embeddings = torch.transpose(hidden_states, 0, 1)
-            masks = torch.sum(attention_mask, dim=1)
-            # Collect masked embeddings.
-            output = torch.zeros(
-                size=(embeddings.shape[0], embeddings.shape[2]),
-                dtype=torch.float32,
-                device=torch.cuda.current_device(),
-            )
-            for i, (embedding, mask) in enumerate(zip(embeddings, masks)):
-                output[i, :] = torch.mean(embedding[1 : mask - 1], dim=0)
-            return output
-
-        # logits and loss
-        output_weight = None
-        if self.share_embeddings_and_output_weights:
-            output_weight = self.shared_embedding_or_output_weight()
-
-        logits = self.lm_head(hidden_states=hidden_states, word_embeddings_weight=output_weight)
-
-        binary_logits = None
-        if self.binary_head is not None:
-            binary_logits = self.binary_head(pooled_output)
-
-        if lm_labels is None:
-            # [s b h] => [b s h]
-            return logits.transpose(0, 1).contiguous(), binary_logits
-
-        loss = self.compute_language_model_loss(lm_labels, logits)
-
-        return loss, binary_logits
-
-    # TODO: add distributed checkpointing
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        pass
-
-    # TODO: add distributed checkpointing
-    def load_state_dict(self, state_dict, strict=True):
-        pass
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/pooler.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/pooler.py
deleted file mode 100644
index 9831e8b0bc4a776c3b8f63f57b2149e2dae5b992..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/pooler.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import torch
-from torch import Tensor
-
-from megatron_ds.core import tensor_parallel
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.utils import get_linear_layer
-
-
-class Pooler(MegatronModule):
-    """Pooler layer.
-
-    Pool hidden states of a specific token (for example start of the
-    sequence) and add a linear transformation followed by a tanh.
-
-    Args:
-        hidden_size (int): The hidden size_
-        init_method (callable): weight initialization method for the linear layer. bias is set to zero.
-        config (TransformerConfig): The transformer configuration
-        sequence_parallel (bool): Using squence parallel ? Defaults to False
-    """
-
-    def __init__(
-        self,
-        hidden_size: int,
-        init_method: callable,
-        config: TransformerConfig,
-        sequence_parallel: bool = False,
-    ):
-        super(Pooler, self).__init__(config)
-        # TODO: Shoudl switch this to TE ?
-        self.dense = get_linear_layer(
-            hidden_size, hidden_size, init_method, config.perform_initialization
-        )
-        self.sequence_parallel = sequence_parallel
-
-    def forward(self, hidden_states: Tensor, sequence_index=0):
-        # hidden_states: [s, b, h]
-        # sequence_index: index of the token to pool.
-
-        # gather data along sequence dimensions
-        # same pooler is run on all tensor parallel nodes
-        if self.sequence_parallel:
-            hidden_states = tensor_parallel.gather_from_sequence_parallel_region(
-                hidden_states, tensor_parallel_output_grad=False
-            )
-
-        pooled = hidden_states[sequence_index, :, :]
-        pooled = self.dense(pooled)
-        pooled = torch.tanh(pooled)
-        return pooled
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/embeddings/language_model_embedding.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/embeddings/language_model_embedding.py
deleted file mode 100644
index d2b49168b0a07f7b346111efa2b0eaf9ab5a1275..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/embeddings/language_model_embedding.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from typing import Literal, Optional
-
-import torch
-from torch import Tensor
-
-from megatron_ds.core import tensor_parallel
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.utils import (
-    make_sharded_tensor_for_checkpoint,
-    make_tp_sharded_tensor_for_checkpoint,
-)
-
-
-class LanguageModelEmbedding(MegatronModule):
-    """Language model embeddings.
-
-    Arguments:
-        config (TransformerConfig): config object with all necessary configs for TransformerBlock
-        vocab_size (int): vocabulary size
-        max_sequence_length (int): maximum size of sequence. This
-                             is used for positional embedding
-        add_position_embedding (bool): Add a position embedding.
-        embedding_dropout_prob (float): dropout probability for embeddings
-        num_tokentypes (int): Set to 0 without binary head, and 2 with a binary head . Defaults to 0.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        vocab_size: int,
-        max_sequence_length: int,
-        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
-        num_tokentypes: int = 0,
-    ):
-        super().__init__(config=config)
-
-        self.config: TransformerConfig = config
-        self.vocab_size: int = vocab_size
-        self.max_sequence_length: int = max_sequence_length
-        self.add_position_embedding: bool = position_embedding_type == 'learned_absolute'
-        self.num_tokentypes = num_tokentypes
-
-        # Word embeddings (parallel).
-        self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
-            num_embeddings=self.vocab_size,
-            embedding_dim=self.config.hidden_size,
-            init_method=self.config.init_method,
-            config=self.config,
-        )
-
-        # Position embedding (serial).
-        if self.add_position_embedding:
-            self.position_embeddings = torch.nn.Embedding(
-                self.max_sequence_length, self.config.hidden_size
-            )
-
-            # Initialize the position embeddings.
-            if self.config.perform_initialization:
-                self.config.init_method(self.position_embeddings.weight)
-
-        if self.num_tokentypes > 0:
-            self.tokentype_embeddings = torch.nn.Embedding(
-                self.num_tokentypes, self.config.hidden_size
-            )
-            # Initialize the token-type embeddings.
-            if self.config.perform_initialization:
-                self.config.init_method(self.tokentype_embeddings.weight)
-        else:
-            self.tokentype_embeddings = None
-
-        # Embeddings dropout
-        self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout)
-
-    def zero_parameters(self):
-        """Zero out all parameters in embedding."""
-        self.word_embeddings.weight.data.fill_(0)
-        self.word_embeddings.weight.shared = True
-        self.position_embeddings.weight.data.fill_(0)
-        self.position_embeddings.weight.shared = True
-        if self.num_tokentypes > 0:
-            self.tokentype_embeddings.weight.data.fill_(0)
-            self.tokentype_embeddings.weight.shared = True
-
-    def forward(self, input_ids: Tensor, position_ids: Tensor, tokentype_ids: int = None) -> Tensor:
-        """Forward pass of the embedding module
-        Args:
-            input_ids (Tensor): The input tokens
-            position_ids (Tensor): The position id's used to calculate position embeddings
-            tokentype_ids (int): The token type ids. Used when args.bert_binary_head is set to True. Defaults to None
-
-        Returns:
-            Tensor: The output embeddings
-        """
-        word_embeddings = self.word_embeddings(input_ids)
-        if self.add_position_embedding:
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings = word_embeddings + position_embeddings
-        else:
-            embeddings = word_embeddings
-
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-
-        if tokentype_ids is not None:
-            assert self.tokentype_embeddings is not None
-            # [b s h] -> [s b h] (So that it can be added with embeddings)
-            tokentype_embedding = self.tokentype_embeddings(tokentype_ids).permute(1, 0, 2)
-            embeddings = embeddings + tokentype_embedding
-        else:
-            assert self.tokentype_embeddings is None
-
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.config.fp32_residual_connection:
-            embeddings = embeddings.float()
-
-        # Dropout.
-        if self.config.sequence_parallel:
-            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
-            # `scatter_to_sequence_parallel_region` returns a view, which prevents
-            # the original tensor from being garbage collected. Clone to facilitate GC.
-            # Has a small runtime cost (~0.5%).
-            if self.config.clone_scatter_output_in_embedding:
-                embeddings = embeddings.clone()
-            with tensor_parallel.get_cuda_rng_tracker().fork():
-                embeddings = self.embedding_dropout(embeddings)
-        else:
-            embeddings = self.embedding_dropout(embeddings)
-
-        return embeddings
-
-    def sharded_state_dict(self, prefix=''):
-
-        sharded_state_dict = {}
-
-        word_embeddings_prefix = f'{prefix}word_embeddings.'
-        word_embeddings_state_dict = self.word_embeddings.state_dict(
-            prefix=word_embeddings_prefix, keep_vars=True
-        )
-
-        sharded_word_embeddings_key = f'{word_embeddings_prefix}weight'
-        sharded_word_embeddings_tensor = make_tp_sharded_tensor_for_checkpoint(
-            tensor=word_embeddings_state_dict[sharded_word_embeddings_key],
-            key=sharded_word_embeddings_key,
-            allow_shape_mismatch=True,
-        )
-        sharded_state_dict[sharded_word_embeddings_key] = sharded_word_embeddings_tensor
-
-        if self.add_position_embedding:
-            position_embeddings_prefix = f'{prefix}position_embeddings.'
-            position_embeddings_state_dict = self.position_embeddings.state_dict(
-                prefix=position_embeddings_prefix, keep_vars=True
-            )
-            sharded_position_embeddings_key = f'{position_embeddings_prefix}weight'
-            sharded_position_embeddings_tensor = make_sharded_tensor_for_checkpoint(
-                tensor=position_embeddings_state_dict[sharded_position_embeddings_key],
-                key=sharded_position_embeddings_key,
-            )
-            sharded_state_dict[sharded_position_embeddings_key] = sharded_position_embeddings_tensor
-
-        return sharded_state_dict
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/embeddings/rotary_pos_embedding.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/embeddings/rotary_pos_embedding.py
deleted file mode 100644
index 5427ae822a3f6fcfd3c9ffc574810523d55062ad..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/embeddings/rotary_pos_embedding.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from megatron_ds.core.transformer.transformer_config import TransformerConfig
-    from megatron_ds.core.transformer.transformer_block import TransformerBlock
-
-import torch
-from torch import Tensor, nn
-
-from megatron_ds.core import parallel_state
-
-__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
-
-
-def get_pos_emb_on_this_cp_rank(pos_emb, seq_dim):
-    cp_size = parallel_state.get_context_parallel_world_size()
-    cp_rank = parallel_state.get_context_parallel_rank()
-    cp_idx = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], device=pos_emb.device)
-    pos_emb = pos_emb.view(
-        *pos_emb.shape[:seq_dim], 2 * cp_size, -1, *pos_emb.shape[(seq_dim + 1) :]
-    )
-    pos_emb = pos_emb.index_select(seq_dim, cp_idx)
-    pos_emb = pos_emb.view(*pos_emb.shape[:seq_dim], -1, *pos_emb.shape[(seq_dim + 2) :])
-    return pos_emb
-
-
-class RotaryEmbedding(nn.Module):
-    """Rotary Embedding for language model.
-
-    Args:
-        kv_channels (int): Projection weights dimension in multi-head attention. Obtained from transformer config
-        rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
-        seq_len_interpolation_factor (float, optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None
-        rotary_base (int, optional): Base period for rotary position embeddings. Defaults to 10000.
-    """
-
-    def __init__(
-        self,
-        kv_channels: int,
-        rotary_percent: float,
-        seq_len_interpolation_factor: float = None,
-        rotary_base: int = 10000,
-    ) -> None:
-        super().__init__()
-
-        dim = kv_channels
-        if rotary_percent < 1.0:
-            dim = int(dim * rotary_percent)
-
-        self.seq_len_interpolation_factor = seq_len_interpolation_factor
-        self.inv_freq = 1.0 / (
-            rotary_base
-            ** (
-                torch.arange(0, dim, 2, dtype=torch.float32, device=torch.cuda.current_device())
-                / dim
-            )
-        )
-
-    def forward(self, max_seq_len: int, offset: int = 0) -> Tensor:
-        """Forward pass of RoPE embedding.
-
-        Args:
-            max_seq_len (int): Maximum size of sequence
-            offset (int, optional): _description_. Defaults to 0.
-
-        Returns:
-            Tensor: Embeddings after applying RoPE.
-        """
-        seq = (
-            torch.arange(max_seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
-            + offset
-        )
-
-        if self.seq_len_interpolation_factor is not None:
-            seq *= 1 / self.seq_len_interpolation_factor
-
-        freqs = torch.outer(seq, self.inv_freq)
-        # first part even vector components, second part odd vector components,
-        #  2 * dim in dimension size
-        emb = torch.cat((freqs, freqs), dim=-1)
-        # emb [seq_length, .., dim]
-        emb = emb[:, None, None, :]
-        if parallel_state.get_context_parallel_world_size() > 1:
-            # slice rotary_pos_emb along sequence dimension and select the parition of the current CP rank
-            emb = get_pos_emb_on_this_cp_rank(emb, 0)
-        return emb
-
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        state_dict.pop(f'{prefix}inv_freq', None)
-        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-
-    def get_rotary_seq_len(
-        self,
-        inference_params,
-        transformer: TransformerBlock,
-        transformer_input: Tensor,
-        transformer_config: TransformerConfig,
-    ) -> float:
-        """Function to get the rotary sequence length.
-
-        Args:
-            inference_params : Used during Inference time
-            transformer (TransformerBlock): The transformer block (decoder/encoder) used by the model
-            transformer_input (Tensor): _description_
-            transformer_config (TransformerConfig): Transformer config used by the model
-
-        Returns:
-            float: The rotary sequence length
-        """
-        if inference_params is not None:
-            rotary_seq_len = inference_params.max_sequence_length
-        else:
-            if transformer.input_tensor is not None:
-                rotary_seq_len = transformer.input_tensor.size(0)
-            else:
-                rotary_seq_len = transformer_input.size(0)
-
-            if transformer_config.sequence_parallel:
-                rotary_seq_len *= transformer_config.tensor_model_parallel_size
-
-        rotary_seq_len *= transformer_config.context_parallel_size
-
-        return rotary_seq_len
-
-
-def _rotate_half(x: Tensor) -> Tensor:
-    """Change sign so the last dimension becomes [-odd, +even]
-
-    Args:
-        x (Tensor): Input tensor
-
-    Returns:
-        Tensor: Tensor rotated half
-    """
-
-    x1, x2 = torch.chunk(x, 2, dim=-1)
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(t: Tensor, freqs: Tensor) -> Tensor:
-    """Apply rotary positional embedding to input tensor T.
-
-    check https://kexue.fm/archives/8265 for detailed formulas
-
-    Args:
-        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
-        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
-
-    Returns:
-        Tensor: The input tensor after applying RoPE
-    """
-    rot_dim = freqs.shape[-1]
-
-    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
-    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
-
-    # first part is cosine component
-    # second part is sine component, need to change signs with _rotate_half method
-    cos_ = torch.cos(freqs).to(t.dtype)
-    sin_ = torch.sin(freqs).to(t.dtype)
-
-    t = (t * cos_) + (_rotate_half(t) * sin_)
-    return torch.cat((t, t_pass), dim=-1)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/language_module/language_module.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/language_module/language_module.py
deleted file mode 100644
index a74c035d906ad61bef0ba4aeb6196c2ffe13f7b9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/language_module/language_module.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import logging
-
-import torch
-from torch import Tensor
-
-from megatron_ds.core import parallel_state, tensor_parallel
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-
-
-class LanguageModule(MegatronModule):
-    """Base language module that has common helper functions used across GPT, BERT etc.
-
-    Args:
-        config (TransformerConfig): Input transformer config for the model
-    """
-
-    def __init__(self, config: TransformerConfig) -> None:
-        super().__init__(config=config)
-
-    def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor:
-        """Computes the language model loss (Cross entropy across vocabulary)
-
-        Args:
-            labels (Tensor): The labels of dimension [batch size, seq length]
-            logits (Tensor): The final logits returned by the output layer of the transformer model
-
-        Returns:
-            Tensor: Loss tensor of dimensions [batch size, sequence_length]
-        """
-        # [b s] => [s b]
-        labels = labels.transpose(0, 1).contiguous()
-        loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
-
-        # [s b] => [b, s]
-        loss = loss.transpose(0, 1).contiguous()
-        return loss
-
-    def initialize_last_stage_with_word_embeddings(self) -> None:
-        """Intializes the word embeddings in the final stage.
-
-        This function just initalizes word embeddings in the final stage, when we are
-        using pipeline parallelism and sharind word embeddings. Nothing to do if we
-        arn't sharing weights or aren't using Pipeline parallelism
-        """
-        if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process):
-            return
-
-        if self.post_process and not self.pre_process:
-            assert not parallel_state.is_pipeline_first_stage()
-            # set word_embeddings weights to 0 here, then copy first
-            # stage's weights using all_reduce below.
-            self.output_layer.weight.data.fill_(0)
-            self.output_layer.weight.shared = True
-
-        # Parameters are shared between the word embeddings layers, and the
-        # heads at the end of the model. In a pipelined setup with more than
-        # one stage, the initial embedding layer and the head are on different
-        # workers, so we do the following:
-        # 1. Create a second copy of word_embeddings on the last stage, with
-        #    initial parameters of 0.0.
-        # 2. Do an all-reduce between the first and last stage to ensure that
-        #    the two copies of word_embeddings start off with the same
-        #    parameter values.
-        # 3. In the training loop, before an all-reduce between the grads of
-        #    the two word_embeddings layers to ensure that every applied weight
-        #    update is the same on both stages.
-
-        # Ensure that first and last stages have the same initial parameter
-        # values.
-        if torch.distributed.is_initialized():
-            if parallel_state.is_rank_in_embedding_group():
-                weight = self.shared_embedding_or_output_weight()
-                torch.distributed.all_reduce(
-                    weight.data, group=parallel_state.get_embedding_group()
-                )
-
-        elif not getattr(LanguageModule, "embedding_warning_printed", False):
-            logging.getLogger(__name__).warning(
-                "Distributed processes aren't initialized, so the output layer "
-                "is not initialized with weights from the word embeddings. "
-                "If you are just manipulating a model this is fine, but "
-                "this needs to be handled manually. If you are training "
-                "something is definitely wrong."
-            )
-            LanguageModule.embedding_warning_printed = True
-
-    def shared_embedding_or_output_weight(self) -> Tensor:
-        """Gets the emedding weight or output logit weights when share embedding and output weights set to True.
-
-        Returns:
-            Tensor: During pre processing it returns the input embeddings weight while during post processing it returns the final output layers weight
-        """
-        if self.pre_process:
-            return self.embedding.word_embeddings.weight
-        elif self.post_process:
-            return self.output_layer.weight
-        return None
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/__init__.py
deleted file mode 100644
index 2d5eb8674f1d19673664160d5eddf3432a6a5399..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .gpt_model import GPTModel
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_embedding.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_embedding.py
deleted file mode 100644
index 97f35e7ebb1d45306ff8cdebf2d42bbe6d8d7c80..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_embedding.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-from megatron_ds.core import tensor_parallel
-
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-
-
-class GPTEmbedding(MegatronModule):
-    """Language model embeddings.
-
-    Arguments:
-        config (TransformerConfig): config object with all necessary configs for TransformerBlock
-        vocab_size (int): vocabulary size
-        max_sequence_length (int): maximum size of sequence. This
-                             is used for positional embedding
-        embedding_dropout_prob float): dropout probability for embeddings
-    """
-
-    def __init__(self, config: TransformerConfig, vocab_size: int, max_sequence_length: int):
-        super().__init__(config=config)
-
-        self.config: TransformerConfig = config
-        self.vocab_size: int = vocab_size
-        self.max_sequence_length: int = max_sequence_length
-
-        # Word embeddings (parallel).
-        self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
-            num_embeddings=self.vocab_size,
-            embedding_dim=self.config.hidden_size,
-            init_method=self.config.init_method,
-            config=self.config
-        )
-        # @jcasper are these keys needed?
-        self._word_embeddings_key = 'word_embeddings'
-
-        # Position embedding (serial).
-        self.position_embeddings = torch.nn.Embedding(self.max_sequence_length, self.config.hidden_size)
-        self._position_embeddings_key = 'position_embeddings'
-
-        # Initialize the position embeddings.
-        if self.config.perform_initialization:
-            self.config.init_method(self.position_embeddings.weight)
-
-        # Embeddings dropout
-        self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout)
-
-    def zero_parameters(self):
-        """Zero out all parameters in embedding."""
-        self.word_embeddings.weight.data.fill_(0)
-        self.word_embeddings.weight.shared = True
-        self.position_embeddings.weight.data.fill_(0)
-        self.position_embeddings.weight.shared = True
-
-    def forward(self, input_ids, position_ids):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        embeddings = words_embeddings + position_embeddings
-
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.config.fp32_residual_connection:
-            embeddings = embeddings.float()
-
-        # Dropout.
-        if self.config.sequence_parallel:
-            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
-            with tensor_parallel.get_cuda_rng_tracker().fork():
-                embeddings = self.embedding_dropout(embeddings)
-        else:
-            embeddings = self.embedding_dropout(embeddings)
-
-        return embeddings
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """For easy load."""
-
-        state_dict_ = {}
-        state_dict_[self._word_embeddings_key] = self.word_embeddings.state_dict(prefix=prefix, keep_vars=keep_vars)
-        state_dict_[self._position_embeddings_key] = self.position_embeddings.state_dict(
-            prefix=prefix, keep_vars=keep_vars
-        )
-
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-
-        # Word embedding.
-        if self._word_embeddings_key in state_dict:
-            state_dict_ = state_dict[self._word_embeddings_key]
-        else:
-            # for backward compatibility.
-            state_dict_ = {}
-            for key in state_dict.keys():
-                if 'word_embeddings' in key:
-                    state_dict_[key.split('word_embeddings.')[1]] = state_dict[key]
-        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
-
-        # Position embedding.
-        if self._position_embeddings_key in state_dict:
-            state_dict_ = state_dict[self._position_embeddings_key]
-        else:
-            # for backward compatibility.
-            state_dict_ = {}
-            for key in state_dict.keys():
-                if 'position_embeddings' in key:
-                    state_dict_[key.split('position_embeddings.')[1]] = state_dict[key]
-        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_layer_specs.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_layer_specs.py
deleted file mode 100644
index e2ba4f66fed3b19e755e15a3e5a2e2fd502934d0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_layer_specs.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from megatron_ds.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron_ds.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron_ds.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron_ds.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-from megatron_ds.core.transformer.custom_layers.transformer_engine import (
-    TEDotProductAttention,
-    TELayerNormColumnParallelLinear,
-    TERowParallelLinear,
-)
-from megatron_ds.core.transformer.dot_product_attention import DotProductAttention
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.mlp import MLP, MLPSubmodules
-from megatron_ds.core.transformer.spec_utils import ModuleSpec
-from megatron_ds.core.transformer.switch_mlp import SwitchMLP
-from megatron_ds.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
-
-
-# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
-def get_gpt_layer_with_transformer_engine_spec() -> ModuleSpec:
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=TELayerNormColumnParallelLinear,
-                    core_attention=TEDotProductAttention,
-                    linear_proj=TERowParallelLinear,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(
-                    linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,
-                ),
-            ),
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
-
-
-# Use this spec for an implementation using only modules in megatron core
-def get_gpt_layer_local_spec() -> ModuleSpec:
-    return ModuleSpec(
-        module=TransformerLayer,
-        submodules=TransformerLayerSubmodules(
-            input_layernorm=FusedLayerNorm,
-            self_attention=ModuleSpec(
-                module=SelfAttention,
-                params={"attn_mask_type": AttnMaskType.causal},
-                submodules=SelfAttentionSubmodules(
-                    linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
-                    linear_proj=RowParallelLinear,
-                ),
-            ),
-            self_attn_bda=get_bias_dropout_add,
-            pre_mlp_layernorm=FusedLayerNorm,
-            mlp=ModuleSpec(
-                module=MLP,
-                submodules=MLPSubmodules(
-                    linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
-                ),
-            ),
-            mlp_bda=get_bias_dropout_add,
-        ),
-    )
-
-
-# Use this spec to use lower level Transformer Engine modules and SwitchMLP based MoE
-gpt_layer_with_transformer_engine_spec_moe = ModuleSpec(
-    module=TransformerLayer,
-    submodules=TransformerLayerSubmodules(
-        self_attention=ModuleSpec(
-            module=SelfAttention,
-            params={"attn_mask_type": AttnMaskType.causal},
-            submodules=SelfAttentionSubmodules(
-                linear_qkv=TELayerNormColumnParallelLinear,
-                core_attention=TEDotProductAttention,
-                linear_proj=TERowParallelLinear,
-            ),
-        ),
-        self_attn_bda=get_bias_dropout_add,
-        pre_mlp_layernorm=FusedLayerNorm,
-        mlp=ModuleSpec(
-            module=SwitchMLP,  # MOE
-            submodules=MLPSubmodules(
-                linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
-            ),
-        ),
-        mlp_bda=get_bias_dropout_add,
-    ),
-)
-
-# Use this spec for an implementation using only modules in megatron core for MoE models
-gpt_layer_local_spec_moe = ModuleSpec(
-    module=TransformerLayer,
-    submodules=TransformerLayerSubmodules(
-        input_layernorm=FusedLayerNorm,
-        self_attention=ModuleSpec(
-            module=SelfAttention,
-            params={"attn_mask_type": AttnMaskType.causal},
-            submodules=SelfAttentionSubmodules(
-                linear_qkv=ColumnParallelLinear,
-                core_attention=DotProductAttention,
-                linear_proj=RowParallelLinear,
-            ),
-        ),
-        self_attn_bda=get_bias_dropout_add,
-        pre_mlp_layernorm=FusedLayerNorm,
-        mlp=ModuleSpec(
-            module=SwitchMLP,  # MOE
-            submodules=MLPSubmodules(
-                linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
-            ),
-        ),
-        mlp_bda=get_bias_dropout_add,
-    ),
-)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_model.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_model.py
deleted file mode 100644
index c21ef1d9fa86e198bd65942f9e8c92fc288fde3a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/gpt/gpt_model.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import logging
-from typing import Literal, Optional, Union
-
-import torch
-from torch import Tensor
-
-from megatron_ds.core import InferenceParams, parallel_state, tensor_parallel
-from megatron_ds.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
-from megatron_ds.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
-from megatron_ds.core.models.common.language_module.language_module import LanguageModule
-from megatron_ds.core.transformer.enums import AttnMaskType, ModelType
-from megatron_ds.core.transformer.spec_utils import ModuleSpec
-from megatron_ds.core.transformer.transformer_block import TransformerBlock
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.utils import make_tp_sharded_tensor_for_checkpoint
-
-
-class GPTModel(LanguageModule):
-    """GPT Transformer language model.
-
-    Args:
-        config (TransformerConfig): Transformer config
-        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers
-        vocab_size (int): Vocabulary size
-        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
-        pre_process (bool, optional): Include embedding layer (used with pipeline parallelism). Defaults to True.
-        post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True.
-        fp16_lm_cross_entropy (bool, optional): Defaults to False.
-        parallel_output (bool, optional): Do not gather the outputs, keep them split across tensor parallel ranks. Defaults to True.
-        share_embeddings_and_output_weights (bool, optional): When True, input embeddings and output logit weights are shared. Defaults to False.
-        position_embedding_type (Literal[learned_absolute,rope], optional):  Position embedding type.. Defaults to 'learned_absolute'.
-        rotary_percent (float, optional): Percent of rotary dimension to use for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 1.0.
-        rotary_base (int, optional): Base period for rotary position embeddings. Ignored unless position_embedding_type is 'rope'. Defaults to 10000.
-        seq_len_interpolation_factor (Optional[float], optional): scale of linearly interpolating RoPE for longer sequences. The value must be a float larger than 1.0. Defaults to None.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        transformer_layer_spec: ModuleSpec,
-        vocab_size: int,
-        max_sequence_length: int,
-        pre_process: bool = True,
-        post_process: bool = True,
-        fp16_lm_cross_entropy: bool = False,
-        parallel_output: bool = True,
-        share_embeddings_and_output_weights: bool = False,
-        position_embedding_type: Literal['learned_absolute', 'rope'] = 'learned_absolute',
-        rotary_percent: float = 1.0,
-        rotary_base: int = 10000,
-        seq_len_interpolation_factor: Optional[float] = None,
-    ) -> None:
-        super().__init__(config=config)
-
-        self.transformer_layer_spec: ModuleSpec = transformer_layer_spec
-        self.vocab_size = vocab_size
-        self.max_sequence_length = max_sequence_length
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
-        self.parallel_output = parallel_output
-        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-        self.position_embedding_type = position_embedding_type
-
-        # megatron core pipelining currently depends on model type
-        # TODO: remove this dependency ?
-        self.model_type = ModelType.encoder_or_decoder
-
-        if self.pre_process:
-            self.embedding = LanguageModelEmbedding(
-                config=self.config,
-                vocab_size=self.vocab_size,
-                max_sequence_length=self.max_sequence_length,
-                position_embedding_type=position_embedding_type,
-            )
-
-        if self.position_embedding_type == 'rope':
-            self.rotary_pos_emb = RotaryEmbedding(
-                kv_channels=self.config.kv_channels,
-                rotary_percent=rotary_percent,
-                seq_len_interpolation_factor=seq_len_interpolation_factor,
-                rotary_base=rotary_base,
-            )
-
-        # Transformer.
-        self.decoder = TransformerBlock(
-            config=self.config,
-            spec=transformer_layer_spec,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-        )
-
-        # Output
-        if post_process:
-            self.output_layer = tensor_parallel.ColumnParallelLinear(
-                config.hidden_size,
-                self.vocab_size,
-                config=config,
-                init_method=config.init_method,
-                bias=False,
-                skip_bias_add=False,
-                gather_output=not self.parallel_output,
-                skip_weight_param_allocation=self.pre_process
-                and self.share_embeddings_and_output_weights,
-            )
-
-        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
-            self.initialize_last_stage_with_word_embeddings()
-
-    def set_input_tensor(self, input_tensor: Tensor) -> None:
-        """Sets input tensor to the model.
-
-        See megatron_ds.model.transformer.set_input_tensor()
-
-        Args:
-            input_tensor (Tensor): Sets the input tensor for the model.
-        """
-        # This is usually handled in schedules.py but some inference code still
-        # gives us non-lists or None
-        if not isinstance(input_tensor, list):
-            input_tensor = [input_tensor]
-
-        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt/bert'
-        self.decoder.set_input_tensor(input_tensor[0])
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids: Tensor,
-        attention_mask: Tensor,
-        decoder_input: Tensor = None,
-        labels: Tensor = None,
-        inference_params: InferenceParams = None,
-        extra_block_kwargs: dict = None,
-    ) -> Tensor:
-        """Forward function of the GPT Model This function passes the input tensors
-        through the embedding layer, and then the decoeder and finally into the post
-        processing layer (optional).
-
-        It either returns the Loss values if labels are given  or the final hidden units
-        """
-        # If decoder_input is provided (not None), then input_ids and position_ids are ignored.
-        # Otherwise, apply embedding layer on input_ids and position_ids to get decoder_input.
-
-        # Decoder embedding.
-        if decoder_input is not None:
-            pass
-        elif self.pre_process:
-            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
-        else:
-            # intermediate stage of pipeline
-            # decoder will get hidden_states from encoder.input_tensor
-            decoder_input = None
-
-        # Rotary positional embeddings (embedding is None for PP intermediate devices)
-        rotary_pos_emb = None
-        if self.position_embedding_type == 'rope':
-            rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
-                inference_params, self.decoder, decoder_input, self.config
-            )
-            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len)
-
-        # Run decoder.
-        hidden_states = self.decoder(
-            hidden_states=decoder_input,
-            attention_mask=attention_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb,
-            **(extra_block_kwargs or {}),
-        )
-
-        if not self.post_process:
-            return hidden_states
-
-        # logits and loss
-        output_weight = None
-        if self.share_embeddings_and_output_weights:
-            output_weight = self.shared_embedding_or_output_weight()
-        logits, _ = self.output_layer(hidden_states, weight=output_weight)
-
-        if labels is None:
-            # [s b h] => [b s h]
-            return logits.transpose(0, 1).contiguous()
-
-        loss = self.compute_language_model_loss(labels, logits)
-
-        return loss
-
-    def sharded_state_dict(self, prefix: str = '') -> dict:
-        sharded_state_dict = {}
-
-        if self.pre_process:
-            embedding_prefix = f'{prefix}embedding.'
-            embedding_sharded_state_dict = self.embedding.sharded_state_dict(
-                prefix=embedding_prefix
-            )
-            sharded_state_dict.update(embedding_sharded_state_dict)
-
-        decoder_prefix = f'{prefix}decoder.'
-        decoder_sharded_state_dict = self.decoder.sharded_state_dict(prefix=decoder_prefix)
-        sharded_state_dict.update(decoder_sharded_state_dict)
-
-        if self.post_process:
-            output_layer_prefix = f'{prefix}output_layer.'
-            output_layer_key = f'{output_layer_prefix}weight'
-            if self.share_embeddings_and_output_weights:
-                if not self.pre_process:
-                    # when sharing embeddings with last stage, we need to use the weights from the first stage
-                    # on pipeline first rank, word embeddings are saved to {prefix}embedding.word_embeddings.weight
-                    tensor = self.shared_embedding_or_output_weight()
-                    first_stage_word_emb_key = f'{prefix}embedding.word_embeddings.weight'
-                    last_stage_word_emb_replica_id = (
-                        1,  # copy of first stage embedding
-                        0,
-                        parallel_state.get_data_parallel_rank(),
-                    )
-
-                    sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                        tensor=tensor,
-                        key=first_stage_word_emb_key,
-                        replica_id=last_stage_word_emb_replica_id,
-                        allow_shape_mismatch=True,
-                    )
-
-                    sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
-
-            else:
-                output_layer_state_dict = self.output_layer.state_dict(
-                    prefix=output_layer_prefix, keep_vars=True
-                )
-                output_layer_tensor = output_layer_state_dict[output_layer_key]
-                # independent output layer
-                sharded_output_layer_tensor = make_tp_sharded_tensor_for_checkpoint(
-                    tensor=output_layer_tensor, key=output_layer_key, allow_shape_mismatch=True,
-                )
-
-                sharded_state_dict[output_layer_key] = sharded_output_layer_tensor
-
-        return sharded_state_dict
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/__init__.py
deleted file mode 100644
index c101fcb1e4cf51be9b2e2268597ed1b1f11a9319..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .config import RetroConfig
-from .decoder_spec import get_retro_decoder_block_spec
-from .model import RetroModel
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/base_attention.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/base_attention.py
deleted file mode 100644
index 77f1bd0634b26f402dd208eb9138f2571a81edea..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/base_attention.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from megatron_ds.core.models.retro.config import RetroConfig
-from megatron_ds.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.module import MegatronModule
-
-
-class BaseRetroCrossAttention(MegatronModule):
-
-    """Base class for Retro cross attention, for both encoder & decoder layers.
-
-    This class collects the retro arguments below (i.e., num neighbors, chunk
-    length, and retrieve length) for use in Retro's custom cross attention
-    operators.
-
-    Arguments:
-      config (RetroConfig): Retro config.
-
-      submodules (CrossAttentionSubmodules): Cross attention submodules.
-
-      layer_number (int): Layer number within transformer block.
-
-      attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
-    """
-
-    def __init__(
-        self,
-        config: RetroConfig,
-        submodules: CrossAttentionSubmodules,
-        layer_number: int = 1,
-        attn_mask_type: AttnMaskType = AttnMaskType.padding,
-    ):
-        super().__init__(config=config)
-
-        self.attn = CrossAttention(
-            config=config,
-            submodules=submodules,
-            layer_number=layer_number,
-            attn_mask_type=attn_mask_type,
-        )
-
-        self.retro_num_neighbors = config.retro_num_neighbors
-        self.retro_chunk_length = config.retro_preprocess.retro_gpt_chunk_length
-        self.retro_retrieved_length = config.retro_preprocess.retro_gpt_retrieved_length
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/config.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/config.py
deleted file mode 100644
index 9af1f6967a8bd6713ecf0953523e2a87e34089d2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/config.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import types
-from dataclasses import dataclass
-
-from megatron_ds.core.transformer import TransformerConfig
-
-
-@dataclass
-class RetroConfig(TransformerConfig):
-
-    """Configuration object for Retro models.
-
-    Attributes:
-
-        retro_preprocess (SimpleNamespace): Retro preprocess arguments.
-        retro_workdir (str): Retro working directory, which contains the
-            preprocessed data for for pretraining. This directory is built during
-            preprocessing (see tools/retro/README.md), and contains subdirectories
-            for the chunk database and pretraining neighbors.
-        retro_encoder_layers (int): Number of layers to use for the retrieval
-            encoder.
-        retro_encoder_hidden_dropout (float): Hidden dropout for retrieval
-            encoder.
-        retro_encoder_attention_dropout (float): Attention dropout for retrieval
-            encoder.
-        retro_num_neighbors (int): Number of neighbors to retrieve during
-            pretraining.
-        retro_num_retrieved_chunks (int): Number of chunks to retrieve from the
-            retrieval database.
-        retro_verify_neighbor_count (bool): Verify that len(GPT dataset) ==
-            len(saved neighbors).
-    """
-
-    # Retro.
-    retro_preprocess: types.SimpleNamespace = None
-    retro_workdir: str = None
-    retro_encoder_num_layers: int = 2
-    retro_encoder_hidden_dropout: float = 0.1
-    retro_encoder_attention_dropout: float = 0.1
-    retro_num_neighbors: int = 2
-    retro_num_retrieved_chunks: int = 2
-    retro_verify_neighbor_count: bool = True
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/decoder_attention.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/decoder_attention.py
deleted file mode 100644
index 0111aa4ce33ecdf655541ee9ad0145ac5b6495bb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/decoder_attention.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Retro's cross attention modules for the decoder block."""
-
-from functools import partial
-from typing import Callable
-
-import numpy as np
-import torch
-from torch import Tensor
-
-from megatron_ds.core import InferenceParams
-from megatron_ds.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron_ds.core.models.retro.base_attention import BaseRetroCrossAttention
-from megatron_ds.core.models.retro.config import RetroConfig
-from megatron_ds.core.transformer import ModuleSpec
-from megatron_ds.core.transformer.attention import CrossAttentionSubmodules
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.transformer_block import TransformerBlock
-
-
-class RetroDecoderCrossAttention(BaseRetroCrossAttention):
-
-    """Retro decoder's chunked cross attention operator.
-
-    See this paper for more details: https://arxiv.org/abs/2112.04426.
-    Neighboring chunks retrieved from the chunk database are used here for
-    chunked-cross attention.
-
-    Arguments:
-      config (RetroConfig): Retro config.
-
-      submodules (CrossAttentionSubmodules): Cross attention submodules.
-
-      layer_number (int): Layer number within transformer block.
-
-      attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
-
-      encoder_block_spec (ModuleSpec): The first Retro decoder
-      layer is provided with a transformer block spec to construct the
-      neighbor encoder.
-    """
-
-    def __init__(
-        self,
-        config: RetroConfig,
-        submodules: CrossAttentionSubmodules,
-        layer_number: int = 1,
-        attn_mask_type: AttnMaskType = AttnMaskType.padding,
-        encoder_block_spec: ModuleSpec = None,
-    ):
-        """
-        ** Note about 'encoder_block_spec' **
-
-        Retro is an encoder-decoder model that uses its encoder for encoding
-        neighboring chunks that are retrieved from a chunk database. These
-        encoded neighbors are then used in the decoder stack for performing
-        chunked-cross attention (see paper link above).
-
-        In contrast to the T5 model, the encoder and decoder are computationally
-        intertwined, since the input to the encoder is the output of the self-
-        attention of the first decoder layer. As such, the encoder block itself
-        is instantiated within the first Retro decoder layer, in order to receive
-        the self-attention's output. (Note, that only the first decoder layer
-        instantiates an encoder block, and the remaining decoder layers use the
-        encoder output from the first decoder layer.)
-        """
-
-        super().__init__(
-            config=config,
-            submodules=submodules,
-            layer_number=layer_number,
-            attn_mask_type=attn_mask_type,
-        )
-
-        if encoder_block_spec:
-            self.encoder = TransformerBlock(
-                config=config, spec=encoder_block_spec, pre_process=True, post_process=False,
-            )
-            # self._encoder_key = 'encoder' # ... necessary?
-        else:
-            self.encoder = None
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Tensor,
-        key_value_states: Tensor = None,
-        inference_params: InferenceParams = None,
-        # rotary_pos_emb: Tensor = None, # ... unsupported for retro.
-    ) -> Tensor:
-        """Cross attention for Retro decoder.
-
-        Notation:
-            ns : Sequence length.
-            bs : Batch size.
-            d  : Hidden size.
-            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-            m  : Number of tokens per chunk.
-            k  : Number of neighbors.
-            r  : Number of retrieved tokens (neighbors + continuation).
-
-        Arguments:
-          hidden_states (Tensor): Transformer layer hidden states.
-
-          attention_mask (Tensor): Attention mask.
-
-          key_value_states (Tensor): Neighbor embeddings if first decoder
-          layer, else encoder output.
-
-          inference_params (InferenceParams): Inference params.
-        """
-
-        # hidden_states: [ ns, bs, d ]
-        # key_value_states: [ r, k*bs*l, d ]
-
-        ns, bs, d = hidden_states.shape
-        l = int(np.ceil(ns / self.retro_chunk_length))
-
-        # Retrieve neighbors.
-        if self.encoder:
-
-            # Sequence length remainder.
-            first_ns = ns % self.retro_chunk_length
-
-            # Case 1: Sequence length not divisible by chunk length.
-            if first_ns > 0:
-
-                # Split sequence into first partial chunk & remaining chunks.
-                first_chunk, rest_chunk = hidden_states[:first_ns], hidden_states[first_ns:]
-
-                # Pad partial chunk with zeros.
-                first_chunk = torch.nn.functional.pad(
-                    first_chunk, (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns), 'constant', 0,
-                )
-
-                # Concatenate padded chunk with remaining chunks.
-                chunked_output = torch.cat((first_chunk, rest_chunk), dim=0)  # [ l*m, bs, d ]
-
-            # Case 2: Sequence length is divisible by chunk length.
-            else:
-                chunked_output = hidden_states  # [ l*m, bs, d ]
-
-            # Chunk & permute hidden states.
-            # - hidden_states:  [ l*m, bs, d ]
-            # - chunked_output: [ m, bs*l, d ]
-            chunked_output = (
-                chunked_output.reshape(l, self.retro_chunk_length, bs, d)
-                .permute(1, 2, 0, 3)
-                .reshape(self.retro_chunk_length, bs * l, d)
-                .contiguous()
-            )
-
-            # Encode neighbors. (Note: 'key_value_states' re-assigned here.)
-            key_value_states = self.encoder(
-                hidden_states=key_value_states,
-                attention_mask=attention_mask,
-                context=chunked_output,
-                context_mask=None,
-                inference_params=inference_params,
-            )  # [ r, k*bs*l, d ]
-            key_value_states = key_value_states.reshape(
-                self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d
-            )  # [ r*k, bs*l, d ]
-
-        # Attend starting at last token of first chunk.
-        pad = (ns - 1) % self.retro_chunk_length
-        attending_chunks = hidden_states[pad:]
-
-        # Pad attending tokens to sequence length.
-        padded_chunks = torch.nn.functional.pad(
-            attending_chunks, (0, 0, 0, 0, 0, self.retro_chunk_length - 1), 'constant', 0,
-        )
-
-        # Permute attending chunks.
-        # - padded_chunks:         [ l*m, bs, d ]
-        # - padded_chunked_output: [ m, bs*l, d ] (matches 'chunked_output' above)
-        padded_chunked_output = padded_chunks.reshape(l, self.retro_chunk_length, bs, d).permute(
-            1, 2, 0, 3
-        )
-        padded_chunked_output = padded_chunked_output.reshape(
-            self.retro_chunk_length, bs * l, d
-        ).contiguous()
-
-        # Attend to encoded neighbors.
-        attention_output, attention_bias = self.attn(
-            padded_chunked_output, None, key_value_states=key_value_states,
-        )
-
-        # Return dimensions for bias-dropout step.
-        return {
-            "ns": ns,
-            "bs": bs,
-            "d": d,
-            "l": l,
-            "pad": pad,
-            "attention_output": attention_output,  # [ m, bs*l, d ]
-            "attention_bias": attention_bias,  # [ d ]
-            "context": key_value_states,  # [ r*k, bs*l, d ]
-        }
-
-
-class RetroDecoderBiasDropoutAdd(MegatronModule):
-
-    """Retro decoder's bias-dropout-add operator.
-
-    This operator takes care of reshaping and permuting the output from the
-    chunk dimension to the sequence dimension.
-
-    Arguments:
-      config (RetroConfig): Retro config.
-    """
-
-    def __init__(
-        self, config: RetroConfig,
-    ):
-        super().__init__(config=config)
-        self.retro_chunk_length = config.retro_preprocess.retro_gpt_chunk_length
-
-    @classmethod
-    def _forward(
-        cls,
-        x_with_bias: dict,
-        residual: Tensor,
-        prob: float,
-        retro_chunk_length: int,
-        bias_dropout_add: Callable,
-    ) -> Tensor:
-        """Per-chunk bias-dropout-add.
-
-        Arguments:
-          x_with_bias (dict): Attention output and bias, along with other Retro
-          relevant parameters.
-
-          residual (Tensor): Transformer layer residual.
-
-          prob (float): Dropout probability.
-
-          retro_chunk_length (int): Retro chunk length (e.g., 64).
-
-          bias_dropout_add (Callable): Bias-dropout-add function.
-        """
-
-        # Extract input dict.
-        ns = x_with_bias["ns"]
-        bs = x_with_bias["bs"]
-        d = x_with_bias["d"]
-        l = x_with_bias["l"]
-        pad = x_with_bias["pad"]
-        attention_output = x_with_bias["attention_output"]  # [ m, bs*l, d ]
-        attention_bias = x_with_bias["attention_bias"]  # [ d ]
-
-        # Re-enable torch grad to enable fused optimization.
-        with torch.enable_grad():
-
-            # Bias-dropout-add.
-            x = bias_dropout_add(
-                (
-                    attention_output,
-                    None if attention_bias is None else attention_bias.expand_as(attention_output),
-                ),
-                torch.zeros_like(attention_output),
-                prob,
-            )
-
-            # Permute chunks back to sequence dimension.
-            # 1. [ m, bs*l, d ]
-            # 2. [ m, bs, l, d ]
-            # 3. [ l, m, bs, d ]
-            # 4. [ m*l, bs, d ] == [ ns, bs, d ]
-            x = (
-                x.reshape(retro_chunk_length, bs, l, d)
-                .permute(2, 0, 1, 3)
-                .reshape(retro_chunk_length * l, bs, d)
-            )
-
-            # Prepend zeros for non-attending tokens.
-            x = torch.nn.functional.pad(x, (0, 0, 0, 0, pad, 0), 'constant', 0,)[
-                :ns
-            ]  # [ ns, bs, d ]
-
-            # Add residual. [ ns, bs, d ]
-            x = x + residual
-
-        # Output. [ ns, bs, d ]
-        return x
-
-    def forward(self, training: bool, fused: bool) -> Tensor:
-        """Retro decoder bias-dropout-add.
-
-        Arguments:
-          training (bool): If training, then apply dropout.
-
-          fused (bool): Fuse bias-dropout-add.
-        """
-        return partial(
-            self._forward,
-            retro_chunk_length=self.retro_chunk_length,
-            bias_dropout_add=get_bias_dropout_add(training, fused),
-        )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/decoder_spec.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/decoder_spec.py
deleted file mode 100644
index bf0c7636d38de596d0ee1bbaa956250c2a04702c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/decoder_spec.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron_ds.core.models.gpt.gpt_layer_specs import (
-    get_gpt_layer_local_spec,
-    get_gpt_layer_with_transformer_engine_spec,
-)
-from megatron_ds.core.models.retro.config import RetroConfig
-from megatron_ds.core.models.retro.decoder_attention import (
-    RetroDecoderBiasDropoutAdd,
-    RetroDecoderCrossAttention,
-)
-from megatron_ds.core.models.retro.encoder_spec import get_retro_encoder_block_spec
-from megatron_ds.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron_ds.core.transformer import ModuleSpec
-from megatron_ds.core.transformer.attention import CrossAttentionSubmodules
-from megatron_ds.core.transformer.custom_layers.transformer_engine import (
-    TEColumnParallelLinear,
-    TEDotProductAttention,
-    TENorm,
-    TERowParallelLinear,
-)
-from megatron_ds.core.transformer.dot_product_attention import DotProductAttention
-from megatron_ds.core.transformer.transformer_block import (
-    TransformerBlockSubmodules,
-    get_num_layers_to_build,
-)
-
-
-def get_retro_decoder_layer_te_spec(encoder_block_spec: ModuleSpec = None) -> ModuleSpec:
-    """Retro decoder TE spec (uses Transformer Engine components).
-
-    A Retro decoder layer uses custom attention and bias-dropout-add operators
-    to perform chunked-cross attention. Additionally, the first Retro decoder
-    layer instantiates an entire encoder transformer block. As such, the decoder
-    cross attention module takes an optional encoder block spec, which is only
-    provided for the first Retro decoder layer.
-
-    Arguments:
-      encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided
-      for the first Retro decoder layer.
-    """
-    spec = get_gpt_layer_with_transformer_engine_spec()
-    spec.submodules.pre_cross_attn_layernorm = TENorm
-    spec.submodules.cross_attention = ModuleSpec(
-        module=RetroDecoderCrossAttention,
-        params={"encoder_block_spec": encoder_block_spec,},
-        submodules=CrossAttentionSubmodules(
-            linear_q=TEColumnParallelLinear,
-            linear_kv=TEColumnParallelLinear,
-            core_attention=TEDotProductAttention,
-            linear_proj=TERowParallelLinear,
-        ),
-    )
-    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
-    return spec
-
-
-def get_retro_decoder_layer_local_spec(encoder_block_spec: ModuleSpec = None) -> ModuleSpec:
-    """Retro decoder local spec (uses Megatron-Core components).
-
-    A Retro decoder layer uses custom attention and bias-dropout-add operators
-    to perform chunked-cross attention. Additionally, the first Retro decoder
-    layer instantiates an entire encoder transformer block. As such, the decoder
-    cross attention module takes an optional encoder block spec, which is only
-    provided for the first Retro decoder layer.
-
-    Arguments:
-      encoder_block_spec (ModuleSpec): Retro encoder block spec, to be provided
-      for the first Retro decoder layer.
-    """
-    spec = get_gpt_layer_local_spec()
-    spec.submodules.pre_cross_attn_layernorm = FusedLayerNorm
-    spec.submodules.cross_attention = ModuleSpec(
-        module=RetroDecoderCrossAttention,
-        params={"encoder_block_spec": encoder_block_spec,},
-        submodules=CrossAttentionSubmodules(
-            linear_q=ColumnParallelLinear,
-            linear_kv=ColumnParallelLinear,
-            core_attention=DotProductAttention,
-            linear_proj=RowParallelLinear,
-        ),
-    )
-    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroDecoderBiasDropoutAdd)
-    return spec
-
-
-def get_retro_decoder_block_spec(
-    config: RetroConfig, use_transformer_engine: bool
-) -> TransformerBlockSubmodules:
-
-    """Retro decoder block spec.
-
-    Retro decoder block implementation details:
-    - The retro decoder block consists of interleaved GPT layers and customized
-      Retro decoder layers.
-    - The Retro decoder layers are spaced three layers apart, and start on layer
-      6 or 9 (depending on the total number of layers).
-    - The first decoder layer instantiates an encoder block, and it therefore
-      passes in an encoder_block_spec.
-
-
-    Arguments:
-      config (RetroConfig): Retro config.
-
-      use_transformer_engine (bool): If True, use Transformer Engine (instead
-      of local modules.
-    """
-
-    # Num layers.
-    assert (
-        parallel_state.get_pipeline_model_parallel_world_size() == 1
-    ), "retro does not currently support pipeline parallelism."
-    assert (
-        parallel_state.get_virtual_pipeline_model_parallel_world_size() is None
-    ), "retro does not currently support virtual pipeline parallelism."
-    num_layers = get_num_layers_to_build(config)
-
-    # Retro layer numbers.
-    retro_layer_start = 6 if num_layers <= 15 else 9
-    retro_layer_numbers = list(range(retro_layer_start, num_layers + 1, 3))
-
-    # Layer specs.
-    gpt_layer_spec = (
-        get_gpt_layer_with_transformer_engine_spec()
-        if use_transformer_engine
-        else get_gpt_layer_local_spec()
-    )
-    get_retro_decoder_layer_spec = (
-        get_retro_decoder_layer_te_spec
-        if use_transformer_engine
-        else get_retro_decoder_layer_local_spec
-    )
-    retro_layer_spec = get_retro_decoder_layer_spec()
-    retro_layer_spec_with_retriever = get_retro_decoder_layer_spec(
-        get_retro_encoder_block_spec(config, use_transformer_engine)
-    )
-
-    layer_specs = []
-    for layer_number in range(1, num_layers + 1):
-        if layer_number == retro_layer_numbers[0]:
-            layer_specs.append(retro_layer_spec_with_retriever)
-        elif layer_number in retro_layer_numbers:
-            layer_specs.append(retro_layer_spec)
-        else:
-            layer_specs.append(gpt_layer_spec)
-
-    # Block spec.
-    block_spec = TransformerBlockSubmodules(layer_specs=layer_specs)
-
-    return block_spec
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/encoder_attention.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/encoder_attention.py
deleted file mode 100644
index 3b3178c7f0092d7d964e26fbb1f015ce295a3339..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/encoder_attention.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Retro's cross attention modules for the encoder block."""
-
-from functools import partial
-from typing import Callable, Optional, Tuple, Type
-
-import torch
-from torch import Tensor
-
-from megatron_ds.core import InferenceParams
-from megatron_ds.core.fusions.fused_bias_dropout import get_bias_dropout_add
-from megatron_ds.core.models.retro.base_attention import BaseRetroCrossAttention
-from megatron_ds.core.models.retro.config import RetroConfig
-from megatron_ds.core.transformer.module import MegatronModule
-
-
-class RetroEncoderCrossAttention(BaseRetroCrossAttention):
-
-    """Retro encoder's cross attention operator.
-
-    See this paper for more details: https://arxiv.org/abs/2112.04426.
-    Neighboring chunks are retrieved from the chunk database, encoded, and
-    used by the decoder layers for chunked cross attention.
-
-    Arguments:
-      config (RetroConfig): Retro config.
-
-      submodules (CrossAttentionSubmodules): Cross attention submodules.
-
-      layer_number (int): Layer number within transformer block.
-
-      attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
-    """
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Tensor,
-        key_value_states: Tensor = None,
-        inference_params: InferenceParams = None,
-        # rotary_pos_emb: Tensor = None, # unsupported for retro.
-    ) -> Tensor:
-        """Cross attention for Retro encoder.
-
-        Notation:
-            ns : Sequence length.
-            bs : Batch size.
-            d  : Hidden size.
-            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-            k  : Number of neighbors.
-            r  : Number of retrieved tokens (neighbors + continuation).
-
-        Arguments:
-          hidden_states (Tensor): Transformer layer hidden states.
-
-          attention_mask (Tensor): Attention mask.
-
-          key_value_states (Tensor): Neighbor embeddings.
-
-          inference_params (InferenceParams): Inference params.
-        """
-
-        # Input shape. [ r, bs*l*k, d ]
-        ns, bs, d = hidden_states.shape
-
-        # Reshape sequence into neighboring chunks.
-        # - hidden_states:   [ r, bs*l*k, d ]
-        # - chunked_outputs: [ r, bs*l, k, d ]
-        chunked_outputs = hidden_states.reshape(
-            self.retro_retrieved_length, -1, self.retro_num_neighbors, d
-        )
-
-        # Per-chunk attention.
-        attention_output_tuples = []
-        for k in range(self.retro_num_neighbors):
-
-            # Attend to current neighboring chunks.
-            # - chunked_output:   [ r, bs*l, d ]
-            # - key_value_states: [ m, bs*l, d ]
-            # - attention_output: [ r, bs*l, d ]
-            # - attention_bias:   [ d ]
-            chunked_output = chunked_outputs[:, :, k].contiguous()
-            attention_output, attention_bias = self.attn(
-                hidden_states=chunked_output,  # Q (neighbor embedding)
-                attention_mask=None,
-                key_value_states=key_value_states,  # K, V (hidden act)
-            )
-
-            # Residual connection. [ r, bs*l, d ]
-            residual = chunked_output
-
-            # Collect tensors.
-            attention_output_tuples.append((attention_output, attention_bias, residual,))
-
-        # Output. (List[Tuple[( [ r, bs*l, d ], [ d ] )]])
-        return attention_output_tuples
-
-
-class RetroEncoderBiasDropoutAdd(MegatronModule):
-
-    """Retro encoder's bias-dropout-add operator.
-
-    This operator applies bias-dropout-add individually on each neighboring
-    chunk that is retrieved from the chunk database.
-
-    Arguments:
-      config (RetroConfig): Retro config.
-    """
-
-    def __init__(
-        self, config: RetroConfig,
-    ):
-        super().__init__(config=config)
-        self.retro_num_neighbors = config.retro_num_neighbors
-
-    @classmethod
-    def _forward(
-        cls,
-        x_with_bias: Tuple[Tensor, Optional[Tensor]],
-        residual: Tensor,
-        prob: float,
-        retro_num_neighbors: int,
-        bias_dropout_add: Callable,
-    ) -> Tensor:
-        """Per-chunk bias-dropout-add.
-
-        Arguments:
-          x_with_bias (dict): Attention output and bias tuple.
-
-          residual (Tensor): Transformer layer residual.
-
-          prob (float): Dropout probability.
-
-          retro_num_neighbors (int): Number of retrieved neighbor chunks (e.g., 2).
-
-          bias_dropout_add (Callable): Bias-dropout-add function.
-        """
-
-        # Re-enable torch grad to enable fused optimization.
-        with torch.enable_grad():
-
-            # Per-neighbor bias-dropout-add.
-            # - attention_output: [ r, bs*l, d ]
-            # - attention_bias:   [ d ]
-            # - residual:         [ r, bs*l, d ]
-            # - output:           [ r, bs*l, d ]
-            outputs = [
-                bias_dropout_add(
-                    (
-                        attention_output,
-                        None if attention_bias is None else attention_bias.expand_as(residual),
-                    ),
-                    residual,
-                    prob,
-                )
-                for attention_output, attention_bias, residual in x_with_bias
-            ]
-
-        # Concatenate outputs (to shape [r, k*bs*l, d]; see notation above).
-        r, _, d = outputs[0].shape
-        output = torch.stack(outputs, dim=1).reshape(r, -1, d)
-
-        # Output. [ r, k*bs*l, d ]
-        return output
-
-    def forward(self, training: bool, fused: bool) -> Tensor:
-        """Retro decoder bias-dropout-add.
-
-        Arguments:
-          training (bool): If training, then apply dropout.
-
-          fused (bool): Fuse bias-dropout-add.
-        """
-        return partial(
-            self._forward,
-            retro_num_neighbors=self.retro_num_neighbors,
-            bias_dropout_add=get_bias_dropout_add(training, fused),
-        )
-
-
-class RetroEncoderLayerNorm(MegatronModule):
-
-    """Retro encoder's layernorm operator.
-
-    This operator applies layernorm individually on each neighboring chunk that
-    is retrieved from the chunk database, and then concatenates the chunks into
-    a single tensor.
-
-    Arguments:
-      config (RetroConfig): Retro config.
-    """
-
-    def __init__(
-        self, config: RetroConfig, submodules: Type, **kwargs,
-    ):
-        super().__init__(config=config)
-        norm_class = submodules
-        self.norm = norm_class(config=config, **kwargs)
-        self.retro_num_neighbors = config.retro_num_neighbors
-
-    def forward(self, input: Tensor) -> Tensor:
-        """Per-chunk layer norm.
-
-        Arguments:
-          input (Tensor): Input chunks, concatenated into a single tensor.
-        """
-
-        # Input shape: [ r, k*bs*l, d ]. (see notation above in attention module)
-
-        # Split input into 'num_neighbors' tensors.
-        chunk_size = input.shape[1] // self.retro_num_neighbors
-        inputs = torch.split(input, chunk_size, dim=1)
-
-        # Norm.
-        outputs = [self.norm(inp.contiguous()) for inp in inputs]
-
-        # Concatenate layer norms (to shape [r, k*bs*l, d]; see notation above).
-        r, _, d = inputs[0].shape
-        output = torch.stack(outputs, dim=1).reshape(r, -1, d)
-
-        # Output. [ r, k*bs*l, d ]
-        return output
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/encoder_spec.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/encoder_spec.py
deleted file mode 100644
index 68392752baa6415536e4bee3f06a308e43396c82..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/encoder_spec.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from megatron_ds.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron_ds.core.models.gpt.gpt_layer_specs import (
-    get_gpt_layer_local_spec,
-    get_gpt_layer_with_transformer_engine_spec,
-)
-from megatron_ds.core.models.retro.config import RetroConfig
-from megatron_ds.core.models.retro.encoder_attention import (
-    RetroEncoderBiasDropoutAdd,
-    RetroEncoderCrossAttention,
-    RetroEncoderLayerNorm,
-)
-from megatron_ds.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from megatron_ds.core.transformer import ModuleSpec
-from megatron_ds.core.transformer.attention import CrossAttentionSubmodules
-from megatron_ds.core.transformer.custom_layers.transformer_engine import (
-    TEColumnParallelLinear,
-    TEDotProductAttention,
-    TENorm,
-    TERowParallelLinear,
-)
-from megatron_ds.core.transformer.dot_product_attention import DotProductAttention
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.mlp import MLP, MLPSubmodules
-from megatron_ds.core.transformer.transformer_block import TransformerBlockSubmodules
-
-
-def get_retro_encoder_layer_te_spec() -> ModuleSpec:
-    """Retro encoder TE spec (uses Transformer Engine components).
-
-    A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm
-    operators to encode neighboring chunks that are retrieved from the chunk
-    database. Each operator is responsible for iterating the retrieved chunks
-    and processing them individually.
-    """
-    spec = get_gpt_layer_with_transformer_engine_spec()
-    spec.submodules.pre_cross_attn_layernorm = TENorm
-    spec.submodules.cross_attention = ModuleSpec(
-        module=RetroEncoderCrossAttention,
-        params={"attn_mask_type": AttnMaskType.padding,},
-        submodules=CrossAttentionSubmodules(
-            linear_q=TEColumnParallelLinear,
-            linear_kv=TEColumnParallelLinear,
-            core_attention=TEDotProductAttention,
-            linear_proj=TERowParallelLinear,
-        ),
-    )
-    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd)
-    spec.submodules.pre_mlp_layernorm = ModuleSpec(module=RetroEncoderLayerNorm, submodules=TENorm,)
-    spec.submodules.mlp = ModuleSpec(
-        module=MLP,
-        submodules=MLPSubmodules(
-            linear_fc1=TEColumnParallelLinear, linear_fc2=TERowParallelLinear,
-        ),
-    )
-    return spec
-
-
-def get_retro_encoder_layer_local_spec() -> ModuleSpec:
-    """Retro encoder local spec (uses Megatron-Core components).
-
-    A Retro encoder layer uses custom attention, bias-dropout-add, and layernorm
-    operators to encode neighboring chunks that are retrieved from the chunk
-    database. Each operator is responsible for iterating the retrieved chunks
-    and processing them individually.
-    """
-    spec = get_gpt_layer_local_spec()
-    spec.submodules.pre_cross_attn_layernorm = FusedLayerNorm
-    spec.submodules.cross_attention = ModuleSpec(
-        module=RetroEncoderCrossAttention,
-        params={"attn_mask_type": AttnMaskType.padding,},
-        submodules=CrossAttentionSubmodules(
-            linear_q=ColumnParallelLinear,
-            linear_kv=ColumnParallelLinear,
-            core_attention=DotProductAttention,
-            linear_proj=RowParallelLinear,
-        ),
-    )
-    spec.submodules.cross_attn_bda = ModuleSpec(module=RetroEncoderBiasDropoutAdd)
-    spec.submodules.pre_mlp_layernorm = ModuleSpec(
-        module=RetroEncoderLayerNorm, submodules=FusedLayerNorm,
-    )
-    spec.submodules.mlp = ModuleSpec(
-        module=MLP,
-        submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
-    )
-    return spec
-
-
-def get_retro_encoder_block_spec(
-    config: RetroConfig, use_transformer_engine: bool
-) -> TransformerBlockSubmodules:
-
-    """Retro encoder block spec.
-
-    The retro encoder block consists of one customized Retro encoder layer
-    (layer 1), and all of the following layers are standard GPT layers.
-
-    Arguments:
-      config (RetroConfig): Retro config.
-
-      use_transformer_engine (bool): If True, use Transformer Engine (instead
-      of local modules.
-    """
-
-    # Num layers.
-    num_layers = config.retro_encoder_num_layers
-    retro_layer_numbers = [1]
-
-    # Layer specs.
-    gpt_layer_spec = (
-        get_gpt_layer_with_transformer_engine_spec()
-        if use_transformer_engine
-        else get_gpt_layer_local_spec()
-    )
-    get_retro_encoder_layer_spec = (
-        get_retro_encoder_layer_te_spec
-        if use_transformer_engine
-        else get_retro_encoder_layer_local_spec
-    )
-    retro_layer_spec = get_retro_encoder_layer_spec()
-    for spec in (gpt_layer_spec, retro_layer_spec):
-        spec.params["hidden_dropout"] = config.retro_encoder_hidden_dropout
-        spec.submodules.self_attention.params["attn_mask_type"] = AttnMaskType.padding
-        spec.submodules.self_attention.submodules.core_attention = ModuleSpec(
-            module=TEDotProductAttention if use_transformer_engine else DotProductAttention,
-            params={"attention_dropout": config.retro_encoder_attention_dropout,},
-        )
-
-    layer_specs = []
-    for layer_number in range(1, num_layers + 1):
-        if layer_number in retro_layer_numbers:
-            layer_specs.append(retro_layer_spec)
-        else:
-            layer_specs.append(gpt_layer_spec)
-
-    # Block spec.
-    block_spec = TransformerBlockSubmodules(layer_specs=layer_specs)
-
-    return block_spec
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/model.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/model.py
deleted file mode 100644
index 48b5b8fcac6333c04e3e6102dfde162d347b9a08..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/retro/model.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Retro Model."""
-
-from torch import Tensor
-
-from megatron_ds.core import InferenceParams
-from megatron_ds.core.models.gpt import GPTModel
-
-
-class RetroModel(GPTModel):
-
-    """Retro Model.
-
-    A Retro model mostly re-uses the GPTModel interface, with the only difference
-    being the embedding of the 'context' this is used by Retro for processing
-    neighbor tokens. This embedded context is then forwarded to the Transformer
-    Block.
-    """
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids: Tensor,
-        attention_mask: Tensor,
-        context_input_ids: Tensor = None,
-        context_position_ids: Tensor = None,
-        context_mask: Tensor = None,
-        decoder_input: Tensor = None,
-        labels: Tensor = None,
-        inference_params: InferenceParams = None,
-    ) -> Tensor:
-        """RetroModel forward method.
-
-        Foward input tokens & mask, along with neighbor tokens & mask, through
-        the Retro model..
-
-        Arguments:
-          input_ids (Tensor): Input token IDs.
-
-          position_ids (Tensor): Input position IDs.
-
-          attention_mask (Tensor): Input attention mask.
-
-          context_input_ids (Tensor): Context (i.e., neighbor) token IDs.
-
-          context_position_ids (Tensor): Context (i.e., neighbor) position IDs.
-
-          context_mask (Tensor): Context (i.e., neighbor) attention mask.
-
-          decoder_input (Tensor): When using pipeline parallelism, input_ids and
-          position_ids will only be used on the first stage, and for all other
-          stages decoder_input will be provided via communication from the
-          previous stage.
-
-          labels (Tensor): The labels of dimension [batch size, seq length].
-
-          inference_params (InferenceParams): Parameters for inference.
-        """
-
-        # Argument shapes:
-        #   Notation:
-        #     ns : Sequence length.
-        #     bs : Batch size.
-        #     d  : Hidden size.
-        #     l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-        #     k  : Number of neighbors.
-        #     r  : Number of retrieved tokens (neighbors + continuation).
-        # - input_ids:   [ bs, ns ]
-        # - context_ids: [ k*bs*l, r ]
-        # - context:     [ r, k*bs*l, d ]
-        # - output:      [ ns, bs, d ]
-
-        # Context embedding (e.g., for Retro neighbor tokens).
-        if context_input_ids is not None:
-            context = self.embedding(context_input_ids, context_position_ids)
-        else:
-            context = None
-
-        # Call GPTModel.forward, and pass in embedded context.
-        return super().forward(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            decoder_input=decoder_input,
-            labels=labels,
-            inference_params=inference_params,
-            extra_block_kwargs={"context": context, "context_mask": context_mask,},
-        )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/parallel_state.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/parallel_state.py
deleted file mode 100644
index b737b4fa4b9878ffe58cf378ce92705452ef599a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/parallel_state.py
+++ /dev/null
@@ -1,1134 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Model and data parallel groups."""
-
-import os
-from typing import Optional
-
-import torch
-
-from .utils import GlobalMemoryBuffer
-
-# Intra-layer model parallel group that the current rank belongs to.
-_TENSOR_MODEL_PARALLEL_GROUP = None
-# Inter-layer model parallel group that the current rank belongs to.
-_PIPELINE_MODEL_PARALLEL_GROUP = None
-# Model parallel group (both intra- and pipeline) that the current rank belongs to.
-_MODEL_PARALLEL_GROUP = None
-# Embedding group.
-_EMBEDDING_GROUP = None
-# Position embedding group.
-_POSITION_EMBEDDING_GROUP = None
-# Data parallel group that the current rank belongs to.
-_DATA_PARALLEL_GROUP = None
-_DATA_PARALLEL_GROUP_GLOO = None
-# tensor model parallel group and data parallel group combined
-# used for fp8 and moe training
-_TENSOR_AND_DATA_PARALLEL_GROUP = None
-# Expert parallel group that the current rank belongs to.
-_TENSOR_AND_EXPERT_PARALLEL_GROUP = None
-_DATA_MODULO_EXPERT_PARALLEL_GROUP = None
-
-
-_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
-_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
-_PIPELINE_MODEL_PARALLEL_SPLIT_RANK = None
-
-# These values enable us to change the mpu sizes on the fly.
-_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
-_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
-_MPU_TENSOR_MODEL_PARALLEL_RANK = None
-_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
-
-# A list of ranks that have a copy of the embedding.
-_EMBEDDING_GLOBAL_RANKS = None
-
-# A list of ranks that have a copy of the position embedding.
-_POSITION_EMBEDDING_GLOBAL_RANKS = None
-
-# A list of global ranks for each pipeline group to ease calculation of the source
-# rank when broadcasting from the first or last pipeline stage.
-_PIPELINE_GLOBAL_RANKS = None
-
-# For DeepSpeed's sequence parallel
-_SEQUENCE_PARALLEL_GROUP = None
-_SEQUENCE_PARALLEL_WORLD_SIZE = None
-_SEQUENCE_PARALLEL_RANK = None
-
-# This group includes processes for both data and sequence parallelisms.
-# We use this group to reduce gradients and shard parameters and optimizer stages for ZeRO.
-_SEQUENCE_DATA_PARALLEL_GROUP = None
-_SEQUENCE_DATA_PARALLEL_WORLD_SIZE = None
-_SEQUENCE_DATA_PARALLEL_RANK = None
-
-# A list of global ranks for each data parallel group to ease calculation of the source
-# rank when broadcasting weights from src to all other data parallel ranks
-_DATA_PARALLEL_GLOBAL_RANKS = None
-
-# Context parallel group that the current rank belongs to
-_CONTEXT_PARALLEL_GROUP = None
-# A list of global ranks for each context parallel group to ease calculation of the
-# destination rank when exchanging KV/dKV between context parallel_ranks
-_CONTEXT_PARALLEL_GLOBAL_RANKS = None
-
-# Data parallel group information with context parallel combined.
-_DATA_PARALLEL_GROUP_WITH_CP = None
-_DATA_PARALLEL_GROUP_WITH_CP_GLOO = None
-_DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None
-
-# combined parallel group of TP, DP, and CP used for fp8
-_TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None
-
-# Memory buffers to avoid dynamic memory allocation
-_GLOBAL_MEMORY_BUFFER = None
-
-_PP_FWD_HANDLES = None
-_PP_BWD_HANDLES = None
-
-def get_nccl_options(pg_name, nccl_comm_cfgs):
-    """Set the NCCL process group options.
-
-    Arguments:
-        pg_name (str): process group name
-        nccl_comm_cfgs (dict): nccl communicator configurations
-
-    When an option (e.g., max_ctas) is not found in the config, use the NCCL default setting.
-    """
-    if pg_name in nccl_comm_cfgs:
-        nccl_options = torch.distributed.ProcessGroupNCCL.Options()
-        nccl_options.config.cga_cluster_size = nccl_comm_cfgs[pg_name].get('cga_cluster_size', 4)
-        nccl_options.config.max_ctas = nccl_comm_cfgs[pg_name].get('max_ctas', 32)
-        nccl_options.config.min_ctas = nccl_comm_cfgs[pg_name].get('min_ctas', 1)
-        return nccl_options
-    else:
-        return None
-
-
-def initialize_model_parallel(
-    tensor_model_parallel_size: int = 1,
-    pipeline_model_parallel_size: int = 1,
-    sequence_parallel_size: int = 1,
-    virtual_pipeline_model_parallel_size: Optional[int] = None,
-    pipeline_model_parallel_split_rank: Optional[int] = None,
-    use_sharp: bool = False,
-    context_parallel_size: int = 1,
-    expert_model_parallel_size: int = 1,
-    nccl_communicator_config_path: Optional[str] = None,
-) -> None:
-    """Initialize model data parallel groups.
-
-    Arguments:
-        tensor_model_parallel_size (int, default = 1):
-            The number of GPUs to split individual tensors across.
-
-        pipeline_model_parallel_size (int, default = 1):
-            The number of tensor parallel GPU groups to split the
-            Transformer layers across. For example, if
-            tensor_model_parallel_size is 4 and
-            pipeline_model_parallel_size is 2, the model will be split
-            into 2 groups of 4 GPUs.
-
-        virtual_pipeline_model_parallel_size (int, optional):
-            The number of stages that each pipeline group will have,
-            interleaving as necessary. If None, no interleaving is
-            performed. For example, if tensor_model_parallel_size is 1,
-            pipeline_model_parallel_size is 4,
-            virtual_pipeline_model_parallel_size is 2, and there are
-            16 transformer layers in the model, the model will be
-            split into 8 stages with two layers each and each GPU
-            would get 2 stages as such (layer number starting with 1):
-
-            GPU 0: [1, 2] [9, 10]
-            GPU 1: [3, 4] [11, 12]
-            GPU 2: [5, 6] [13, 14]
-            GPU 3: [7, 8] [15, 16]
-
-        pipeline_model_parallel_split_rank (int, optional):
-            For models with both an encoder and decoder, the rank in
-            pipeline to switch between encoder and decoder (i.e. the
-            first rank of the decoder). This allows the user to set
-            the pipeline parallel size of the encoder and decoder
-            independently. For example, if
-            pipeline_model_parallel_size is 8 and
-            pipeline_model_parallel_split_rank is 3, then ranks 0-2
-            will be the encoder and ranks 3-7 will be the decoder.
-
-        use_sharp (bool, default = False):
-            Set the use of SHARP for the collective communications of
-            data-parallel process groups. When `True`, run barrier
-            within each data-parallel process group, which specifies
-            the SHARP application target groups.
-
-        context_parallel_size (int, default = 1):
-            The number of tensor parallel GPU groups to split the
-            network input sequence length across. Compute of attention
-            module requires tokens of full sequence length, so GPUs
-            in a context parallel group need to communicate with each
-            other to exchange information of other sequence chunks.
-            Each GPU and its counterparts in other tensor parallel
-            groups compose a context parallel group.
-
-            For example, assume we have 8 GPUs, if tensor model parallel
-            size is 4 and context parallel size is 2, the network input
-            will be split into two sequence chunks, which are processed
-            by 2 different groups of 4 GPUs. One chunk is processed by
-            GPU0-3, the other chunk is processed by GPU4-7. Four groups
-            are build to do context parallel communications: [GPU0, GPU4],
-            [GPU1, GPU5], [GPU2, GPU6], and [GPU3, GPU7].
-
-            Context parallelism partitions sequence length, so it has no
-            impact on weights, which means weights are duplicated among
-            GPUs in a context parallel group. Hence, weight gradients
-            all-reduce is required in backward. For simplicity, we piggyback
-            GPUs of context parallelism on data parallel group for
-            weight gradient all-reduce.
-
-        nccl_communicator_config_path (str, default = None):
-            Path to the yaml file of NCCL communicator configurations.
-            `min_ctas`, `max_ctas`, and `cga_cluster_size` can be set
-            for each communicator.
-
-    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
-    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
-    the model pipeline. The present function will
-    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
-    and 8 data-parallel groups as:
-        8 data_parallel groups:
-            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
-        8 tensor model-parallel groups:
-            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
-        4 pipeline model-parallel groups:
-            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
-    Note that for efficiency, the caller should make sure adjacent ranks
-    are on the same DGX box. For example if we are using 2 DGX-1 boxes
-    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
-    ranks 8 to 15 belong to the second box.
-
-    """
-    # Get world size and rank. Ensure some consistencies.
-    assert torch.distributed.is_initialized()
-    world_size: int = torch.distributed.get_world_size()
-
-    if (
-        world_size
-        % (tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size)
-        != 0
-    ):
-        raise RuntimeError(
-            f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
-            f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size}) "
-            f"x context_parallel_size ({context_parallel_size})"
-        )
-
-    enable_ds_sequence_parallel = sequence_parallel_size > 1
-    if enable_ds_sequence_parallel:
-        assert tensor_model_parallel_size == 1 and pipeline_model_parallel_size == 1, \
-        'DeepSpeed\'s sequence parallel does not work with tensor parallel or pipeline parallel'
-
-        if world_size % sequence_parallel_size != 0:
-            raise RuntimeError(
-                f"world_size ({world_size}) is not divisible by sequence_parallel_size {sequence_parallel_size})"
-            )
-        
-    data_parallel_size: int = world_size // (
-        tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size
-    )
-    sequence_data_parallel_size: int = sequence_parallel_size * data_parallel_size
-
-    if data_parallel_size % expert_model_parallel_size != 0:
-        raise RuntimeError(
-            f"data_parallel_size ({data_parallel_size}) is not divisible by expert_model_parallel_size "
-        )
-
-    if expert_model_parallel_size > 1 and context_parallel_size > 1:
-        raise RuntimeError(
-            f"combination of expert model prallellism and context parallelism is not supported"
-        )
-
-    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
-    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
-    num_data_parallel_groups: int = world_size // data_parallel_size
-    num_sequence_parallel_groups: int = world_size // sequence_parallel_size
-    num_sequence_data_parallel_groups: int = world_size // sequence_parallel_size // data_parallel_size
-
-    if virtual_pipeline_model_parallel_size is not None:
-        if not pipeline_model_parallel_size > 2:
-            raise RuntimeError(
-                "pipeline-model-parallel size should be greater than 2 with interleaved schedule"
-            )
-        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-        _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
-        _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size
-
-    if pipeline_model_parallel_split_rank is not None:
-        global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-        _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = pipeline_model_parallel_split_rank
-
-    rank = torch.distributed.get_rank()
-
-    nccl_comm_cfgs = {}
-    if nccl_communicator_config_path is not None:
-        try:
-            import yaml
-        except ImportError:
-            raise RuntimeError(
-                "Cannot import `yaml`. Setting custom nccl communicator configs "
-                "requires the yaml package."
-            )
-
-        with open(nccl_communicator_config_path, "r") as stream:
-            nccl_comm_cfgs = yaml.safe_load(stream)
-
-    # Build the data-parallel groups.
-    global _DATA_PARALLEL_GROUP
-    global _DATA_PARALLEL_GROUP_GLOO
-    global _DATA_PARALLEL_GLOBAL_RANKS
-    global _DATA_PARALLEL_GROUP_WITH_CP
-    global _DATA_PARALLEL_GROUP_WITH_CP_GLOO
-    global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP
-    assert _DATA_PARALLEL_GROUP is None, 'data parallel group is already initialized'
-    all_data_parallel_group_ranks_with_cp = []
-    for i in range(pipeline_model_parallel_size):
-        start_rank = i * num_pipeline_model_parallel_groups
-        end_rank = (i + 1) * num_pipeline_model_parallel_groups
-        for j in range(context_parallel_size * tensor_model_parallel_size):
-            ranks = range(
-                start_rank + j, end_rank, context_parallel_size * tensor_model_parallel_size
-            )
-            group = torch.distributed.new_group(
-                ranks, pg_options=get_nccl_options('dp', nccl_comm_cfgs)
-            )
-            group_gloo = torch.distributed.new_group(ranks, backend="gloo")
-            if rank in ranks:
-                _DATA_PARALLEL_GROUP = group
-                _DATA_PARALLEL_GROUP_GLOO = group_gloo
-                _DATA_PARALLEL_GLOBAL_RANKS = ranks
-        for j in range(tensor_model_parallel_size):
-            ranks_with_cp = range(start_rank + j, end_rank, tensor_model_parallel_size)
-            all_data_parallel_group_ranks_with_cp.append(list(ranks_with_cp))
-            group_with_cp = torch.distributed.new_group(
-                ranks_with_cp, pg_options=get_nccl_options('dp_cp', nccl_comm_cfgs)
-            )
-            group_with_cp_gloo = torch.distributed.new_group(ranks_with_cp, backend="gloo")
-            if rank in ranks_with_cp:
-                _DATA_PARALLEL_GROUP_WITH_CP = group_with_cp
-                _DATA_PARALLEL_GROUP_WITH_CP_GLOO = group_with_cp_gloo
-                _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks_with_cp
-
-    # Apply SHARP to DP process groups
-    if use_sharp:
-        if rank == 0:
-            print(
-                "The number of process groups to use SHARP with depends on the type "
-                "of the network switch. Nvidia QM1 switch supports SAHRP up to 8 "
-                "process groups and QM2 supports up to 256 process groups. We apply "
-                "SHARP to the communications of the data-parallel domain. If the "
-                "number of data-parallel process groups is larger than the max "
-                "process groups that the network switch supports, the communication "
-                "will fall back to non-SHARP operators. To enable SHARP, "
-                "`#SBATCH_NETWORK=sharp` should be set in the sbatch script."
-            )
-        torch.distributed.barrier(
-            group=get_data_parallel_group(with_context_parallel=context_parallel_size > 1),
-            device_ids=[torch.cuda.current_device()],
-        )
-        # Set `NCCL_SHARP_DISABLE=1` to restrict SHARP application to DP process groups
-        os.environ["NCCL_SHARP_DISABLE"] = "1"
-
-    # Build the context-parallel groups.
-    global _CONTEXT_PARALLEL_GROUP
-    global _CONTEXT_PARALLEL_GLOBAL_RANKS
-    assert _CONTEXT_PARALLEL_GROUP is None, 'context parallel group is already initialized'
-    for i in range(pipeline_model_parallel_size):
-        for j in range(data_parallel_size):
-            start_rank = (
-                i * num_pipeline_model_parallel_groups
-                + j * tensor_model_parallel_size * context_parallel_size
-            )
-            end_rank = (
-                i * num_pipeline_model_parallel_groups
-                + (j + 1) * tensor_model_parallel_size * context_parallel_size
-            )
-            for k in range(tensor_model_parallel_size):
-                ranks = range(start_rank + k, end_rank, tensor_model_parallel_size)
-                group = torch.distributed.new_group(
-                    ranks, pg_options=get_nccl_options('cp', nccl_comm_cfgs)
-                )
-                if rank in ranks:
-                    _CONTEXT_PARALLEL_GROUP = group
-                    _CONTEXT_PARALLEL_GLOBAL_RANKS = ranks
-
-    # Build the sequence parallel groups.
-    global _SEQUENCE_PARALLEL_GROUP
-    assert _SEQUENCE_PARALLEL_GROUP is None, \
-        'sequence parallel group is already initialized'
-    for i in range(num_sequence_parallel_groups):
-        ranks = range(i * sequence_parallel_size,
-                      (i + 1) * sequence_parallel_size)
-        group = torch.distributed.new_group(ranks)
-        if rank in ranks:
-            _SEQUENCE_PARALLEL_GROUP = group
-
-    # Build the sequence data parallel groups.
-    global _SEQUENCE_DATA_PARALLEL_GROUP
-    assert _SEQUENCE_DATA_PARALLEL_GROUP is None, \
-        'sequence data parallel group is already initialized'
-    all_data_sequence_parallel_group_ranks = []
-    if enable_ds_sequence_parallel:
-        for i in range(num_sequence_data_parallel_groups):
-            ranks = range(i * sequence_data_parallel_size,
-                        (i + 1) * sequence_data_parallel_size)
-            group = torch.distributed.new_group(ranks)
-            all_data_sequence_parallel_group_ranks.append(list(ranks))
-            if rank in ranks:
-                _SEQUENCE_DATA_PARALLEL_GROUP = group
-    else:
-        _SEQUENCE_DATA_PARALLEL_GROUP = _DATA_PARALLEL_GROUP
-
-    # Build the model-parallel groups.
-    global _MODEL_PARALLEL_GROUP
-    assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
-    for i in range(data_parallel_size * context_parallel_size):
-        ranks = [
-            data_parallel_group_ranks_with_cp[i]
-            for data_parallel_group_ranks_with_cp in all_data_parallel_group_ranks_with_cp
-        ]
-        group = torch.distributed.new_group(
-            ranks, pg_options=get_nccl_options('mp', nccl_comm_cfgs)
-        )
-        if rank in ranks:
-            _MODEL_PARALLEL_GROUP = group
-
-    # Build the tensor model-parallel groups.
-    global _TENSOR_MODEL_PARALLEL_GROUP
-    assert (
-        _TENSOR_MODEL_PARALLEL_GROUP is None
-    ), 'tensor model parallel group is already initialized'
-    for i in range(num_tensor_model_parallel_groups):
-        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
-        group = torch.distributed.new_group(
-            ranks, pg_options=get_nccl_options('tp', nccl_comm_cfgs)
-        )
-        if rank in ranks:
-            _TENSOR_MODEL_PARALLEL_GROUP = group
-
-    # Build the pipeline model-parallel groups and embedding groups
-    # (first and last rank in each pipeline model-parallel group).
-    global _PIPELINE_MODEL_PARALLEL_GROUP
-    global _PIPELINE_GLOBAL_RANKS
-    assert (
-        _PIPELINE_MODEL_PARALLEL_GROUP is None
-    ), 'pipeline model parallel group is already initialized'
-    global _EMBEDDING_GROUP
-    global _EMBEDDING_GLOBAL_RANKS
-    assert _EMBEDDING_GROUP is None, 'embedding group is already initialized'
-    global _POSITION_EMBEDDING_GROUP
-    global _POSITION_EMBEDDING_GLOBAL_RANKS
-    assert _POSITION_EMBEDDING_GROUP is None, 'position embedding group is already initialized'
-    for i in range(num_pipeline_model_parallel_groups):
-        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
-        group = torch.distributed.new_group(
-            ranks, pg_options=get_nccl_options('pp', nccl_comm_cfgs)
-        )
-        if rank in ranks:
-            _PIPELINE_MODEL_PARALLEL_GROUP = group
-            _PIPELINE_GLOBAL_RANKS = ranks
-        # Setup embedding group (to exchange gradients between
-        # first and last stages).
-        if len(ranks) > 1:
-            embedding_ranks = [ranks[0], ranks[-1]]
-            position_embedding_ranks = [ranks[0]]
-            if pipeline_model_parallel_split_rank is not None:
-                if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
-                    embedding_ranks = [
-                        ranks[0],
-                        ranks[pipeline_model_parallel_split_rank],
-                        ranks[-1],
-                    ]
-                if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
-                    position_embedding_ranks = [ranks[0], ranks[pipeline_model_parallel_split_rank]]
-        else:
-            embedding_ranks = ranks
-            position_embedding_ranks = ranks
-
-        group = torch.distributed.new_group(
-            embedding_ranks, pg_options=get_nccl_options('embd', nccl_comm_cfgs)
-        )
-        if rank in embedding_ranks:
-            _EMBEDDING_GROUP = group
-        if rank in ranks:
-            _EMBEDDING_GLOBAL_RANKS = embedding_ranks
-
-        group = torch.distributed.new_group(
-            position_embedding_ranks, pg_options=get_nccl_options('embd', nccl_comm_cfgs)
-        )
-        if rank in position_embedding_ranks:
-            _POSITION_EMBEDDING_GROUP = group
-        if rank in ranks:
-            _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
-
-    # Build the tensor + data parallel groups.
-    global _TENSOR_AND_DATA_PARALLEL_GROUP
-    global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
-    assert (
-        _TENSOR_AND_DATA_PARALLEL_GROUP is None
-    ), 'Tensor + data parallel group is already initialized'
-    tensor_and_data_group_size_with_cp: int = tensor_model_parallel_size * data_parallel_size * context_parallel_size
-    num_tensor_and_data_groups_with_cp: int = world_size // tensor_and_data_group_size_with_cp
-    for i in range(num_tensor_and_data_groups_with_cp):
-        start_rank = i * tensor_and_data_group_size_with_cp
-        end_rank = start_rank + tensor_and_data_group_size_with_cp
-        ranks = range(start_rank, end_rank)
-        group = torch.distributed.new_group(
-            ranks, pg_options=get_nccl_options('tp_dp_cp', nccl_comm_cfgs)
-        )
-        if rank in ranks:
-            _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = group
-
-        for j in range(context_parallel_size):
-            ranks = []
-            for k in range(data_parallel_size):
-                start_rank = (
-                    i * tensor_and_data_group_size_with_cp
-                    + j * tensor_model_parallel_size
-                    + k * tensor_model_parallel_size * context_parallel_size
-                )
-                end_rank = start_rank + tensor_model_parallel_size
-                ranks = ranks + list(range(start_rank, end_rank))
-            group = torch.distributed.new_group(
-                ranks, pg_options=get_nccl_options('tp_dp', nccl_comm_cfgs)
-            )
-            if rank in ranks:
-                _TENSOR_AND_DATA_PARALLEL_GROUP = group
-
-    # Build the tensor + expert parallel groups
-    global _TENSOR_AND_EXPERT_PARALLEL_GROUP
-    assert (
-        _TENSOR_AND_EXPERT_PARALLEL_GROUP is None
-    ), 'Tensor + expert parallel group is already initialized'
-    global _DATA_MODULO_EXPERT_PARALLEL_GROUP
-    assert (
-        _DATA_MODULO_EXPERT_PARALLEL_GROUP is None
-    ), 'Data modulo expert group is already initialized'
-    tensor_and_data_group_size: int = tensor_model_parallel_size * data_parallel_size
-    num_tensor_and_data_groups: int = world_size // tensor_and_data_group_size
-    tensor_and_expert_group_size: int = tensor_model_parallel_size * expert_model_parallel_size
-    num_expert_groups: int = data_parallel_size // expert_model_parallel_size
-    for i in range(num_tensor_and_data_groups):
-        for j in range(num_expert_groups):
-            start_rank = i * tensor_and_data_group_size + j * tensor_and_expert_group_size
-            end_rank = i * tensor_and_data_group_size + (j + 1) * tensor_and_expert_group_size
-            ranks = range(start_rank, end_rank)
-            group = torch.distributed.new_group(
-                ranks, pg_options=get_nccl_options('tp_exp', nccl_comm_cfgs)
-            )
-            if rank in ranks:
-                _TENSOR_AND_EXPERT_PARALLEL_GROUP = group
-
-    for i in range(num_tensor_and_data_groups):
-        start_rank = i * tensor_and_data_group_size
-        end_rank = (i + 1) * tensor_and_data_group_size
-        for j in range(tensor_and_expert_group_size):
-            ranks = range(start_rank + j, end_rank, tensor_and_expert_group_size)
-            group = torch.distributed.new_group(
-                ranks, pg_options=get_nccl_options('dp_modulo_exp', nccl_comm_cfgs)
-            )
-            if rank in ranks:
-                _DATA_MODULO_EXPERT_PARALLEL_GROUP = group
-
-    # Initialize global memory buffer
-    # This isn't really "parallel state" but there isn't another good place to
-    # put this. If we end up with a more generic initialization of megatron-core
-    # we could stick it there
-    _set_global_memory_buffer()
-
-
-def is_unitialized():
-    """Useful for code segments that may be accessed with or without mpu initialization"""
-    return _DATA_PARALLEL_GROUP is None
-
-
-def model_parallel_is_initialized():
-    """Check if model and data parallel groups are initialized."""
-    if (
-        _TENSOR_MODEL_PARALLEL_GROUP is None
-        or _PIPELINE_MODEL_PARALLEL_GROUP is None
-        or _DATA_PARALLEL_GROUP is None
-    ):
-        return False
-    return True
-
-def sequence_parallel_is_initialized():
-    """Check if sequence and data parallel groups are initialized."""
-    if _SEQUENCE_PARALLEL_GROUP is None or \
-        _DATA_PARALLEL_GROUP is None:
-        return False
-    return True
-
-def sequence_data_parallel_is_initialized():
-    """Check if sequence data parallel groups are initialized."""
-    if _SEQUENCE_DATA_PARALLEL_GROUP is None:
-        return False
-    return True
-
-def get_model_parallel_group():
-    """Get the model parallel group the caller rank belongs to."""
-    assert _MODEL_PARALLEL_GROUP is not None, 'model parallel group is not initialized'
-    return _MODEL_PARALLEL_GROUP
-
-
-def get_tensor_model_parallel_group(check_initialized=True):
-    """Get the tensor model parallel group the caller rank belongs to."""
-    if check_initialized:
-        assert (
-            _TENSOR_MODEL_PARALLEL_GROUP is not None
-        ), 'tensor model parallel group is not initialized'
-    return _TENSOR_MODEL_PARALLEL_GROUP
-
-
-def get_pipeline_model_parallel_group():
-    """Get the pipeline model parallel group the caller rank belongs to."""
-    assert (
-        _PIPELINE_MODEL_PARALLEL_GROUP is not None
-    ), 'pipeline_model parallel group is not initialized'
-    return _PIPELINE_MODEL_PARALLEL_GROUP
-
-def get_sequence_parallel_group():
-    """Get the sequence parallel group the caller rank belongs to."""
-    assert _SEQUENCE_PARALLEL_GROUP is not None, \
-        'sequence parallel group is not initialized'
-    return _SEQUENCE_PARALLEL_GROUP
-
-
-def get_sequence_data_parallel_group():
-    """Get the sequence parallel group the caller rank belongs to."""
-    assert _SEQUENCE_DATA_PARALLEL_GROUP is not None, \
-        'sequence data parallel group is not initialized'
-    return _SEQUENCE_DATA_PARALLEL_GROUP
-
-
-def get_data_parallel_group(with_context_parallel=False):
-    """Get the data parallel group the caller rank belongs to."""
-    if with_context_parallel:
-        assert (
-            _DATA_PARALLEL_GROUP_WITH_CP is not None
-        ), 'data parallel group with context parallel combined is not initialized'
-        return _DATA_PARALLEL_GROUP_WITH_CP
-    else:
-        assert _DATA_PARALLEL_GROUP is not None, 'data parallel group is not initialized'
-        return _DATA_PARALLEL_GROUP
-
-
-def get_data_parallel_group_gloo(with_context_parallel=False):
-    """Get the data parallel group-gloo the caller rank belongs to."""
-    if with_context_parallel:
-        assert (
-            _DATA_PARALLEL_GROUP_WITH_CP_GLOO is not None
-        ), 'data parallel group-gloo with context parallel combined is not initialized'
-        return _DATA_PARALLEL_GROUP_WITH_CP_GLOO
-    else:
-        assert _DATA_PARALLEL_GROUP_GLOO is not None, 'data parallel group-gloo is not initialized'
-        return _DATA_PARALLEL_GROUP_GLOO
-
-
-def get_context_parallel_group(check_initialized=True):
-    """Get the context parallel group the caller rank belongs to."""
-    if check_initialized:
-        assert _CONTEXT_PARALLEL_GROUP is not None, 'context parallel group is not initialized'
-    return _CONTEXT_PARALLEL_GROUP
-
-
-def get_context_parallel_global_ranks(check_initialized=True):
-    """Get all global ranks of the context parallel group that the caller rank belongs to."""
-    if check_initialized:
-        assert (
-            _CONTEXT_PARALLEL_GLOBAL_RANKS is not None
-        ), 'context parallel group is not initialized'
-    return _CONTEXT_PARALLEL_GLOBAL_RANKS
-
-
-def get_embedding_group():
-    """Get the embedding group the caller rank belongs to."""
-    assert _EMBEDDING_GROUP is not None, 'embedding group is not initialized'
-    return _EMBEDDING_GROUP
-
-
-def get_position_embedding_group():
-    """Get the position embedding group the caller rank belongs to."""
-    assert _POSITION_EMBEDDING_GROUP is not None, 'position embedding group is not initialized'
-    return _POSITION_EMBEDDING_GROUP
-
-
-def get_amax_reduction_group(with_context_parallel=False):
-    """Get the FP8 amax reduction group the caller rank belongs to."""
-    if with_context_parallel:
-        assert (
-            _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP is not None
-        ), 'FP8 amax reduction group is not initialized'
-        return _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
-    else:
-        assert (
-            _TENSOR_AND_DATA_PARALLEL_GROUP is not None
-        ), 'FP8 amax reduction group is not initialized'
-        return _TENSOR_AND_DATA_PARALLEL_GROUP
-
-
-def get_tensor_and_data_parallel_group(with_context_parallel=False):
-    """Get the tensor and data parallel group the caller rank belongs to."""
-    if with_context_parallel:
-        assert (
-            _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP is not None
-        ), 'tensor and data parallel group is not initialized'
-        return _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
-    else:
-        assert (
-            _TENSOR_AND_DATA_PARALLEL_GROUP is not None
-        ), 'tensor and data parallel group is not initialized'
-        return _TENSOR_AND_DATA_PARALLEL_GROUP
-
-
-def get_tensor_and_expert_parallel_group():
-    assert (
-        _TENSOR_AND_EXPERT_PARALLEL_GROUP is not None
-    ), 'tensor and expert parallel group is not initialized'
-    return _TENSOR_AND_EXPERT_PARALLEL_GROUP
-
-
-def get_data_modulo_expert_parallel_group():
-    assert (
-        _DATA_MODULO_EXPERT_PARALLEL_GROUP is not None
-    ), 'data modulo expert parallel group is not initialized'
-    return _DATA_MODULO_EXPERT_PARALLEL_GROUP
-
-
-def set_tensor_model_parallel_world_size(world_size):
-    """Set the tensor model parallel size"""
-    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
-
-def set_sequence_parallel_world_size(world_size):
-    """Set the sequence  parallel size"""
-    global _SEQUENCE_PARALLEL_WORLD_SIZE
-    _SEQUENCE_PARALLEL_WORLD_SIZE = world_size
-
-def set_sequence_data_parallel_world_size(world_size):
-    """Set the sequence  parallel size"""
-    global _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
-    _SEQUENCE_DATA_PARALLEL_WORLD_SIZE = world_size
-
-def set_pipeline_model_parallel_world_size(world_size):
-    """Set the pipeline model parallel size"""
-    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
-
-
-def set_virtual_pipeline_model_parallel_world_size(world_size):
-    """Set the pipeline model parallel size"""
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
-
-
-def get_tensor_model_parallel_world_size():
-    """Return world size for the tensor model parallel group."""
-    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
-        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
-
-def get_model_parallel_world_size():
-    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_world_size is only supported if PP is disabled"
-    return get_tensor_model_parallel_world_size()
-
-def get_sequence_parallel_world_size():
-    """Return world size for the sequence parallel group."""
-    global _SEQUENCE_PARALLEL_WORLD_SIZE
-    if _SEQUENCE_PARALLEL_WORLD_SIZE is not None:
-        return _SEQUENCE_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(group=get_sequence_parallel_group())
-
-def get_sequence_data_parallel_world_size():
-    """Return world size for the sequence parallel group."""
-    global _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
-    if _SEQUENCE_DATA_PARALLEL_WORLD_SIZE is not None:
-        return _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(group=get_sequence_data_parallel_group())
-
-def get_pipeline_model_parallel_world_size():
-    """Return world size for the pipeline model parallel group."""
-    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
-        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
-
-
-def set_tensor_model_parallel_rank(rank):
-    """Set tensor model parallel rank."""
-    global _MPU_TENSOR_MODEL_PARALLEL_RANK
-    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
-
-
-def get_model_parallel_rank():
-    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_rank is only supported if PP is disabled"
-    return get_tensor_model_parallel_rank()
-
-
-def set_sequence_parallel_rank(rank):
-    """Set sequence parallel rank."""
-    global _SEQUENCE_PARALLEL_RANK
-    _SEQUENCE_PARALLEL_RANK = rank
-
-
-def set_sequence_data_parallel_rank(rank):
-    """Set sequence parallel rank."""
-    global _SEQUENCE_DATA_PARALLEL_RANK
-    _SEQUENCE_DATA_PARALLEL_RANK = rank
-
-
-def set_pipeline_model_parallel_rank(rank):
-    """Set pipeline model parallel rank."""
-    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
-
-
-def set_pipeline_model_parallel_split_rank(rank):
-    """Set pipeline model parallel split rank."""
-    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-    _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = rank
-
-
-def get_tensor_model_parallel_rank():
-    """Return my rank for the tensor model parallel group."""
-    global _MPU_TENSOR_MODEL_PARALLEL_RANK
-    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
-        return _MPU_TENSOR_MODEL_PARALLEL_RANK
-    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
-
-
-def get_pipeline_model_parallel_rank():
-    """Return my rank for the pipeline model parallel group."""
-    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
-        return _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
-
-
-def get_pipeline_model_parallel_split_rank():
-    """Return pipeline model parallel split rank."""
-    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-    return _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-
-
-def get_sequence_parallel_rank():
-    """Return my rank for the sequence parallel group."""
-    global _SEQUENCE_PARALLEL_RANK
-    if _SEQUENCE_PARALLEL_RANK is not None:
-        return _SEQUENCE_PARALLEL_RANK
-    return torch.distributed.get_rank(group=get_sequence_parallel_group())
-
-
-def get_sequence_data_parallel_rank():
-    """Return my rank for the sequence data parallel group."""
-    global _SEQUENCE_DATA_PARALLEL_RANK
-    if _SEQUENCE_DATA_PARALLEL_RANK is not None:
-        return _SEQUENCE_DATA_PARALLEL_RANK
-    return torch.distributed.get_rank(group=get_sequence_data_parallel_group())
-
-
-def is_pipeline_first_stage(ignore_virtual=False):
-    """Return True if in the first pipeline model-parallel stage, False otherwise."""
-    if not ignore_virtual:
-        if (
-            get_virtual_pipeline_model_parallel_world_size() is not None
-            and get_virtual_pipeline_model_parallel_rank() != 0
-        ):
-            return False
-    return get_pipeline_model_parallel_rank() == 0
-
-
-def is_pipeline_last_stage(ignore_virtual=False):
-    """Return True if in the last pipeline model-parallel stage, False otherwise."""
-    if not ignore_virtual:
-        virtual_pipeline_model_parallel_world_size = (
-            get_virtual_pipeline_model_parallel_world_size()
-        )
-        if virtual_pipeline_model_parallel_world_size is not None and get_virtual_pipeline_model_parallel_rank() != (
-            virtual_pipeline_model_parallel_world_size - 1
-        ):
-            return False
-    return get_pipeline_model_parallel_rank() == (get_pipeline_model_parallel_world_size() - 1)
-
-
-def is_rank_in_embedding_group(ignore_virtual=False):
-    """Return true if current rank is in embedding group, False otherwise."""
-    rank = torch.distributed.get_rank()
-    global _EMBEDDING_GLOBAL_RANKS
-    if ignore_virtual:
-        return rank in _EMBEDDING_GLOBAL_RANKS
-    if rank in _EMBEDDING_GLOBAL_RANKS:
-        if rank == _EMBEDDING_GLOBAL_RANKS[0]:
-            return is_pipeline_first_stage(ignore_virtual=False)
-        elif rank == _EMBEDDING_GLOBAL_RANKS[-1]:
-            return is_pipeline_last_stage(ignore_virtual=False)
-        else:
-            return True
-    return False
-
-
-def is_rank_in_position_embedding_group():
-    """Return true if current rank is in position embedding group, False otherwise."""
-    rank = torch.distributed.get_rank()
-    global _POSITION_EMBEDDING_GLOBAL_RANKS
-    return rank in _POSITION_EMBEDDING_GLOBAL_RANKS
-
-
-def is_pipeline_stage_before_split(rank=None):
-    """Return True if pipeline stage executes encoder block for a model
-    with both encoder and decoder."""
-    if get_pipeline_model_parallel_world_size() == 1:
-        return True
-    if rank is None:
-        rank = get_pipeline_model_parallel_rank()
-    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
-        return True
-    if rank < _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
-        return True
-    return False
-
-
-def is_pipeline_stage_after_split(rank=None):
-    """Return True if pipeline stage executes decoder block for a model
-    with both encoder and decoder."""
-    if get_pipeline_model_parallel_world_size() == 1:
-        return True
-    if rank is None:
-        rank = get_pipeline_model_parallel_rank()
-    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
-    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
-        return True
-    if rank >= _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
-        return True
-    return False
-
-
-def is_pipeline_stage_at_split():
-    """Return true if pipeline stage executes decoder block and next
-    stage executes encoder block for a model with both encoder and
-    decoder."""
-    rank = get_pipeline_model_parallel_rank()
-    return is_pipeline_stage_before_split(rank) and is_pipeline_stage_after_split(rank + 1)
-
-
-def get_virtual_pipeline_model_parallel_rank():
-    """Return the virtual pipeline-parallel rank."""
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-
-
-def set_virtual_pipeline_model_parallel_rank(rank):
-    """Set the virtual pipeline-parallel rank."""
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank
-
-
-def get_virtual_pipeline_model_parallel_world_size():
-    """Return the virtual pipeline-parallel world size."""
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-
-
-def get_tensor_model_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the tensor model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_tensor_model_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
-
-
-def get_sequence_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the sequence parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_sequence_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
-
-def get_data_parallel_src_rank(with_context_parallel=False):
-    """Calculate the global rank corresponding to the first local rank
-    in the data parallel group."""
-    if with_context_parallel:
-        assert (
-            _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP is not None
-        ), "Data parallel group with context parallel combined is not initialized"
-        return _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP[0]
-    else:
-        assert _DATA_PARALLEL_GLOBAL_RANKS is not None, "Data parallel group is not initialized"
-        return _DATA_PARALLEL_GLOBAL_RANKS[0]
-
-
-def get_pipeline_model_parallel_first_rank():
-    """Return the global rank of the first process in the pipeline for the
-    current tensor parallel group"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
-    return _PIPELINE_GLOBAL_RANKS[0]
-
-
-def get_pipeline_model_parallel_last_rank():
-    """Return the global rank of the last process in the pipeline for the
-    current tensor parallel group"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
-    last_rank_local = get_pipeline_model_parallel_world_size() - 1
-    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
-
-
-def get_pipeline_model_parallel_next_rank():
-    """Return the global rank that follows the caller in the pipeline"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
-
-
-def get_pipeline_model_parallel_prev_rank():
-    """Return the global rank that preceeds the caller in the pipeline"""
-    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
-    rank_in_pipeline = get_pipeline_model_parallel_rank()
-    world_size = get_pipeline_model_parallel_world_size()
-    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
-
-
-def get_data_parallel_world_size(with_context_parallel=False):
-    """Return world size for the data parallel group."""
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        return torch.distributed.get_world_size(
-            group=get_data_parallel_group(with_context_parallel=with_context_parallel)
-        )
-    else:
-        return 0
-
-
-def get_data_parallel_rank(with_context_parallel=False):
-    """Return my rank for the data parallel group."""
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        return torch.distributed.get_rank(
-            group=get_data_parallel_group(with_context_parallel=with_context_parallel)
-        )
-    else:
-        return 0
-
-
-def get_context_parallel_world_size():
-    """Return world size for the context parallel group."""
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        return torch.distributed.get_world_size(group=get_context_parallel_group())
-    else:
-        return 0
-
-
-def get_context_parallel_rank():
-    """Return my rank for the context parallel group."""
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        return torch.distributed.get_rank(group=get_context_parallel_group())
-    else:
-        return 0
-
-
-def get_expert_model_parallel_world_size():
-    """Return my rank for the expert parallel group"""
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        tensor_and_expert_parallel_world_size = torch.distributed.get_world_size(
-            group=get_tensor_and_expert_parallel_group()
-        )
-        return tensor_and_expert_parallel_world_size // get_tensor_model_parallel_world_size()
-    else:
-        return 0
-
-
-def get_expert_model_parallel_rank():
-    """Return my rank for the expert parallel group"""
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        tensor_and_expert_parallel_rank = torch.distributed.get_rank(
-            group=get_tensor_and_expert_parallel_group()
-        )
-        return tensor_and_expert_parallel_rank // get_tensor_model_parallel_world_size()
-    else:
-        return 0
-
-
-def get_data_modulo_expert_parallel_rank():
-    """Return my rank for the context parallel group."""
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        return torch.distributed.get_rank(group=get_data_modulo_expert_parallel_group())
-    else:
-        return 0
-
-
-def _set_global_memory_buffer():
-    """Initialize global buffer"""
-    global _GLOBAL_MEMORY_BUFFER
-    assert _GLOBAL_MEMORY_BUFFER is None, 'global memory buffer is already initialized'
-    _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
-
-
-def get_global_memory_buffer():
-    """Return the global GlobalMemoryBuffer object"""
-    assert _GLOBAL_MEMORY_BUFFER is not None, 'global memory buffer is not initialized'
-    return _GLOBAL_MEMORY_BUFFER
-
-
-def destroy_global_memory_buffer():
-    """Sets the global memory buffer to None"""
-    global _GLOBAL_MEMORY_BUFFER
-    _GLOBAL_MEMORY_BUFFER = None
-
-
-def destroy_model_parallel():
-    """Set the groups to none."""
-    global _MODEL_PARALLEL_GROUP
-    _MODEL_PARALLEL_GROUP = None
-    global _TENSOR_MODEL_PARALLEL_GROUP
-    _TENSOR_MODEL_PARALLEL_GROUP = None
-    global _PIPELINE_MODEL_PARALLEL_GROUP
-    _PIPELINE_MODEL_PARALLEL_GROUP = None
-    global _DATA_PARALLEL_GROUP
-    _DATA_PARALLEL_GROUP = None
-    global _SEQUENCE_PARALLEL_GROUP
-    _SEQUENCE_PARALLEL_GROUP = None
-    global _SEQUENCE_DATA_PARALLEL_GROUP
-    _SEQUENCE_DATA_PARALLEL_GROUP = None
-    global _DATA_PARALLEL_GROUP_WITH_CP
-    _DATA_PARALLEL_GROUP_WITH_CP = None
-    global _CONTEXT_PARALLEL_GROUP
-    _CONTEXT_PARALLEL_GROUP = None
-    global _CONTEXT_PARALLEL_GLOBAL_RANKS
-    _CONTEXT_PARALLEL_GLOBAL_RANKS = None
-    global _EMBEDDING_GROUP
-    _EMBEDDING_GROUP = None
-    global _POSITION_EMBEDDING_GROUP
-    _POSITION_EMBEDDING_GROUP = None
-    global _TENSOR_AND_DATA_PARALLEL_GROUP
-    _TENSOR_AND_DATA_PARALLEL_GROUP = None
-    global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
-    _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None
-    global _TENSOR_AND_EXPERT_PARALLEL_GROUP
-    _TENSOR_AND_EXPERT_PARALLEL_GROUP = None
-    global _DATA_MODULO_EXPERT_PARALLEL_GROUP
-    _DATA_MODULO_EXPERT_PARALLEL_GROUP = None
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
-    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
-    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
-    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
-    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
-    global _MPU_TENSOR_MODEL_PARALLEL_RANK
-    _MPU_TENSOR_MODEL_PARALLEL_RANK = None
-    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
-    global _GLOBAL_MEMORY_BUFFER
-    _GLOBAL_MEMORY_BUFFER = None
-    global _PP_FWD_HANDLES
-    _PP_FWD_HANDLES = None
-    global _PP_BWD_HANDLES
-    _PP_BWD_HANDLES = None
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/__init__.py
deleted file mode 100644
index 00cd1ff3826564f9eef6cd9b023c0dd331b5d691..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .schedules import get_forward_backward_func
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/p2p_communication.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/p2p_communication.py
deleted file mode 100644
index 091799cea4c3d0e10c223c843fc64db28639992d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/p2p_communication.py
+++ /dev/null
@@ -1,598 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import operator
-from functools import reduce
-from typing import Callable, List, Optional, Tuple, Union
-
-import torch
-
-from megatron_ds import core
-from megatron_ds.core import ModelParallelConfig
-from megatron_ds.core.parallel_state import (
-    get_pipeline_model_parallel_group,
-    get_pipeline_model_parallel_next_rank,
-    get_pipeline_model_parallel_prev_rank,
-    get_pipeline_model_parallel_rank,
-)
-
-try:
-    from transformer_engine.pytorch.module._common import DelayP2P
-    TE_delayp2p = True
-except:
-    TE_delayp2p = False
-
-# Types
-Shape = Union[List[int], torch.Size]
-
-
-def _communicate_shapes(tensor_send_next, tensor_send_prev, recv_prev, recv_next, config):
-    """Communicate tensor shapes between stages. Used to communicate
-    tensor shapes before the actual tensor communication happens.
-    This is required when the sequence lengths across micro batches
-    are not uniform.
-
-    Takes the following arguments:
-        tensor_send_next: tensor to send to next rank (no tensor sent if
-                          set to None).
-        tensor_send_prev: tensor to send to prev rank (no tensor sent if
-                          set to None).
-        recv_prev: boolean for whether tensor should be received from
-                   previous rank.
-        recv_next: boolean for whether tensor should be received from
-                   next rank.
-    Returns:
-        (recv_prev_shape, recv_next_shape)
-    """
-
-    recv_prev_shape_tensor = None
-    recv_next_shape_tensor = None
-    send_prev_shape_tensor = None
-    send_next_shape_tensor = None
-    if recv_prev:
-        recv_prev_shape_tensor = torch.empty(
-            (3), device=torch.cuda.current_device(), dtype=torch.int64
-        )
-    if recv_next:
-        recv_next_shape_tensor = torch.empty(
-            (3), device=torch.cuda.current_device(), dtype=torch.int64
-        )
-    if tensor_send_prev is not None:
-        send_prev_shape_tensor = torch.tensor(
-            tensor_send_prev.size(), device=torch.cuda.current_device(), dtype=torch.int64
-        )
-    if tensor_send_next is not None:
-        send_next_shape_tensor = torch.tensor(
-            tensor_send_next.size(), device=torch.cuda.current_device(), dtype=torch.int64
-        )
-
-    if config.use_ring_exchange_p2p:
-        torch.distributed.ring_exchange(
-            tensor_send_prev=send_prev_shape_tensor,
-            tensor_recv_prev=recv_prev_shape_tensor,
-            tensor_send_next=send_next_shape_tensor,
-            tensor_recv_next=recv_next_shape_tensor,
-            group=get_pipeline_model_parallel_group(),
-        )
-    else:
-        ops = []
-        if send_prev_shape_tensor is not None:
-            send_prev_op = torch.distributed.P2POp(
-                torch.distributed.isend,
-                send_prev_shape_tensor,
-                get_pipeline_model_parallel_prev_rank(),
-            )
-            ops.append(send_prev_op)
-        if recv_prev_shape_tensor is not None:
-            recv_prev_op = torch.distributed.P2POp(
-                torch.distributed.irecv,
-                recv_prev_shape_tensor,
-                get_pipeline_model_parallel_prev_rank(),
-            )
-            ops.append(recv_prev_op)
-        if send_next_shape_tensor is not None:
-            send_next_op = torch.distributed.P2POp(
-                torch.distributed.isend,
-                send_next_shape_tensor,
-                get_pipeline_model_parallel_next_rank(),
-            )
-            ops.append(send_next_op)
-        if recv_next_shape_tensor is not None:
-            recv_next_op = torch.distributed.P2POp(
-                torch.distributed.irecv,
-                recv_next_shape_tensor,
-                get_pipeline_model_parallel_next_rank(),
-            )
-            ops.append(recv_next_op)
-        if len(ops) > 0:
-            reqs = torch.distributed.batch_isend_irecv(ops)
-            for req in reqs:
-                req.wait()
-
-        # To protect against race condition when using batch_isend_irecv().
-        # should take this out once the bug with batch_isend_irecv is resolved.
-        torch.cuda.synchronize()
-
-    recv_prev_shape = [0, 0, 0]
-    if recv_prev_shape_tensor is not None:
-        recv_prev_shape = recv_prev_shape_tensor.tolist()
-
-    recv_next_shape = [0, 0, 0]
-    if recv_next_shape_tensor is not None:
-        recv_next_shape = recv_next_shape_tensor.tolist()
-
-    return recv_prev_shape, recv_next_shape
-
-
-def _batched_p2p_ops(
-    *,
-    tensor_send_prev: Optional[torch.Tensor],
-    tensor_recv_prev: Optional[torch.Tensor],
-    tensor_send_next: Optional[torch.Tensor],
-    tensor_recv_next: Optional[torch.Tensor],
-    group: torch.distributed.ProcessGroup
-):
-    ops = []
-    if tensor_send_prev is not None:
-        send_prev_op = torch.distributed.P2POp(
-            torch.distributed.isend,
-            tensor_send_prev,
-            get_pipeline_model_parallel_prev_rank(),
-            group,
-        )
-        ops.append(send_prev_op)
-    if tensor_recv_prev is not None:
-        recv_prev_op = torch.distributed.P2POp(
-            torch.distributed.irecv,
-            tensor_recv_prev,
-            get_pipeline_model_parallel_prev_rank(),
-            group,
-        )
-        ops.append(recv_prev_op)
-    if tensor_send_next is not None:
-        send_next_op = torch.distributed.P2POp(
-            torch.distributed.isend,
-            tensor_send_next,
-            get_pipeline_model_parallel_next_rank(),
-            group,
-        )
-        ops.append(send_next_op)
-    if tensor_recv_next is not None:
-        recv_next_op = torch.distributed.P2POp(
-            torch.distributed.irecv,
-            tensor_recv_next,
-            get_pipeline_model_parallel_next_rank(),
-            group,
-        )
-        ops.append(recv_next_op)
-    if len(ops) > 0:
-        reqs = torch.distributed.batch_isend_irecv(ops)
-    else:
-        reqs = []
-    return reqs
-
-
-def _p2p_ops(
-    *,
-    tensor_send_prev: Optional[torch.Tensor],
-    tensor_recv_prev: Optional[torch.Tensor],
-    tensor_send_next: Optional[torch.Tensor],
-    tensor_recv_next: Optional[torch.Tensor],
-    group: torch.distributed.ProcessGroup,
-    without_sync: bool = False,
-):
-    reqs = []
-    rank = get_pipeline_model_parallel_rank()
-    if get_pipeline_model_parallel_rank() % 2 == 0:
-        if tensor_send_next is not None:
-            # 92 is a magic number defined in torch to disable stream sync before nccl call
-            send_next_req = torch.distributed.isend(
-                tensor=tensor_send_next, dst=get_pipeline_model_parallel_next_rank(), group=group, tag=92 if without_sync else 0,
-            )
-            reqs.append(send_next_req)
-
-        if tensor_recv_prev is not None:
-            recv_prev_req = torch.distributed.irecv(
-                tensor=tensor_recv_prev, src=get_pipeline_model_parallel_prev_rank(), group=group, tag=92 if without_sync else 0,
-            )
-            reqs.append(recv_prev_req)
-
-        if tensor_send_prev is not None:
-            send_prev_req = torch.distributed.isend(
-                tensor=tensor_send_prev, dst=get_pipeline_model_parallel_prev_rank(), group=group, tag=92 if without_sync else 0,
-            )
-            reqs.append(send_prev_req)
-
-        if tensor_recv_next is not None:
-            recv_next_req = torch.distributed.irecv(
-                tensor=tensor_recv_next, src=get_pipeline_model_parallel_next_rank(), group=group, tag=92 if without_sync else 0,
-            )
-            reqs.append(recv_next_req)
-
-    else:
-        if tensor_recv_prev is not None:
-            recv_prev_req = torch.distributed.irecv(
-                tensor=tensor_recv_prev, src=get_pipeline_model_parallel_prev_rank(), group=group, tag=92 if without_sync else 0,
-            )
-            reqs.append(recv_prev_req)
-
-        if tensor_send_next is not None:
-            send_next_req = torch.distributed.isend(
-                tensor=tensor_send_next, dst=get_pipeline_model_parallel_next_rank(), group=group, tag=92 if without_sync else 0,
-            )
-            reqs.append(send_next_req)
-
-        if tensor_recv_next is not None:
-            recv_next_req = torch.distributed.irecv(
-                tensor=tensor_recv_next, src=get_pipeline_model_parallel_next_rank(), group=group, tag=92 if without_sync else 0,
-            )
-            reqs.append(recv_next_req)
-
-        if tensor_send_prev is not None:
-            send_prev_req = torch.distributed.isend(
-                tensor=tensor_send_prev, dst=get_pipeline_model_parallel_prev_rank(), group=group, tag=92 if without_sync else 0,
-            )
-            reqs.append(send_prev_req)
-    return reqs
-
-
-def _communicate(
-    *,
-    tensor_send_next: Optional[torch.Tensor],
-    tensor_send_prev: Optional[torch.Tensor],
-    recv_prev: bool,
-    recv_next: bool,
-    tensor_shape: Shape,
-    config: ModelParallelConfig,
-    wait_on_reqs: bool = True
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Communicate tensors between stages. Used as helper method in other
-    communication methods that are used in megatron/schedules.py.
-
-    Arguments:
-        tensor_send_next (torch.Tensor, optional):
-            Tensor to send to next rank (no tensor sent if None)
-
-        tensor_send_prev (torch.Tensor, optional):
-            Tensor to send to prev rank (no tensor sent if None)
-
-        recv_prev (boolean, required):
-            whether tensor should be received from previous rank.
-
-        recv_next (boolean, required):
-            whether tensor should be received from next rank.
-
-        tensor_shape (List[int] or torch.Size, required):
-            shape of tensor to receive (this method assumes that all
-            tensors sent and received in a single function call are
-            the same shape).
-
-        wait_on_reqs (boolean, optional, default=False):
-            For non-batched p2p communication, wait on each request
-            before returning.
-
-    Returns:
-        tuple containing
-
-        - tensor_recv_prev: torch.Tensor if recv_prev is True, None otherwise.
-        - tensor_recv_next: torch.Tensor if recv_next is True, None otherwise.
-
-    """
-
-    # Create placeholder tensors for receive in forward and backward directions
-    # if needed.
-    tensor_recv_prev = None
-    tensor_recv_next = None
-
-    if not config.variable_seq_lengths:
-        recv_prev_shape = tensor_shape
-        recv_next_shape = tensor_shape
-    else:
-        recv_prev_shape, recv_next_shape = _communicate_shapes(
-            tensor_send_next, tensor_send_prev, recv_prev, recv_next, config
-        )
-
-    if recv_prev:
-        if config.pipeline_dtype is None:
-            raise RuntimeError("pipeline_dtype must be provided if recv_prev is True")
-        if tensor_shape is None:
-            raise RuntimeError(
-                "tensor_shape must be specified if recv_prev is True. "
-                "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
-            )
-        tensor_recv_prev = torch.empty(
-            recv_prev_shape,
-            requires_grad=True,
-            device=torch.cuda.current_device(),
-            dtype=config.pipeline_dtype,
-        )
-    if recv_next:
-        if config.pipeline_dtype is None:
-            raise RuntimeError("dtype must be provided if recv_next is True")
-        if tensor_shape is None:
-            raise RuntimeError(
-                "tensor_shape must be specified if recv_next is True. "
-                "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
-            )
-        tensor_recv_next = torch.empty(
-            recv_next_shape,
-            requires_grad=True,
-            device=torch.cuda.current_device(),
-            dtype=config.pipeline_dtype,
-        )
-
-    # Send tensors in both the forward and backward directions as appropriate.
-    if config.use_ring_exchange_p2p:
-
-        def _ring_exchange_wrapper(**kwargs):
-            torch.distributed.ring_exchange(**kwargs)
-            return []
-
-        p2p_func = _ring_exchange_wrapper
-    elif config.batch_p2p_comm:
-        assert wait_on_reqs
-        p2p_func = _batched_p2p_ops
-    else:
-        p2p_func = _p2p_ops
-
-    if config.pp_delay and TE_delayp2p:
-        # split PP communication into different block, with Order:send, recv, send, recv....
-        if tensor_send_prev is None and tensor_recv_prev is None and tensor_send_next is None and tensor_recv_next is None:
-            reqs = []
-        else:
-            torch.cuda.current_stream().synchronize()
-            reqs = []
-            assert(tensor_shape[0] % config.pp_split_size == 0)
-            seq = tensor_shape[0] // config.pp_split_size
-            for i in range(config.pp_split_size):
-                reqs.append(DelayP2P(_p2p_ops,
-                        tensor_send_prev=None if tensor_send_prev is None else tensor_send_prev[i*seq:(i+1)*seq],
-                        tensor_recv_prev=None if tensor_recv_prev is None else tensor_recv_prev[i*seq:(i+1)*seq],
-                        tensor_send_next=None if tensor_send_next is None else tensor_send_next[i*seq:(i+1)*seq],
-                        tensor_recv_next=None if tensor_recv_next is None else tensor_recv_next[i*seq:(i+1)*seq],
-                        group=get_pipeline_model_parallel_group(),
-                        without_sync=True,
-                        ))
-    else:
-        reqs = p2p_func(
-            tensor_send_prev=tensor_send_prev,
-            tensor_recv_prev=tensor_recv_prev,
-            tensor_send_next=tensor_send_next,
-            tensor_recv_next=tensor_recv_next,
-            group=get_pipeline_model_parallel_group(),
-        )
-
-    if wait_on_reqs and len(reqs) > 0:
-        for req in reqs:
-            req.wait()
-        reqs = None
-
-    if config.batch_p2p_comm and config.batch_p2p_sync:
-        # To protect against race condition when using batch_isend_irecv().
-        # User should assert that we have a modern enough PyTorch to not need this
-        torch.cuda.synchronize()
-
-    return tensor_recv_prev, tensor_recv_next, reqs
-
-
-def recv_forward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor:
-    """ Receive tensor from previous rank in pipeline (forward receive).
-
-
-    See _communicate for argument details.
-    """
-
-    if core.parallel_state.is_pipeline_first_stage():
-        input_tensor = None
-    else:
-        if config.timers is not None:
-            config.timers('forward-recv', log_level=2).start()
-        input_tensor, _, _ = _communicate(
-            tensor_send_next=None,
-            tensor_send_prev=None,
-            recv_prev=True,
-            recv_next=False,
-            tensor_shape=tensor_shape,
-            config=config,
-        )
-        if config.timers is not None:
-            config.timers('forward-recv').stop()
-    return input_tensor
-
-
-def recv_backward(tensor_shape: Shape, config: ModelParallelConfig) -> torch.Tensor:
-    """Receive tensor from next rank in pipeline (backward receive).
-
-    See _communicate for argument details.
-    """
-    if core.parallel_state.is_pipeline_last_stage():
-        output_tensor_grad = None
-    else:
-        if config.timers is not None:
-            config.timers('backward-recv', log_level=2).start()
-        _, output_tensor_grad, _ = _communicate(
-            tensor_send_next=None,
-            tensor_send_prev=None,
-            recv_prev=False,
-            recv_next=True,
-            tensor_shape=tensor_shape,
-            config=config,
-        )
-        if config.timers is not None:
-            config.timers('backward-recv').stop()
-    return output_tensor_grad
-
-
-def send_forward(output_tensor: torch.Tensor, config: ModelParallelConfig) -> None:
-    """Send tensor to next rank in pipeline (forward send).
-
-    See _communicate for argument details.
-    """
-
-    if not core.parallel_state.is_pipeline_last_stage():
-        if config.timers is not None:
-            config.timers('forward-send', log_level=2).start()
-        _communicate(
-            tensor_send_next=output_tensor,
-            tensor_send_prev=None,
-            recv_prev=False,
-            recv_next=False,
-            tensor_shape=None,
-            config=config,
-        )
-        if config.timers is not None:
-            config.timers('forward-send').stop()
-
-
-def send_backward(input_tensor_grad: torch.Tensor, config: ModelParallelConfig) -> None:
-    """Send tensor to previous rank in pipeline (backward send).
-
-    See _communicate for argument details.
-    """
-    if not core.parallel_state.is_pipeline_first_stage():
-        if config.timers is not None:
-            config.timers('backward-send', log_level=2).start()
-        _communicate(
-            tensor_send_next=None,
-            tensor_send_prev=input_tensor_grad,
-            recv_prev=False,
-            recv_next=False,
-            tensor_shape=None,
-            config=config,
-        )
-        if config.timers is not None:
-            config.timers('backward-send').stop()
-
-
-def send_forward_recv_backward(
-    output_tensor: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig
-) -> torch.Tensor:
-    """Batched send and recv with next rank in pipeline.
-
-    See _communicate for argument details.
-    """
-    if core.parallel_state.is_pipeline_last_stage():
-        output_tensor_grad = None
-    else:
-        if config.timers is not None:
-            config.timers('forward-send-backward-recv', log_level=2).start()
-        _, output_tensor_grad, _ = _communicate(
-            tensor_send_next=output_tensor,
-            tensor_send_prev=None,
-            recv_prev=False,
-            recv_next=True,
-            tensor_shape=tensor_shape,
-            config=config,
-        )
-        if config.timers is not None:
-            config.timers('forward-send-backward-recv').stop()
-    return output_tensor_grad
-
-
-def send_backward_recv_forward(
-    input_tensor_grad: torch.Tensor, tensor_shape: Shape, config: ModelParallelConfig
-) -> torch.Tensor:
-    """Batched send and recv with previous rank in pipeline.
-
-    See _communicate for argument details.
-    """
-    if core.parallel_state.is_pipeline_first_stage():
-        input_tensor = None
-    else:
-        if config.timers is not None:
-            config.timers('backward-send-forward-recv', log_level=2).start()
-        input_tensor, _, _ = _communicate(
-            tensor_send_next=None,
-            tensor_send_prev=input_tensor_grad,
-            recv_prev=True,
-            recv_next=False,
-            tensor_shape=tensor_shape,
-            config=config,
-        )
-        if config.timers is not None:
-            config.timers('backward-send-forward-recv').stop()
-    return input_tensor
-
-
-def send_forward_recv_forward(
-    output_tensor: torch.Tensor,
-    recv_prev: bool,
-    tensor_shape: Shape,
-    config: ModelParallelConfig,
-    overlap_p2p_comm: bool = False,
-) -> torch.Tensor:
-    """Batched recv from previous rank and send to next rank in pipeline.
-
-    See _communicate for argument details.
-    """
-    if config.timers is not None:
-        config.timers('forward-send-forward-recv', log_level=2).start()
-    input_tensor, _, wait_handles = _communicate(
-        tensor_send_next=output_tensor,
-        tensor_send_prev=None,
-        recv_prev=recv_prev,
-        recv_next=False,
-        tensor_shape=tensor_shape,
-        wait_on_reqs=(not overlap_p2p_comm),
-        config=config,
-    )
-    if config.timers is not None:
-        config.timers('forward-send-forward-recv').stop()
-    if overlap_p2p_comm:
-        return input_tensor, wait_handles
-    return input_tensor
-
-
-def send_backward_recv_backward(
-    input_tensor_grad: torch.Tensor,
-    recv_next: bool,
-    tensor_shape: Shape,
-    config: ModelParallelConfig,
-    overlap_p2p_comm: bool = False,
-) -> torch.Tensor:
-    """Batched recv from next rank and send to previous rank in pipeline.
-
-    See _communicate for argument details.
-    """
-    if config.timers is not None:
-        config.timers('backward-send-backward-recv', log_level=2).start()
-    _, output_tensor_grad, wait_handles = _communicate(
-        tensor_send_next=None,
-        tensor_send_prev=input_tensor_grad,
-        recv_prev=False,
-        recv_next=recv_next,
-        tensor_shape=tensor_shape,
-        wait_on_reqs=(not overlap_p2p_comm),
-        config=config,
-    )
-    if config.timers is not None:
-        config.timers('backward-send-backward-recv').stop()
-    if overlap_p2p_comm:
-        return output_tensor_grad, wait_handles
-    return output_tensor_grad
-
-
-def send_forward_backward_recv_forward_backward(
-    output_tensor: torch.Tensor,
-    input_tensor_grad: torch.Tensor,
-    recv_prev: bool,
-    recv_next: bool,
-    tensor_shape: Shape,
-    config: ModelParallelConfig,
-) -> torch.Tensor:
-    """Batched send and recv with previous and next ranks in pipeline.
-
-    See _communicate for argument details.
-    """
-    if config.timers is not None:
-        config.timers('forward-backward-send-forward-backward-recv', log_level=2).start()
-    input_tensor, output_tensor_grad, _ = _communicate(
-        tensor_send_next=output_tensor,
-        tensor_send_prev=input_tensor_grad,
-        recv_prev=recv_prev,
-        recv_next=recv_next,
-        tensor_shape=tensor_shape,
-        config=config,
-    )
-    if config.timers is not None:
-        config.timers('forward-backward-send-forward-backward-recv').stop()
-    return input_tensor, output_tensor_grad
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/schedules.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/schedules.py
deleted file mode 100644
index 7d8224d96650399ae0dc5a97d005f139b59b3e63..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/pipeline_parallel/schedules.py
+++ /dev/null
@@ -1,1307 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import contextlib
-from typing import Callable, Iterator, List, Optional, Union
-
-import torch
-from torch.autograd.variable import Variable
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.enums import ModelType
-from megatron_ds.core.pipeline_parallel import p2p_communication
-from megatron_ds.core.utils import get_attr_wrapped_model, get_model_config, get_model_type
-
-try:
-    from transformer_engine.pytorch.module import _common as TE_common
-    handles = TE_common
-except:
-    handles = parallel_state
-
-# Types
-Shape = Union[List[int], torch.Size]
-
-
-def get_forward_backward_func():
-    """Retrieves the appropriate forward_backward function given the
-    configuration of parallel_state.
-
-    Returns a function that will perform all of the forward and
-    backward passes of the model given the pipeline model parallel
-    world size and virtual pipeline model parallel world size in the
-    global parallel_state.
-
-    Note that if using sequence parallelism, the sequence length component of
-    the tensor shape is updated to original_sequence_length /
-    tensor_model_parallel_world_size.
-
-    The function returned takes the following arguments:
-
-    forward_step_func (required): A function that takes a data
-        iterator and a model as its arguments and return the model's
-        forward output and the loss function. The loss function should
-        take one torch.Tensor and return a torch.Tensor of loss and a
-        dictionary of string -> torch.Tensor.
-
-        A third argument, checkpoint_activations_microbatch, indicates
-        that the activations for this microbatch should be
-        checkpointed. A None value for this argument indicates that
-        the default from the configuration should be used. This is
-        used when the
-        num_microbatches_with_partial_activation_checkpoints is used.
-
-        For example:
-
-        def loss_func(loss_mask, output_tensor):
-            losses = output_tensor.float()
-            loss_mask = loss_mask.view(-1).float()
-            loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
-
-            # Reduce loss for logging.
-            averaged_loss = average_losses_across_data_parallel_group([loss])
-
-            return loss, {'lm loss': averaged_loss[0]}
-
-        def forward_step(data_iterator, model):
-            data, loss_mask = next(data_iterator)
-            output = model(data)
-            return output, partial(loss_func, loss_mask)
-
-
-        forward_backward_func(forward_step_func=forward_step, ...)
-
-
-    data_iterator (required): an iterator over the data, will be
-        passed as is to forward_step_func. Expected to be a list of
-        iterators in the case of interleaved pipeline parallelism.
-
-    model (required): the actual model. Expected to be a list of modules in the case of interleaved
-        pipeline parallelism. Must be a (potentially wrapped) megatron_ds.core.models.MegatronModule.
-
-    num_microbatches (int, required):
-        The number of microbatches to go through
-
-    seq_length (int, required): Sequence length of the current global batch. If this is a dual-stack
-        transformer, this is the encoder's sequence length. This is ignored if variable_seq_lengths
-        in the config is True. Otherwise, each microbatch in the current global batch size must use
-        this sequence length.
-
-    micro_batch_size (int, required): The number of sequences in a microbatch.
-
-    decoder_seq_length (int, optional): The sequence length for the decoder in a dual-stack
-        transformer. This is ignored for a single-stack transformer.
-
-    forward_only (optional, default = False): Perform only the forward step
-
-    collect_non_loss_data (optional, bool, default=False): TODO
-
-    """
-    pipeline_model_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
-    if pipeline_model_parallel_size > 1:
-        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-            forward_backward_func = forward_backward_pipelining_with_interleaving
-        else:
-            forward_backward_func = forward_backward_pipelining_without_interleaving
-    else:
-        forward_backward_func = forward_backward_no_pipelining
-    return forward_backward_func
-
-
-def deallocate_output_tensor(out, deallocate_pipeline_outputs=False):
-    '''Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
-
-    This method should be called right after the output tensor has been
-    sent to the next pipeline stage. At this point, the output tensor is
-    only useful for its '.grad_fn' field, and not its '.data'.
-    '''
-    if (out is None) or (not deallocate_pipeline_outputs):
-        return
-    assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
-    assert out._base is None, "counter-productive to free a view of another tensor."
-    out.data = torch.empty((1,), device=out.device, dtype=out.dtype,)
-
-
-def custom_backward(output, grad_output):
-    '''Directly call C++ autograd engine.
-
-    To make the 'deallocate_output_tensor' (above) optimization work, the C++
-    autograd engine must be called directly, bypassing Pytorch's
-    torch.autograd.backward. Pytorch's 'backward' checks that the output and
-    grad have the same shape, while C++'s 'backward' does not.
-    '''
-
-    assert output.numel() == 1, "output should be pseudo-'freed' in schedule, to optimize memory"
-    assert isinstance(output, torch.Tensor), "output == '%s'." % type(output).__name__
-    assert isinstance(grad_output, (torch.Tensor, type(None))), (
-        "grad_output == '%s'." % type(grad_output).__name__
-    )
-
-    # Handle scalar output
-    if grad_output is None:
-        assert output.numel() == 1, "implicit grad requires scalar output."
-        grad_output = torch.ones_like(output, memory_format=torch.preserve_format,)
-
-    # Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ]
-    Variable._execution_engine.run_backward(
-        tensors=(output,),
-        grad_tensors=(grad_output,),
-        keep_graph=False,
-        create_graph=False,
-        inputs=tuple(),
-        allow_unreachable=True,
-        accumulate_grad=True,
-    )
-
-
-def forward_step(
-    forward_step_func,
-    data_iterator,
-    model,
-    num_microbatches,
-    input_tensor,
-    forward_data_store,
-    config,
-    collect_non_loss_data=False,
-    checkpoint_activations_microbatch=None,
-):
-    """Forward step for passed-in model.
-
-    If first stage, input tensor is obtained from data_iterator, otherwise
-    passed-in input_tensor is used.
-
-    Returns output tensor."""
-    if config.timers is not None:
-        config.timers('forward-compute', log_level=2).start()
-
-    unwrap_output_tensor = False
-    if not isinstance(input_tensor, list):
-        input_tensor = [input_tensor]
-        unwrap_output_tensor = True
-
-    set_input_tensor = get_attr_wrapped_model(model, "set_input_tensor")
-    set_input_tensor(input_tensor)
-
-    if config.enable_autocast:
-        context_manager = torch.autocast("cuda", dtype=config.autocast_dtype)
-    else:
-        context_manager = contextlib.nullcontext()
-    with context_manager:
-        if checkpoint_activations_microbatch is None:
-            output_tensor, loss_func = forward_step_func(data_iterator, model)
-        else:
-            output_tensor, loss_func = forward_step_func(
-                data_iterator, model, checkpoint_activations_microbatch
-            )
-
-    if parallel_state.is_pipeline_last_stage():
-        if not collect_non_loss_data:
-            output_tensor = loss_func(output_tensor)
-            loss, loss_reduced = output_tensor
-            output_tensor = loss / num_microbatches
-            forward_data_store.append(loss_reduced)
-        else:
-            data = loss_func(output_tensor, non_loss_data=True)
-            forward_data_store.append(data)
-
-    if config.timers is not None:
-        config.timers('forward-compute').stop()
-
-    # If T5 model (or other model with encoder and decoder)
-    # and in decoder stack, then send encoder_hidden_state
-    # downstream as well.
-    model_type = get_model_type(model)
-    if (
-        parallel_state.is_pipeline_stage_after_split()
-        and model_type == ModelType.encoder_and_decoder
-    ):
-        return [output_tensor, input_tensor[-1]]
-    if unwrap_output_tensor:
-        return output_tensor
-    return [output_tensor]
-
-
-def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config):
-    """Backward step through passed-in output tensor.
-
-    If last stage, output_tensor_grad is None, otherwise gradient of loss
-    with respect to stage's output tensor.
-
-    Returns gradient of loss with respect to input tensor (None if first
-    stage)."""
-
-    # NOTE: This code currently can handle at most one skip connection. It
-    # needs to be modified slightly to support arbitrary numbers of skip
-    # connections.
-
-    if config.timers is not None:
-        config.timers('backward-compute', log_level=2).start()
-
-    # Retain the grad on the input_tensor.
-    unwrap_input_tensor_grad = False
-    if not isinstance(input_tensor, list):
-        input_tensor = [input_tensor]
-        unwrap_input_tensor_grad = True
-    for x in input_tensor:
-        if x is not None:
-            x.retain_grad()
-
-    if not isinstance(output_tensor, list):
-        output_tensor = [output_tensor]
-    if not isinstance(output_tensor_grad, list):
-        output_tensor_grad = [output_tensor_grad]
-
-    # Backward pass.
-    if output_tensor_grad[0] is None and config.grad_scale_func is not None:
-        output_tensor[0] = config.grad_scale_func(output_tensor[0])
-
-    if config.deallocate_pipeline_outputs:
-        custom_backward(output_tensor[0], output_tensor_grad[0])
-    else:
-        torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0])
-
-    # Collect the grad of the input_tensor.
-    input_tensor_grad = [None]
-    if input_tensor is not None:
-        input_tensor_grad = []
-        for x in input_tensor:
-            if x is None:
-                input_tensor_grad.append(None)
-            else:
-                input_tensor_grad.append(x.grad)
-
-    # Handle single skip connection if it exists (encoder_hidden_state in
-    # model with encoder and decoder).
-    if (
-        parallel_state.get_pipeline_model_parallel_world_size() > 1
-        and parallel_state.is_pipeline_stage_after_split()
-        and model_type == ModelType.encoder_and_decoder
-    ):
-        if output_tensor_grad[1] is not None:
-            input_tensor_grad[-1].add_(output_tensor_grad[1])
-    if unwrap_input_tensor_grad:
-        input_tensor_grad = input_tensor_grad[0]
-
-    if config.timers is not None:
-        config.timers('backward-compute').stop()
-
-    return input_tensor_grad
-
-
-def forward_backward_no_pipelining(
-    *,
-    forward_step_func,
-    data_iterator: Union[Iterator, List[Iterator]],
-    model: Union[torch.nn.Module, List[torch.nn.Module]],
-    num_microbatches: int,
-    seq_length: int,  # unused
-    micro_batch_size: int,  # unused
-    decoder_seq_length: int = None,  # unused
-    forward_only: bool = False,
-    collect_non_loss_data: bool = False,
-):
-    """Run forward and backward passes with no pipeline parallelism
-    (no inter-stage communication).
-
-    Returns dictionary with losses.
-
-
-    See get_forward_backward_func() for argument details
-    """
-
-    if isinstance(model, list):
-        assert len(model) == 1, "non-pipeline-parallel schedule does not support model chunking"
-        model = model[0]
-    if isinstance(data_iterator, list):
-        assert (
-            len(data_iterator) == 1
-        ), "non-pipeline-parallel schedule does not support model chunking"
-        data_iterator = data_iterator[0]
-
-    config = get_model_config(model)
-    if config.timers is not None:
-        config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
-
-    no_sync_func = config.no_sync_func
-    if no_sync_func is None:
-        no_sync_func = contextlib.nullcontext
-
-    model_type = get_model_type(model)
-
-    forward_data_store = []
-    input_tensor, output_tensor_grad = None, None
-    with no_sync_func():
-        for i in range(num_microbatches - 1):
-            output_tensor = forward_step(
-                forward_step_func,
-                data_iterator,
-                model,
-                num_microbatches,
-                input_tensor,
-                forward_data_store,
-                config,
-                collect_non_loss_data,
-            )
-            if not forward_only:
-                backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
-
-    # Run computation for last microbatch out of context handler (want to
-    # synchronize gradients).
-    output_tensor = forward_step(
-        forward_step_func,
-        data_iterator,
-        model,
-        num_microbatches,
-        input_tensor,
-        forward_data_store,
-        config,
-        collect_non_loss_data,
-    )
-
-    if not forward_only:
-        backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
-
-    if config.timers is not None:
-        config.timers('forward-backward').stop()
-
-    if config.finalize_model_grads_func is not None and not forward_only:
-        # Finalize model grads (perform full grad all-reduce / reduce-scatter for
-        # data parallelism and layernorm all-reduce for sequence parallelism).
-        config.finalize_model_grads_func([model])
-
-    return forward_data_store
-
-
-def forward_backward_pipelining_with_interleaving(
-    *,
-    forward_step_func,
-    data_iterator: Union[Iterator, List[Iterator]],
-    model: Union[torch.nn.Module, List[torch.nn.Module]],
-    num_microbatches: int,
-    seq_length: int,
-    micro_batch_size: int,
-    decoder_seq_length: int = None,
-    forward_only: bool = False,
-    collect_non_loss_data: bool = False,
-):
-    """Run interleaved 1F1B schedule (model split into model chunks), with
-    communication between pipeline stages as needed.
-
-    Returns dictionary with losses if the last stage, empty dict otherwise."""
-    assert isinstance(model, list), "interleaved pipeline parallelism expected model chunking"
-    assert all(isinstance(chunk, torch.nn.Module) for chunk in model), "invalid model chunking"
-    assert isinstance(
-        data_iterator, list
-    ), "interleaved pipeline parallelism expected each model chunk to have a data iterator"
-
-    config = get_model_config(model[0])
-    if config.overlap_p2p_comm and config.batch_p2p_comm:
-        raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm")
-
-    if config.timers is not None:
-        config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
-
-    # Disable async grad reductions
-    no_sync_func = config.no_sync_func
-    if isinstance(no_sync_func, list):
-
-        def multi_no_sync():
-            stack = contextlib.ExitStack()
-            for model_chunk_no_sync_func in config.no_sync_func:
-                stack.enter_context(model_chunk_no_sync_func())
-            return stack
-
-        no_sync_func = multi_no_sync
-    if no_sync_func is None:
-        no_sync_func = contextlib.nullcontext
-    no_sync_context = None
-
-    if config.grad_sync_func is not None and not isinstance(config.grad_sync_func, list):
-        config.grad_sync_func = [config.grad_sync_func for _ in model]
-
-    if config.param_sync_func is not None and not isinstance(config.param_sync_func, list):
-        config.param_sync_func = [config.param_sync_func for _ in model]
-
-    def disable_grad_sync():
-        """Disable asynchronous grad reductions"""
-        nonlocal no_sync_context
-        if no_sync_context is None:
-            no_sync_context = no_sync_func()
-            no_sync_context.__enter__()
-
-    def enable_grad_sync():
-        """Enable asynchronous grad reductions"""
-        nonlocal no_sync_context
-        if no_sync_context is not None:
-            no_sync_context.__exit__(None, None, None)
-            no_sync_context = None
-
-    disable_grad_sync()
-
-    # Model chunk IDs with synchronized grads
-    synchronized_model_chunks = set()
-
-    input_tensors = [[] for _ in range(len(model))]
-    output_tensors = [[] for _ in range(len(model))]
-    forward_data_store = []
-    if not forward_only:
-        output_tensor_grads = [[] for _ in range(len(model))]
-
-    pipeline_parallel_size = parallel_state.get_pipeline_model_parallel_world_size()
-    pipeline_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
-
-    if num_microbatches % pipeline_parallel_size != 0:
-        msg = f'number of microbatches ({num_microbatches}) is not divisible by '
-        msg += f'pipeline-model-parallel-size ({pipeline_parallel_size}) '
-        msg += 'when using interleaved schedule'
-        raise RuntimeError(msg)
-
-    model_type = get_model_type(model[0])
-    if model_type == ModelType.encoder_and_decoder:
-        raise RuntimeError("Interleaving is not supported with an encoder and decoder model.")
-
-    if decoder_seq_length is not None and decoder_seq_length != seq_length:
-        raise RuntimeError(
-            "Interleaving is not supported with a different decoder sequence length."
-        )
-
-    tensor_shape = [seq_length, micro_batch_size, config.hidden_size]
-    if config.sequence_parallel:
-        tensor_shape[0] = tensor_shape[0] // parallel_state.get_tensor_model_parallel_world_size()
-
-    # Compute number of warmup and remaining microbatches.
-    num_model_chunks = len(model)
-    total_num_microbatches = num_microbatches * num_model_chunks
-    all_warmup_microbatches = False
-    if forward_only:
-        num_warmup_microbatches = total_num_microbatches
-    else:
-        # Run all forward passes and then all backward passes if number of
-        # microbatches is just the number of pipeline stages.
-        # Otherwise, perform (num_model_chunks-1)*pipeline_parallel_size on
-        # all workers, followed by more microbatches after depending on
-        # stage ID (more forward passes for earlier stages, later stages can
-        # immediately start with 1F1B).
-        if num_microbatches == pipeline_parallel_size:
-            num_warmup_microbatches = total_num_microbatches
-            all_warmup_microbatches = True
-        else:
-            num_warmup_microbatches = (pipeline_parallel_size - pipeline_parallel_rank - 1) * 2
-            num_warmup_microbatches += (num_model_chunks - 1) * pipeline_parallel_size
-            num_warmup_microbatches = min(num_warmup_microbatches, total_num_microbatches)
-    num_microbatches_remaining = total_num_microbatches - num_warmup_microbatches
-
-    # Checkpoint the activations of partial Transformer layers in a number of micro-batches
-    # within the maximum outstanding micro-batch backpropagations.
-    # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
-    # checkpoint partial Transformer layers (or skip checkpointing) and
-    # the rest of micro-batches within a window of micro-batches checkpoint
-    # all Transformer layers. The window of micro-batches is set by the maximum
-    # outstanding backpropagations and becomes smaller at later pipeline stages.
-    # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
-    max_outstanding_backprops = None
-    if config.num_microbatches_with_partial_activation_checkpoints is not None:
-        max_outstanding_backprops = num_warmup_microbatches + 1
-
-    # Synchronize params for first two model chunks
-    if config.param_sync_func is not None:
-        config.param_sync_func[0](model[0].parameters())
-        config.param_sync_func[1](model[1].parameters())
-
-    def get_model_chunk_id(microbatch_id, forward):
-        """Helper method to get the model chunk ID given the iteration number."""
-        microbatch_id_in_group = microbatch_id % (pipeline_parallel_size * num_model_chunks)
-        model_chunk_id = microbatch_id_in_group // pipeline_parallel_size
-        if not forward:
-            model_chunk_id = num_model_chunks - model_chunk_id - 1
-        return model_chunk_id
-
-    def is_first_microbatch_for_model_chunk(microbatch_id: int) -> bool:
-        """Check if an iteration is the first for a model chunk."""
-        microbatch_group_size = pipeline_parallel_size * num_model_chunks
-        num_microbatch_groups = total_num_microbatches // microbatch_group_size
-        microbatch_group_id = microbatch_id // microbatch_group_size
-        microbatch_id_in_group = microbatch_id % microbatch_group_size
-        if microbatch_group_id == 0:
-            return microbatch_id_in_group % pipeline_parallel_size == 0
-        else:
-            return False
-
-    def is_last_microbatch_for_model_chunk(microbatch_id: int) -> bool:
-        """Check if an iteration is the last for a model chunk."""
-        microbatch_group_size = pipeline_parallel_size * num_model_chunks
-        num_microbatch_groups = total_num_microbatches // microbatch_group_size
-        microbatch_group_id = microbatch_id // microbatch_group_size
-        microbatch_id_in_group = microbatch_id % microbatch_group_size
-        if microbatch_group_id == num_microbatch_groups - 1:
-            return microbatch_id_in_group % pipeline_parallel_size == pipeline_parallel_size - 1
-        else:
-            return False
-
-    def forward_step_helper(microbatch_id, checkpoint_activations_microbatch):
-        """Helper method to run forward step with model split into chunks
-        (run set_virtual_pipeline_model_parallel_rank() before calling
-        forward_step())."""
-        model_chunk_id = get_model_chunk_id(microbatch_id, forward=True)
-        parallel_state.set_virtual_pipeline_model_parallel_rank(model_chunk_id)
-
-        # launch param synchronization for next model chunk
-        # Note: Asynchronous communication tends to slow down compute.
-        # To reduce idling from mismatched microbatch times, we launch
-        # asynchronous communication at the same time across the
-        # pipeline-parallel group.
-        if config.param_sync_func is not None:
-            param_sync_microbatch_id = microbatch_id + pipeline_parallel_rank
-            if (
-                param_sync_microbatch_id < total_num_microbatches
-                and is_first_microbatch_for_model_chunk(param_sync_microbatch_id)
-            ):
-                param_sync_chunk_id = get_model_chunk_id(param_sync_microbatch_id, forward=True) + 1
-                if 1 < param_sync_chunk_id < num_model_chunks:
-                    config.param_sync_func[param_sync_chunk_id](
-                        model[param_sync_chunk_id].parameters()
-                    )
-
-        # forward step
-        if parallel_state.is_pipeline_first_stage():
-            if len(input_tensors[model_chunk_id]) == len(output_tensors[model_chunk_id]):
-                input_tensors[model_chunk_id].append(None)
-        input_tensor = input_tensors[model_chunk_id][-1]
-        output_tensor = forward_step(
-            forward_step_func,
-            data_iterator[model_chunk_id],
-            model[model_chunk_id],
-            num_microbatches,
-            input_tensor,
-            forward_data_store,
-            config,
-            collect_non_loss_data,
-            checkpoint_activations_microbatch,
-        )
-        output_tensors[model_chunk_id].append(output_tensor)
-
-        # if forward-only, no need to save tensors for a backward pass
-        if forward_only:
-            input_tensors[model_chunk_id].pop()
-            output_tensors[model_chunk_id].pop()
-
-        return output_tensor
-
-    def backward_step_helper(microbatch_id):
-        """Helper method to run backward step with model split into chunks
-        (run set_virtual_pipeline_model_parallel_rank() before calling
-        backward_step())."""
-        model_chunk_id = get_model_chunk_id(microbatch_id, forward=False)
-        parallel_state.set_virtual_pipeline_model_parallel_rank(model_chunk_id)
-
-        # launch grad synchronization (default)
-        if config.grad_sync_func is None and is_last_microbatch_for_model_chunk(microbatch_id):
-            enable_grad_sync()
-            synchronized_model_chunks.add(model_chunk_id)
-
-        if parallel_state.is_pipeline_last_stage():
-            if len(output_tensor_grads[model_chunk_id]) == 0:
-                output_tensor_grads[model_chunk_id].append(None)
-        input_tensor = input_tensors[model_chunk_id].pop(0)
-        output_tensor = output_tensors[model_chunk_id].pop(0)
-        output_tensor_grad = output_tensor_grads[model_chunk_id].pop(0)
-        input_tensor_grad = backward_step(
-            input_tensor, output_tensor, output_tensor_grad, model_type, config
-        )
-
-        # launch grad synchronization (custom grad sync)
-        # Note: Asynchronous communication tends to slow down compute.
-        # To reduce idling from mismatched microbatch times, we launch
-        # asynchronous communication at the same time across the
-        # pipeline-parallel group.
-        if config.grad_sync_func is not None:
-            grad_sync_microbatch_id = microbatch_id - pipeline_parallel_rank
-            if grad_sync_microbatch_id >= 0 and is_last_microbatch_for_model_chunk(
-                grad_sync_microbatch_id
-            ):
-                grad_sync_chunk_id = get_model_chunk_id(grad_sync_microbatch_id, forward=False)
-                enable_grad_sync()
-                config.grad_sync_func[grad_sync_chunk_id](model[grad_sync_chunk_id].parameters())
-                synchronized_model_chunks.add(grad_sync_chunk_id)
-        disable_grad_sync()
-
-        return input_tensor_grad
-
-    handles._PP_FWD_HANDLES = None
-    handles._PP_BWD_HANDLES = None
-    output_tensor = None
-    # Run warmup forward passes.
-    parallel_state.set_virtual_pipeline_model_parallel_rank(0)
-    input_tensors[0].append(p2p_communication.recv_forward(tensor_shape, config))
-
-    for k in range(num_warmup_microbatches):
-
-        if handles._PP_FWD_HANDLES is not None:
-            for req in handles._PP_FWD_HANDLES:
-                req.wait()
-        handles._PP_FWD_HANDLES = None
-        deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
-
-        # Decide to checkpoint all layers' activations of the current micro-batch
-        if max_outstanding_backprops is not None:
-            checkpoint_activations_microbatch = (
-                k % max_outstanding_backprops
-                >= config.num_microbatches_with_partial_activation_checkpoints
-            )
-        else:
-            checkpoint_activations_microbatch = None
-
-        output_tensor = forward_step_helper(k, checkpoint_activations_microbatch)
-
-        # Determine if tensor should be received from previous stage.
-        next_forward_model_chunk_id = get_model_chunk_id(k + 1, forward=True)
-        recv_prev = True
-        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-            if next_forward_model_chunk_id == 0:
-                recv_prev = False
-        if k == (total_num_microbatches - 1):
-            recv_prev = False
-
-        # Don't send tensor downstream if on last stage.
-        if parallel_state.is_pipeline_last_stage():
-            output_tensor = None
-
-        # Send and receive tensors as appropriate (send tensors computed
-        # in this iteration; receive tensors for next iteration).
-        if not config.overlap_p2p_comm:
-            if (
-                k == (num_warmup_microbatches - 1)
-                and not forward_only
-                and not all_warmup_microbatches
-            ):
-                input_tensor_grad = None
-                recv_next = True
-                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                    recv_next = False
-                (
-                    input_tensor,
-                    output_tensor_grad,
-                ) = p2p_communication.send_forward_backward_recv_forward_backward(
-                    output_tensor,
-                    input_tensor_grad,
-                    recv_prev=recv_prev,
-                    recv_next=recv_next,
-                    tensor_shape=tensor_shape,
-                    config=config,
-                )
-                output_tensor_grads[num_model_chunks - 1].append(output_tensor_grad)
-            else:
-                input_tensor = p2p_communication.send_forward_recv_forward(
-                    output_tensor, recv_prev=recv_prev, tensor_shape=tensor_shape, config=config
-                )
-            input_tensors[next_forward_model_chunk_id].append(input_tensor)
-        else:
-            input_tensor, handles._PP_FWD_HANDLES = p2p_communication.send_forward_recv_forward(
-                output_tensor,
-                recv_prev=recv_prev,
-                tensor_shape=tensor_shape,
-                config=config,
-                overlap_p2p_comm=True,
-            )
-
-            if (
-                k == (num_warmup_microbatches - 1)
-                and not forward_only
-                and not all_warmup_microbatches
-            ):
-                input_tensor_grad = None
-                recv_next = True
-                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                    recv_next = False
-
-                (
-                    output_tensor_grad,
-                    handles._PP_BWD_HANDLES,
-                ) = p2p_communication.send_backward_recv_backward(
-                    input_tensor_grad,
-                    recv_next=recv_next,
-                    tensor_shape=tensor_shape,
-                    config=config,
-                    overlap_p2p_comm=True,
-                )
-
-                output_tensor_grads[num_model_chunks - 1].append(output_tensor_grad)
-            input_tensors[next_forward_model_chunk_id].append(input_tensor)
-
-        # deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
-
-    # Run 1F1B in steady state.
-    for k in range(num_microbatches_remaining):
-        # Forward pass.
-        forward_k = k + num_warmup_microbatches
-
-        # Decide to checkpoint all layers' activations of the current micro-batch
-        if max_outstanding_backprops is not None:
-            checkpoint_activations_microbatch = (
-                forward_k % max_outstanding_backprops
-                >= config.num_microbatches_with_partial_activation_checkpoints
-            )
-        else:
-            checkpoint_activations_microbatch = None
-        if handles._PP_FWD_HANDLES is not None:
-            for req in handles._PP_FWD_HANDLES:
-                req.wait()
-        handles._PP_FWD_HANDLES = None
-        deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)        
-        if config.overlap_p2p_comm:
-
-            output_tensor = forward_step_helper(forward_k, checkpoint_activations_microbatch)
-
-            # Determine if current stage has anything to send in either direction,
-            # otherwise set tensor to None.
-            forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True)
-            parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
-
-            # Last virtual stage no activation tensor to send
-            if parallel_state.is_pipeline_last_stage():
-                output_tensor = None
-
-            # Determine if peers are sending, and where in data structure to put
-            # received tensors.
-            recv_prev = True
-            if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-                # First stage is ahead of last stage by (pipeline_parallel_size - 1).
-                next_forward_model_chunk_id = get_model_chunk_id(
-                    forward_k - (pipeline_parallel_size - 1), forward=True
-                )
-                if next_forward_model_chunk_id == (num_model_chunks - 1):
-                    recv_prev = False
-                next_forward_model_chunk_id += 1
-            else:
-                next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1, forward=True)
-
-            # If last iteration, don't receive; we already received one extra
-            # before the start of the for loop.
-            if k == (num_microbatches_remaining - 1):
-                recv_prev = False
-
-            # Send activation tensor to the next stage and receive activation tensor from the
-            # previous stage
-            input_tensor, handles._PP_FWD_HANDLES = p2p_communication.send_forward_recv_forward(
-                output_tensor,
-                recv_prev=recv_prev,
-                tensor_shape=tensor_shape,
-                config=config,
-                overlap_p2p_comm=True,
-            )
-            # assert fwd_wait_handles is not None
-
-            if handles._PP_BWD_HANDLES is not None:
-                for req in handles._PP_BWD_HANDLES:
-                    req.wait()
-            handles._PP_BWD_HANDLES = None
-
-            # Backward pass.
-            backward_k = k
-            input_tensor_grad = backward_step_helper(backward_k)
-
-            backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False)
-            parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
-
-            # First virtual stage no activation gradient tensor to send
-            if parallel_state.is_pipeline_first_stage():
-                input_tensor_grad = None
-
-            # Determine if the current virtual stage has an activation gradient tensor to receive
-            recv_next = True
-            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                # Last stage is ahead of first stage by (pipeline_parallel_size - 1).
-                next_backward_model_chunk_id = get_model_chunk_id(
-                    backward_k - (pipeline_parallel_size - 1), forward=False
-                )
-                if next_backward_model_chunk_id == 0:
-                    recv_next = False
-                next_backward_model_chunk_id -= 1
-            else:
-                next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1, forward=False)
-
-            output_tensor_grad, handles._PP_BWD_HANDLES = p2p_communication.send_backward_recv_backward(
-                input_tensor_grad,
-                recv_next=recv_next,
-                tensor_shape=tensor_shape,
-                config=config,
-                overlap_p2p_comm=True,
-            )
-
-        else:  # no p2p overlap
-            output_tensor = forward_step_helper(forward_k, checkpoint_activations_microbatch)
-
-            # Backward pass.
-            backward_k = k
-            input_tensor_grad = backward_step_helper(backward_k)
-
-            # Send output_tensor and input_tensor_grad, receive input_tensor
-            # and output_tensor_grad.
-
-            # Determine if current stage has anything to send in either direction,
-            # otherwise set tensor to None.
-            forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True)
-            parallel_state.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
-            if parallel_state.is_pipeline_last_stage():
-                output_tensor = None
-
-            backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False)
-            parallel_state.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
-            if parallel_state.is_pipeline_first_stage():
-                input_tensor_grad = None
-
-            # Determine if peers are sending, and where in data structure to put
-            # received tensors.
-            recv_prev = True
-            if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-                # First stage is ahead of last stage by (pipeline_parallel_size - 1).
-                next_forward_model_chunk_id = get_model_chunk_id(
-                    forward_k - (pipeline_parallel_size - 1), forward=True
-                )
-                if next_forward_model_chunk_id == (num_model_chunks - 1):
-                    recv_prev = False
-                next_forward_model_chunk_id += 1
-            else:
-                next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1, forward=True)
-
-            recv_next = True
-            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                # Last stage is ahead of first stage by (pipeline_parallel_size - 1).
-                next_backward_model_chunk_id = get_model_chunk_id(
-                    backward_k - (pipeline_parallel_size - 1), forward=False
-                )
-                if next_backward_model_chunk_id == 0:
-                    recv_next = False
-                next_backward_model_chunk_id -= 1
-            else:
-                next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1, forward=False)
-
-            # If last iteration, don't receive; we already received one extra
-            # before the start of the for loop.
-            if k == (num_microbatches_remaining - 1):
-                recv_prev = False
-
-            # Communicate tensors.
-            (
-                input_tensor,
-                output_tensor_grad,
-            ) = p2p_communication.send_forward_backward_recv_forward_backward(
-                output_tensor,
-                input_tensor_grad,
-                recv_prev=recv_prev,
-                recv_next=recv_next,
-                tensor_shape=tensor_shape,
-                config=config,
-            )
-            # deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
-
-        # Put input_tensor and output_tensor_grad in data structures in the
-        # right location.
-        if recv_prev:
-            input_tensors[next_forward_model_chunk_id].append(input_tensor)
-        if recv_next:
-            output_tensor_grads[next_backward_model_chunk_id].append(output_tensor_grad)
-
-    if handles._PP_FWD_HANDLES is not None:
-        for req in handles._PP_FWD_HANDLES:
-            req.wait()
-    handles._PP_FWD_HANDLES = None
-    deallocate_output_tensor(output_tensor, config.deallocate_pipeline_outputs)
-
-    # Run cooldown backward passes (flush out pipeline).
-    if not forward_only:
-        if config.overlap_p2p_comm and handles._PP_BWD_HANDLES is not None:
-            for wait_handle in handles._PP_BWD_HANDLES:
-                wait_handle.wait()
-            handles._PP_BWD_HANDLES = None
-
-        if all_warmup_microbatches:
-            output_tensor_grads[num_model_chunks - 1].append(
-                p2p_communication.recv_backward(tensor_shape, config=config)
-            )
-        for k in range(num_microbatches_remaining, total_num_microbatches):
-            input_tensor_grad = backward_step_helper(k)
-            next_backward_model_chunk_id = get_model_chunk_id(k + 1, forward=False)
-            recv_next = True
-            if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                if next_backward_model_chunk_id == (num_model_chunks - 1):
-                    recv_next = False
-            if k == (total_num_microbatches - 1):
-                recv_next = False
-            output_tensor_grads[next_backward_model_chunk_id].append(
-                p2p_communication.send_backward_recv_backward(
-                    input_tensor_grad, recv_next=recv_next, tensor_shape=tensor_shape, config=config
-                )
-            )
-
-        # Launch any remaining grad reductions.
-        enable_grad_sync()
-        if config.grad_sync_func is not None:
-            for model_chunk_id in range(num_model_chunks):
-                if model_chunk_id not in synchronized_model_chunks:
-                    config.grad_sync_func[model_chunk_id](model[model_chunk_id].parameters())
-                    synchronized_model_chunks.add(model_chunk_id)
-
-    if config.timers is not None:
-        config.timers('forward-backward').stop()
-
-    if config.finalize_model_grads_func is not None and not forward_only:
-        # Finalize model grads (perform full grad all-reduce / reduce-scatter for
-        # data parallelism, layernorm all-reduce for sequence parallelism, and
-        # embedding all-reduce for pipeline parallelism).
-        config.finalize_model_grads_func(model)
-
-    return forward_data_store
-
-
-def get_tensor_shapes(
-    *,
-    rank: int,
-    model_type: ModelType,
-    seq_length: int,
-    micro_batch_size: int,
-    decoder_seq_length: int,
-    config,
-):
-    # Determine right tensor sizes (based on position of rank with respect to split
-    # rank) and model size.
-    # Send two tensors if model is T5 and rank is in decoder stage:
-    #     first tensor is decoder (pre-transpose),
-    #     second tensor is encoder (post-transpose).
-    # If model is T5 and rank is at the boundary:
-    #     send one tensor (post-transpose from encoder).
-    # Otherwise, send one tensor (pre-transpose).
-    tensor_shapes = []
-
-    seq_length = seq_length // parallel_state.get_context_parallel_world_size()
-    if config.sequence_parallel:
-        seq_length = seq_length // parallel_state.get_tensor_model_parallel_world_size()
-        if model_type == ModelType.encoder_and_decoder:
-            decoder_seq_length = (
-                decoder_seq_length // parallel_state.get_tensor_model_parallel_world_size()
-            )
-
-    if model_type == ModelType.encoder_and_decoder:
-        if parallel_state.is_pipeline_stage_before_split(rank):
-            tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
-        else:
-            tensor_shapes.append((decoder_seq_length, micro_batch_size, config.hidden_size))
-            tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
-    else:
-        tensor_shapes.append((seq_length, micro_batch_size, config.hidden_size))
-    return tensor_shapes
-
-
-def recv_forward(tensor_shapes, config):
-    input_tensors = []
-    for tensor_shape in tensor_shapes:
-        if tensor_shape is None:
-            input_tensors.append(None)
-        else:
-            input_tensors.append(p2p_communication.recv_forward(tensor_shape, config))
-    return input_tensors
-
-
-def recv_backward(tensor_shapes, config):
-    output_tensor_grads = []
-    for tensor_shape in tensor_shapes:
-        if tensor_shape is None:
-            output_tensor_grads.append(None)
-        else:
-            output_tensor_grads.append(p2p_communication.recv_backward(tensor_shape, config))
-    return output_tensor_grads
-
-
-def send_forward(output_tensors, tensor_shapes, config):
-    if not isinstance(output_tensors, list):
-        output_tensors = [output_tensors]
-    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
-        if tensor_shape is None:
-            continue
-        p2p_communication.send_forward(output_tensor, config)
-
-
-def send_backward(input_tensor_grads, tensor_shapes, config):
-    if not isinstance(input_tensor_grads, list):
-        input_tensor_grads = [input_tensor_grads]
-    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
-        if tensor_shape is None:
-            continue
-        p2p_communication.send_backward(input_tensor_grad, config)
-
-
-def send_forward_recv_backward(output_tensors, tensor_shapes, config):
-    if not isinstance(output_tensors, list):
-        output_tensors = [output_tensors]
-    output_tensor_grads = []
-    for (output_tensor, tensor_shape) in zip(output_tensors, tensor_shapes):
-        if tensor_shape is None:
-            output_tensor_grads.append(None)
-            continue
-        output_tensor_grad = p2p_communication.send_forward_recv_backward(
-            output_tensor, tensor_shape, config
-        )
-        output_tensor_grads.append(output_tensor_grad)
-    return output_tensor_grads
-
-
-def send_backward_recv_forward(input_tensor_grads, tensor_shapes, config):
-    if not isinstance(input_tensor_grads, list):
-        input_tensor_grads = [input_tensor_grads]
-    input_tensors = []
-    for (input_tensor_grad, tensor_shape) in zip(input_tensor_grads, tensor_shapes):
-        if tensor_shape is None:
-            input_tensors.append(None)
-            continue
-        input_tensor = p2p_communication.send_backward_recv_forward(
-            input_tensor_grad, tensor_shape, config
-        )
-        input_tensors.append(input_tensor)
-    return input_tensors
-
-
-def forward_backward_pipelining_without_interleaving(
-    *,
-    forward_step_func,
-    data_iterator: Union[Iterator, List[Iterator]],
-    model: Union[torch.nn.Module, List[torch.nn.Module]],
-    num_microbatches: int,
-    seq_length: int,
-    micro_batch_size: int,
-    decoder_seq_length: int = None,
-    forward_only: bool = False,
-    collect_non_loss_data: bool = False,
-):
-    """Run non-interleaved 1F1B schedule, with communication between pipeline
-    stages.
-
-    Returns dictionary with losses if the last stage, empty dict otherwise."""
-
-    if isinstance(model, list):
-        assert (
-            len(model) == 1
-        ), "non-interleaved pipeline parallelism does not support model chunking"
-        model = model[0]
-    if isinstance(data_iterator, list):
-        assert (
-            len(data_iterator) == 1
-        ), "non-pipeline-parallel schedule does not support model chunking"
-        data_iterator = data_iterator[0]
-
-    config = get_model_config(model)
-    if config.overlap_p2p_comm:
-        raise ValueError(
-            "Non-interleaved pipeline parallelism does not support overlapping p2p communication"
-        )
-
-    if config.timers is not None:
-        config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
-
-    # Disable async grad reductions
-    no_sync_func = config.no_sync_func
-    if no_sync_func is None:
-        no_sync_func = contextlib.nullcontext
-    no_sync_context = None
-
-    def disable_grad_sync():
-        """Disable asynchronous grad reductions"""
-        nonlocal no_sync_context
-        if no_sync_context is None:
-            no_sync_context = no_sync_func()
-            no_sync_context.__enter__()
-
-    def enable_grad_sync():
-        """Enable asynchronous grad reductions"""
-        nonlocal no_sync_context
-        if no_sync_context is not None:
-            no_sync_context.__exit__(None, None, None)
-            no_sync_context = None
-
-    disable_grad_sync()
-
-    # Compute number of warmup microbatches.
-    num_warmup_microbatches = (
-        parallel_state.get_pipeline_model_parallel_world_size()
-        - parallel_state.get_pipeline_model_parallel_rank()
-        - 1
-    )
-    num_warmup_microbatches = min(num_warmup_microbatches, num_microbatches)
-    num_microbatches_remaining = num_microbatches - num_warmup_microbatches
-
-    # Checkpoint the activations of partial Transformer layers in a number of micro-batches
-    # within the maximum outstanding micro-batch backpropagations.
-    # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
-    # checkpoint partial Transformer layers (or skip checkpointing) and
-    # the rest of micro-batches within a window of micro-batches checkpoint
-    # all Transformer layers. The window of micro-batches is set by the maximum
-    # outstanding backpropagations and becomes smaller at later pipeline stages.
-    # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
-    max_outstanding_backprops = None
-    if config.num_microbatches_with_partial_activation_checkpoints is not None:
-        max_outstanding_backprops = num_warmup_microbatches + 1
-
-    model_type = get_model_type(model)
-
-    rank = parallel_state.get_pipeline_model_parallel_rank()
-    recv_tensor_shapes = get_tensor_shapes(
-        rank=rank - 1,
-        model_type=model_type,
-        seq_length=seq_length,
-        micro_batch_size=micro_batch_size,
-        decoder_seq_length=decoder_seq_length,
-        config=config,
-    )
-    send_tensor_shapes = get_tensor_shapes(
-        rank=rank,
-        model_type=model_type,
-        seq_length=seq_length,
-        micro_batch_size=micro_batch_size,
-        decoder_seq_length=decoder_seq_length,
-        config=config,
-    )
-
-    # Input, output tensors only need to be saved when doing backward passes
-    input_tensors = None
-    output_tensors = None
-    if not forward_only:
-        input_tensors = []
-        output_tensors = []
-    forward_data_store = []
-
-    # Run warmup forward passes.
-    for i in range(num_warmup_microbatches):
-        # Decide to checkpoint all layers' activations of the current micro-batch
-        if max_outstanding_backprops is not None:
-            checkpoint_activations_microbatch = (
-                i % max_outstanding_backprops
-                >= config.num_microbatches_with_partial_activation_checkpoints
-            )
-        else:
-            checkpoint_activations_microbatch = None
-
-        input_tensor = recv_forward(recv_tensor_shapes, config)
-        output_tensor = forward_step(
-            forward_step_func,
-            data_iterator,
-            model,
-            num_microbatches,
-            input_tensor,
-            forward_data_store,
-            config,
-            collect_non_loss_data,
-            checkpoint_activations_microbatch,
-        )
-        send_forward(output_tensor, send_tensor_shapes, config)
-
-        if not forward_only:
-            input_tensors.append(input_tensor)
-            output_tensors.append(output_tensor)
-            deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs)
-
-    # Before running 1F1B, need to receive first forward tensor.
-    # If all microbatches are run in warmup / cooldown phase, then no need to
-    # receive this tensor here.
-    if num_microbatches_remaining > 0:
-        input_tensor = recv_forward(recv_tensor_shapes, config)
-
-    # Run 1F1B in steady state.
-    for i in range(num_microbatches_remaining):
-        last_iteration = i == (num_microbatches_remaining - 1)
-
-        # Decide to checkpoint all layers' activations of the current micro-batch
-        if max_outstanding_backprops is not None:
-            checkpoint_activations_microbatch = (
-                (i + num_warmup_microbatches) % max_outstanding_backprops
-            ) >= config.num_microbatches_with_partial_activation_checkpoints
-        else:
-            checkpoint_activations_microbatch = None
-
-        output_tensor = forward_step(
-            forward_step_func,
-            data_iterator,
-            model,
-            num_microbatches,
-            input_tensor,
-            forward_data_store,
-            config,
-            collect_non_loss_data,
-            checkpoint_activations_microbatch,
-        )
-
-        if forward_only:
-            send_forward(output_tensor, send_tensor_shapes, config)
-
-            if not last_iteration:
-                input_tensor = recv_forward(recv_tensor_shapes, config)
-
-        else:
-            output_tensor_grad = send_forward_recv_backward(
-                output_tensor, send_tensor_shapes, config
-            )
-
-            # Add input_tensor and output_tensor to end of list.
-            input_tensors.append(input_tensor)
-            output_tensors.append(output_tensor)
-            deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs)
-
-            # Pop input_tensor and output_tensor from the start of the list for
-            # the backward pass.
-            input_tensor = input_tensors.pop(0)
-            output_tensor = output_tensors.pop(0)
-
-            # Enable grad sync for the last microbatch in the batch if the full
-            # backward pass completes in the 1F1B stage.
-            if num_warmup_microbatches == 0 and last_iteration:
-                if config.grad_sync_func is None or rank == 0:
-                    enable_grad_sync()
-
-            input_tensor_grad = backward_step(
-                input_tensor, output_tensor, output_tensor_grad, model_type, config
-            )
-
-            if last_iteration:
-                input_tensor = None
-                send_backward(input_tensor_grad, recv_tensor_shapes, config)
-            else:
-                input_tensor = send_backward_recv_forward(
-                    input_tensor_grad, recv_tensor_shapes, config
-                )
-
-    # Run cooldown backward passes.
-    if not forward_only:
-        for i in range(num_warmup_microbatches):
-
-            # Enable async grad reduction in the last backward pass
-            # Note: If grad sync function is provided, only enable
-            # async grad reduction in first pipeline stage. Other
-            # pipeline stages do grad reduction during pipeline
-            # bubble.
-            if i == num_warmup_microbatches - 1:
-                if config.grad_sync_func is None or rank == 0:
-                    enable_grad_sync()
-
-            input_tensor = input_tensors.pop(0)
-            output_tensor = output_tensors.pop(0)
-
-            output_tensor_grad = recv_backward(send_tensor_shapes, config)
-
-            input_tensor_grad = backward_step(
-                input_tensor, output_tensor, output_tensor_grad, model_type, config
-            )
-
-            send_backward(input_tensor_grad, recv_tensor_shapes, config)
-
-        # Launch any remaining grad reductions.
-        if no_sync_context is not None:
-            enable_grad_sync()
-            if config.grad_sync_func is not None:
-                config.grad_sync_func(model.parameters())
-
-    if config.timers is not None:
-        config.timers('forward-backward').stop()
-
-    if config.finalize_model_grads_func is not None and not forward_only:
-        # Finalize model grads (perform full grad all-reduce / reduce-scatter for
-        # data parallelism, layernorm all-reduce for sequence parallelism, and
-        # embedding all-reduce for pipeline parallelism).
-        config.finalize_model_grads_func([model])
-
-    return forward_data_store
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/__init__.py
deleted file mode 100644
index f4a384fafaa5eab0cfda48f8ea10f0951b694764..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/__init__.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from .cross_entropy import vocab_parallel_cross_entropy
-from .data import broadcast_data
-from .layers import (
-    ColumnParallelLinear,
-    RowParallelLinear,
-    VocabParallelEmbedding,
-    copy_tensor_model_parallel_attributes,
-    linear_with_grad_accumulation_and_async_allreduce,
-    param_is_not_tensor_parallel_duplicate,
-    set_defaults_if_not_set_tensor_model_parallel_attributes,
-    set_tensor_model_parallel_attributes,
-)
-from .mappings import (
-    copy_to_tensor_model_parallel_region,
-    gather_from_sequence_parallel_region,
-    gather_from_sequence_parallel_region_to_moe,
-    gather_from_tensor_model_parallel_region,
-    reduce_scatter_to_sequence_parallel_region_from_moe,
-    scatter_to_sequence_parallel_region,
-    scatter_to_tensor_model_parallel_region,
-)
-from .random import (
-    checkpoint,
-    get_cuda_rng_tracker,
-    get_data_parallel_rng_tracker_name,
-    model_parallel_cuda_manual_seed,
-    model_parallel_reconfigure_tp_seed
-)
-from .utils import (
-    gather_split_1d_tensor,
-    split_tensor_along_last_dim,
-    split_tensor_into_1d_equal_chunks,
-)
-
-__all__ = [
-    # cross_entropy.py
-    "vocab_parallel_cross_entropy",
-    # data.py
-    "broadcast_data",
-    # layers.py
-    "ColumnParallelLinear",
-    "RowParallelLinear",
-    "VocabParallelEmbedding",
-    "set_tensor_model_parallel_attributes",
-    "set_defaults_if_not_set_tensor_model_parallel_attributes",
-    "copy_tensor_model_parallel_attributes",
-    "param_is_not_tensor_parallel_duplicate",
-    "linear_with_grad_accumulation_and_async_allreduce",
-    # mappings.py
-    "copy_to_tensor_model_parallel_region",
-    "gather_from_tensor_model_parallel_region",
-    "gather_from_sequence_parallel_region",
-    #    "reduce_from_tensor_model_parallel_region",
-    "scatter_to_tensor_model_parallel_region",
-    "scatter_to_sequence_parallel_region",
-    # random.py
-    "checkpoint",
-    "get_cuda_rng_tracker",
-    "model_parallel_cuda_manual_seed",
-    # utils.py
-    "split_tensor_along_last_dim",
-    "split_tensor_into_1d_equal_chunks",
-    "gather_split_1d_tensor",
-    "gather_from_sequence_parallel_region_to_moe",
-    "reduce_scatter_to_sequence_parallel_region_from_moe",
-]
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/cross_entropy.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/cross_entropy.py
deleted file mode 100644
index b42cdbd6ac4374ad04a0371264a549cde26e13c5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/cross_entropy.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-from megatron_ds.core.parallel_state import (
-    get_tensor_model_parallel_group,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-
-from .utils import VocabUtility
-
-
-class _VocabParallelCrossEntropy(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
-
-        # Maximum value along vocab dimension across all GPUs.
-        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
-        torch.distributed.all_reduce(
-            logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
-        )
-        # Subtract the maximum value.
-        vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
-
-        # Get the partition's vocab indecies
-        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
-        partition_vocab_size = vocab_parallel_logits.size()[-1]
-        rank = get_tensor_model_parallel_rank()
-        world_size = get_tensor_model_parallel_world_size()
-        vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
-
-        # Create a mask of valid vocab ids (1 means it needs to be masked).
-        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
-        masked_target = target.clone() - vocab_start_index
-        masked_target[target_mask] = 0
-
-        # Get predicted-logits = logits[target].
-        # For Simplicity, we convert logits to a 2-D tensor with size
-        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
-        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
-        masked_target_1d = masked_target.view(-1)
-        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
-        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
-        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
-        predicted_logits = predicted_logits_1d.view_as(target)
-        predicted_logits[target_mask] = 0.0
-        # All reduce is needed to get the chunks from other GPUs.
-        torch.distributed.all_reduce(
-            predicted_logits,
-            op=torch.distributed.ReduceOp.SUM,
-            group=get_tensor_model_parallel_group(),
-        )
-
-        # Sum of exponential of logits along vocab dimension across all GPUs.
-        exp_logits = vocab_parallel_logits
-        torch.exp(vocab_parallel_logits, out=exp_logits)
-        sum_exp_logits = exp_logits.sum(dim=-1)
-        torch.distributed.all_reduce(
-            sum_exp_logits,
-            op=torch.distributed.ReduceOp.SUM,
-            group=get_tensor_model_parallel_group(),
-        )
-
-        # Loss = log(sum(exp(logits))) - predicted-logit.
-        loss = torch.log(sum_exp_logits) - predicted_logits
-
-        # Normalize and optionally smooth logits
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
-
-        vocab_size = exp_logits.size(-1)
-        if label_smoothing > 0:
-            """
-            We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth.
-            = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt})
-            = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
-            = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
-            = (K * (1 - alpha) - 1) / (K - 1)) * y_gt  + (alpha / (K - 1)) * \sum_{i} y_i
-            = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K
-            From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py
-            """
-            assert 1.0 > label_smoothing > 0.0
-            smoothing = label_smoothing * vocab_size / (vocab_size - 1)
-
-            # Exp logits at this point are normalized probabilities. So we can just take the log to get log-probs.
-            log_probs = torch.log(exp_logits)
-            mean_log_probs = log_probs.mean(dim=-1)
-            loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs
-
-        ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size
-
-        # Store softmax, target-mask and masked-target for backward pass.
-        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
-
-        return loss
-
-    @staticmethod
-    def backward(ctx, grad_output):
-
-        # Retreive tensors from the forward path.
-        softmax, target_mask, masked_target_1d = ctx.saved_tensors
-        label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size
-
-        # All the inputs have softmax as thier gradient.
-        grad_input = softmax
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = grad_input.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
-
-        softmax_update = 1.0 - target_mask.view(-1).float()
-
-        if label_smoothing > 0:
-            smoothing = label_smoothing * vocab_size / (vocab_size - 1)
-            grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update
-            average_grad = 1 / vocab_size
-            grad_2d[arange_1d, :] -= smoothing * average_grad
-        else:
-            grad_2d[arange_1d, masked_target_1d] -= softmax_update
-
-        # Finally elementwise multiplication with the output gradients.
-        grad_input.mul_(grad_output.unsqueeze(dim=-1))
-
-        return grad_input, None, None
-
-
-def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0):
-    """
-    Performs cross entropy loss when logits are split across tensor parallel ranks
-
-    Arguments:
-        vocab_parallel_logits: logits split across tensor parallel ranks
-                               dimension is [sequence_length, batch_size, hidden_size]
-
-        target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
-
-        lobal_smoothing: smoothing factor, must be in range [0.0, 1.0)
-                         default is no smoothing (=0.0)
-    """
-    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/layers.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/layers.py
deleted file mode 100644
index 7deb1a592a9987dbdc33125cf5eed8ddc0ddef51..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/layers.py
+++ /dev/null
@@ -1,995 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-
-import math
-import os
-import warnings
-from packaging import version
-from typing import Callable, Optional
-
-import torch
-import torch.nn.functional as F
-import torch.nn.init as init
-from torch.cuda.amp import custom_bwd, custom_fwd
-from torch.nn.parameter import Parameter
-
-from megatron_ds.core.model_parallel_config import ModelParallelConfig
-from megatron_ds.core.parallel_state import (
-    get_global_memory_buffer,
-    get_tensor_model_parallel_group,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-
-from .mappings import (
-    copy_to_tensor_model_parallel_region,
-    gather_from_sequence_parallel_region,
-    gather_from_tensor_model_parallel_region,
-    reduce_from_tensor_model_parallel_region,
-    reduce_scatter_to_sequence_parallel_region,
-    scatter_to_tensor_model_parallel_region,
-)
-from .random import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name
-from .utils import VocabUtility, divide, split_tensor_along_last_dim
-
-_grad_accum_fusion_available = True
-try:
-    import fused_weight_gradient_mlp_cuda
-except ImportError:
-    _grad_accum_fusion_available = False
-
-_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {
-    'tensor_model_parallel': False,
-    'partition_dim': -1,
-    'partition_stride': 1,
-}
-
-
-def param_is_not_tensor_parallel_duplicate(param):
-    return (hasattr(param, 'tensor_model_parallel') and param.tensor_model_parallel) or (
-        get_tensor_model_parallel_rank() == 0
-    )
-
-
-def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
-    # Make sure the attributes are not set.
-    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
-        assert not hasattr(tensor, attribute)
-    # Set the attributes.
-    setattr(tensor, 'tensor_model_parallel', is_parallel)
-    setattr(tensor, 'partition_dim', dim)
-    setattr(tensor, 'partition_stride', stride)
-
-
-def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
-    def maybe_set(attribute, value):
-        if not hasattr(tensor, attribute):
-            setattr(tensor, attribute, value)
-
-    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
-        maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
-
-
-def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
-    def maybe_copy(attribute):
-        if hasattr(source_tensor, attribute):
-            setattr(destination_tensor, attribute, getattr(source_tensor, attribute))
-
-    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
-        maybe_copy(attribute)
-
-
-def _initialize_affine_weight_gpu(
-    weight, init_method, partition_dim, stride=1, expert_parallel=False
-):
-    """Initialize affine weight for model parallel on GPU."""
-
-    set_tensor_model_parallel_attributes(
-        tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
-    )
-
-    if not expert_parallel:
-        with get_cuda_rng_tracker().fork():
-            init_method(weight)
-    else:
-        with get_cuda_rng_tracker().fork(get_expert_parallel_rng_tracker_name()):
-            init_method(weight)
-
-
-def _initialize_affine_weight_cpu(
-    weight,
-    output_size,
-    input_size,
-    per_partition_size,
-    partition_dim,
-    init_method,
-    stride=1,
-    return_master_weight=False,
-    *,
-    params_dtype=torch.float32,
-):
-    """Initialize affine weight for model parallel.
-
-    Build the master weight on all processes and scatter
-    the relevant chunk."""
-
-    set_tensor_model_parallel_attributes(
-        tensor=weight, is_parallel=True, dim=partition_dim, stride=stride
-    )
-
-    # Initialize master weight
-    master_weight = torch.empty(output_size, input_size, dtype=torch.float, requires_grad=False)
-    init_method(master_weight)
-    master_weight = master_weight.to(dtype=params_dtype)
-
-    # Split and copy
-    per_partition_per_stride_size = divide(per_partition_size, stride)
-    weight_list = torch.split(master_weight, per_partition_per_stride_size, dim=partition_dim)
-    rank = get_tensor_model_parallel_rank()
-    world_size = get_tensor_model_parallel_world_size()
-    my_weight_list = weight_list[rank::world_size]
-
-    with torch.no_grad():
-        torch.cat(my_weight_list, dim=partition_dim, out=weight)
-    if return_master_weight:
-        return master_weight
-    return None
-
-
-class VocabParallelEmbedding(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    This is mainly adapted from torch.nn.Embedding and all the default
-    values are kept.
-    Arguments:
-        num_embeddings: vocabulary size.
-        embedding_dim: size of hidden state.
-
-    Keyword Arguments:
-        config: A megatron_ds.core.ModelParallelConfig object
-    """
-
-    def __init__(
-        self,
-        num_embeddings: int,
-        embedding_dim: int,
-        *,
-        init_method: Callable,
-        config: ModelParallelConfig,
-    ):
-        super(VocabParallelEmbedding, self).__init__()
-        # Keep the input dimensions.
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
-        # Divide the weight matrix along the vocaburaly dimension.
-        (
-            self.vocab_start_index,
-            self.vocab_end_index,
-        ) = VocabUtility.vocab_range_from_global_vocab_size(
-            self.num_embeddings, get_tensor_model_parallel_rank(), self.tensor_model_parallel_size
-        )
-        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
-
-        # Allocate weights and initialize.
-        if config.use_cpu_initialization:
-            self.weight = Parameter(
-                torch.empty(
-                    self.num_embeddings_per_partition, self.embedding_dim, dtype=config.params_dtype
-                )
-            )
-            if config.perform_initialization:
-                _initialize_affine_weight_cpu(
-                    self.weight,
-                    self.num_embeddings,
-                    self.embedding_dim,
-                    self.num_embeddings_per_partition,
-                    0,
-                    init_method,
-                    params_dtype=config.params_dtype,
-                )
-        else:
-            self.weight = Parameter(
-                torch.empty(
-                    self.num_embeddings_per_partition,
-                    self.embedding_dim,
-                    device=torch.cuda.current_device(),
-                    dtype=config.params_dtype,
-                )
-            )
-            if config.perform_initialization:
-                _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
-
-    def forward(self, input_):
-        assert not torch.any(
-            (input_ < 0) | (input_ >= self.num_embeddings)
-        ), "An input token is out of bounds of the embedding table"
-        if self.tensor_model_parallel_size > 1:
-            # Build the mask.
-            input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
-            # Mask the input.
-            masked_input = input_.clone() - self.vocab_start_index
-            masked_input[input_mask] = 0
-        else:
-            masked_input = input_
-        # Get the embeddings.
-        output_parallel = self.weight[masked_input]
-        # Mask the output embedding.
-        if self.tensor_model_parallel_size > 1:
-            output_parallel[input_mask, :] = 0.0
-        # Reduce across all the model parallel GPUs.
-        output = reduce_from_tensor_model_parallel_region(output_parallel)
-        return output
-
-
-class SequenceParallelPositionEmbedding(torch.nn.Module):
-    """Embedding parallelized in the sequence dimension.
-
-    Arguments:
-        sequence_length: max sequence length.
-        embedding_dim: size of hidden state.
-    """
-
-    def __init__(self, sequence_length, embedding_dim):
-        super(SequenceParallelPositionEmbedding, self).__init__()
-        sequence_parallel_size = get_tensor_model_parallel_world_size()
-        assert sequence_length % sequence_parallel_size == 0
-        local_sequence_length = sequence_length // sequence_parallel_size
-        self.offset = local_sequence_length * get_tensor_model_parallel_rank()
-        self.local_embeddings = torch.nn.Embedding(
-            local_sequence_length, embedding_dim)
-
-    def forward(self, position_ids):
-        return self.local_embeddings(position_ids - self.offset)
-class LinearWithFrozenWeight(torch.autograd.Function):
-    """Linear operator that does not calculate gradient for weight.
-    This op and LinearWithGradAccumulationAndAsyncCommunication performs 
-    mathematically-identical forward and DGRAD. 
-
-    Conceptually this op is the same as torch.nn.functional.linear with
-    weight.requires_grad==False, but in experiments they are not identical 
-    mathematically. """
-
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx, input, weight, bias,
-    ):
-        ctx.save_for_backward(weight)
-        output = torch.matmul(input, weight.t())
-        if bias is not None:
-            output = output + bias
-        return output
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        (weight,) = ctx.saved_tensors
-        grad_input = grad_output.matmul(weight)
-        return grad_input, None, None
-
-
-def linear_with_frozen_weight(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    bias: Optional[torch.Tensor],
-    gradient_accumulation_fusion: bool,
-    async_grad_allreduce: bool,
-    sequence_parallel: bool,
-) -> torch.Tensor:
-    """Linear layer execution with weight.requires_grad == False.
-
-    This function handles linear layers with weight frozen (untrainable). 
-    In the forward, it only saves weight and does not save input activations.
-    In the backward, it does not perform weight gradient calculation, or 
-    weight gradient allreduce. 
-
-    Arguments:
-
-    input (torch.Tensor required): input like torch.nn.functional.linear
-
-    weight (torch.Tensor required): weight like torch.nn.functional.linear
-
-    bias (torch.Tensor optional): bias like torch.nn.functional.linear
-
-    gradient_accumulation_fusion (bool required): dummy argument, used to 
-    keep the API unified between all forward implementation functions.
-
-    async_grad_allreduce (bool required): dummy argument, used to 
-    keep the API unified between all forward implementation functions.
-
-    sequence_parallel (bool required): Indicates that sequence
-        parallelism is used and thus in the forward pass the input is
-        all gathered, and the backward pass the input gradients are
-        reduce scattered.
-    """
-
-    if sequence_parallel:
-        input = gather_from_sequence_parallel_region(input, tensor_parallel_output_grad=True)
-    else:
-        input = input
-
-    args = [
-        input,
-        weight,
-        bias,
-    ]
-
-    return LinearWithFrozenWeight.apply(*args)
-
-
-class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
-    """See linear_with_grad_accumulation_and_async_allreduce"""
-
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        input,
-        weight,
-        bias,
-        gradient_accumulation_fusion,
-        async_grad_allreduce,
-        sequence_parallel,
-        inference_params=None,
-    ):
-        ctx.save_for_backward(input, weight)
-        ctx.use_bias = bias is not None
-        ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
-        ctx.async_grad_allreduce = async_grad_allreduce
-        ctx.sequence_parallel = sequence_parallel
-
-        if sequence_parallel and not inference_params:
-            world_size = get_tensor_model_parallel_world_size()
-            dim_size = list(input.size())
-            dim_size[0] = dim_size[0] * world_size
-
-            all_gather_buffer = \
-                get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
-
-            if version.parse(torch.__version__) >= version.parse('1.13'):
-                torch.distributed.all_gather_into_tensor(
-                    all_gather_buffer,
-                    input,
-                    group=get_tensor_model_parallel_group())
-            else:
-                torch.distributed._all_gather_base(
-                    all_gather_buffer,
-                    input,
-                    group=get_tensor_model_parallel_group())
-
-            total_input = all_gather_buffer
-        else:
-            total_input = input
-
-        output = torch.matmul(total_input, weight.t())
-        if bias is not None:
-            output = output + bias
-        return output
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        input, weight = ctx.saved_tensors
-        use_bias = ctx.use_bias
-
-        if ctx.sequence_parallel:
-            world_size = get_tensor_model_parallel_world_size()
-            dim_size = list(input.size())
-            dim_size[0] = dim_size[0] * world_size
-
-            all_gather_buffer = \
-                get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
-
-            if version.parse(torch.__version__) >= version.parse('1.13'):
-                handle = torch.distributed.all_gather_into_tensor(
-                    all_gather_buffer,
-                    input,
-                    group=get_tensor_model_parallel_group(), async_op=True)
-            else:
-                handle = torch.distributed._all_gather_base(
-                    all_gather_buffer,
-                    input,
-                    group=get_tensor_model_parallel_group(), async_op=True)
-
-            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
-            # gather is scheduled before the input gradient computation
-            total_input = all_gather_buffer
-        else:
-            total_input = input
-        grad_input = grad_output.matmul(weight)
-
-        if ctx.sequence_parallel:
-            handle.wait()
-
-        # Doing gather + slicing during the NeMo forward pass can make this tensor
-        # not be contiguous. PyTorch only checks if the tensor is contiguous, and only
-        # clones it if it's not contiguous:
-        # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
-        grad_output = grad_output.contiguous()
-        # Convert the tensor shapes to 2D for execution compatibility
-        grad_output = grad_output.view(
-            grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2]
-        )
-        total_input = total_input.view(
-            total_input.shape[0] * total_input.shape[1], total_input.shape[2]
-        )
-
-        if ctx.async_grad_allreduce:
-            # Asynchronous all-reduce
-            handle = torch.distributed.all_reduce(
-                grad_input, group=get_tensor_model_parallel_group(), async_op=True
-            )
-            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
-            # all-reduce is scheduled before the weight gradient computation
-
-        if ctx.sequence_parallel:
-            assert not ctx.async_grad_allreduce
-            dim_size = list(input.size())
-            sub_grad_input = torch.empty(
-                dim_size, dtype=input.dtype, device=torch.cuda.current_device(), requires_grad=False
-            )
-            # reduce_scatter
-            handle = torch.distributed._reduce_scatter_base(
-                sub_grad_input, grad_input, group=get_tensor_model_parallel_group(), async_op=True
-            )
-            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
-            # reduce scatter is scheduled before the weight gradient computation
-
-        if ctx.gradient_accumulation_fusion:
-            if weight.main_grad.dtype == torch.float32:
-                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
-                    total_input, grad_output, weight.main_grad
-                )
-            elif weight.main_grad.dtype in (torch.float16, torch.bfloat16):
-                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(
-                    total_input, grad_output, weight.main_grad
-                )
-            else:
-                raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
-
-            if hasattr(weight, 'grad_added_to_main_grad'):
-                # When overlap_grad_reduce is True, need to ensure that backward hooks
-                # are all run on the main backprop thread to prevent deadlocks. Setup
-                # dummy grad_weight tensor to prevent backward hooks from being run
-                # in a background thread.
-                grad_weight = torch.empty(
-                    weight.main_grad.shape,
-                    dtype=input.dtype,
-                    device=torch.cuda.current_device(),
-                    requires_grad=False,
-                )
-                weight.grad_added_to_main_grad = True
-            else:
-                grad_weight = None
-        else:
-            grad_weight = grad_output.t().matmul(total_input)
-        grad_bias = grad_output.sum(dim=0) if use_bias else None
-
-        if ctx.sequence_parallel:
-            handle.wait()
-            return sub_grad_input, grad_weight, grad_bias, None, None, None, None
-
-        if ctx.async_grad_allreduce:
-            handle.wait()
-
-        return grad_input, grad_weight, grad_bias, None, None, None, None
-
-
-def linear_with_grad_accumulation_and_async_allreduce(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    bias: Optional[torch.Tensor],
-    gradient_accumulation_fusion: bool,
-    async_grad_allreduce: bool,
-    sequence_parallel: bool,
-    inference_params=None,
-) -> torch.Tensor:
-    """Linear layer execution with asynchronous communication and
-    gradient accumulation fusion in backprop.
-
-    This has the option to accumulate the result of backprop
-    calculation into an existing gradient buffer, preventing the need
-    to do an additional addition kernel after the gradient
-    calculation.
-
-    Additionally, the tensor parallel all reduce of the input
-    gradients can be done asynchronously with the calculation of
-    the weight gradients.
-
-    In the case of sequence parallelism, the reduce scatter of the
-    input gradients is done asynchronously with the calcluation of the
-    weight gradients.
-
-    Use of this module requires that the environment variable
-    CUDA_DEVICE_MAX_CONNECTIONS=1. There are a few collective
-    operations, noted in the code, that should be scheduled before
-    compute kernels to overlap the communication with the computation,
-    which is necessary for a speedup but not for correctness so that
-    ordering isn't imposed by the scheduler. Setting
-    CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled
-    in the order they are called.
-
-    Arguments:
-
-    input (torch.Tensor required): input like torch.nn.functional.linear
-
-    weight (torch.Tensor required): weight like torch.nn.functional.linear
-
-    bias (torch.Tensor optional): bias like torch.nn.functional.linear
-
-    gradient_accumulation_fusion (bool required): Perform the gradient
-        accumulation fusion, requires the custom CUDA extension
-        fused_weight_gradient_mlp_cuda module. To use
-        gradient_accumulation_fusion you must install APEX with
-        --cpp_ext and --cuda_ext. For example: "pip install
-        --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\"
-        " Note that the extension requires CUDA>=11. Otherwise, you
-        must turn off gradient accumulation fusion."
-
-    async_grad_allreduce (bool required): Do the allreduce of input
-        gradients asyncronously with the computation of weight
-        gradients. If sequence_parallel is True, this must be
-        False, as no all reduce is performed.
-
-    sequence_parallel (bool required): Indicates that sequence
-        parallelism is used and thus in the forward pass the input is
-        all gathered, and the backward pass the input gradients are
-        reduce scattered.
-    """
-    args = [
-        input,
-        weight,
-        bias,
-        gradient_accumulation_fusion,
-        async_grad_allreduce,
-        sequence_parallel,
-        inference_params,
-    ]
-
-    if not linear_with_grad_accumulation_and_async_allreduce.warned:
-        if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
-            if sequence_parallel:
-                warnings.warn(
-                    "When using sequence parallelism it is recommended to set the "
-                    "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
-                    "maximum speedup"
-                )
-                linear_with_grad_accumulation_and_async_allreduce.warned = True
-
-            if async_grad_allreduce:
-                warnings.warn(
-                    "When using async grad allreduce it is recommended to set the "
-                    "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
-                    "maximum speedup"
-                )
-                linear_with_grad_accumulation_and_async_allreduce.warned = True
-
-    return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
-
-
-linear_with_grad_accumulation_and_async_allreduce.warned = False
-
-
-class ColumnParallelLinear(torch.nn.Module):
-    """Linear layer with column parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its second dimension as A = [A_1, ..., A_p].
-
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-
-    Keyword Arguments
-        bias: If true, add bias
-        gather_output: If true, call all-gather on output and make Y available
-                       to all GPUs, otherwise, every GPU will have its output
-                       which is Y_i = XA_i
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-        skip_bias_add: If True, do not add the bias term, instead
-                       return it to be added by the caller. This
-                       enables performance optimations where bias can
-                       be fused with other elementwise operations.
-        skip_weight_param_allocation: If True, weight parameter is not allocated and must be passed
-                                      as a keyword argument `weight` during the forward pass. Note
-                                      that this does not affect bias, which will be allocated if
-                                      bias is True. Defaults to False.
-        is_expert: If True, the layer is treated as an MoE expert layer.
-        config: ModelParallelConfig object
-        tp_comm_buffer_name: Communication buffer name is not used in
-                             non-Transformer-Engine modules.
-
-    """
-
-    def __init__(
-        self,
-        input_size,
-        output_size,
-        *,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias=True,
-        gather_output=False,
-        stride=1,
-        keep_master_weight_for_test=False,
-        skip_bias_add=False,
-        skip_weight_param_allocation: bool = False,
-        is_expert: bool = False,
-        tp_comm_buffer_name: str = None,  # Not used
-    ):
-        torch.nn.Module.__init__(self)
-        super(ColumnParallelLinear, self).__init__()
-
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.gather_output = gather_output
-        # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
-        self.output_size_per_partition = divide(output_size, world_size)
-        self.skip_bias_add = skip_bias_add
-        self.is_expert = is_expert
-        self.expert_parallel = config.expert_model_parallel_size > 1
-        self.config = config
-
-        # Parameters.
-        # Note: torch.nn.functional.linear performs XA^T + b and as a result
-        # we allocate the transpose.
-        # Initialize weight.
-        if not skip_weight_param_allocation:
-            if config.use_cpu_initialization:
-                self.weight = Parameter(
-                    torch.empty(
-                        self.output_size_per_partition, self.input_size, dtype=config.params_dtype
-                    )
-                )
-                if config.perform_initialization:
-                    self.master_weight = _initialize_affine_weight_cpu(
-                        self.weight,
-                        self.output_size,
-                        self.input_size,
-                        self.output_size_per_partition,
-                        0,
-                        init_method,
-                        stride=stride,
-                        return_master_weight=keep_master_weight_for_test,
-                    )
-            else:
-                self.weight = Parameter(
-                    torch.empty(
-                        self.output_size_per_partition,
-                        self.input_size,
-                        device=torch.cuda.current_device(),
-                        dtype=config.params_dtype,
-                    )
-                )
-                if config.perform_initialization:
-                    _initialize_affine_weight_gpu(
-                        self.weight,
-                        init_method,
-                        partition_dim=0,
-                        stride=stride,
-                        expert_parallel=(self.is_expert and self.expert_parallel),
-                    )
-
-            setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel))
-        else:
-            self.weight = None
-
-        if bias:
-            if config.use_cpu_initialization:
-                self.bias = Parameter(
-                    torch.empty(self.output_size_per_partition, dtype=config.params_dtype)
-                )
-            else:
-                self.bias = Parameter(
-                    torch.empty(
-                        self.output_size_per_partition,
-                        device=torch.cuda.current_device(),
-                        dtype=config.params_dtype,
-                    )
-                )
-            set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
-            if config.perform_initialization:
-                # Always initialize bias to zero.
-                with torch.no_grad():
-                    self.bias.zero_()
-            setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel))
-        else:
-            self.register_parameter('bias', None)
-
-        self.async_tensor_model_parallel_allreduce = (
-            config.async_tensor_model_parallel_allreduce and world_size > 1
-        )
-
-        self.sequence_parallel = config.sequence_parallel
-        if self.sequence_parallel and world_size <= 1:
-            warnings.warn(
-                f"`sequence_parallel` is set to `True`, but tensor model parallel size is {world_size}. "
-                f"Disabling sequence parallel."
-            )
-            self.sequence_parallel = False
-
-        if config.gradient_accumulation_fusion and not _grad_accum_fusion_available:
-            raise RuntimeError(
-                "ColumnParallelLinear was called with gradient_accumulation_fusion set "
-                "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
-                "module is not found. To use gradient_accumulation_fusion you must "
-                "install APEX with --cpp_ext and --cuda_ext. For example: "
-                "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
-                "Note that the extension requires CUDA>=11. Otherwise, you must turn off "
-                "gradient accumulation fusion."
-            )
-        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
-
-        if self.async_tensor_model_parallel_allreduce and self.sequence_parallel:
-            raise RuntimeError(
-                "`async_tensor_model_parallel_allreduce` and `sequence_parallel` "
-                "cannot be enabled at the same time."
-            )
-
-        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
-        self.explicit_expert_comm = self.is_expert and (
-            self.sequence_parallel or self.expert_parallel
-        )
-
-    def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None, inference_params=None):
-        """Forward of ColumnParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
-
-            weight (optional): weight tensor to use, compulsory when
-                skip_weight_param_allocation is True.
-
-        Returns:
-            - output
-            - bias
-
-        """
-        if weight is None:
-            if self.weight is None:
-                raise RuntimeError(
-                    "weight was not supplied to ColumnParallelLinear forward pass "
-                    "and skip_weight_param_allocation is True."
-                )
-            weight = self.weight
-        else:
-            # Check the weight passed in is the correct shape
-            expected_shape = (self.output_size_per_partition, self.input_size)
-            if weight.shape != expected_shape:
-                raise RuntimeError(
-                    f"supplied weight's shape is {tuple(weight.shape)}, "
-                    f"not {expected_shape} as expected"
-                )
-
-        bias = self.bias if not self.skip_bias_add else None
-
-        if (
-            self.async_tensor_model_parallel_allreduce
-            or self.sequence_parallel
-            or self.explicit_expert_comm
-        ):
-            input_parallel = input_
-        else:
-            input_parallel = copy_to_tensor_model_parallel_region(input_)
-
-        # Matrix multiply.
-        if not weight.requires_grad:
-            self._forward_impl = linear_with_frozen_weight
-        else:
-            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
-        output_parallel = self._forward_impl(
-            input=input_parallel,
-            weight=weight,
-            bias=bias,
-            gradient_accumulation_fusion=self.gradient_accumulation_fusion,
-            async_grad_allreduce=False
-            if self.explicit_expert_comm
-            else self.async_tensor_model_parallel_allreduce,
-            sequence_parallel=False if self.explicit_expert_comm else self.sequence_parallel,
-            inference_params=inference_params,
-        )
-        if self.gather_output:
-            # All-gather across the partitions.
-            assert not self.sequence_parallel
-            output = gather_from_tensor_model_parallel_region(output_parallel)
-        else:
-            output = output_parallel
-        output_bias = self.bias if self.skip_bias_add else None
-        return output, output_bias
-
-
-class RowParallelLinear(torch.nn.Module):
-    """Linear layer with row parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its first dimension and X along its second dimension as:
-               -   -
-              | A_1 |
-              | .   |
-          A = | .   |        X = [X_1, ..., X_p]
-              | .   |
-              | A_p |
-               -   -
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-
-    Keyword Arguments:
-        bias: If true, add bias. Note that bias is not parallelized.
-        input_is_parallel: If true, we assume that the input is already
-                           split across the GPUs and we do not split
-                           again.
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-        skip_bias_add: If True, do not add the bias term, instead
-                       return it to be added by the caller. This
-                       enables performance optimations where bias can
-                       be fused with other elementwise operations.
-        is_expert: If True, the layer is treated as an MoE expert layer
-        tp_comm_buffer_name: Communication buffer name. Not used in
-                             non-Transformer-Engine modules.
-        config: ModelParallelConfig object
-
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias: bool,
-        input_is_parallel: bool,
-        skip_bias_add: bool,
-        stride: int = 1,
-        keep_master_weight_for_test: bool = False,
-        is_expert: bool = False,
-        tp_comm_buffer_name: str = None,  # Not used
-    ):
-        torch.nn.Module.__init__(self)
-        super(RowParallelLinear, self).__init__()
-
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.input_is_parallel = input_is_parallel
-        # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
-        self.input_size_per_partition = divide(input_size, world_size)
-        self.skip_bias_add = skip_bias_add
-        self.config = config
-        self.is_expert = is_expert
-        self.expert_parallel = config.expert_model_parallel_size > 1
-        self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
-        self.sequence_parallel = config.sequence_parallel
-        if self.sequence_parallel and not self.input_is_parallel:
-            raise RuntimeError("To enable `sequence_parallel`, `input_is_parallel` must be `True`")
-
-        # Parameters.
-        # Note: torch.nn.functional.linear performs XA^T + b and as a result
-        # we allocate the transpose.
-        # Initialize weight.
-        if config.use_cpu_initialization:
-            self.weight = Parameter(
-                torch.empty(
-                    self.output_size, self.input_size_per_partition, dtype=config.params_dtype
-                )
-            )
-            if config.perform_initialization:
-                self.master_weight = _initialize_affine_weight_cpu(
-                    self.weight,
-                    self.output_size,
-                    self.input_size,
-                    self.input_size_per_partition,
-                    1,
-                    init_method,
-                    stride=stride,
-                    return_master_weight=keep_master_weight_for_test,
-                    params_dtype=config.params_dtype,
-                )
-        else:
-            self.weight = Parameter(
-                torch.empty(
-                    self.output_size,
-                    self.input_size_per_partition,
-                    device=torch.cuda.current_device(),
-                    dtype=config.params_dtype,
-                )
-            )
-            if config.perform_initialization:
-                _initialize_affine_weight_gpu(
-                    self.weight,
-                    init_method,
-                    partition_dim=1,
-                    stride=stride,
-                    expert_parallel=(self.is_expert and self.expert_parallel),
-                )
-        setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel))
-
-        if bias:
-            if config.use_cpu_initialization:
-                self.bias = Parameter(torch.empty(self.output_size, dtype=config.params_dtype))
-            else:
-                self.bias = Parameter(
-                    torch.empty(
-                        self.output_size,
-                        device=torch.cuda.current_device(),
-                        dtype=config.params_dtype,
-                    )
-                )
-
-            if config.perform_initialization:
-                # Always initialize bias to zero.
-                with torch.no_grad():
-                    self.bias.zero_()
-            setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel))
-            setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
-        else:
-            self.register_parameter('bias', None)
-
-        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
-        self.explicit_expert_comm = self.is_expert and (
-            self.sequence_parallel or self.expert_parallel
-        )
-
-    def forward(self, input_, inference_params=None):
-        """Forward of RowParallelLinear
-
-        Args:
-            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
-
-        Returns:
-            - output
-            - bias
-        """
-        # Set up backprop all-reduce.
-        if self.input_is_parallel:
-            input_parallel = input_
-        else:
-            assert not self.sequence_parallel
-            input_parallel = scatter_to_tensor_model_parallel_region(input_)
-        # Matrix multiply.
-        if not self.weight.requires_grad:
-            self._forward_impl = linear_with_frozen_weight
-        else:
-            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
-        output_parallel = self._forward_impl(
-            input=input_parallel,
-            weight=self.weight,
-            bias=None,
-            gradient_accumulation_fusion=self.gradient_accumulation_fusion,
-            async_grad_allreduce=False,
-            sequence_parallel=False,
-            inference_params=inference_params,
-        )
-
-        # All-reduce across all the partitions.
-        if self.explicit_expert_comm:
-            assert self.skip_bias_add
-            output_ = output_parallel
-        elif self.sequence_parallel and not inference_params:
-            output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
-        else:
-            output_ = reduce_from_tensor_model_parallel_region(output_parallel)
-        if not self.skip_bias_add:
-            output = (output_ + self.bias) if self.bias is not None else output_
-            output_bias = None
-        else:
-            output = output_
-            output_bias = self.bias
-        return output, output_bias
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/mappings.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/mappings.py
deleted file mode 100644
index dea90dcb9de984e03573a5a96fd114ff265e7766..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/mappings.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-from megatron_ds.core.parallel_state import (
-    get_tensor_and_expert_parallel_group,
-    get_tensor_model_parallel_group,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-
-from .utils import split_tensor_along_last_dim
-from deepspeed.accelerator import get_accelerator
-
-
-def _reduce(input_):
-    """All-reduce the input tensor across model parallel group."""
-
-    # Bypass the function if we are using only 1 GPU.
-    if get_tensor_model_parallel_world_size() == 1:
-        return input_
-
-    # All-reduce.
-    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
-
-    return input_
-
-
-def _split_along_last_dim(input_):
-    """Split the tensor along its last dimension and keep the
-    corresponding slice."""
-
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    # Split along last dimension.
-    input_list = split_tensor_along_last_dim(input_, world_size)
-
-    # Note: torch.split does not create contiguous tensors by default.
-    rank = get_tensor_model_parallel_rank()
-    output = input_list[rank].contiguous()
-
-    return output
-
-
-def _split_along_first_dim(input_):
-    """Split the tensor along its first dimension and keep the
-    corresponding slice."""
-
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    # Split along first dimension.
-    dim_size = input_.size()[0]
-    assert (
-        dim_size % world_size == 0
-    ), "First dimension of the tensor should be divisible by tensor parallel size"
-    local_dim_size = dim_size // world_size
-    rank = get_tensor_model_parallel_rank()
-    dim_offset = rank * local_dim_size
-
-    output = input_[dim_offset : dim_offset + local_dim_size].contiguous()
-
-    return output
-
-
-def _gather_along_last_dim(input_):
-    """Gather tensors and concatinate along the last dimension."""
-
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    # Size and dimension.
-    last_dim = input_.dim() - 1
-    rank = get_tensor_model_parallel_rank()
-
-    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-    tensor_list[rank] = input_
-    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
-
-    # Note: torch.cat already creates a contiguous tensor.
-    output = torch.cat(tensor_list, dim=last_dim).contiguous()
-
-    return output
-
-
-def _gather_along_first_dim(input_):
-    """Gather tensors and concatinate along the first dimension."""
-
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    dim_size = list(input_.size())
-    dim_size[0] = dim_size[0] * world_size
-
-    output = torch.empty(dim_size, dtype=input_.dtype,
-                         device=get_accelerator().current_device_name())
-    torch.distributed._all_gather_base(output, input_.contiguous(),
-                                       group=get_tensor_model_parallel_group())
-
-    return output
-
-
-def _reduce_scatter_along_first_dim(input_):
-    """Reduce-scatter the input tensor across model parallel group."""
-    world_size = get_tensor_model_parallel_world_size()
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    dim_size = list(input_.size())
-    assert (
-        dim_size[0] % world_size == 0
-    ), "First dimension of the tensor should be divisible by tensor parallel size"
-
-    dim_size[0] = dim_size[0] // world_size
-
-    output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
-    torch.distributed._reduce_scatter_base(
-        output, input_.contiguous(), group=get_tensor_model_parallel_group()
-    )
-    return output
-
-
-def _gather_along_first_dim_moe(input_):
-    """Gather tensors and concatenate along the first dimension."""
-    group = get_tensor_and_expert_parallel_group()
-    world_size = torch.distributed.get_world_size(group=group)
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    dim_size = list(input_.size())
-    dim_size[0] = dim_size[0] * world_size
-
-    output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
-    torch.distributed._all_gather_base(output, input_.contiguous(), group=group)
-
-    return output
-
-
-def _reduce_scatter_along_first_dim_moe(input_):
-    """Reduce-scatter the input tensor across model parallel group."""
-    group = get_tensor_and_expert_parallel_group()
-    world_size = torch.distributed.get_world_size(group=group)
-    # Bypass the function if we are using only 1 GPU.
-    if world_size == 1:
-        return input_
-
-    dim_size = list(input_.size())
-    assert dim_size[0] % world_size == 0
-    dim_size[0] = dim_size[0] // world_size
-
-    output = torch.empty(dim_size, dtype=input_.dtype, device=torch.cuda.current_device())
-    torch.distributed._reduce_scatter_base(output, input_.contiguous(), group=group)
-    return output
-
-
-class _CopyToModelParallelRegion(torch.autograd.Function):
-    """Pass the input to the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return input_
-
-    @staticmethod
-    def forward(ctx, input_):
-        return input_
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _reduce(grad_output)
-
-
-class _ReduceFromModelParallelRegion(torch.autograd.Function):
-    """All-reduce the input from the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _reduce(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _reduce(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output
-
-
-class _ScatterToModelParallelRegion(torch.autograd.Function):
-    """Split the input and keep only the corresponding chuck to the rank."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _split_along_last_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _split_along_last_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather_along_last_dim(grad_output)
-
-
-class _GatherFromModelParallelRegion(torch.autograd.Function):
-    """Gather the input from model parallel region and concatinate."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _gather_along_last_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _gather_along_last_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _split_along_last_dim(grad_output)
-
-
-class _ScatterToSequenceParallelRegion(torch.autograd.Function):
-    """Split the input and keep only the corresponding chuck to the rank."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _split_along_first_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _split_along_first_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather_along_first_dim(grad_output)
-
-
-class _GatherFromSequenceParallelRegion(torch.autograd.Function):
-    """Gather the input from sequence parallel region and concatinate."""
-
-    @staticmethod
-    def symbolic(graph, input_, tensor_parallel_output_grad=True):
-        return _gather_along_first_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_, tensor_parallel_output_grad=True):
-        ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
-        return _gather_along_first_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
-
-        # If the computation graph after the gather operation is
-        # in the tensor parallel mode, output gradients need to reduce
-        # scattered and whereas if the computation is duplicated,
-        # output gradients need to be scattered.
-        if tensor_parallel_output_grad:
-            return _reduce_scatter_along_first_dim(grad_output), None
-        else:
-            return _split_along_first_dim(grad_output), None
-
-
-class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
-    """Reduce scatter the input from the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _reduce_scatter_along_first_dim(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _reduce_scatter_along_first_dim(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather_along_first_dim(grad_output)
-
-
-class _GatherFromSequenceParallelRegionToMOE(torch.autograd.Function):
-    """Gather the input from model parallel region and concatenate."""  # TODO
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _gather_along_first_dim_moe(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _gather_along_first_dim_moe(input_,)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _reduce_scatter_along_first_dim_moe(grad_output)
-
-
-class _ReduceScatterToSequenceParallelRegionFromMOE(torch.autograd.Function):
-    """Reduce scatter the input from the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        return _reduce_scatter_along_first_dim_moe(input_)
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _reduce_scatter_along_first_dim_moe(input_,)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather_along_first_dim_moe(grad_output)
-
-
-# -----------------
-# Helper functions.
-# -----------------
-
-
-def copy_to_tensor_model_parallel_region(input_):
-    return _CopyToModelParallelRegion.apply(input_)
-
-
-def reduce_from_tensor_model_parallel_region(input_):
-    return _ReduceFromModelParallelRegion.apply(input_)
-
-
-def scatter_to_tensor_model_parallel_region(input_):
-    return _ScatterToModelParallelRegion.apply(input_)
-
-
-def gather_from_tensor_model_parallel_region(input_):
-    return _GatherFromModelParallelRegion.apply(input_)
-
-
-def scatter_to_sequence_parallel_region(input_):
-    return _ScatterToSequenceParallelRegion.apply(input_)
-
-
-def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
-    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
-
-
-def reduce_scatter_to_sequence_parallel_region(input_):
-    return _ReduceScatterToSequenceParallelRegion.apply(input_)
-
-
-def gather_from_sequence_parallel_region_to_moe(input_):
-    return _GatherFromSequenceParallelRegionToMOE.apply(input_)
-
-
-def reduce_scatter_to_sequence_parallel_region_from_moe(input_):
-    return _ReduceScatterToSequenceParallelRegionFromMOE.apply(input_)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/random.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/random.py
deleted file mode 100644
index af8b7634162656b2039dc7a91bbf9ecc3f31749e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/random.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-
-import contextlib
-
-import torch
-from torch import _C
-from torch.cuda import _lazy_call
-from torch.cuda import device as device_ctx_manager
-from torch.utils.checkpoint import detach_variable
-
-from megatron_ds.core.parallel_state import (
-    get_data_parallel_rank,
-    get_expert_model_parallel_rank,
-    get_tensor_model_parallel_group,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-from megatron_ds.core.utils import safely_set_viewless_tensor_data
-
-import deepspeed
-from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
-
-# Default name for the model parallel rng tracker.
-_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
-_EXPERT_PARALLEL_RNG_TRACKER_NAME = 'expert-parallel-rng'
-_DATA_PARALLEL_RNG_TRACKER_NAME = 'data-parallel-rng'
-
-
-def _set_cuda_rng_state(new_state, device=-1):
-    """Sets the random number generator state of the current GPU.
-
-    Argumentss:
-        new_state (torch.ByteTensor): The desired state
-    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
-    with a single change: the input state is not cloned. Cloning caused
-    major performance issues for +4 GPU cases.
-    """
-    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
-        # older PyTorch
-        def cb():
-            with device_ctx_manager(device):
-                _C._cuda_setRNGState(new_state)
-
-    else:
-        # newer PyTorch
-        if device == -1:
-            device = torch.device('cuda')
-        elif isinstance(device, str):
-            device = torch.device(device)
-        elif isinstance(device, int):
-            device = torch.device('cuda', device)
-
-        def cb():
-            idx = device.index
-            if idx is None:
-                idx = torch.cuda.current_device()
-            default_generator = torch.cuda.default_generators[idx]
-            default_generator.set_state(new_state)
-
-    _lazy_call(cb)
-
-
-def get_expert_parallel_rng_tracker_name():
-    global _EXPERT_PARALLEL_RNG_TRACKER_NAME
-    return _EXPERT_PARALLEL_RNG_TRACKER_NAME
-
-
-def get_data_parallel_rng_tracker_name():
-    global _DATA_PARALLEL_RNG_TRACKER_NAME
-    return _DATA_PARALLEL_RNG_TRACKER_NAME
-
-
-class CudaRNGStatesTracker:
-    """Tracker for the cuda RNG states.
-
-    Using the `add` method, a cuda rng state is initialized based on
-    the input `seed` and is assigned to `name`. Later, by forking the
-    rng state, we can perform operations and return to our starting
-    cuda state.
-    """
-
-    def __init__(self):
-        # Map from a string name to the cuda rng state.
-        self.states_ = {}
-        # Seeds are just for book keeping and ensure no seed is set twice.
-        self.seeds_ = set()
-
-    def reset(self):
-        """Set to the initial state (no tracker)."""
-        self.states_ = {}
-        self.seeds_ = set()
-
-    def get_states(self):
-        """Get rng states. Copy the dictionary so we have direct
-        pointers to the states, not just a pointer to the dictionary."""
-        states = {}
-        for name in self.states_:
-            states[name] = self.states_[name]
-        return states
-
-    def set_states(self, states):
-        """Set the rng states. For efficiency purposes, we do not check
-        the size of seed for compatibility."""
-        self.states_ = states
-
-    def add(self, name, seed):
-        """Track the rng state."""
-        # Check seed is not already used.
-        if seed in self.seeds_:
-            raise Exception('seed {} already exists'.format(seed))
-        self.seeds_.add(seed)
-        # Check that state is not already defined.
-        if name in self.states_:
-            raise Exception('cuda rng state {} already exists'.format(name))
-        # Get the current rng state.
-        orig_rng_state = torch.cuda.get_rng_state()
-        # Set the new state and store it.
-        torch.cuda.manual_seed(seed)
-        self.states_[name] = torch.cuda.get_rng_state()
-        # Reset rng state to what it was.
-        _set_cuda_rng_state(orig_rng_state)
-
-    @contextlib.contextmanager
-    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
-        """Fork the cuda rng state, perform operations, and exit with
-        the original state."""
-        # Check if we have added the state
-        if name not in self.states_:
-            raise Exception('cuda rng state {} is not added'.format(name))
-        # Store current rng state.
-        orig_cuda_rng_state = torch.cuda.get_rng_state()
-        # Set rng state to the desired one
-        _set_cuda_rng_state(self.states_[name])
-        # Do the stuff we wanted to do.
-        try:
-            yield
-        finally:
-            # Update the current rng state for later use.
-            self.states_[name] = torch.cuda.get_rng_state()
-            # And set the state to the original state we started with.
-            _set_cuda_rng_state(orig_cuda_rng_state)
-
-
-# RNG tracker object.
-_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
-
-def get_cuda_rng_tracker():
-    """Get cuda rng tracker."""
-    if deepspeed.checkpointing.is_configured():
-        return deepspeed.checkpointing.get_cuda_rng_tracker()
-    
-    return _CUDA_RNG_STATE_TRACKER
-
-
-def model_parallel_cuda_manual_seed(seed):
-    """Initialize model parallel cuda seed.
-
-    This function should be called after the model parallel is
-    initialized. Also, no torch.cuda.manual_seed should be called
-    after this function. Basically, this is replacement for that
-    function.
-    Two set of RNG states are tracked:
-        default state: This is for data parallelism and is the same among a
-                       set of model parallel GPUs but different across
-                       different model paralle groups. This is used for
-                       example for dropout in the non-tensor-model-parallel regions.
-        tensor-model-parallel state: This state is different among a set of model
-                              parallel GPUs, but the same across data parallel
-                              groups. This is used for example for dropout in
-                              model parallel regions.
-    """
-    if deepspeed.checkpointing.is_configured():
-        return deepspeed.checkpointing.model_parallel_cuda_manual_seed(seed)
-    
-    # 2718 is just for fun and any POSITIVE value will work.
-    offset = seed + 2718
-    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
-    # Data parallel gets the original seed.
-    data_parallel_seed = seed
-
-    _CUDA_RNG_STATE_TRACKER.reset()
-    # Set the default state.
-    torch.cuda.manual_seed(data_parallel_seed)
-    _CUDA_RNG_STATE_TRACKER.add(_DATA_PARALLEL_RNG_TRACKER_NAME, data_parallel_seed)
-
-    # and model parallel state.
-    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, tensor_model_parallel_seed)
-
-    expert_parallel_seed = (
-        seed + 1024 + 100 * get_expert_model_parallel_rank() + get_tensor_model_parallel_rank()
-    )
-    _CUDA_RNG_STATE_TRACKER.add(_EXPERT_PARALLEL_RNG_TRACKER_NAME, expert_parallel_seed)
-
-
-def model_parallel_reconfigure_tp_seed(seed):
-    if deepspeed.checkpointing.is_configured():
-        return deepspeed.checkpointing.model_parallel_reconfigure_tp_seed(seed)
-
-    model_parallel_seed = seed + 2718 + get_tensor_model_parallel_rank()
-    with _CUDA_RNG_STATE_TRACKER.fork():
-        get_accelerator().manual_seed(model_parallel_seed)
-
-
-class CheckpointFunction(torch.autograd.Function):
-    """This function is adapted from torch.utils.checkpoint with
-       two main changes:
-           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
-           2) the states in the model parallel tracker are also properly
-              tracked/set/reset.
-    """
-
-    @staticmethod
-    def forward(ctx, run_function, distribute_saved_activations, *args):
-        ctx.run_function = run_function
-        ctx.distribute_saved_activations = distribute_saved_activations
-
-        # Copy the rng states.
-        ctx.fwd_cpu_rng_state = torch.get_rng_state()
-        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
-        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
-
-        with torch.no_grad():
-            outputs = run_function(*args)
-
-        # Divide hidden states across model parallel group and only keep
-        # the chunk corresponding to the current rank.
-        if distribute_saved_activations:
-            ctx.input_0_shape = args[0].data.shape
-            safely_set_viewless_tensor_data(
-                args[0], split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True)
-            )
-
-        # Store everything.
-        ctx.save_for_backward(*args)
-
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        if not torch.autograd._is_checkpoint_valid():
-            raise RuntimeError(
-                "Checkpointing is not compatible with .grad(), "
-                "please use .backward() if possible"
-            )
-        inputs = ctx.saved_tensors
-        if ctx.distribute_saved_activations:
-            safely_set_viewless_tensor_data(
-                inputs[0], gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape)
-            )
-
-        # Store the current states.
-        bwd_cpu_rng_state = torch.get_rng_state()
-        bwd_cuda_rng_state = torch.cuda.get_rng_state()
-        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
-
-        # Set the states to what it used to be before the forward pass.
-        torch.set_rng_state(ctx.fwd_cpu_rng_state)
-        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
-        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
-
-        # Compute the forward pass.
-        detached_inputs = detach_variable(inputs)
-        with torch.enable_grad():
-            outputs = ctx.run_function(*detached_inputs)
-
-        # Set the states back to what it was at the start of this function.
-        torch.set_rng_state(bwd_cpu_rng_state)
-        _set_cuda_rng_state(bwd_cuda_rng_state)
-        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
-
-        if isinstance(outputs, torch.Tensor):
-            outputs = (outputs,)
-        torch.autograd.backward(outputs, args)
-        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in detached_inputs)
-        return (None, None) + grads
-
-
-def checkpoint(function, distribute_saved_activations, *args):
-    """Checkpoint a model or part of the model.
-    This has been directly copied from torch.utils.checkpoint."""
-    if deepspeed.checkpointing.is_configured():
-        return deepspeed.checkpointing.checkpoint(function, *args)
-    
-    return CheckpointFunction.apply(function,
-                                    distribute_saved_activations, *args)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/utils.py
deleted file mode 100644
index 8521bb560e94ea990582ec820eb97ba389acde04..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/utils.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-from typing import List, Sequence
-from packaging import version
-
-import torch
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.utils import divide
-
-
-def split_tensor_along_last_dim(
-    tensor: torch.Tensor, num_partitions: int, contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """ Split a tensor along its last dimension.
-
-        Arguments:
-            tensor: input tensor.
-            num_partitions: number of partitions to split the tensor
-            contiguous_split_chunks: If True, make each chunk contiguous
-                                     in memory.
-
-        Returns:
-            A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
-    """ Break a tensor into equal 1D chunks across tensor parallel ranks.
-
-        Returns a Tensor or View with this rank's portion of the data.
-
-        Arguments:
-            tensor: The tensor to split
-
-        Keyword Arguments:
-            new_buffer (bool): If True, returns a new Tensor.
-                               If False, returns a view into the existing Tensor.
-                               Default is False
-
-    """
-    partition_size = torch.numel(tensor) // parallel_state.get_tensor_model_parallel_world_size()
-    start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
-    end_index = start_index + partition_size
-    if new_buffer:
-        data = torch.empty(
-            partition_size,
-            dtype=tensor.dtype,
-            device=torch.cuda.current_device(),
-            requires_grad=False,
-        )
-        data.copy_(tensor.view(-1)[start_index:end_index])
-    else:
-        data = tensor.view(-1)[start_index:end_index]
-    return data
-
-
-def gather_split_1d_tensor(tensor):
-    """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
-        model parallel ranks.
-
-        Returns a new Tensor with the gathered data.
-
-        Arguments:
-            tensor: A Tensor or view of this rank's portion of the data.
-    """
-    numel_gathered = torch.numel(tensor) * parallel_state.get_tensor_model_parallel_world_size()
-    gathered = torch.empty(
-        numel_gathered, dtype=tensor.dtype, device=torch.cuda.current_device(), requires_grad=False
-    )
-    # TODO: This API is experimental in pytorch (as of Feb 2022) and
-    # this might break in future pytorch releases. We chose this API
-    # as opposed to torch.distributed.all_gather for efficiency reasons.
-    # This API calls directly NCCL all-gather versus the former does
-    # internal copies and can potentially cause slow down.
-    if version.parse(torch.__version__) >= version.parse('1.13'):
-        torch.distributed.all_gather_into_tensor(gathered, tensor,
-                                       group=parallel_state.get_tensor_model_parallel_group())
-    else:
-        torch.distributed._all_gather_base(gathered, tensor,
-                                       group=parallel_state.get_tensor_model_parallel_group())
-
-    return gathered
-
-
-class VocabUtility:
-    """ Split the vocabulary into `world_size` chunks and return the first
-        and last index of the vocabulary belonging to the `rank`
-        partition: Note that indices in [fist, last)
-
-    """
-
-    @staticmethod
-    def vocab_range_from_per_partition_vocab_size(
-        per_partition_vocab_size: int, rank, world_size: int
-    ) -> Sequence[int]:
-        index_f = rank * per_partition_vocab_size
-        index_l = index_f + per_partition_vocab_size
-        return index_f, index_l
-
-    @staticmethod
-    def vocab_range_from_global_vocab_size(
-        global_vocab_size: int, rank: int, world_size: int
-    ) -> Sequence[int]:
-        per_partition_vocab_size = divide(global_vocab_size, world_size)
-        return VocabUtility.vocab_range_from_per_partition_vocab_size(
-            per_partition_vocab_size, rank, world_size
-        )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/__init__.py
deleted file mode 100644
index 7cc10776b7459542eb35ecb5e768dbef9bd54d05..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from .module import MegatronModule
-from .spec_utils import ModuleSpec, build_module
-from .transformer_config import TransformerConfig
-from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/attention.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/attention.py
deleted file mode 100644
index 7196ef400e78aaf7eaecf5faa3e9310eec088acb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/attention.py
+++ /dev/null
@@ -1,443 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import Union
-
-import torch
-
-from megatron_ds.core import parallel_state, tensor_parallel
-from megatron_ds.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.identity_op import IdentityFuncOp, IdentityOp
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.spec_utils import ModuleSpec, build_module
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.utils import divide
-
-from .enums import AttnMaskType
-from .transformer_config import TransformerConfig
-from .utils import make_sharded_tensors_for_checkpoint
-
-
-@dataclass
-class SelfAttentionSubmodules:
-    linear_qkv: Union[ModuleSpec, type] = None
-    core_attention: Union[ModuleSpec, type] = None
-    linear_proj: Union[ModuleSpec, type] = None
-
-
-@dataclass
-class CrossAttentionSubmodules:
-    linear_q: Union[ModuleSpec, type] = None
-    linear_kv: Union[ModuleSpec, type] = None
-    core_attention: Union[ModuleSpec, type] = None
-    linear_proj: Union[ModuleSpec, type] = None
-
-
-class Attention(MegatronModule, ABC):
-    """Attention layer abstract class.
-
-    This layer only contains common modules required for the "self attn" and
-    "cross attn" specializations.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        submodules: Union[SelfAttentionSubmodules, CrossAttentionSubmodules],
-        layer_number: int,
-        attn_mask_type: AttnMaskType,
-        attention_type: str,
-    ):
-        super().__init__(config=config)
-
-        self.config = config
-        self.layer_number = layer_number
-        self.attn_mask_type = attn_mask_type
-        self.attention_type = attention_type
-
-        # For normal attention without groups, num_query_groups == num_attention_heads,
-        # so these two will be the same
-        self.query_projection_size = self.config.kv_channels * self.config.num_attention_heads
-        self.kv_projection_size = self.config.kv_channels * self.config.num_query_groups
-
-        # Per attention head and per partition values.
-        world_size = parallel_state.get_tensor_model_parallel_world_size()
-        self.hidden_size_per_attention_head = divide(
-            self.query_projection_size, self.config.num_attention_heads
-        )
-        self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
-        self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
-
-        self.core_attention = build_module(
-            submodules.core_attention,
-            config=self.config,
-            layer_number=self.layer_number,
-            attn_mask_type=self.attn_mask_type,
-            attention_type=self.attention_type,
-        )
-
-        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
-
-        # Output.
-        self.linear_proj = build_module(
-            submodules.linear_proj,
-            self.query_projection_size,
-            self.config.hidden_size,
-            config=self.config,
-            init_method=self.config.output_layer_init_method,
-            bias=self.config.add_bias_linear,
-            input_is_parallel=True,
-            skip_bias_add=True,
-            is_expert=False,
-            tp_comm_buffer_name='proj',
-        )
-
-    def _checkpointed_attention_forward(
-        self, query, key, value, attention_mask, rotary_pos_emb=None, attn_mask_type=None
-    ):
-        """Forward method with selective activation checkpointing."""
-
-        def custom_forward(*inputs):
-            query = inputs[0]
-            key = inputs[1]
-            value = inputs[2]
-            attention_mask = inputs[3]
-            attn_mask_type = inputs[5]
-            attn_mask_type = AttnMaskType(attn_mask_type.item())
-            output_ = self.core_attention(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type
-            )
-            return output_
-
-        if attn_mask_type is None:
-            attn_mask_type = self.attn_mask_type
-        attn_mask_type = torch.tensor([attn_mask_type.value], dtype=torch.int)
-        hidden_states = tensor_parallel.checkpoint(
-            custom_forward, False, query, key, value, attention_mask, rotary_pos_emb, attn_mask_type
-        )
-
-        return hidden_states
-
-    def _allocate_memory(self, inference_max_sequence_length, batch_size, dtype):
-        """Allocate memory to store kv cache during inference."""
-
-        return torch.empty(
-            inference_max_sequence_length,
-            batch_size,
-            self.num_query_groups_per_partition,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=torch.cuda.current_device(),
-        )
-
-    def _adjust_key_value_for_inference(self, inference_params, key, value, rotary_pos_emb):
-        """
-        Saves the generated key and value tensors to the end of the buffers in inference_params.
-        Returns the full size keys and values from the provided inference_params, as well as
-        adjusted rotary_pos_emb.
-
-        Returns a tuple: (key, value, rotary_pos_emb)
-
-        """
-        attn_mask_type = self.attn_mask_type
-        if inference_params is None:
-            return key, value, rotary_pos_emb, attn_mask_type
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        is_first_step = False
-        if self.layer_number not in inference_params.key_value_memory_dict:
-            inf_max_seq_length = inference_params.max_sequence_length
-            inf_max_batch_size = inference_params.max_batch_size
-            inference_key_memory = self._allocate_memory(
-                inf_max_seq_length, inf_max_batch_size, key.dtype
-            )
-            inference_value_memory = self._allocate_memory(
-                inf_max_seq_length, inf_max_batch_size, value.dtype
-            )
-            inference_params.key_value_memory_dict[self.layer_number] = (
-                inference_key_memory,
-                inference_value_memory,
-            )
-            is_first_step = True
-        else:
-            # Get the pre-allocated buffers for this layer
-            inference_key_memory, inference_value_memory = inference_params.key_value_memory_dict[
-                self.layer_number
-            ]
-            attn_mask_type = AttnMaskType.no_mask
-
-        batch_start = inference_params.batch_size_offset
-        batch_end = batch_start + key.size(1)
-        assert batch_end <= inference_key_memory.size(1)
-        sequence_start = inference_params.sequence_len_offset
-        sequence_end = sequence_start + key.size(0)
-        assert sequence_end <= inference_key_memory.size(0)
-        # Copy key and values.
-        inference_key_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = key
-        inference_value_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = value
-        key = inference_key_memory[:sequence_end, batch_start:batch_end, ...]
-        value = inference_value_memory[:sequence_end, batch_start:batch_end, ...]
-
-        # adjust the key rotary positional embedding
-        if rotary_pos_emb is not None:
-            q_pos_emb, k_pos_emb = rotary_pos_emb
-            # need to cross check this condition during inference
-            # if not set_inference_key_value_memory:
-            if not is_first_step:
-                # In inference, we compute one token at a time.
-                # Select the correct positional embedding
-                # (only the last token in the sequence)
-                q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end]
-            else:
-                # In the first forward pass of inference,
-                # we use the entire provided prefix.
-                # q_pos_emb here has the rope embeddings of the entire
-                # prefix + to-be-generated output so
-                # we slice to just the prefix.
-                q_pos_emb = q_pos_emb[:sequence_end, :, :, :]
-            k_pos_emb = k_pos_emb[:sequence_end, :, :, :]
-            rotary_pos_emb = (q_pos_emb, k_pos_emb)
-
-        return key, value, rotary_pos_emb, attn_mask_type
-
-    @abstractmethod
-    def get_query_key_value_tensors(self, hidden_states, key_value_states):
-        """
-        This method needs to be implemented based on whether the derived class
-        is "self-attn" or "cross-attn".
-        """
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        key_value_states=None,
-        inference_params=None,
-        rotary_pos_emb=None,
-    ):
-        # hidden_states: [sq, b, h]
-
-        # For self attention we just duplicate the rotary_pos_emb if it isn't already
-        if rotary_pos_emb is not None and not isinstance(rotary_pos_emb, tuple):
-            rotary_pos_emb = (rotary_pos_emb,) * 2
-
-        # =====================
-        # Query, Key, and Value
-        # =====================
-        # Get the query, key and value tensors based on the type of attention -
-        # self or cross attn.
-        query, key, value = self.get_query_key_value_tensors(hidden_states, key_value_states)
-
-        # ===================================================
-        # Adjust key, value, and rotary_pos_emb for inference
-        # ===================================================
-        key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
-            inference_params, key, value, rotary_pos_emb
-        )
-
-        # ================================================
-        # relative positional embedding (rotary embedding)
-        # ================================================
-        if rotary_pos_emb is not None:
-            q_pos_emb, k_pos_emb = rotary_pos_emb
-            query = apply_rotary_pos_emb(query, q_pos_emb)
-            key = apply_rotary_pos_emb(key, k_pos_emb)
-            # TODO, can apply positional embedding to value_layer so it has
-            # absolute positional embedding.
-            # otherwise, only relative positional embedding takes effect
-            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        if self.checkpoint_core_attention:
-            core_attn_out = self._checkpointed_attention_forward(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type
-            )
-        else:
-            core_attn_out = self.core_attention(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type
-            )
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output, bias = self.linear_proj(core_attn_out)
-
-        return output, bias
-
-
-class SelfAttention(Attention):
-    """Self-attention layer class
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        submodules: SelfAttentionSubmodules,
-        layer_number: int,
-        attn_mask_type=AttnMaskType.padding,
-    ):
-        super().__init__(
-            config=config,
-            submodules=submodules,
-            layer_number=layer_number,
-            attn_mask_type=attn_mask_type,
-            attention_type="self",
-        )
-
-        self.linear_qkv = build_module(
-            submodules.linear_qkv,
-            self.config.hidden_size,
-            self.query_projection_size + 2 * self.kv_projection_size,
-            config=self.config,
-            init_method=self.config.init_method,
-            gather_output=False,
-            bias=self.config.add_bias_linear,
-            skip_bias_add=False,
-            is_expert=False,
-            tp_comm_buffer_name='qkv',
-        )
-
-    def get_query_key_value_tensors(self, hidden_states, key_value_states=None):
-        """
-        Derives `query`, `key` and `value` tensors from `hidden_states`.
-        """
-        # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
-        mixed_qkv, _ = self.linear_qkv(hidden_states)
-
-        # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
-        new_tensor_shape = mixed_qkv.size()[:-1] + (
-            self.num_query_groups_per_partition,
-            (
-                (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
-                * self.hidden_size_per_attention_head
-            ),
-        )
-        mixed_qkv = mixed_qkv.view(*new_tensor_shape)
-
-        # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-        (query, key, value) = torch.split(
-            mixed_qkv,
-            [
-                (
-                    self.num_attention_heads_per_partition
-                    // self.num_query_groups_per_partition
-                    * self.hidden_size_per_attention_head
-                ),
-                self.hidden_size_per_attention_head,
-                self.hidden_size_per_attention_head,
-            ],
-            dim=3,
-        )
-        # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn]
-        query = query.reshape(query.size(0), query.size(1), -1, self.hidden_size_per_attention_head)
-
-        return query, key, value
-
-    def sharded_state_dict(self, prefix='', sharded_key_prefix=None, sharded_offsets=()):
-        sharded_key_prefix = prefix if sharded_key_prefix is None else sharded_key_prefix
-        sharded_state_dict = {}
-        for name, module in (
-            ('linear_qkv', self.linear_qkv),
-            ('linear_proj', self.linear_proj),
-        ):
-            sub_sd = module.sharded_state_dict(
-                prefix=f'{prefix}{name}.',
-                sharded_key_prefix=f'{sharded_key_prefix}{name}.',
-                sharded_offsets=sharded_offsets,
-            )
-            sharded_state_dict.update(sub_sd)
-        return sharded_state_dict
-
-
-class CrossAttention(Attention):
-    """Cross-attention layer class
-
-    Cross-attention layer takes input with size [s, b, h] and context with size
-    [s, b, h] and returns output of the same size.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        submodules: CrossAttentionSubmodules,
-        layer_number: int,
-        attn_mask_type=AttnMaskType.padding,
-    ):
-        super().__init__(
-            config=config,
-            submodules=submodules,
-            layer_number=layer_number,
-            attn_mask_type=attn_mask_type,
-            attention_type="cross",
-        )
-
-        if self.config.num_query_groups != self.config.num_attention_heads:
-            raise ValueError(
-                f"Group query attention is not currently supported in cross attention."
-            )
-        assert self.query_projection_size == self.kv_projection_size
-
-        self.linear_q = build_module(
-            submodules.linear_q,
-            self.config.hidden_size,
-            self.query_projection_size,
-            config=self.config,
-            init_method=self.config.init_method,
-            gather_output=False,
-            bias=self.config.add_bias_linear,
-            skip_bias_add=False,
-            is_expert=False,
-        )
-
-        self.linear_kv = build_module(
-            submodules.linear_kv,
-            self.config.hidden_size,
-            2 * self.kv_projection_size,
-            config=self.config,
-            init_method=self.config.init_method,
-            gather_output=False,
-            bias=self.config.add_bias_linear,
-            skip_bias_add=False,
-            is_expert=False,
-        )
-
-    def get_query_key_value_tensors(self, hidden_states, key_value_states):
-        """
-        Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
-        from `key_value_states`.
-        """
-        # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
-        mixed_kv, _ = self.linear_kv(key_value_states)
-
-        # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
-        new_tensor_shape = mixed_kv.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            2 * self.hidden_size_per_attention_head,
-        )
-        mixed_kv = mixed_kv.view(*new_tensor_shape)
-
-        # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
-        (key, value) = tensor_parallel.split_tensor_along_last_dim(mixed_kv, 2)
-
-        # Attention head [sq, b, h] --> [sq, b, hp]
-        query, _ = self.linear_q(hidden_states)
-
-        # [sq, b, hp] --> [sq, b, np, hn]
-        new_tensor_shape = query.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head,
-        )
-        query = query.view(*new_tensor_shape)
-
-        return query, key, value
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/custom_layers/transformer_engine.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/custom_layers/transformer_engine.py
deleted file mode 100644
index 638c42036dbc23bac3a81bf25b1c520365cd2bad..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/custom_layers/transformer_engine.py
+++ /dev/null
@@ -1,431 +0,0 @@
-import os
-from importlib.metadata import version
-from typing import Callable
-
-import torch
-import transformer_engine as te
-from pkg_resources import packaging
-from torch import Tensor
-
-from megatron_ds.core import ModelParallelConfig
-from megatron_ds.core.parallel_state import (
-    get_context_parallel_global_ranks,
-    get_context_parallel_group,
-    get_tensor_model_parallel_group,
-)
-from megatron_ds.core.tensor_parallel import get_cuda_rng_tracker
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.utils import make_sharded_tensors_for_checkpoint
-
-
-def _get_extra_te_kwargs(config: TransformerConfig):
-    extra_transformer_engine_kwargs = {
-        "params_dtype": config.params_dtype,
-    }
-
-    te_version = packaging.version.Version(version("transformer-engine"))
-    if te_version >= packaging.version.Version("0.12.0"):
-        if config.use_cpu_initialization:
-            extra_transformer_engine_kwargs["device"] = 'cpu'
-        else:
-            extra_transformer_engine_kwargs["device"] = torch.cuda.current_device()
-    return extra_transformer_engine_kwargs
-
-
-class TENorm:
-    """
-    A conditional wrapper to initialize an instance of Transformer-Engine's
-    `LayerNorm` or `RMSNorm` based on input
-    """
-
-    # TODO should we ditch normalization config and just use spec to choose LayerNorm vs RMSNorm?
-    def __new__(
-        cls, config: TransformerConfig, hidden_size: int, eps: float = 1e-5,
-    ):
-        if config.normalization == "LayerNorm":
-            instance = te.pytorch.LayerNorm(
-                hidden_size=hidden_size,
-                eps=eps,
-                sequence_parallel=config.sequence_parallel,
-                zero_centered_gamma=config.layernorm_zero_centered_gamma,
-                **_get_extra_te_kwargs(config),
-            )
-        elif config.normalization == "RMSNorm":
-            assert hasattr(
-                te.pytorch, "RMSNorm"
-            ), "Transformer-Engine >= v0.11 required to use this feature"
-            instance = te.pytorch.RMSNorm(
-                hidden_size=hidden_size,
-                eps=eps,
-                sequence_parallel=config.sequence_parallel,
-                zero_centered_gamma=config.layernorm_zero_centered_gamma,
-                **_get_extra_te_kwargs(config),
-            )
-        else:
-            raise Exception('Only LayerNorm and RMSNorm are curently supported')
-
-        return instance
-
-
-class TELinear(te.pytorch.Linear):
-    """
-    Wrapper for the Transformer-Engine's `Linear` layer.
-
-    Note that if Megatron's parallel_state has not been initialized
-    yet, the tp_group passed to TE will be None and must be set later
-    via set_tensor_parallel_group().
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        parallel_mode: str,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias: bool,
-        skip_bias_add: bool,
-        skip_weight_param_allocation: bool,
-        tp_comm_buffer_name: str = None,
-    ):
-        self.config = config
-
-        # TE returns a zero length Tensor when bias=False and
-        # return_bias=True, but we prefer None.  So in that case we
-        # tell TE to not return the bias, and return None
-        # ourselves. This way our forward always returns two values
-        # and we don't have to deal with the zero length Tensor.
-        self.te_return_bias = skip_bias_add and bias
-
-        if skip_weight_param_allocation:
-            raise ValueError(
-                'Transformer Engine linear layers do not support skip_weight_param_allocation'
-            )
-
-        extra_kwargs = _get_extra_te_kwargs(config)
-
-        te_version = packaging.version.Version(version("transformer-engine"))
-        if te_version >= packaging.version.Version("0.8.0"):
-            if self.config.tp_comm_overlap:
-                extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag
-                extra_kwargs["ub_split_rs"] = self.config.tp_comm_split_rs
-                if te_version > packaging.version.Version("1.0.0"):
-                    assert (
-                        tp_comm_buffer_name is not None
-                    ), "Buffer name should be set to configure communication overlap settings"
-                    extra_kwargs["ub_name"] = tp_comm_buffer_name
-
-        super().__init__(
-            in_features=input_size,
-            out_features=output_size,
-            sequence_parallel=self.config.sequence_parallel,
-            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
-            tp_group=get_tensor_model_parallel_group(check_initialized=False),
-            tp_size=self.config.tensor_model_parallel_size,
-            get_rng_state_tracker=get_cuda_rng_tracker,
-            init_method=init_method,
-            bias=bias,
-            return_bias=self.te_return_bias,
-            parallel_mode=parallel_mode,
-            **extra_kwargs,
-        )
-
-    def forward(self, x):
-        out = super().forward(x)
-
-        # TE only returns a tuple when return_bias is True, otherwise
-        # it returns a single Tensor, we always want to return two
-        # values regardless of the arguments.
-        if self.te_return_bias:
-            return out
-        return out, None
-
-
-class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
-    """
-    Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines
-    layernorm and linear layers
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        config: TransformerConfig,
-        init_method: Callable,
-        gather_output: bool,
-        bias: bool,
-        skip_bias_add: bool,
-        is_expert: bool,
-        skip_weight_param_allocation: bool = False,
-        tp_comm_buffer_name: str = None,
-    ):
-        self.config = config
-
-        if gather_output:
-            raise ValueError('Transformer Engine linear layers do not support gather_output = True')
-
-        if is_expert:
-            raise ValueError('Transformer Engine linear layers do not yet support MoE')
-
-        if skip_weight_param_allocation:
-            raise ValueError(
-                'Transformer Engine linear layers do not support skip_weight_param_allocation'
-            )
-
-        # TE returns a zero length Tensor when bias=False and
-        # return_bias=True, but we prefer None.  So in that case we
-        # tell TE to not return the bias, and return None
-        # ourselves. This way our forward always returns two values
-        # and we don't have to deal with the zero length Tensor.
-        self.te_return_bias = skip_bias_add and bias
-
-        extra_kwargs = _get_extra_te_kwargs(config)
-
-        # Only Transformer-Engine version >= 0.11.0 supports `RMSNorm`
-        te_version = packaging.version.Version(version("transformer-engine"))
-        if te_version >= packaging.version.Version("0.11.0"):
-            extra_kwargs["normalization"] = self.config.normalization
-        elif self.config.normalization != "LayerNorm":
-            raise ValueError(
-                f"Transformer Engine v{te_version} does not support {self.config.normalization}."
-            )
-
-        if te_version >= packaging.version.Version("0.8.0"):
-            if self.config.tp_comm_overlap:
-                extra_kwargs["ub_bulk_wgrad"] = self.config.tp_comm_bulk_wgrad
-                extra_kwargs["ub_bulk_dgrad"] = self.config.tp_comm_bulk_dgrad
-                extra_kwargs["ub_split_ag"] = self.config.tp_comm_split_ag
-                if te_version > packaging.version.Version("1.0.0"):
-                    assert (
-                        tp_comm_buffer_name is not None
-                    ), "Buffer name should be set to configure communication overlap settings"
-                    extra_kwargs["ub_name"] = tp_comm_buffer_name
-
-        super().__init__(
-            in_features=input_size,
-            out_features=output_size,
-            eps=self.config.layernorm_epsilon,
-            sequence_parallel=self.config.sequence_parallel,
-            fuse_wgrad_accumulation=self.config.gradient_accumulation_fusion,
-            tp_group=get_tensor_model_parallel_group(check_initialized=False),
-            tp_size=self.config.tensor_model_parallel_size,
-            get_rng_state_tracker=get_cuda_rng_tracker,
-            init_method=init_method,
-            bias=bias,
-            return_bias=self.te_return_bias,
-            parallel_mode="column",
-            return_layernorm_output=False,
-            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
-            **extra_kwargs,
-        )
-
-    def forward(self, x):
-        out = super().forward(x)
-
-        # TE only returns a tuple when return_bias is True, otherwise
-        # it returns a single Tensor, we always want to return two
-        # values regardless of the arguments.
-        if self.te_return_bias:
-            return out
-        return out, None
-
-    def sharded_state_dict(self, prefix='', sharded_key_prefix=None, sharded_offsets=()):
-        """ Sharding along axis 0, bias sharded """
-        state_dict = self.state_dict(prefix='', keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, sharded_key_prefix, {'weight': 0, 'bias': 0}, sharded_offsets
-        )
-
-
-class TEColumnParallelLinear(TELinear):
-    """
-    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
-    to megatron's `ColumnParallelLinear` layer.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        gather_output: bool,
-        bias: bool,
-        skip_bias_add: bool,
-        is_expert: bool,
-        skip_weight_param_allocation: bool = False,
-        tp_comm_buffer_name: str = None,
-    ):
-        if gather_output:
-            raise ValueError('Transformer Engine linear layers do not support gather_output = True')
-
-        if is_expert:
-            raise ValueError('Transformer Engine linear layers do not yet support MoE')
-
-        super().__init__(
-            input_size=input_size,
-            output_size=output_size,
-            parallel_mode="column",
-            config=config,
-            init_method=init_method,
-            bias=bias,
-            skip_bias_add=skip_bias_add,
-            skip_weight_param_allocation=skip_weight_param_allocation,
-            tp_comm_buffer_name=tp_comm_buffer_name,
-        )
-
-    def sharded_state_dict(self, prefix='', sharded_key_prefix=None, sharded_offsets=()):
-        """ Sharding along axis 0, bias sharded """
-        state_dict = self.state_dict(prefix='', keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, sharded_key_prefix, {'weight': 0, 'bias': 0}, sharded_offsets
-        )
-
-
-class TERowParallelLinear(TELinear):
-    """
-    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
-    to megatron's `RowParallelLinear` layer.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        *,
-        config: ModelParallelConfig,
-        init_method: Callable,
-        bias: bool,
-        input_is_parallel: bool,
-        skip_bias_add: bool,
-        is_expert: bool,
-        tp_comm_buffer_name: str = None,
-    ):
-        if not input_is_parallel:
-            raise ValueError(
-                "Transformer Engine linear layers do not support input_is_parallel = False"
-            )
-
-        if is_expert:
-            raise ValueError('Transformer Engine linear layers do not yet support MoE')
-
-        super().__init__(
-            input_size=input_size,
-            output_size=output_size,
-            parallel_mode="row",
-            config=config,
-            init_method=init_method,
-            bias=bias,
-            skip_bias_add=skip_bias_add,
-            skip_weight_param_allocation=False,  # We don't currently use this for row parallel layers
-            tp_comm_buffer_name=tp_comm_buffer_name,
-        )
-
-    def sharded_state_dict(self, prefix='', sharded_key_prefix=None, sharded_offsets=()):
-        """ Sharding along axis 1, bias not sharded """
-        state_dict = self.state_dict(prefix='', keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, sharded_key_prefix, {'weight': 1}, sharded_offsets
-        )
-
-
-class TEDotProductAttention(te.pytorch.DotProductAttention):
-    """
-    Wrapper for the Transformer-Engine's `DotProductAttention` layer that also
-    has "flash attention" enabled.
-
-    Note that if Megatron's parallel_state has not been initialized yet, the
-    tp_group and cp_group passed to TE will be None and must be set later
-    via set_tensor_parallel_group() and set_context_parallel_group().
-    """
-
-    cp_stream: torch.cuda.Stream = None
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        layer_number: int,
-        attn_mask_type: AttnMaskType,
-        attention_type: str,
-        attention_dropout: float = None,
-    ):
-        self.config = config
-        self.te_forward_mask_type = False
-
-        if self.config.apply_query_key_layer_scaling != bool(
-            int(os.getenv('NVTE_APPLY_QK_LAYER_SCALING', '0'))
-        ):
-            raise ValueError(
-                f"apply_query_key_layer_scaling is {self.config.apply_query_key_layer_scaling} "
-                f"but environment variable NVTE_APPLY_QK_LAYER_SCALING is "
-                f"{os.getenv('NVTE_APPLY_QK_LAYER_SCALING')}. Transformer Engine does not support "
-                f"setting query key layer scaling via argument, so these two must match."
-            )
-
-        extra_kwargs = {}
-        te_version = packaging.version.Version(version("transformer-engine"))
-        if te_version >= packaging.version.Version("0.11.0"):
-            extra_kwargs["num_gqa_groups"] = self.config.num_query_groups
-        elif self.config.num_query_groups != self.config.num_attention_heads:
-            raise ValueError(
-                f"Transformer Engine v{te_version} does not support Grouped Query Attention, "
-                f"use a newer version of Transformer Engine. "
-                f"(num_query_groups ({self.config.num_query_groups}) != "
-                f"num_attention_heads ({self.config.num_attention_heads}))"
-            )
-
-        if te_version >= packaging.version.Version("0.10.0"):
-            extra_kwargs["attention_type"] = attention_type
-            # older version don't need attention_type
-
-        if te_version > packaging.version.Version("0.12.0"):
-            self.te_forward_mask_type = True
-
-        # Only Transformer-Engine version >= 1.0.0 supports context parallelism
-        if te_version >= packaging.version.Version("1.0.0"):
-            if getattr(TEDotProductAttention, "cp_stream") is None:
-                TEDotProductAttention.cp_stream = torch.cuda.Stream()
-            extra_kwargs["cp_group"] = get_context_parallel_group(check_initialized=False)
-            extra_kwargs["cp_global_ranks"] = get_context_parallel_global_ranks(
-                check_initialized=False
-            )
-            extra_kwargs["cp_stream"] = TEDotProductAttention.cp_stream
-        else:
-            assert (
-                self.config.context_parallel_size == 1
-            ), "Only Transformer-Engine version >= 1.0.0 supports context parallelism!"
-
-        super().__init__(
-            num_attention_heads=self.config.num_attention_heads,
-            kv_channels=self.config.kv_channels,
-            attention_dropout=self.config.attention_dropout
-            if attention_dropout is None
-            else attention_dropout,
-            attn_mask_type=attn_mask_type.name,
-            sequence_parallel=self.config.sequence_parallel,
-            tp_size=self.config.tensor_model_parallel_size,
-            get_rng_state_tracker=get_cuda_rng_tracker,
-            tp_group=get_tensor_model_parallel_group(check_initialized=False),
-            layer_number=layer_number,
-            **extra_kwargs,
-        )
-
-    def forward(
-        self,
-        query: Tensor,
-        key: Tensor,
-        value: Tensor,
-        attention_mask: Tensor,
-        attn_mask_type: AttnMaskType,
-    ):
-        if self.te_forward_mask_type:
-            return super().forward(
-                query, key, value, attention_mask, attn_mask_type=attn_mask_type.name
-            )
-        else:
-            return super().forward(query, key, value, attention_mask)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/dot_product_attention.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/dot_product_attention.py
deleted file mode 100644
index 1fc60b2c56bfa4e581a6dd87310e3346e8647757..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/dot_product_attention.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-
-import math
-
-import torch
-from torch import Tensor
-
-from megatron_ds.core import parallel_state, tensor_parallel
-from megatron_ds.core.fusions.fused_softmax import FusedScaleMaskSoftmax
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.utils import attention_mask_func
-from megatron_ds.core.utils import divide
-
-
-class DotProductAttention(MegatronModule):
-    """
-    Region where selective activation recomputation is applied.
-    This region is memory intensive but less compute intensive which
-    makes activation checkpointing more efficient for LLMs (20B+).
-    See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
-
-    We use the following notation:
-     h: hidden size
-     n: number of attention heads
-     p: number of tensor model parallel partitions
-     b: batch size
-     s: sequence length
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        layer_number: int,
-        attn_mask_type: AttnMaskType,
-        attention_type: str,
-        attention_dropout: float = None,
-    ):
-        super().__init__(config=config)
-
-        self.config: TransformerConfig = config
-
-        assert (
-            self.config.context_parallel_size == 1
-        ), "Context parallelism is only supported by TEDotProductAttention!"
-
-        self.layer_number = max(1, layer_number)
-        self.attn_mask_type = attn_mask_type
-        self.attention_type = attention_type  # unused for now
-
-        projection_size = self.config.kv_channels * self.config.num_attention_heads
-
-        # Per attention head and per partition values.
-        world_size = parallel_state.get_tensor_model_parallel_world_size()
-        self.hidden_size_per_partition = divide(projection_size, world_size)
-        self.hidden_size_per_attention_head = divide(projection_size, config.num_attention_heads)
-        self.num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
-        self.num_query_groups_per_partition = divide(self.config.num_query_groups, world_size)
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.config.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-
-        self.scale_mask_softmax = FusedScaleMaskSoftmax(
-            input_in_fp16=self.config.fp16,
-            input_in_bf16=self.config.bf16,
-            attn_mask_type=self.attn_mask_type,
-            scaled_masked_softmax_fusion=self.config.masked_softmax_fusion,
-            mask_func=attention_mask_func,
-            softmax_in_fp32=self.config.attention_softmax_in_fp32,
-            scale=coeff,
-        )
-
-        # Dropout. Note that for a single iteration, this layer will generate
-        # different outputs on different number of parallel partitions but
-        # on average it should not be partition dependent.
-        self.attention_dropout = torch.nn.Dropout(
-            self.config.attention_dropout if attention_dropout is None else attention_dropout
-        )
-
-    def forward(
-        self,
-        query: Tensor,
-        key: Tensor,
-        value: Tensor,
-        attention_mask: Tensor,
-        attn_mask_type: AttnMaskType = None,
-    ):
-
-        # ===================================
-        # Raw attention scores. [b, n/p, s, s]
-        # ===================================
-
-        # expand the key and value [sk, b, ng, hn] -> [sk, b, np, hn]
-        # This is a noop for normal attention where ng == np. When using group query attention this
-        # creates a view that has the keys and values virtually repeated along their dimension to
-        # match the number of queries.
-
-        # attn_mask_type is not used.
-        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
-            key = key.repeat_interleave(
-                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
-            )
-            value = value.repeat_interleave(
-                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
-            )
-
-        # [b, np, sq, sk]
-        output_size = (
-            query.size(1),
-            query.size(2),
-            query.size(0),
-            key.size(0),
-        )
-
-        # [sq, b, np, hn] -> [sq, b * np, hn]
-        # This will be a simple view when doing normal attention, but in group query attention
-        # the key and value tensors are repeated to match the queries so you can't use simple strides
-        # to extract the queries.
-        query = query.reshape(output_size[2], output_size[0] * output_size[1], -1)
-        # [sk, b, np, hn] -> [sk, b * np, hn]
-        key = key.view(output_size[3], output_size[0] * output_size[1], -1)
-
-        # preallocting input tensor: [b * np, sq, sk]
-        matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor(
-            (output_size[0] * output_size[1], output_size[2], output_size[3]), query.dtype, "mpu",
-        )
-
-        # Raw attention scores. [b * np, sq, sk]
-        matmul_result = torch.baddbmm(
-            matmul_input_buffer,
-            query.transpose(0, 1),  # [b * np, sq, hn]
-            key.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0,
-            alpha=(1.0 / self.norm_factor),
-        )
-
-        # change view to [b, np, sq, sk]
-        attention_scores = matmul_result.view(*output_size)
-
-        # ===========================
-        # Attention probs and dropout
-        # ===========================
-
-        # attention scores and attention mask [b, np, sq, sk]
-        attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-
-        if not self.config.sequence_parallel:
-            with tensor_parallel.get_cuda_rng_tracker().fork():
-                attention_probs = self.attention_dropout(attention_probs)
-        else:
-            attention_probs = self.attention_dropout(attention_probs)
-
-        # =========================
-        # Context layer. [sq, b, hp]
-        # =========================
-
-        # value -> context layer.
-        # [sk, b, np, hn] --> [b, np, sq, hn]
-
-        # context layer shape: [b, np, sq, hn]
-        output_size = (
-            value.size(1),
-            value.size(2),
-            query.size(0),
-            value.size(3),
-        )
-
-        # change view [sk, b * np, hn]
-        value = value.view(value.size(0), output_size[0] * output_size[1], -1)
-
-        # change view [b * np, sq, sk]
-        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-
-        # matmul: [b * np, sq, hn]
-        context = torch.bmm(attention_probs, value.transpose(0, 1))
-
-        # change view [b, np, sq, hn]
-        context = context.view(*output_size)
-
-        # [b, np, sq, hn] --> [sq, b, np, hn]
-        context = context.permute(2, 0, 1, 3).contiguous()
-
-        # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_shape = context.size()[:-2] + (self.hidden_size_per_partition,)
-        context = context.view(*new_context_shape)
-
-        return context
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/enums.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/enums.py
deleted file mode 100644
index ab72f3536854413443eb56455fe96171aef5a72e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/enums.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import enum
-
-
-# can we get rid of this?
-# it's being used in pipeline schedules
-class ModelType(enum.Enum):
-    encoder_or_decoder = 1
-    encoder_and_decoder = 2
-
-
-# class LayerType(enum.Enum):
-#     encoder = 1
-#     decoder = 2
-
-
-class AttnType(enum.Enum):
-    self_attn = 1
-    cross_attn = 2
-
-
-class AttnMaskType(enum.Enum):
-    padding = 1
-    causal = 2
-    no_mask = 3  # only used for TE
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/identity_op.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/identity_op.py
deleted file mode 100644
index 5d9388ffcc628bdd0f04dd5969b9e669153446a8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/identity_op.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-import torch
-
-
-class IdentityOp(torch.nn.Module):
-    """
-    This is a placeholder for IdentityOp(x) -> x
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-
-    def forward(self, x, *args, **kwargs):
-        return x
-
-
-class IdentityFuncOp(IdentityOp):
-    """
-    This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x.
-    Such a func is handy for ops like `bias_dropout_fusion` which themselves
-    return a function at runtime based on passed arguments
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-
-    def forward(self, *args, **kwargs):
-        return super().forward
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/mlp.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/mlp.py
deleted file mode 100644
index f7c41b278cd7dbc35195f3c40ea94f1780778d57..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/mlp.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from dataclasses import dataclass
-from typing import Tuple, Union
-
-import torch
-import torch.nn.functional as F
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.dist_checkpointing import ShardedTensor
-from megatron_ds.core.dist_checkpointing.mapping import ShardedTensorFactory
-from megatron_ds.core.fusions.fused_bias_gelu import bias_gelu_impl
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.spec_utils import ModuleSpec, build_module
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.utils import make_sharded_tensors_for_checkpoint
-
-
-@dataclass
-class MLPSubmodules:
-    linear_fc1: Union[ModuleSpec, type] = None
-    linear_fc2: Union[ModuleSpec, type] = None
-
-
-class MLP(MegatronModule):
-    """
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-
-
-    Returns an output and a bias to be added to the output.
-    If config.add_bias_linear is False, the bias returned is None.
-
-    We use the following notation:
-     h: hidden size
-     p: number of tensor model parallel partitions
-     b: batch size
-     s: sequence length
-    """
-
-    def __init__(
-        self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False
-    ):
-        super().__init__(config=config)
-
-        self.config: TransformerConfig = config
-
-        # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        ffn_hidden_size = self.config.ffn_hidden_size
-        if self.config.gated_linear_unit:
-            ffn_hidden_size *= 2
-
-        self.linear_fc1 = build_module(
-            submodules.linear_fc1,
-            self.config.hidden_size,
-            ffn_hidden_size,
-            config=self.config,
-            init_method=self.config.init_method,
-            gather_output=False,
-            bias=self.config.add_bias_linear,
-            skip_bias_add=True,
-            is_expert=is_expert,
-            tp_comm_buffer_name='fc1',
-        )
-
-        if self.config.gated_linear_unit:
-
-            def glu(x):
-                x = torch.chunk(x, 2, dim=-1)
-                return self.config.activation_func(x[0]) * x[1]
-
-            self.activation_func = glu
-        else:
-            self.activation_func = self.config.activation_func
-
-        self.linear_fc2 = build_module(
-            submodules.linear_fc2,
-            self.config.ffn_hidden_size,
-            self.config.hidden_size,
-            config=self.config,
-            init_method=self.config.output_layer_init_method,
-            bias=self.config.add_bias_linear,
-            input_is_parallel=True,
-            skip_bias_add=True,
-            is_expert=is_expert,
-            tp_comm_buffer_name='fc2',
-        )
-
-    def forward(self, hidden_states):
-
-        # [s, b, 4 * h/p]
-        intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
-
-        if self.config.bias_gelu_fusion:
-            assert self.config.add_bias_linear is True
-            assert self.activation_func == F.gelu
-            intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
-        else:
-            if bias_parallel is not None:
-                intermediate_parallel = intermediate_parallel + bias_parallel
-            intermediate_parallel = self.activation_func(intermediate_parallel)
-
-        # [s, b, h]
-        output, output_bias = self.linear_fc2(intermediate_parallel)
-
-        return output, output_bias
-
-    def sharded_state_dict(self, prefix='', sharded_key_prefix=None, sharded_offsets=()):
-        sharded_key_prefix = prefix if sharded_key_prefix is None else sharded_key_prefix
-        sharded_state_dict = {}
-        for name, module in self._modules.items():
-            if name == 'linear_fc1' and self.config.gated_linear_unit:
-                sub_sd = self._sharded_state_dict_for_glu(
-                    name, module, prefix, sharded_key_prefix, sharded_offsets
-                )
-            else:
-                sub_sd = module.sharded_state_dict(
-                    prefix=f'{prefix}{name}.',
-                    sharded_key_prefix=f'{sharded_key_prefix}{name}.',
-                    sharded_offsets=sharded_offsets,
-                )
-            sharded_state_dict.update(sub_sd)
-        return sharded_state_dict
-
-    def _sharded_state_dict_for_glu(
-        self,
-        module_name: str,
-        module: torch.nn.Module,
-        prefix: str,
-        sharded_key_prefix: str,
-        sharded_offsets: Tuple[Tuple[int, int, int]],
-    ):
-        assert module_name == 'linear_fc1', module_name
-        sharded_state_dict = module.sharded_state_dict(
-            prefix=f'{prefix}{module_name}.',
-            sharded_key_prefix=f'{sharded_key_prefix}{module_name}.',
-            sharded_offsets=sharded_offsets,
-        )
-        weight_key = f'{prefix}{module_name}.weight'
-        prev_sh_ten = sharded_state_dict[weight_key]
-
-        # We must split the tensor into 2 parts, each sharded separately.
-        # This requires a ShardedTensorFactory which `chunk`s during saving
-        # and `cat`s during loading
-        tp_rank = parallel_state.get_tensor_model_parallel_rank()
-        tp_size = parallel_state.get_tensor_model_parallel_world_size()
-
-        tp_shard_axis = 0
-        replica_id = prev_sh_ten.replica_id
-        prepend_axis_num = len(sharded_offsets)
-
-        def sh_ten_build_fn(key: str, t: torch.Tensor):
-            offset_w = (tp_shard_axis + prepend_axis_num, tp_rank, tp_size * 2)
-            offset_v = (tp_shard_axis + prepend_axis_num, tp_size + tp_rank, tp_size * 2)
-            with torch.no_grad():
-                tensor_w, tensor_v = torch.chunk(t, 2, dim=tp_shard_axis)
-            return [
-                ShardedTensor.from_rank_offsets(
-                    key,
-                    tensor_w,
-                    *sharded_offsets,
-                    offset_w,
-                    replica_id=replica_id,
-                    prepend_axis_num=1,
-                ),
-                ShardedTensor.from_rank_offsets(
-                    key,
-                    tensor_v,
-                    *sharded_offsets,
-                    offset_v,
-                    replica_id=replica_id,
-                    prepend_axis_num=1,
-                ),
-            ]
-
-        def sh_ten_merge_fn(sub_state_dict):
-            with torch.no_grad():
-                return torch.cat(sub_state_dict)
-
-        sharded_state_dict[weight_key] = ShardedTensorFactory(
-            prev_sh_ten.key, prev_sh_ten.data, sh_ten_build_fn, sh_ten_merge_fn
-        )
-        return sharded_state_dict
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/module.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/module.py
deleted file mode 100644
index f739f0fffe3e6f9624e60844f7836acb949d398f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/module.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-"""Megatron Module."""
-
-import torch
-from torch.autograd import Variable
-from torch.nn.parameter import Parameter
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-
-_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
-_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
-_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
-
-
-def param_is_not_shared(param):
-    return not hasattr(param, 'shared') or not param.shared
-
-
-class MegatronModule(torch.nn.Module):
-    """Base Megatron module inhertied by all Models.
-
-    Megatron specific extensions of torch Module with support
-    for pipelining
-
-    Args:
-        config (TransformerConfig): Transformer config
-    """
-
-    # def __init__(self, config: TransformerConfig, share_word_embeddings=True):
-    def __init__(self, config: TransformerConfig):
-        super().__init__()
-        self.config = config
-
-    def state_dict_for_save_checkpoint(self, prefix: str = '', keep_vars: bool = False):
-        """Override state dict for saving checkpoints Use this function to override the
-        state dict for saving checkpoints.
-
-        Args:
-            prefix (str, optional): _description_. Defaults to ''.
-            keep_vars (bool, optional): _description_. Defaults to False.
-
-        Returns:
-            _type_: _description_
-        """
-
-        return self.state_dict(prefix=prefix, keep_vars=keep_vars)
-
-    def sharded_state_dict(self, prefix: str = ''):
-        """Override sharded state dict with Dist Checkpointing.
-
-        Override sharded_state_dict when using distributed checkpointing. keep_vars must always be set to True so that optimizer states can be sharded.
-
-        Args:
-            prefix (str, optional): _description_. Defaults to ''.
-
-        Returns:
-            _type_: _description_
-        """
-        return self.state_dict(prefix=prefix, keep_vars=True)
-
-
-def conversion_helper(val, conversion):
-    if not isinstance(val, (tuple, list)):
-        return conversion(val)
-    rtn = [conversion_helper(v, conversion) for v in val]
-    if isinstance(val, tuple):
-        rtn = tuple(rtn)
-    return rtn
-
-
-def fp32_to_float16(val, float16_convertor):
-    def half_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, _FLOAT_TYPES):
-            val = float16_convertor(val)
-        return val
-
-    return conversion_helper(val, half_conversion)
-
-
-def float16_to_fp32(val):
-    def float_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
-            val = val.float()
-        return val
-
-    return conversion_helper(val, float_conversion)
-
-
-class Float16Module(MegatronModule):
-    """Float 16 Module.
-
-    Attributes:
-        config (TransformerConfig): Transformer config
-        fp16 (bool) : Specifies if the model runs in fp16 mode
-        bf16 (bool) : Specifies if the model runs in bf16 mode
-
-    Args:
-        config (TransformerConfig): The transformer config used to initalize the model
-    """
-
-    def __init__(self, config: TransformerConfig, module: torch.nn.Module):
-        super(Float16Module, self).__init__(config)
-        self.config = config
-        self.fp16 = config.fp16
-        self.bf16 = config.bf16
-
-        if self.fp16:
-            self.add_module('module', module.half())
-
-            def float16_convertor(val):
-                return val.half()
-
-        elif self.bf16:
-            self.add_module('module', module.bfloat16())
-
-            def float16_convertor(val):
-                return val.bfloat16()
-
-        else:
-            raise Exception('Either config.fp16 or config.bf16 should be True.')
-
-        self.float16_convertor = float16_convertor
-
-    def set_input_tensor(self, input_tensor):
-        return self.module.set_input_tensor(input_tensor)
-
-    def forward(self, *inputs, **kwargs):
-        if parallel_state.is_pipeline_first_stage():
-            inputs = fp32_to_float16(inputs, self.float16_convertor)
-        outputs = self.module(*inputs, **kwargs)
-        if parallel_state.is_pipeline_last_stage():
-            outputs = float16_to_fp32(outputs)
-        return outputs
-
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        return self.module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """Retrieve state_dict from the module being wrapped."""
-        return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
-
-    def sharded_state_dict(self, prefix=''):
-        """Retrieve state_dict from the module being wrapped.
-
-        When using distributed checkpointing, keep_vars must always be set to True.
-        """
-        return self.module.sharded_state_dict(prefix=prefix)
-
-    def load_state_dict(self, state_dict, strict=True):
-        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/spec_utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/spec_utils.py
deleted file mode 100644
index 473933e45297903a76f539db0e1c5990ff2a946d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/spec_utils.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import types
-from dataclasses import dataclass, field
-from typing import Tuple, Union
-
-import torch
-
-
-@dataclass
-class ModuleSpec:
-    """This is a Module Specification dataclass.
-
-    Specification defines the location of the module (to import dynamically)
-    or the imported module itself. It also defines the params that need to be
-    passed to initialize the module.
-
-    Args:
-        module (Union[Tuple, type]): A tuple describing the location of the
-            module class e.g. `(module.location, ModuleClass)` or the imported
-            module class itself e.g. `ModuleClass` (which is already imported
-            using `from module.location import ModuleClass`).
-        params (dict): A dictionary of params that need to be passed while init.
-
-    """
-
-    module: Union[Tuple, type]
-    params: dict = field(default_factory=lambda: {})
-    submodules: type = None
-
-
-def import_module(module_path: Tuple[str]):
-    """Import a named object from a module in the context of this function.
-
-    TODO: make this importer module more robust, at least make sure there
-    are no side effects of using this as is
-    """
-    base_path, name = module_path
-    try:
-        module = __import__(base_path, globals(), locals(), [name])
-    except ImportError as e:
-        print(f"couldn't import module due to {e}")
-        return None
-    return vars(module)[name]
-
-
-def get_module(spec_or_module: Union[ModuleSpec, type], **additional_kwargs):
-    # If a module clas is already provided return it as is
-    if isinstance(spec_or_module, (type, types.FunctionType)):
-        return spec_or_module
-
-    # If the module is provided instead of module path, then return it as is
-    if isinstance(spec_or_module.module, (type, types.FunctionType)):
-        return spec_or_module.module
-
-    # Otherwise, return the dynamically imported module from the module path
-    return import_module(spec_or_module.module)
-
-
-def build_module(spec_or_module: Union[ModuleSpec, type], *args, **kwargs):
-    # If the passed `spec_or_module` is
-    # a `Function`, then return it as it is
-    # NOTE: to support an already initialized module add the following condition
-    # `or isinstance(spec_or_module, torch.nn.Module)` to the following if check
-    if isinstance(spec_or_module, types.FunctionType):
-        return spec_or_module
-
-    # If the passed `spec_or_module` is actually a spec (instance of
-    # `ModuleSpec`) and it specifies a `Function` using its `module`
-    # field, return the `Function` as it is
-    if isinstance(spec_or_module, ModuleSpec) and isinstance(
-        spec_or_module.module, types.FunctionType
-    ):
-        return spec_or_module.module
-
-    # Check if a module class is provided as a spec or if the module path
-    # itself is a class
-    if isinstance(spec_or_module, type):
-        module = spec_or_module
-    elif hasattr(spec_or_module, "module") and isinstance(spec_or_module.module, type):
-        module = spec_or_module.module
-    else:
-        # Otherwise, dynamically import the module from the module path
-        module = import_module(spec_or_module.module)
-
-    # If the imported module is actually a `Function` return it as it is
-    if isinstance(module, types.FunctionType):
-        return module
-
-    # Finally return the initialized module with params from the spec as well
-    # as those passed as **kwargs from the code
-
-    # Add the `submodules` argument to the module init call if it exists in the
-    # spec.
-    if hasattr(spec_or_module, "submodules") and spec_or_module.submodules is not None:
-        kwargs["submodules"] = spec_or_module.submodules
-
-    try:
-        return module(
-            *args, **spec_or_module.params if hasattr(spec_or_module, "params") else {}, **kwargs
-        )
-    except Exception as e:
-        # improve the error message since we hide the module name in the line above
-        import sys
-
-        tb = sys.exc_info()[2]
-        raise type(e)(f"{str(e)} when instantiating {module.__name__}").with_traceback(
-            sys.exc_info()[2]
-        )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/switch_mlp.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/switch_mlp.py
deleted file mode 100644
index 4cbcba3145bb8254c628c1ccf9125c9f36416c82..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/switch_mlp.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-from megatron_ds.core import parallel_state, tensor_parallel
-from megatron_ds.core.parallel_state import (
-    get_tensor_and_expert_parallel_group,
-    get_tensor_model_parallel_group,
-)
-from megatron_ds.core.tensor_parallel import get_cuda_rng_tracker, get_data_parallel_rng_tracker_name
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-
-from .mlp import MLP, MLPSubmodules
-
-
-def sinkhorn(cost, tol=0.0001):
-    "Sinkhorn based MoE routing function"
-    cost = torch.exp(cost)
-    d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
-    d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype)
-
-    eps = 0.00000001
-    error = 1e9
-    d1_old = d1
-    while error > tol:
-        d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps)
-        d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps)
-        error = torch.mean(torch.abs(d1_old - d1))
-        d1_old = d1
-    return d1 * cost * d0.unsqueeze(1)
-
-
-def get_router_linear_layer(config):
-    router = torch.nn.Linear(config.hidden_size, config.num_moe_experts, bias=False)
-    with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()):
-        config.init_method(router.weight)
-    setattr(router.weight, 'sequence_parallel', config.sequence_parallel)
-    return router
-
-
-class SwitchMLP(MegatronModule):
-    """
-    Top-1 Mixture of Experts Layer. Routes input to one of N MLP "experts"
-    Curently supports Sinkhorn based expert routing.
-    """
-
-    def __init__(self, config: TransformerConfig, submodules: MLPSubmodules):
-        super().__init__(config=config)
-
-        self.config: TransformerConfig = config
-
-        self.router = get_router_linear_layer(self.config)
-        self.add_bias = config.add_bias_linear
-        self.sequence_parallel = config.sequence_parallel
-        self.route_algo = sinkhorn
-        self.router_activation = torch.sigmoid
-        self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size()
-
-        assert self.config.num_moe_experts % self.expert_parallel_size == 0
-        self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size
-        local_expert_indices_offset = (
-            parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
-        )
-        self.local_expert_indices = [
-            local_expert_indices_offset + i for i in range(self.num_local_experts)
-        ]
-
-        self.local_experts = torch.nn.ModuleList()
-        for _ in range(self.num_local_experts):
-            expert = MLP(self.config, submodules, is_expert=True)
-            self.local_experts.append(expert)
-
-    def gather_indices(self, local_indices):
-        """ Gather tensors and concatenate along the first dimension."""
-        group = get_tensor_and_expert_parallel_group()
-        world_size = torch.distributed.get_world_size(group=group)
-        # Bypass the function if we are using only 1 GPU.
-        if world_size == 1:
-            return local_indices
-
-        dim_size = list(local_indices.size())
-        dim_size[0] = dim_size[0] * world_size
-
-        # TODO pre allocate memory
-        output = torch.empty(
-            dim_size, dtype=local_indices.dtype, device=torch.cuda.current_device()
-        )
-        torch.distributed._all_gather_base(output, local_indices.contiguous(), group=group)
-        return output
-
-    def forward(self, hidden_states):
-        hidden_shape = hidden_states.shape
-        route = self.router(hidden_states)
-        route = route.view(-1, self.config.num_moe_experts)
-
-        if self.training:
-            with torch.no_grad():
-                norm_route = self.route_algo(
-                    route.detach().to(dtype=torch.float32)
-                )  # explicit fp32 conversion for stability
-                _, max_ind = torch.max(norm_route, dim=1)
-            route = self.router_activation(route)
-            max_prob = route[torch.arange(route.size(0)), max_ind]
-        else:
-            route = self.router_activation(route)
-            max_prob, max_ind = torch.max(route, dim=1)
-
-        max_prob = torch.unsqueeze(max_prob, 1)
-        hidden_states = hidden_states.view(-1, hidden_shape[-1])
-
-        if self.sequence_parallel or (self.expert_parallel_size > 1):
-            global_hidden_states = tensor_parallel.gather_from_sequence_parallel_region_to_moe(
-                hidden_states
-            )
-            global_indices = self.gather_indices(max_ind)
-        else:
-            global_hidden_states = hidden_states
-            global_indices = max_ind
-
-        output_total = torch.zeros_like(global_hidden_states)
-        if self.add_bias:
-            output_bias_total = torch.zeros_like(global_hidden_states)
-
-        for expert_num, expert in enumerate(self.local_experts):
-            local_expert_index = self.local_expert_indices[expert_num]
-            local_indices = (global_indices == local_expert_index).nonzero()
-            hidden = global_hidden_states[local_indices, :]
-            output, output_bias = expert(hidden)
-
-            output_total[local_indices, :] = output
-            if self.add_bias:
-                output_bias = output_bias.expand_as(output)
-                output_bias_total[local_indices, :] = output_bias
-
-        if self.sequence_parallel or (self.expert_parallel_size > 1):
-            output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
-                output_total
-            )
-            if self.add_bias:
-                output_bias_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
-                    output_bias_total
-                )
-                # bias is duplicated across tensor parallelism ranks;
-                # reduce scatter reduces bias across tensor parallel_ranks
-                output_bias_total = (
-                    output_bias_total / parallel_state.get_tensor_model_parallel_world_size()
-                )
-
-        output_total = output_total * max_prob
-        output_total = output_total.view(hidden_shape)
-        if self.add_bias:
-            output_bias_total = output_bias_total * max_prob
-            output_bias_total = output_bias_total.view(hidden_shape)
-        else:
-            output_bias_total = None
-
-        return output_total, output_bias_total
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_block.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_block.py
deleted file mode 100644
index 22f0aa34a09762dbed7286a7b983765a81dde78c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_block.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import re
-from contextlib import nullcontext
-from dataclasses import dataclass
-from typing import List, Union
-
-import torch
-from torch import Tensor
-
-from megatron_ds.core import InferenceParams, parallel_state, tensor_parallel
-from megatron_ds.core.fusions.fused_layer_norm import FusedLayerNorm
-from megatron_ds.core.transformer.custom_layers.transformer_engine import TENorm
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.spec_utils import ModuleSpec, build_module
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.transformer_layer import TransformerLayer
-from megatron_ds.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
-
-
-def get_num_layers_to_build(config: TransformerConfig) -> int:
-
-    num_layers_per_pipeline_rank = (
-        config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
-    )
-
-    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-        # Interleaved pipeline parallelism:
-        # Number of layers in each model chunk is the number of layers in the stage,
-        # divided by the number of model chunks in a stage.
-        # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
-        # layers to stages like (each list is a model chunk):
-        # Stage 0: [0]  [2]  [4]  [6]
-        # Stage 1: [1]  [3]  [5]  [7]
-        # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
-        # layers to stages like (each list is a model chunk):
-        # Stage 0: [0, 1]  [4, 5]
-        # Stage 1: [2, 3]  [6, 7]
-
-        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-
-        num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
-
-        num_layers_to_build = num_layers_per_virtual_rank
-
-    else:
-        # Non-interleaved pipeline parallelism:
-        # Each stage gets a contiguous set of layers.
-
-        num_layers_to_build = num_layers_per_pipeline_rank
-
-    return num_layers_to_build
-
-
-@dataclass
-class TransformerBlockSubmodules:
-    layer_specs: List[ModuleSpec] = None
-
-
-def _get_block_submodules(
-    config: TransformerConfig, spec: Union[TransformerBlockSubmodules, ModuleSpec],
-) -> TransformerBlockSubmodules:
-
-    # Transformer block submodules.
-    if isinstance(spec, TransformerBlockSubmodules):
-        return spec
-
-    # ModuleSpec here is generally assumed to be for a transformer layer.
-    elif isinstance(spec, ModuleSpec):
-        if issubclass(spec.module, TransformerBlock):
-            return spec.submodules
-        elif issubclass(spec.module, TransformerLayer):
-            num_layers = get_num_layers_to_build(config)
-            return TransformerBlockSubmodules(layer_specs=[spec] * num_layers)
-        else:
-            raise Exception(f"specialize for {spec.module.__name__}.")
-    else:
-        raise Exception(f"specialize for {type(spec).__name__}.")
-
-
-class TransformerBlock(MegatronModule):
-    """Transformer class."""
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        spec: Union[TransformerBlockSubmodules, ModuleSpec],
-        post_layer_norm: bool = True,
-        pre_process: bool = True,
-        post_process: bool = True,
-    ):
-        super().__init__(config=config)
-
-        self.submodules = _get_block_submodules(config, spec)
-        self.post_layer_norm = post_layer_norm
-        self.pre_process = pre_process
-        self.post_process = post_process
-
-        # required for pipeline parallel schedules
-        self.input_tensor = None
-
-        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
-
-        self._build_layers()
-        self.num_layers_per_pipeline_rank = len(self.layers)
-
-    def _build_layers(self):
-        # Transformer layers.
-        # @jcasper can we improve how we deal with layer_number?
-        # currently it's only used in CoreAttention?
-        # if self.apply_query_key_layer_scaling:
-        #     coeff = self.layer_number
-        #     self.norm_factor *= coeff
-        def build_layer(layer_spec, layer_number):
-            return build_module(layer_spec, config=self.config, layer_number=layer_number,)
-
-        # offset is implicit in TransformerLayer
-        self.layers = torch.nn.ModuleList(
-            [
-                build_layer(layer_spec, i + 1)
-                for i, layer_spec in enumerate(self.submodules.layer_specs)
-            ]
-        )
-
-        # # TODO: add back standalone_embedding_stage
-        # if self.num_layers == 0:
-        #     # When a standalone embedding stage is used (e.g.,
-        #     # args.standalone_embedding_stage == True), virtual pipeline ranks
-        #     # on pipeline rank 0 will have zero transformer layers assigned to
-        #     # them. This results in the model's input and output tensors to be
-        #     # the same, which will cause failure for certain output tensor
-        #     # optimizations (e.g., pipeline output deallocation). To remedy
-        #     # this, we assign a 'no-op' layer on these ranks, which will
-        #     # disconnect the input tensor from the output tensor.
-        #     self.num_layers = 1
-        #     self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
-        # else:
-        #     self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
-
-        if self.post_process and self.post_layer_norm:
-            # Final layer norm before output.
-            self.final_layernorm = TENorm(
-                config=self.config,
-                hidden_size=self.config.hidden_size,
-                eps=self.config.layernorm_epsilon,
-            )
-
-    def _get_layer(self, layer_number: int):
-        return self.layers[layer_number]
-
-    def _checkpointed_forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Tensor,
-        context: Tensor,
-        context_mask: Tensor,
-        rotary_pos_emb: Tensor,
-    ):
-        """Forward method with activation checkpointing."""
-
-        def custom(start: int, end: int):
-            def custom_forward(
-                hidden_states, attention_mask, context, context_mask, rotary_pos_emb,
-            ):
-                for index in range(start, end):
-                    layer = self._get_layer(index)
-                    hidden_states, context = layer(
-                        hidden_states=hidden_states,
-                        attention_mask=attention_mask,
-                        context=context,
-                        context_mask=context_mask,
-                        rotary_pos_emb=rotary_pos_emb,
-                        inference_params=None,
-                    )
-                return hidden_states, context
-
-            return custom_forward
-
-        if self.config.recompute_method == 'uniform':
-            # Uniformly divide the total number of Transformer layers and checkpoint
-            # the input activation of each divided chunk.
-            # A method to further reduce memory usage reducing checkpoints.
-            l = 0
-            while l < self.num_layers_per_pipeline_rank:
-                hidden_states, context = tensor_parallel.checkpoint(
-                    custom(l, l + self.config.recompute_num_layers),
-                    self.config.distribute_saved_activations,
-                    hidden_states,
-                    attention_mask,
-                    context,
-                    context_mask,
-                    rotary_pos_emb,
-                )
-
-                l += self.config.recompute_num_layers
-
-        elif self.config.recompute_method == 'block':
-            # Checkpoint the input activation of only a set number of individual
-            # Transformer layers and skip the rest.
-            # A method fully use the device memory removing redundant re-computation.
-            for l in range(self.num_layers_per_pipeline_rank):
-                if l < self.config.recompute_num_layers:
-                    hidden_states, context = tensor_parallel.checkpoint(
-                        custom(l, l + 1),
-                        self.config.distribute_saved_activations,
-                        hidden_states,
-                        attention_mask,
-                        context,
-                        context_mask,
-                        rotary_pos_emb,
-                    )
-                else:
-                    hidden_states, context = custom(l, l + 1)(
-                        hidden_states, attention_mask, context, context_mask, rotary_pos_emb,
-                    )
-        else:
-            raise ValueError("Invalid activation recompute method.")
-
-        return hidden_states
-
-    def set_input_tensor(self, input_tensor: Tensor):
-        """Set input tensor to be used instead of forward()'s input.
-
-        When doing pipeline parallelism the input from the previous
-        stage comes from communication, not from the input, so the
-        model's forward_step_func won't have it. This function is thus
-        used by internal code to bypass the input provided by the
-        forward_step_func"""
-        self.input_tensor = input_tensor
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask: Tensor,
-        context: Tensor = None,
-        context_mask: Tensor = None,
-        rotary_pos_emb: Tensor = None,
-        inference_params: InferenceParams = None,
-    ):
-        # hidden_states (float): [s, b, h]
-        # attention_mask (bool): [1, 1, s, s]
-
-        if not self.pre_process:
-            # See set_input_tensor()
-            hidden_states = self.input_tensor
-
-        # Viewless tensor.
-        # - We only need to create a viewless tensor in the case of micro batch
-        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
-        #   above creates a view tensor, and '.contiguous()' is a pass-through.
-        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
-        #   the need to make it viewless.
-        #
-        #   However, we don't explicitly check mbs == 1 here because
-        #   make_viewless_tensor() has negligible overhead when its input
-        #   is already viewless.
-        #
-        # - For the 'else' case above, calling make_viewless_tensor() here is
-        #   likely redundant, since p2p_communication.py (likely originator)
-        #   already creates viewless tensors. That said, make_viewless_tensor()
-        #   is called here to be future-proof and corner-case-proof.
-        hidden_states = make_viewless_tensor(
-            inp=hidden_states, requires_grad=True, keep_graph=True,
-        )
-
-        if self.config.sequence_parallel:
-            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
-        else:
-            rng_context = nullcontext()
-
-        if self.config.fp8:
-            import transformer_engine  # To keep out TE dependency when not training in fp8
-
-            if self.config.fp8 == "e4m3":
-                fp8_format = transformer_engine.common.recipe.Format.E4M3
-            elif self.config.fp8 == "hybrid":
-                fp8_format = transformer_engine.common.recipe.Format.HYBRID
-            else:
-                raise ValueError("E4M3 and HYBRID are the only supported FP8 formats.")
-
-            fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
-                margin=self.config.fp8_margin,
-                interval=self.config.fp8_interval,
-                fp8_format=fp8_format,
-                amax_compute_algo=self.config.fp8_amax_compute_algo,
-                amax_history_len=self.config.fp8_amax_history_len,
-                override_linear_precision=(False, False, not self.config.fp8_wgrad),
-            )
-            fp8_group = None
-            if parallel_state.model_parallel_is_initialized():
-                fp8_group = parallel_state.get_amax_reduction_group(with_context_parallel=True)
-            fp8_context = transformer_engine.pytorch.fp8_autocast(
-                enabled=True, fp8_recipe=fp8_recipe, fp8_group=fp8_group
-            )
-        else:
-            fp8_context = nullcontext()
-
-        with rng_context and fp8_context:
-            # Forward pass.
-            if self.config.recompute_granularity == 'full':
-                hidden_states = self._checkpointed_forward(
-                    hidden_states=hidden_states,
-                    attention_mask=attention_mask,
-                    context=context,
-                    context_mask=context_mask,
-                    rotary_pos_emb=rotary_pos_emb,
-                )
-            else:
-                for layer in self.layers:
-                    hidden_states, context = layer(
-                        hidden_states=hidden_states,
-                        attention_mask=attention_mask,
-                        context=context,
-                        context_mask=context_mask,
-                        rotary_pos_emb=rotary_pos_emb,
-                        inference_params=inference_params,
-                    )
-
-        # Final layer norm.
-        if self.post_process and self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states
-
-    def sharded_state_dict(self, prefix: str = ''):
-
-        sharded_state_dict = {}
-
-        layer_prefix = f'{prefix}layers.'
-        for layer in self.layers:
-            sharded_state_dict.update(layer.sharded_state_dict(prefix=layer_prefix))
-
-        if self.post_process and self.post_layer_norm:
-            state_dict = self.state_dict(keep_vars=True)
-
-            tensor = state_dict['final_layernorm.weight']
-            layer_name = f'{prefix}final_layernorm.weight'
-            sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(tensor, layer_name)
-
-            # RMSNorm doesn't have bias.
-            if 'final_layernorm.bias' in state_dict.keys():
-                tensor = state_dict['final_layernorm.bias']
-                layer_name = f'{prefix}final_layernorm.bias'
-                sharded_state_dict[layer_name] = make_sharded_tensor_for_checkpoint(
-                    tensor, layer_name
-                )
-
-        return sharded_state_dict
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_config.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_config.py
deleted file mode 100644
index dca83734411d1e8ac43a563b6255449cc7f5b630..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_config.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import types
-from dataclasses import dataclass
-from typing import Callable
-
-import torch
-import torch.nn.functional as F
-
-from ..model_parallel_config import ModelParallelConfig
-from ..utils import init_method_normal, scaled_init_method_normal
-
-
-@dataclass
-class TransformerConfig(ModelParallelConfig):
-    """Configuration object for megatron-core transformers.
-
-        Attributes:
-
-        # model architecture
-        num_layers (int): Number of transformer layers in a transformer block.
-        hidden_size (int): Transformer hidden size.
-        ffn_hidden_size (int): Transformer Feed-Forward Network hidden size.
-                                This is set to 4*hidden_size if not provided. Defaults to None.')
-        num_attention_heads (int): Number of transformer attention heads.
-        kv_channels (int): Projection weights dimension in multi-head attention.
-                            This is set to hidden_size // num_attention_heads if not provided.
-                            Defaults to None.
-        num_query_groups (int): Number of query groups for group query attention. If None, normal attention is used.
-
-        hidden_dropout (float): Dropout probability for transformer hidden state. Defaults to 0.1.
-        attention_dropout (float): Post attention dropout probability. Defaults to 0.1.
-        fp32_residual_connection (bool): If true, move residual connections to fp32.
-        apply_residual_connection_post_layernorm (bool): If true, uses the original BERT residule connection ordering.
-                                                         Defaults to False.
-        layernorm_epsilon (float): Layernorm epsilon. Defaults to 1e-5.
-
-        layernorm_zero_centered_gamma (bool): if set to 'True', the LayerNorm is adjusted to center the gamma values
-                                              around 0. This improves numerical stability. Defaults to False.
-
-        add_bias_linear (bool): Include a bias term in all linear layers (QKV projections, after core attention, and two
-                                in MLP layer). Default is True.
-
-        gated_linear_unit (bool): Use a gated linear unit for the first linear layer in the MLP. Defaults to False.
-
-        activation_func (Callable): Activation function to use for the non-linearity in the MLP. Defaults to F.gelu.
-
-        num_moe_experts (int): Number of experts to use for Mixture of Experts. 
-                               When set, it replaces MLP with Switch MLP. Defaults to None (no MoE).
-
-        # initialization
-        init_method (Callable): Method to initialize weights. Note that bias is always set to
-                                zero. Should be a function that takes a single Tensor and
-                                initializes it. Defaults to
-                                megatron_ds.core.utils.init_method_normal(init_method_std) which is
-                                torch.nn.init.normal_ with mean=0.0 and std=init_method_Std.
-
-        output_layer_init_method (Callable): Method to initialize weights of the output layer of
-                                             both attention and MLP blocks. Defaults to
-                                             megatron_ds.core.utils.scaled_init_method_normal(init_method_std)
-                                             which is torch.nn.init.normal_ with mean=0.0 and
-                                             std=init_method_std / math.sqrt(2.0 * num_layers).
-
-        init_method_std (float): Standard deviation of the zero mean normal for the default
-                                 initialization method, not used if init_method and
-                                 output_layer_init_method are provided. Defaults to 0.02.
-
-        # mixed-precision
-        apply_query_key_layer_scaling (bool): If true, scale Q * K^T by 1 / layer-number. Defaults to True.
-        attention_softmax_in_fp32 (bool): If true, run attention masking and softmax in fp32.
-                                          This should be true if apply_query_key_layer_scaling is true.
-
-        # fusion
-        bias_gelu_fustion (bool): If true, fuses bias and gelu. Defaults to False.
-        masked_softmax_fusion (bool): If true, uses softmax fusion.
-        persist_layer_norm (bool): If true, uses the persistent fused layer norm kernel.
-                                   This kernel only supports a fixed set of hidden sizes.
-                                   Defaults to False.
-        bias_dropout_fusion (bool): If true, uses bias dropout fusion.
-
-        # activation recomputation
-
-        recompute_granularity (str): megatron-core supports 'selective' activation checkpointing where only the memory
-                                     intensive part of attention is checkpointed.  These memory intensive activations
-                                     are also less compute intensive which makes activation checkpointing more efficient
-                                     for LLMs (20B+).  See Reducing Activation Recomputation in Large Transformer
-                                     Models: https://arxiv.org/abs/2205.05198 for more details.  'full' will checkpoint
-                                     the entire transformer layer.  Must be 'selective' or 'full'. 'selective' always uses all layers.
-                                     Defaults to None.
-
-        recompute_method (str): uniform will uniformly divide the total number of transformer layers in a transformer
-                                block and recompute the input activation of each divided chunk at the specified
-                                granularity.  block will recompute the input activations for only a set number of
-                                transformer layers per pipeline stage.  The rest of the layers in the pipeline stage
-                                will not have any activations recomputed.  Must be 'uniform' or 'block'. Defaults to
-                                None.
-
-        recompute_num_layers (int): When recompute_method is uniform, recompute_num_layers is the number of transformer
-                                    layers in each uniformly divided recompute unit.  When recompute_method is block,
-                                    recompute_num_layers is the number of transformer layers to recompute within each
-                                    pipeline stage.  Must be None for 'selective' activation checkpointing. Defaults to None.
-
-        distribute_saved_activations (bool): If true, distribute recomputed activations across the model parallel
-                                             group. Defaults to None.
-
-        # fp8 related (via Transformer Engine). For detailed info, refer the the Transformer Engine docs at
-        # https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/api/common.html
-
-        fp8 (str): If set, enables the use of FP8 precision through Transformer Engine. There are 2 predefined choices: (1) 'e4m3'
-                   uniformly uses e4m3 for all FP8 tensors, (2) 'hybrid' uses e4m3 for all FP8 activation and weight tensors and
-                   e5m2 for all FP8 output activation gradient tensors. Defaults to None.
-
-        fp8_margin (int): Margin for the scaling factor computation.
-
-        fp8_interval (int): Controls how often the scaling factor is recomputed.
-
-        fp8_amax_history_len (int): The length of the amax history window used for scaling factor computation.
-
-        fp8_amax_compute_algo (str): Algorithm used for choosing the `amax` value for the scaling factor computation.
-                                     There are 2 predefined choices: `max` chooses the largest `amax` in the history
-                                     window, while `most_recent` always chooses the most recently seen value.
-
-        fp8_wgrad (bool): When set to False, override FP8 config options and do the wgrad computation in higher precision.
-                          Defaults to True.
-
-        # Miscellaneous
-        clone_scatter_output_in_embedding (bool): When set to true, clone the output of scatter_to_sequence_parallel_region
-                                                  in embedding layer to facilitate garbage collection of input.
-
-        # Experimental
-        normalization (str): Swtich b/w `LayerNorm` and `RMSNorm` as normalization layers. For now, these are primarily
-                             used by Transformer-Engine's layers like `LayerNormLinear`. Default value is `LayerNorm`.
-
-
-    """
-
-    # model architecture
-    num_layers: int = 0
-    hidden_size: int = 0
-    num_attention_heads: int = 0
-    num_query_groups: int = None
-
-    ffn_hidden_size: int = None
-    kv_channels: int = None
-    hidden_dropout: float = 0.1
-    attention_dropout: float = 0.1
-    fp32_residual_connection: bool = False
-    # @jcasper should we keep this option?
-    apply_residual_connection_post_layernorm: bool = False
-    layernorm_epsilon: float = 1e-5
-    layernorm_zero_centered_gamma: bool = False
-    add_bias_linear: bool = True
-    gated_linear_unit: bool = False
-    activation_func: Callable = F.gelu
-    num_moe_experts: int = None
-
-    # initialization
-    init_method: Callable = None
-    output_layer_init_method: Callable = None
-    init_method_std: float = 0.02
-
-    # mixed-precision
-    apply_query_key_layer_scaling: bool = False
-    attention_softmax_in_fp32: bool = True
-
-    # communication
-
-    # fusion
-    bias_gelu_fusion: bool = False  # TODO: this should be bias_activation_fusion ?
-    masked_softmax_fusion: bool = False
-    persist_layer_norm: bool = False
-    bias_dropout_fusion: bool = False  # TODO: this should be bias_dropout_add_fusion?
-
-    # activation recomputation
-    recompute_granularity: str = None
-    recompute_method: str = None
-    recompute_num_layers: int = None
-    distribute_saved_activations: bool = None
-    custom_recompute_layers_per_stage: list = None
-
-    # fp8 related
-    fp8: str = None
-    fp8_margin: int = 0
-    fp8_interval: int = 1
-    fp8_amax_history_len: int = 1
-    fp8_amax_compute_algo: str = "most_recent"
-    fp8_wgrad: bool = True
-
-    # miscellaneous
-    clone_scatter_output_in_embedding: bool = True
-
-    # experimental section (TODO: move to apt. section above once stable)
-    normalization: bool = "LayerNorm"  # alt value supported by TE: "RMSNorm"
-
-    def __post_init__(self):
-        """ Python dataclass method that is used to modify attributes after initialization.
-            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
-        """
-        super().__post_init__()
-        if self.fp16 and self.bf16:
-            raise ValueError(
-                f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.'
-            )
-
-        if self.num_attention_heads % self.tensor_model_parallel_size != 0:
-            raise ValueError(
-                f"num_attention_heads ({self.num_attention_heads}) must be a multiple of "
-                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
-            )
-
-        if self.ffn_hidden_size is None:
-            self.ffn_hidden_size = 4 * self.hidden_size
-
-        if self.kv_channels is None:
-            self.kv_channels = self.hidden_size // self.num_attention_heads
-
-        if self.num_query_groups is None:
-            self.num_query_groups = self.num_attention_heads
-
-        if self.num_query_groups % self.tensor_model_parallel_size != 0:
-            raise ValueError(
-                f"num_query_groups ({self.num_query_groups}) must be a multiple of "
-                f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
-            )
-
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-
-        if self.expert_model_parallel_size > 1 and self.num_moe_experts is None:
-            raise ValueError(f'num_moe_experts must be non None to use expert-parallel.')
-
-        if self.recompute_granularity is not None:
-            if not self.recompute_granularity in ['full', 'selective']:
-                raise ValueError(
-                    f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".'
-                )
-
-            if self.recompute_method is not None:
-                if not self.recompute_method in ['block', 'uniform']:
-                    raise ValueError(
-                        f'recompute_method: {self.recompute_method} must be "block" or "uniform".'
-                    )
-            elif self.recompute_granularity != 'selective':
-                raise ValueError(
-                    f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"'
-                )
-
-            if self.recompute_granularity != 'selective' and self.recompute_num_layers is None and self.custom_recompute_layers_per_stage is None:
-                raise ValueError(
-                    f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers or custom_recompute_layers_per_stage must be not None '
-                )
-            elif (
-                self.recompute_granularity == 'selective' and self.recompute_num_layers is not None
-            ):
-                raise ValueError(
-                    f'When using recompute_granularity: {self.recompute_granularity} recompute_num_layers must be None.'
-                )
-
-            if self.distribute_saved_activations and self.sequence_parallel:
-                raise ValueError(
-                    f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel}'
-                )
-
-            if self.virtual_pipeline_model_parallel_size is not None:
-                if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0:
-                    raise ValueError(
-                        f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}'
-                    )
-
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-
-        if self.bias_gelu_fusion:
-            if not self.add_bias_linear:
-                raise ValueError(
-                    "When bias_gelu_fusion is True, add_bias_linear must also be True."
-                )
-
-            if self.activation_func != F.gelu:
-                raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
-
-        if self.init_method is None:
-            self.init_method = init_method_normal(self.init_method_std)
-
-        if self.output_layer_init_method is None:
-            self.output_layer_init_method = scaled_init_method_normal(
-                self.init_method_std, self.num_layers
-            )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_layer.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_layer.py
deleted file mode 100644
index 75cc5f1a3c7bfbcdc398eb99bbcd4ac13e58c3d8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/transformer_layer.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from dataclasses import dataclass
-from typing import Union
-
-import torch
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
-from megatron_ds.core.transformer.enums import AttnMaskType
-from megatron_ds.core.transformer.identity_op import IdentityFuncOp, IdentityOp
-from megatron_ds.core.transformer.module import MegatronModule
-from megatron_ds.core.transformer.spec_utils import ModuleSpec, build_module
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.utils import make_viewless_tensor
-
-
-@dataclass
-class TransformerLayerSubmodules:
-    input_layernorm: Union[ModuleSpec, type] = IdentityOp
-    self_attention: Union[ModuleSpec, type] = IdentityOp
-    self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
-
-    pre_cross_attn_layernorm: Union[ModuleSpec, type] = IdentityOp
-    cross_attention: Union[ModuleSpec, type] = IdentityOp
-    cross_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
-
-    pre_mlp_layernorm: Union[ModuleSpec, type] = IdentityOp
-    mlp: Union[ModuleSpec, type] = IdentityOp
-    mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
-
-
-class TransformerLayer(MegatronModule):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(
-        self,
-        config: TransformerConfig,
-        submodules: TransformerLayerSubmodules,
-        layer_number: int = 1,
-        hidden_dropout: float = None,
-    ):
-        super().__init__(config=config)
-
-        self.layer_number = layer_number + self._get_layer_offset()
-        self.hidden_dropout = config.hidden_dropout if hidden_dropout is None else hidden_dropout
-
-        ## [Module 1: Input Layernorm] Optional Layernorm on the input data
-        # TODO: add pytorch only layernorm
-        self.input_layernorm = build_module(
-            submodules.input_layernorm,
-            config=self.config,
-            hidden_size=self.config.hidden_size,
-            eps=self.config.layernorm_epsilon,
-        )
-
-        ## [Module 2: SelfAttention]
-        self.self_attention = build_module(
-            submodules.self_attention, config=self.config, layer_number=layer_number,
-        )
-
-        ## [Module 3: BiasDropoutFusion]
-        self.self_attn_bda = build_module(submodules.self_attn_bda)
-
-        ## [Module 4: Post SelfAttention] Optional Layernorm after self-attn
-        self.pre_cross_attn_layernorm = build_module(
-            submodules.pre_cross_attn_layernorm,
-            config=self.config,
-            hidden_size=self.config.hidden_size,
-            eps=self.config.layernorm_epsilon,
-        )
-
-        ## [Module 5: CrossAttention]
-        self.cross_attention = build_module(
-            submodules.cross_attention, config=self.config, layer_number=layer_number,
-        )
-
-        ## [Module 6: BiasDropoutFusion]
-        self.cross_attn_bda = build_module(submodules.cross_attn_bda, config=self.config,)
-
-        ## [Module 7: Pre MLP] Optional Layernorm before MLP
-        self.pre_mlp_layernorm = build_module(
-            submodules.pre_mlp_layernorm,
-            config=self.config,
-            hidden_size=self.config.hidden_size,
-            eps=self.config.layernorm_epsilon,
-        )
-
-        ## [Module 8: MLP block]
-        # TODO how to set the gpt_layer_spec.py when we have moe_frequency > 1,
-        #      where MLP and SwitchMLP both appear alternately?
-        self.mlp = build_module(submodules.mlp, config=self.config)
-
-        ## [Module 9: BiasDropoutFusion]
-        self.mlp_bda = build_module(submodules.mlp_bda)
-
-        # @jcasper how should we handle nvfuser?
-        # Set bias+dropout+add fusion grad_enable execution handler.
-        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        # TORCH_MINOR = int(torch.__version__.split('.')[1])
-        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
-        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
-        self.bias_dropout_add_exec_handler = torch.enable_grad
-
-    def _get_layer_offset(self):
-
-        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
-
-        num_layers_per_pipeline_rank = (
-            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
-        )
-
-        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
-            vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
-            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-
-            total_num_layers = self.config.num_layers
-            num_layers_per_virtual_rank = num_layers_per_pipeline_rank // vp_size
-            total_virtual_chunks = total_num_layers // vp_size
-            offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
-
-        else:
-            # Each stage gets a contiguous set of layers.
-            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                offset = pipeline_rank * num_layers_per_pipeline_rank
-            else:
-                offset = 0
-
-        return offset
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        context=None,
-        context_mask=None,
-        rotary_pos_emb=None,
-        inference_params=None,
-    ):
-        # hidden_states: [s, b, h]
-
-        # Residual connection.
-        residual = hidden_states
-
-        # Optional Input Layer norm
-        input_layernorm_output = self.input_layernorm(hidden_states)
-
-        # Self attention.
-        attention_output_with_bias = self.self_attention(
-            input_layernorm_output,
-            attention_mask=attention_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb,
-        )
-
-        # TODO: could we move `bias_dropout_add_exec_handler` itself
-        # inside the module provided in the `bias_dropout_add_spec` module?
-        with self.bias_dropout_add_exec_handler():
-            hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
-                attention_output_with_bias, residual, self.hidden_dropout
-            )
-
-        # Residual connection.
-        residual = hidden_states
-
-        # Optional Layer norm after self-attention
-        pre_cross_attn_layernorm_output = self.pre_cross_attn_layernorm(hidden_states)
-
-        # Cross attention.
-        attention_output_with_bias = self.cross_attention(
-            pre_cross_attn_layernorm_output,
-            attention_mask=context_mask,
-            key_value_states=context,
-            inference_params=inference_params,
-        )
-
-        if isinstance(attention_output_with_bias, dict) and "context" in attention_output_with_bias:
-            context = attention_output_with_bias["context"]
-
-        # TODO: could we move `bias_dropout_add_exec_handler` itself
-        # inside the module provided in the `bias_dropout_add_spec` module?
-        with self.bias_dropout_add_exec_handler():
-            hidden_states = self.cross_attn_bda(self.training, self.config.bias_dropout_fusion)(
-                attention_output_with_bias, residual, self.hidden_dropout
-            )
-
-        # Residual connection.
-        residual = hidden_states
-
-        # Optional Layer norm post the cross-attention.
-        pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
-
-        # MLP.
-        mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output)
-
-        # TODO: could we move `bias_dropout_add_exec_handler` itself
-        # inside the module provided in the `bias_dropout_add_spec` module?
-        with self.bias_dropout_add_exec_handler():
-            hidden_states = self.mlp_bda(self.training, self.config.bias_dropout_fusion)(
-                mlp_output_with_bias, residual, self.hidden_dropout
-            )
-
-        # Jit compiled function creates 'view' tensor. This tensor
-        # potentially gets saved in the MPU checkpoint function context,
-        # which rejects view tensors. While making a viewless tensor here
-        # won't result in memory savings (like the data loader, or
-        # p2p_communication), it serves to document the origin of this
-        # 'view' tensor.
-        output = make_viewless_tensor(
-            inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
-        )
-
-        return output, context
-
-    def sharded_state_dict(self, prefix=''):
-        offset = self._get_layer_offset()
-        num_layers = self.config.num_layers
-
-        global_layer_offset = self.layer_number - 1  # self.layer_number starts at 1
-        state_dict_prefix = (
-            f'{prefix}{global_layer_offset - offset}.'  # module list index in TransformerBlock
-        )
-        sharded_pp_offset = [
-            (0, global_layer_offset, num_layers)
-        ]  # PP sharding offset for ShardedTensors
-
-        attn_state_dict = self.self_attention.sharded_state_dict(
-            prefix=f'{state_dict_prefix}self_attention.',
-            sharded_key_prefix=f'{prefix}self_attention.',
-            sharded_offsets=sharded_pp_offset,
-        )
-
-        mlp_state_dict = self.mlp.sharded_state_dict(
-            prefix=f'{state_dict_prefix}mlp.',
-            sharded_key_prefix=f'{prefix}mlp.',
-            sharded_offsets=sharded_pp_offset,
-        )
-
-        sharded_state_dict = {**mlp_state_dict, **attn_state_dict}
-
-        return sharded_state_dict
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/utils.py
deleted file mode 100644
index 68c721af6f5352e270afb1799b5367a3c7c5b7c5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/utils.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Utilities for transformer layers."""
-from operator import itemgetter
-from typing import Any, Dict, Iterable, Optional, Tuple, Union
-
-import torch
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.dist_checkpointing.mapping import ShardedObject, StateDict
-from megatron_ds.core.utils import (
-    make_sharded_tensor_for_checkpoint,
-    make_tp_sharded_tensor_for_checkpoint,
-)
-
-
-def get_linear_layer(rows, columns, init_method, perform_initialization=True):
-    """Simple linear layer with weight initialization."""
-    layer = torch.nn.Linear(rows, columns)
-    if perform_initialization:  # Take from modelparallel config
-        init_method(layer.weight)
-    with torch.no_grad():
-        layer.bias.zero_()
-    return layer
-
-
-def attention_mask_func(attention_scores, attention_mask):
-    attention_scores.masked_fill_(attention_mask, -10000.0)
-    return attention_scores
-
-
-@torch.jit.script
-def gelu_impl(x):
-    """OpenAI's gelu implementation."""
-    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
-
-
-def openai_gelu(x):
-    return gelu_impl(x)
-
-
-# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
-@torch.jit.script
-def erf_gelu(x):
-    return (
-        x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
-    )
-
-
-def make_sharded_tensors_for_checkpoint(
-    state_dict: StateDict,
-    state_dict_prefix: str,
-    sharded_key_prefix: Optional[str] = None,
-    tensor_parallel_layers_axis_map: Optional[Dict[str, int]] = None,
-    sharded_offsets: Iterable[Tuple[int, int, int]] = (),
-    extra_state_suffix: str = '_extra_state',
-):
-    """Wraps tensors from transformer layers with ShardedTensor or ShardedObject.
-
-    For a given `state_dict`, wraps:
-    - all _extra_states with ShardedObject
-    - all tensors specified in tensor_parallel_layers_axis_map with TP and DP sharded ShardedTensor
-    - other values with DP sharded ShardedTensor
-
-    Args:
-        state_dict (StateDict): state_dict to convert
-        state_dict_prefix (str): prefix appended to keys in final state dict
-        sharded_key_prefix (str, optional): prefix appended to ShardedTensor keys
-        tensor_parallel_layers_axis_map (Dict[str, int], optional): dict mapping layer
-            names to the axis for TP sharding
-        sharded_offsets (Iterable[Tuple[int, int, int]], optional): sharding already
-            applied (e.g. PP related), passed along to ShardedTensor
-        extra_state_suffix (str, default = '_extra_state'): layers with this
-            suffix will be wrapped with ShardedObject instead of ShardedTensor.
-
-    """
-    if sharded_key_prefix is None:
-        sharded_key_prefix = state_dict_prefix
-
-    if tensor_parallel_layers_axis_map is None:
-        tensor_parallel_layers_axis_map = {}
-
-    sharded_state_dict = {}
-    for layer_name in state_dict.keys():
-        tensor = state_dict[layer_name]
-        layer_key = f'{state_dict_prefix}{layer_name}'
-        sharded_key = f'{sharded_key_prefix}{layer_name}'
-
-        if layer_name.endswith(extra_state_suffix):
-            sharded_state_dict[layer_key] = make_sharded_object_for_checkpoint(
-                tensor, sharded_key, sharded_offsets
-            )
-
-        elif layer_name in tensor_parallel_layers_axis_map:
-            tp_axis = tensor_parallel_layers_axis_map[layer_name]
-            sharded_state_dict[layer_key] = make_tp_sharded_tensor_for_checkpoint(
-                tensor, sharded_key, tp_axis, prepend_offsets=sharded_offsets,
-            )
-
-        else:
-            sharded_state_dict[layer_key] = make_sharded_tensor_for_checkpoint(
-                tensor, sharded_key, prepend_offsets=sharded_offsets,
-            )
-
-    return sharded_state_dict
-
-
-def make_sharded_object_for_checkpoint(
-    obj: Any,
-    key: str,
-    sharded_offsets: Iterable[Tuple[int, int, int]] = (),
-    replica_id: Union[None, int, Tuple[int, ...]] = None,
-    **kwargs,
-):
-    """ Helper for instantiating a non-sharded ShardedObject (replicated across TP and DP group).
-
-    Arguments:
-        obj (object): any object to be sharded
-        key (str): unique identifier of the object
-        sharded_offsets (Iterable[Tuple[int, int, int]]): offsets normally
-            prepended to ShardedTensors, will be used as global offsets for
-            ShardedObject
-        replica_id (Union[None, int, Tuple[int, ...]]): replica id
-    """
-    if replica_id is None:
-        replica_id = (
-            0,
-            parallel_state.get_tensor_model_parallel_rank(),
-            parallel_state.get_data_parallel_rank(),
-        )
-
-    return ShardedObject(key, obj, *_get_extra_state_offsets(sharded_offsets), replica_id, **kwargs)
-
-
-def _get_extra_state_offsets(
-    sharded_offsets: Iterable[Tuple[int, int, int]]
-) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
-    """ Turns ShardedTensor offsets into offsets suitable for ShardedObject. """
-    if sharded_offsets:
-        sharded_offsets = sorted(sharded_offsets, key=itemgetter(0))  # sort by axis
-        axis, extra_state_offset, extra_state_shape = zip(*sharded_offsets)
-        assert list(axis) == list(
-            range(len(axis))
-        ), f'Expected contiguous axis for offsets: {sharded_offsets}'
-    else:
-        extra_state_shape = (1,)
-        extra_state_offset = (0,)
-    return extra_state_shape, extra_state_offset
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/core/utils.py
deleted file mode 100644
index 0878eef88a0794efb6ec969008abfb3d35390d4b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/utils.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Utility functions used throughout Megatron core"""
-import math
-import operator
-from functools import reduce
-
-import torch
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.dist_checkpointing.mapping import ShardedTensor
-
-
-def ensure_divisibility(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
-
-
-def divide(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator and return
-    the division value."""
-    ensure_divisibility(numerator, denominator)
-    return numerator // denominator
-
-
-def get_attr_wrapped_model(model, attr, allow_none=True, return_model_obj=False):
-    """Get an attribute from a wrapped model.
-    If return_model_obj is true, return the object that has the 'attr' attribute;
-    otherwise, return the attribute directly."""
-    if isinstance(model, list):
-        raise RuntimeError("_get_attr_wrapped_model given a list of models")
-
-    if allow_none:
-
-        def condition(model, attr):
-            return not hasattr(model, attr)
-
-    else:
-
-        def condition(model, attr):
-            return getattr(model, attr, None) is None
-
-    while condition(model, attr):
-        if not hasattr(model, "module"):
-            raise RuntimeError(f"_get_attr_wrapped_model couldn't find attribute {attr}")
-
-        model = model.module
-
-    if return_model_obj:
-        return model
-    return getattr(model, attr)
-
-
-def get_model_type(model):
-    return get_attr_wrapped_model(model, 'model_type')
-
-
-def get_model_config(model):
-    return get_attr_wrapped_model(model, 'config', allow_none=False)
-
-
-class GlobalMemoryBuffer:
-    """Global buffer to avoid dynamic memory allocations.
-    Caller should ensure that buffers of the same name
-    are not used concurrently."""
-
-    def __init__(self):
-        self.buffer = {}
-
-    def get_tensor(self, tensor_shape, dtype, name):
-        required_len = reduce(operator.mul, tensor_shape, 1)
-        if (
-            self.buffer.get((name, dtype), None) is None
-            or self.buffer[(name, dtype)].numel() < required_len
-        ):
-            self.buffer[(name, dtype)] = torch.empty(
-                required_len, dtype=dtype, device=torch.cuda.current_device(), requires_grad=False
-            )
-
-        return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
-
-
-def _kernel_make_viewless_tensor(inp, requires_grad):
-    '''Make a viewless tensor.
-
-    View tensors have the undesirable side-affect of retaining a reference
-    to the originally-viewed tensor, even after manually setting the '.data'
-    field. This method creates a new tensor that links to the old tensor's
-    data, without linking the viewed tensor, referenced via the '._base'
-    field.
-    '''
-    out = torch.empty((1,), dtype=inp.dtype, device=inp.device, requires_grad=requires_grad,)
-    out.data = inp.data
-    return out
-
-
-class MakeViewlessTensor(torch.autograd.Function):
-    '''
-    Autograd function to make a viewless tensor.
-
-    This function should be used in cases where the computation graph needs
-    to be propagated, but we only want a viewless tensor (e.g.,
-    ParallelTransformer's hidden_states). Call this function by passing
-    'keep_graph = True' to 'make_viewless_tensor()'.
-    '''
-
-    @staticmethod
-    def forward(ctx, inp, requires_grad):
-        return _kernel_make_viewless_tensor(inp, requires_grad)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output, None
-
-
-def make_viewless_tensor(inp, requires_grad, keep_graph):
-    '''
-    Entry-point for creating viewless tensors.
-
-    This method should be used, rather than calling 'MakeViewlessTensor'
-    or '_kernel_make_viewless_tensor' directly. This method acts as a
-    switch for determining if an autograd function or a regular method
-    should be used to create the tensor.
-    '''
-
-    # return tensor as-is, if not a 'view'
-    if inp._base is None:
-        return inp
-
-    # create viewless tensor
-    if keep_graph:
-        return MakeViewlessTensor.apply(inp, requires_grad)
-    else:
-        return _kernel_make_viewless_tensor(inp, requires_grad)
-
-
-def assert_viewless_tensor(tensor, extra_msg=None):
-    '''Assert that a tensor is not a view (i.e., its '._base' field is
-    not set).'''
-    if isinstance(tensor, list):
-        [assert_viewless_tensor(t) for t in tensor]
-        return tensor
-    if not isinstance(tensor, torch.Tensor):
-        return tensor
-    assert tensor._base is None, (
-        "Ensure tensor._base is None before setting tensor.data or storing "
-        "tensor to memory buffer. Otherwise, a memory leak will occur (and "
-        "likely accumulate over iterations). %s"
-    ) % extra_msg
-    return tensor
-
-
-def safely_set_viewless_tensor_data(tensor, new_data_tensor):
-    '''Safely set tensor's '.data' field.
-
-    Check first that the tensor is viewless (i.e., '._base' not set). If not,
-    raise an exception.
-    '''
-    assert_viewless_tensor(
-        tensor,
-        extra_msg="FYI, tensor._base has shape %s, and new_data_tensor has shape %s."
-        % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape),
-    )
-    tensor.data = new_data_tensor
-
-
-def init_method_normal(sigma):
-    """Init method based on N(0, sigma)."""
-
-    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
-
-    return init_
-
-
-def scaled_init_method_normal(sigma, num_layers):
-    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
-    std = sigma / math.sqrt(2.0 * num_layers)
-
-    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
-    return init_
-
-
-def make_tp_sharded_tensor_for_checkpoint(
-    tensor, key, tp_axis=0, replica_id=None, prepend_offsets=(), **kwargs
-):
-    """ Helper for instantiating a ShardedTensor where the `tp_axis` dimension is sharded across TP group.
-
-    Optionally, can provide offsets which prepend new dimensions to the tensor.
-    """
-
-    prepend_axis_num = len(prepend_offsets)
-
-    if replica_id is None:
-        replica_id = (0, 0, parallel_state.get_data_parallel_rank())
-
-    return ShardedTensor.from_rank_offsets(
-        key,
-        tensor,
-        *prepend_offsets,
-        (
-            tp_axis + prepend_axis_num,
-            parallel_state.get_tensor_model_parallel_rank(),
-            parallel_state.get_tensor_model_parallel_world_size(),
-        ),
-        replica_id=replica_id,
-        prepend_axis_num=prepend_axis_num,
-        **kwargs,
-    )
-
-
-def make_sharded_tensor_for_checkpoint(tensor, key, prepend_offsets=(), replica_id=None, **kwargs):
-    """ Helper for instantiating a non-sharded ShardedTensor (replicated across TP and DP group).
-
-    Optionally, can provide offsets which prepend new dimensions to the tensor.
-    """
-
-    prepend_axis_num = len(prepend_offsets)
-
-    if replica_id is None:
-        replica_id = (
-            0,
-            parallel_state.get_tensor_model_parallel_rank(),
-            parallel_state.get_data_parallel_rank(),
-        )
-
-    return ShardedTensor.from_rank_offsets(
-        key,
-        tensor,
-        *prepend_offsets,
-        replica_id=replica_id,
-        prepend_axis_num=prepend_axis_num,
-        **kwargs,
-    )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/dist_signal_handler.py b/toolbox/Megatron-DeepSpeed/megatron_ds/dist_signal_handler.py
deleted file mode 100644
index a60204f004a3f149da0bc059cd875b6ec390c0c4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/dist_signal_handler.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import signal
-
-import torch
-
-
-def get_world_size():
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        world_size = torch.distributed.get_world_size()
-    else:
-        world_size = 1
-    return world_size
-
-
-def get_device(local_rank=None):
-    backend = torch.distributed.get_backend()
-    if backend == 'nccl':
-        if local_rank is None:
-            device = torch.device('cuda')
-        else:
-            device = torch.device(f'cuda:{local_rank}')
-    elif backend == 'gloo':
-        device = torch.device('cpu')
-    else:
-        raise RuntimeError
-    return device
-
-
-def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None):
-    if not torch.distributed.is_available() or \
-       not torch.distributed.is_initialized():
-        return [item]
-
-    device = get_device(local_rank)
-
-    if group is not None:
-        group_size = group.size()
-    else:
-        group_size = get_world_size()
-
-    tensor = torch.tensor([item], device=device, dtype=dtype)
-    output_tensors = [
-        torch.zeros(1, dtype=tensor.dtype, device=tensor.device)
-        for _ in range(group_size)
-    ]
-    torch.distributed.all_gather(output_tensors, tensor, group, async_op)
-    output = [elem.item() for elem in output_tensors]
-    return output
-
-
-class DistributedSignalHandler:
-    def __init__(self, sig=signal.SIGTERM):
-        self.sig = sig
-
-    def signals_received(self):
-        all_received = all_gather_item(
-            self._signal_received, dtype=torch.int32
-        )
-        return all_received
-
-    def __enter__(self):
-        self._signal_received = False
-        self.released = False
-        self.original_handler = signal.getsignal(self.sig)
-
-        def handler(signum, frame):
-            self._signal_received = True
-
-        signal.signal(self.sig, handler)
-
-        return self
-
-    def __exit__(self, type, value, tb):
-        self.release()
-
-    def release(self):
-        if self.released:
-            return False
-
-        signal.signal(self.sig, self.original_handler)
-        self.released = True
-        return True
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/enums.py b/toolbox/Megatron-DeepSpeed/megatron_ds/enums.py
deleted file mode 100644
index d9050462aaf5d542e0f6a669cb293d13664a9b7f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/enums.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import enum
-
-class LayerType(enum.Enum):
-    encoder = 1
-    decoder = 2
- 
-class AttnType(enum.Enum):
-    self_attn = 1
-    cross_attn = 2
-
-class AttnMaskType(enum.Enum):
-    padding = 1
-    causal = 2
-    prefix = 3
-
-class PositionEmbeddingType(enum.Enum):
-    rotary = 1
-    absolute = 2
-    alibi = 3
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/fp16_deprecated/loss_scaler.py b/toolbox/Megatron-DeepSpeed/megatron_ds/fp16_deprecated/loss_scaler.py
deleted file mode 100644
index cb64aa928923e138f504c6d118ff7a67882dd34c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/fp16_deprecated/loss_scaler.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""For backward compatibility, we need the class definitions to deserialize."""
-
-class LossScaler:
-    def __init__(self, scale=1):
-        self.cur_scale = scale
-
-class DynamicLossScaler:
-    def __init__(self,
-                 init_scale=2**32,
-                 scale_factor=2.,
-                 scale_window=1000,
-                 min_scale=1,
-                 delayed_shift=1,
-                 consecutive_hysteresis=False):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.min_scale = min_scale
-        self.delayed_shift = delayed_shift
-        self.cur_hysteresis = delayed_shift
-        self.consecutive_hysteresis = consecutive_hysteresis
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/__init__.py
deleted file mode 100644
index 87cceac3e35f983cf9f2264ff651a1067069f9e2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/__init__.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import os
-import pathlib
-import subprocess
-
-from torch.utils import cpp_extension
-
-# Setting this param to a list has a problem of generating different
-# compilation commands (with diferent order of architectures) and
-# leading to recompilation of fused kernels. Set it to empty string
-# to avoid recompilation and assign arch flags explicity in
-# extra_cuda_cflags below
-os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-
-
-def load(args):
-
-    # Check if cuda 11 is installed for compute capability 8.0
-    cc_flag = []
-    _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
-        cpp_extension.CUDA_HOME
-    )
-    if int(bare_metal_major) >= 11:
-        cc_flag.append('-gencode')
-        cc_flag.append('arch=compute_80,code=sm_80')
-        if int(bare_metal_minor) >= 8:
-            cc_flag.append('-gencode')
-            cc_flag.append('arch=compute_90,code=sm_90')
-
-    # Build path
-    srcpath = pathlib.Path(__file__).parent.absolute()
-    buildpath = srcpath / "build"
-    _create_build_dir(buildpath)
-
-    # Helper function to build the kernels.
-    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
-        return cpp_extension.load(
-            name=name,
-            sources=sources,
-            build_directory=buildpath,
-            extra_cflags=[
-                "-O3",
-            ],
-            extra_cuda_cflags=[
-                "-O3",
-                "-gencode",
-                "arch=compute_70,code=sm_70",
-                "--use_fast_math",
-            ]
-            + extra_cuda_flags
-            + cc_flag,
-            verbose=(args.rank == 0),
-        )
-
-
-def _get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output(
-        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
-    )
-    output = raw_output.split()
-    release_idx = output.index("release") + 1
-    release = output[release_idx].split(".")
-    bare_metal_major = release[0]
-    bare_metal_minor = release[1][0]
-
-    return raw_output, bare_metal_major, bare_metal_minor
-
-
-def _create_build_dir(buildpath):
-    try:
-        os.mkdir(buildpath)
-    except OSError:
-        if not os.path.isdir(buildpath):
-            print(f"Creation of the build directory {buildpath} failed")
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/compat.h b/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/compat.h
deleted file mode 100644
index 5495d7807762d8b4e3dbc11b28dba15f85bd8108..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/compat.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
-
-/*This code is copied fron NVIDIA apex:
- *     https://github.com/NVIDIA/apex
- *     with minor changes. */
-
-
-
-#ifndef TORCH_CHECK
-#define TORCH_CHECK AT_CHECK
-#endif
-
-#ifdef VERSION_GE_1_3
-#define DATA_PTR data_ptr
-#else
-#define DATA_PTR data
-#endif
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/tests/test_fused_kernels.py b/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/tests/test_fused_kernels.py
deleted file mode 100644
index 5cd9b758ce2e8514c78e69d4840aa460bbf29879..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/tests/test_fused_kernels.py
+++ /dev/null
@@ -1,388 +0,0 @@
-import math
-
-import torch
-from torch.nn import LayerNorm
-
-from megatron_ds.model.enums import AttnMaskType
-from megatron_ds.model.fused_layer_norm import MixedFusedLayerNorm
-from megatron_ds.model.fused_softmax import FusedScaleMaskSoftmax
-from megatron_ds.model.utils import attention_mask_func
-from megatron_ds.fused_kernels import load
-
-def test_load_fused_kernels():
-    try:
-        import fused_layer_norm_cuda
-        import scaled_masked_softmax_cuda
-        import scaled_upper_triang_masked_softmax_cuda
-        import torch
-
-        print("[Success] load_fused_kernels")
-    except ImportError as e:
-        print("[Fail] load_fused_kernels")
-        raise e
-
-def test_fused_softmax():
-    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-    test_text = (
-        "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
-    )
-
-    tokens = tokenizer(
-        [test_text] * 4,
-        return_tensors="pt",
-    )
-
-    embedding_output = bert.embeddings(
-        input_ids=tokens["input_ids"].cuda(),
-        position_ids=None,
-        token_type_ids=tokens["token_type_ids"].cuda(),
-        inputs_embeds=None,
-        past_key_values_length=0,
-    )
-
-    # (bsz, 1, 1, seq_len)
-    mask = bert.get_extended_attention_mask(
-        attention_mask=tokens["attention_mask"].cuda(),
-        input_shape=tokens["input_ids"].shape,
-        device=bert.device,
-    )
-    # (bsz, 1, seq_len, seq_len)
-    mask = mask.repeat(1, 1, mask.size()[-1], 1)
-
-    attention = bert.encoder.layer[0].attention.self
-    key_layer = attention.transpose_for_scores(attention.key(embedding_output))
-    query_layer = attention.transpose_for_scores(attention.query(embedding_output))
-
-    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-    attention_scores /= math.sqrt(key_layer.size()[-1])
-
-    fused_softmax = (
-        FusedScaleMaskSoftmax(
-            input_in_fp16=True,
-            input_in_bf16=False,
-            mask_func=attention_mask_func,
-            scale=None,
-            softmax_in_fp32=False,
-            attn_mask_type=AttnMaskType.padding,
-            scaled_masked_softmax_fusion=True,
-        )
-        .cuda()
-        .half()
-    )
-
-    fused_softmax_output = fused_softmax(
-        attention_scores,
-        (mask != 0),
-    )
-
-    torch_softmax = (
-        FusedScaleMaskSoftmax(
-            input_in_fp16=True,
-            input_in_bf16=False,
-            mask_func=attention_mask_func,
-            scale=None,
-            softmax_in_fp32=False,
-            attn_mask_type=AttnMaskType.padding,
-            scaled_masked_softmax_fusion=False,
-        )
-        .cuda()
-        .half()
-    )
-
-    torch_softmax_output = torch_softmax(
-        attention_scores,
-        (mask != 0),
-    )
-
-    test_result = (fused_softmax_output - torch_softmax_output).abs()
-
-    while test_result.dim() != 1:
-        test_result = test_result.mean(dim=-1)
-
-    diff = test_result.mean(dim=-1)
-
-    if diff <= 1e-3:
-        print(
-            f"\n[Success] test_fused_softmax"
-            f"\n > mean_difference={diff}"
-            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
-            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
-        )
-    else:
-        print(
-            f"\n[Fail] test_fused_softmax"
-            f"\n > mean_difference={diff}, "
-            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
-            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
-        )
-
-
-def test_fused_upper_triangle_mask_softmax():
-    gpt = GPT2Model.from_pretrained("gpt2").cuda().half()
-    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    test_text = (
-        "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi"  # 24
-    )
-
-    tokens = tokenizer(
-        [test_text] * 4,
-        return_tensors="pt",
-    )
-
-    attention_mask = tokens["attention_mask"].cuda()
-    attention_mask = attention_mask.view(attention_mask.size(0), -1)
-    attention_mask = attention_mask[:, None, None, :]
-    attention_mask = (1.0 - attention_mask) * -10000.0
-    attention_mask = attention_mask.repeat(1, 1, attention_mask.size()[-1], 1)
-    attn = gpt.h[0]
-
-    hidden_states = gpt.wte(tokens["input_ids"].cuda())
-    q, k, v = attn.attn.c_attn(hidden_states).split(768, dim=-1)
-    q = attn.attn._split_heads(q, attn.attn.num_heads, attn.attn.head_dim)
-    k = attn.attn._split_heads(k, attn.attn.num_heads, attn.attn.head_dim)
-    attn_weights = torch.matmul(q, k.transpose(-1, -2))
-
-    sq, sk = q.size(-2), k.size(-2)
-    causal_mask = attn.attn.bias[:, :, sk - sq : sk, :sk].bool()
-    total_mask = ~(causal_mask & (attention_mask == 0))
-    """
-    tensor([[[[False,  True,  True,  ...,  True,  True,  True],
-              [False, False,  True,  ...,  True,  True,  True],
-              [False, False, False,  ...,  True,  True,  True],
-              ...,
-              [False, False, False,  ..., False,  True,  True],
-              [False, False, False,  ..., False, False,  True],
-              [False, False, False,  ..., False, False, False]]]
-    """
-
-    fused_softmax = (
-        FusedScaleMaskSoftmax(
-            input_in_fp16=True,
-            input_in_bf16=False,
-            mask_func=attention_mask_func,
-            scale=None,
-            softmax_in_fp32=False,
-            attn_mask_type=AttnMaskType.causal,
-            scaled_masked_softmax_fusion=True,
-        )
-        .cuda()
-        .half()
-    )
-
-    fused_softmax_output = fused_softmax(
-        attn_weights,
-        total_mask,
-    )
-
-    torch_softmax = (
-        FusedScaleMaskSoftmax(
-            input_in_fp16=True,
-            input_in_bf16=False,
-            mask_func=attention_mask_func,
-            scale=None,
-            softmax_in_fp32=False,
-            attn_mask_type=AttnMaskType.causal,
-            scaled_masked_softmax_fusion=False,
-        )
-        .cuda()
-        .half()
-    )
-
-    torch_softmax_output = torch_softmax(
-        attn_weights,
-        total_mask,
-    )
-
-    test_result = (fused_softmax_output - torch_softmax_output).abs()
-
-    while test_result.dim() != 1:
-        test_result = test_result.mean(dim=-1)
-
-    diff = test_result.mean(dim=-1)
-
-    if diff <= 1e-3:
-        print(
-            f"\n[Success] test_fused_upper_triangle_mask_softmax"
-            f"\n > mean_difference={diff}"
-            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
-            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
-        )
-    else:
-        print(
-            f"\n[Fail] test_fused_upper_triangle_mask_softmax"
-            f"\n > mean_difference={diff}, "
-            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
-            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
-        )
-
-
-def test_layer_norm():
-    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-    test_text = (
-        "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
-    )
-
-    tokens = tokenizer(
-        [test_text] * 4,
-        return_tensors="pt",
-    )
-
-    # [bsz, seq_len, d_model]
-    embedding_output = (
-        bert.embeddings(
-            input_ids=tokens["input_ids"].cuda(),
-            position_ids=None,
-            token_type_ids=tokens["token_type_ids"].cuda(),
-            inputs_embeds=None,
-            past_key_values_length=0,
-        )
-        .cuda()
-        .half()
-    )
-
-    fused_layernorm_layer = (
-        MixedFusedLayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half()
-    )
-
-    torch_layernorm_layer = (
-        LayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half()
-    )
-
-    fused_output = fused_layernorm_layer(embedding_output)
-    torch_output = torch_layernorm_layer(embedding_output)
-    test_result = (fused_output - torch_output).abs()
-
-    while test_result.dim() != 1:
-        test_result = test_result.mean(dim=-1)
-
-    diff = test_result.mean(dim=-1)
-
-    if diff <= 1e-3:
-        print(
-            f"\n[Success] test_layer_norm"
-            f"\n > mean_difference={diff}"
-            f"\n > fused_values={fused_output[-1][-1][:5].tolist()}"
-            f"\n > torch_values={torch_output[-1][-1][:5].tolist()}"
-        )
-    else:
-        print(
-            f"\n[Fail] test_layer_norm"
-            f"\n > mean_difference={diff}, "
-            f"\n > fused_values={fused_output[-1][-1][:5].tolist()}, "
-            f"\n > torch_values={torch_output[-1][-1][:5].tolist()}"
-        )
-
-
-def attention_mask_func(attention_scores, attention_mask):
-    attention_scores.masked_fill_(attention_mask, -10000.0)
-    return attention_scores
-
-
-def forward_torch_softmax(input, mask, scale):
-    input = input * scale
-    mask_output = attention_mask_func(input, mask) if mask is not None else input
-    probs = torch.nn.Softmax(dim=-1)(mask_output)
-    return probs
-
-
-def test_masked_softmax_forward():
-    import scaled_masked_softmax_cuda
-
-    batch = 2
-    attn = 16
-    scale_t = torch.tensor([1.0])
-    for qlen in [128, 256, 1024, 2048, 4096]:
-        for klen in [128, 256, 1024, 2048]:
-            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
-            masks = torch.randint(0, 2, (batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
-            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
-            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
-            error = (softmax_results_torch - softmax_results).abs().max()
-            assert error < 1e-3
-
-def test_masked_softmax_backward():
-    import scaled_masked_softmax_cuda
-
-    batch = 2
-    attn = 16
-    scale_t = torch.tensor([1.0])
-    for qlen in [128, 256, 1024, 2048, 4096]:
-        for klen in [128, 256, 1024, 2048]:
-            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
-            backward = torch.rand_like(inputs, dtype=torch.float16, device='cuda:0')
-            masks = torch.randint(0, 2, (batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
-            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
-            back_grad = scaled_masked_softmax_cuda.backward(backward, softmax_results, scale_t[0].item())
-
-            inputs.requires_grad = True
-            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
-            softmax_results_torch.backward(backward)
-            error = (back_grad - inputs.grad).abs().max()
-            assert error < 1e-3
-
-
-def test_allmasked_softmax_forward():
-    import scaled_masked_softmax_cuda
-
-    batch = 2
-    attn = 16
-    scale_t = torch.tensor([1.0])
-    for qlen in [128, 256, 1024, 2048, 4096]:
-        for klen in [128, 256, 1024, 2048]:
-            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
-            masks = torch.ones((batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
-            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
-            softmax_results_torch = torch.zeros_like(inputs)
-            error = (softmax_results_torch - softmax_results).abs().max()
-            assert error == 0.0
-
-
-def test_allmasked_softmax_backward():
-    import scaled_masked_softmax_cuda
-
-    batch = 2
-    attn = 16
-    scale_t = torch.tensor([1.0])
-    for qlen in [128, 256, 1024, 2048, 4096]:
-        for klen in [128, 256, 1024, 2048]:
-            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
-            backward = torch.rand_like(inputs, dtype=torch.float16, device='cuda:0')
-            masks = torch.ones((batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
-            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
-            back_grad = scaled_masked_softmax_cuda.backward(backward, softmax_results, scale_t[0].item())
-            inputs.requires_grad = True
-            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
-            softmax_results_torch.backward(backward)
-            error = (back_grad - inputs.grad).abs().max()
-            assert error < 1e-3
-
-
-if __name__ == "__main__":
-    try:
-        from transformers import BertTokenizer, GPT2Tokenizer
-        from transformers.models.bert.modeling_bert import BertModel
-        from transformers.models.gpt2.modeling_gpt2 import GPT2Model
-        import transformers
-
-        transformers.logging.set_verbosity(
-            transformers.logging.FATAL,
-        )
-
-    except:
-        print("\n[Fail] Please install `transformers` package to test fused kernels\n")
-        exit(-1)
-
-    load()
-    test_masked_softmax_forward()
-    test_masked_softmax_backward()
-    test_allmasked_softmax_forward()
-    test_allmasked_softmax_backward()
-    test_load_fused_kernels()
-    test_fused_softmax()
-    test_fused_upper_triangle_mask_softmax()
-    test_layer_norm()
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/type_shim.h b/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/type_shim.h
deleted file mode 100644
index d60a6f8c6fb50e241f9ddcc852adec71e963e1b2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/type_shim.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-
-#include <ATen/ATen.h>
-#include "compat.h"
-
-
-#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
-  switch(TYPE)								\
-    {									\
-    case at::ScalarType::Half:						\
-      {									\
-	using scalar_t = at::Half;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::BFloat16:					\
-      {									\
-	using scalar_t = at::BFloat16;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    default:								\
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
-      }
-
-
-#define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...)			\
-  switch(TYPE)								\
-    {									\
-    case at::ScalarType::Half:						\
-      {									\
-	using scalar_t = at::Half;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::BFloat16:					\
-      {									\
-	using scalar_t = at::BFloat16;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::Float:						\
-      {									\
-	using scalar_t = float;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    default:								\
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
-      }
-
-
-
-#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
-  switch(TYPEIN)							\
-    {									\
-    case at::ScalarType::Float:						\
-      {									\
-	using scalar_t_in = float;					\
-	switch(TYPEOUT)							\
-	  {								\
-	  case at::ScalarType::Float:					\
-	    {								\
-	      using scalar_t_out = float;				\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  case at::ScalarType::Half:					\
-	    {								\
-	      using scalar_t_out = at::Half;				\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  case at::ScalarType::BFloat16:				\
-	    {								\
-	      using scalar_t_out = at::BFloat16;			\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  default:							\
-	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
-	  }								\
-	break;								\
-      }									\
-    case at::ScalarType::Half:						\
-      {									\
-	using scalar_t_in = at::Half;					\
-	using scalar_t_out = at::Half;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::BFloat16:					\
-      {									\
-	using scalar_t_in = at::BFloat16;				\
-	using scalar_t_out = at::BFloat16;				\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    default:								\
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
-    }
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/global_vars.py b/toolbox/Megatron-DeepSpeed/megatron_ds/global_vars.py
deleted file mode 100644
index 5b07bba0d2202a2ab76b048e8bd04442ffab1990..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/global_vars.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron global variables."""
-
-import os
-import sys
-import torch
-
-from megatron_ds import dist_signal_handler
-from megatron_ds.tokenizer import build_tokenizer
-from .microbatches import build_num_microbatches_calculator
-from .timers import Timers
-
-_GLOBAL_ARGS = None
-_GLOBAL_RLHF_ARGS = None
-_GLOBAL_RETRO_ARGS = None
-_GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
-_GLOBAL_TOKENIZER = None
-_GLOBAL_TENSORBOARD_WRITER = None
-_GLOBAL_WANDB_WRITER = None
-_GLOBAL_ADLR_AUTORESUME = None
-_GLOBAL_TIMERS = None
-_GLOBAL_SIGNAL_HANDLER = None
-
-def get_args():
-    """Return arguments."""
-    _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
-    return _GLOBAL_ARGS
-
-
-def get_retro_args():
-    """Return retro arguments."""
-    return _GLOBAL_RETRO_ARGS
-
-
-def get_rlhf_args():
-    '''Return rlhf arguments.'''
-    return _GLOBAL_RLHF_ARGS
-
-
-def get_num_microbatches():
-    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get()
-
-
-def get_current_global_batch_size():
-    return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size()
-
-
-def update_num_microbatches(consumed_samples, consistency_check=True):
-    _GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples,
-                                               consistency_check)
-
-
-def get_tokenizer():
-    """Return tokenizer."""
-    _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
-    return _GLOBAL_TOKENIZER
-
-
-def get_tensorboard_writer():
-    """Return tensorboard writer. It can be None so no need
-    to check if it is initialized."""
-    return _GLOBAL_TENSORBOARD_WRITER
-
-
-def get_wandb_writer():
-    """Return tensorboard writer. It can be None so no need
-    to check if it is initialized."""
-    return _GLOBAL_WANDB_WRITER
-
-
-def get_adlr_autoresume():
-    """ADLR autoresume object. It can be None so no need
-    to check if it is initialized."""
-    return _GLOBAL_ADLR_AUTORESUME
-
-
-def get_timers():
-    """Return timers."""
-    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
-    return _GLOBAL_TIMERS
-
-
-def get_signal_handler():
-    _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
-    return _GLOBAL_SIGNAL_HANDLER
-
-
-def _set_signal_handler():
-    global _GLOBAL_SIGNAL_HANDLER
-    _ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
-    _GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__()
-
-
-
-def set_global_variables(args, build_tokenizer=True):
-    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
-
-    assert args is not None
-
-    _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
-    set_args(args)
-
-    _build_num_microbatches_calculator(args)
-    if build_tokenizer:
-        _ = _build_tokenizer(args)
-    _set_tensorboard_writer(args)
-    _set_wandb_writer(args)
-    _set_adlr_autoresume(args)
-    _set_timers(args)
-
-    if args.exit_signal_handler:
-        _set_signal_handler()
-
-
-def set_args(args):
-    global _GLOBAL_ARGS
-    _GLOBAL_ARGS = args
-
-
-def set_retro_args(retro_args):
-    global _GLOBAL_RETRO_ARGS
-    _GLOBAL_RETRO_ARGS = retro_args
-
-
-def set_rlhf_args(rlhf_args):
-    global _GLOBAL_RLHF_ARGS
-    _GLOBAL_RLHF_ARGS = rlhf_args
-
-
-def _build_num_microbatches_calculator(args):
-
-    global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-    _ensure_var_is_not_initialized(_GLOBAL_NUM_MICROBATCHES_CALCULATOR,
-                                   'num microbatches calculator')
-
-    _GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator(
-        args)
-
-
-def _build_tokenizer(args):
-    """Initialize tokenizer."""
-    global _GLOBAL_TOKENIZER
-    _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
-    _GLOBAL_TOKENIZER = build_tokenizer(args)
-    return _GLOBAL_TOKENIZER
-
-
-def rebuild_tokenizer(args):
-    global _GLOBAL_TOKENIZER
-    _GLOBAL_TOKENIZER = None
-    return _build_tokenizer(args)
-
-
-def _set_tensorboard_writer(args):
-    """Set tensorboard writer."""
-    global _GLOBAL_TENSORBOARD_WRITER
-    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
-                                   'tensorboard writer')
-
-    if hasattr(args, 'tensorboard_dir') and \
-       args.tensorboard_dir and args.rank == (args.world_size - 1):
-        try:
-            from torch.utils.tensorboard import SummaryWriter
-            print('> setting tensorboard ...')
-            _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
-                log_dir=args.tensorboard_dir,
-                max_queue=args.tensorboard_queue_size)
-        except ModuleNotFoundError:
-            print('WARNING: TensorBoard writing requested but is not '
-                  'available (are you using PyTorch 1.1.0 or later?), '
-                  'no TensorBoard logs will be written.', flush=True)
-
-
-def _set_wandb_writer(args):
-    global _GLOBAL_WANDB_WRITER
-    _ensure_var_is_not_initialized(_GLOBAL_WANDB_WRITER,
-                                   'wandb writer')
-    if getattr(args, 'wandb_project', '') and args.rank == (args.world_size - 1):
-        if args.wandb_exp_name == '':
-            raise ValueError("Please specify the wandb experiment name!")
-
-        import wandb
-        if args.wandb_save_dir:
-            save_dir = args.wandb_save_dir
-        else:
-            # Defaults to the save dir.
-            save_dir = os.path.join(args.save, 'wandb')
-        wandb_kwargs = {
-            'dir': save_dir,
-            'name': args.wandb_exp_name,
-            'project': args.wandb_project,
-            'config': vars(args)}
-        os.makedirs(wandb_kwargs['dir'], exist_ok=True)
-        wandb.init(**wandb_kwargs)
-        _GLOBAL_WANDB_WRITER = wandb
-
-
-def _set_adlr_autoresume(args):
-    """Initialize ADLR autoresume."""
-    global _GLOBAL_ADLR_AUTORESUME
-    _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
-
-    if args.adlr_autoresume:
-        if args.rank == 0:
-            print('enabling autoresume ...', flush=True)
-        sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
-        try:
-            from userlib.auto_resume import AutoResume
-        except BaseException:
-            print('ADLR autoresume is not available, exiting ...')
-            sys.exit()
-
-        _GLOBAL_ADLR_AUTORESUME = AutoResume
-
-
-def _set_timers(args):
-    """Initialize timers."""
-    global _GLOBAL_TIMERS
-    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
-    _GLOBAL_TIMERS = Timers(args.timing_log_level, args.timing_log_option)
-
-
-def _ensure_var_is_initialized(var, name):
-    """Make sure the input variable is not None."""
-    assert var is not None, '{} is not initialized.'.format(name)
-
-
-def _ensure_var_is_not_initialized(var, name):
-    """Make sure the input variable is not None."""
-    assert var is None, '{} is already initialized.'.format(name)
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/indexer.py b/toolbox/Megatron-DeepSpeed/megatron_ds/indexer.py
deleted file mode 100644
index aab244a3b0837c7bd724c63c6c0779411629eac7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/indexer.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import sys
-import time
-import torch
-import torch.distributed as dist
-
-from megatron_ds import get_args, print_rank_0
-from megatron_ds.core import mpu
-from megatron_ds.checkpointing import load_biencoder_checkpoint
-from megatron_ds.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
-from megatron_ds.data.orqa_wiki_dataset import get_open_retrieval_batch
-from megatron_ds.data.biencoder_dataset_utils import get_one_epoch_dataloader
-from megatron_ds.data.realm_index import detach, OpenRetreivalDataStore
-from megatron_ds.model.biencoder_model import get_model_provider
-from megatron_ds.training import get_model
-
-
-class IndexBuilder(object):
-    """
-    Object for taking one pass over a dataset and creating a BlockData of its
-    embeddings
-    """
-    def __init__(self):
-        args = get_args()
-        self.model = None
-        self.dataloader = None
-        self.evidence_embedder_obj = None
-        self.biencoder_shared_query_context_model = \
-            args.biencoder_shared_query_context_model
-
-        # need to know whether we're using a REALM checkpoint (args.load)
-        # or ICT checkpoint
-        assert not (args.load and args.ict_load)
-
-        self.log_interval = args.indexer_log_interval
-        self.batch_size = args.indexer_batch_size
-
-        self.load_attributes()
-        self.is_main_builder = mpu.get_data_parallel_rank() == 0
-        self.num_total_builders = mpu.get_data_parallel_world_size()
-        self.iteration = self.total_processed = 0
-
-    def load_attributes(self):
-        """
-        Load the necessary attributes: model, dataloader and empty BlockData
-        """
-        only_context_model = True
-        if self.biencoder_shared_query_context_model:
-            only_context_model = False
-
-        model = get_model(get_model_provider(only_context_model=\
-            only_context_model, biencoder_shared_query_context_model=\
-            self.biencoder_shared_query_context_model))
-
-        self.model = load_biencoder_checkpoint(model,
-                only_context_model=only_context_model)
-
-        assert len(self.model) == 1
-        self.model[0].eval()
-
-        self.dataset = get_open_retrieval_wiki_dataset()
-        self.dataloader = iter(get_one_epoch_dataloader(self.dataset, \
-            self.batch_size))
-
-        self.evidence_embedder_obj = OpenRetreivalDataStore( \
-            load_from_path=False)
-
-    def track_and_report_progress(self, batch_size):
-        """
-        Utility function for tracking progress
-        """
-        self.iteration += 1
-        self.total_processed += batch_size * self.num_total_builders
-        if self.is_main_builder and self.iteration % self.log_interval == 0:
-            print('Batch {:10d} | Total {:10d}'.format(self.iteration,
-                self.total_processed), flush=True)
-
-    def build_and_save_index(self):
-        """
-        Goes through one epoch of the dataloader and adds all data to this
-        instance's BlockData.
-
-        The copy of BlockData is saved as a shard, which when run in a
-        distributed setting will be consolidated by the rank 0 process
-        and saved as a final pickled BlockData.
-        """
-        assert len(self.model) == 1
-        unwrapped_model = self.model[0]
-
-        while not hasattr(unwrapped_model, 'embed_text'):
-            unwrapped_model = unwrapped_model.module
-
-        while True:
-            try:
-                # batch also has query_tokens and query_pad_data
-                row_id, context_tokens, context_mask, context_types, \
-                    context_pad_mask = get_open_retrieval_batch( \
-                    self.dataloader)
-            except (StopIteration, IndexError):
-                break
-
-            # TODO: can we add with torch.no_grad() to reduce memory usage
-            # detach, separate fields and add to BlockData
-            assert context_mask.dtype == torch.bool
-            context_logits = unwrapped_model.embed_text(
-                unwrapped_model.context_model, context_tokens, context_mask,
-                context_types)
-
-            context_logits = detach(context_logits)
-            row_id = detach(row_id)
-
-            self.evidence_embedder_obj.add_block_data(row_id, context_logits)
-            self.track_and_report_progress(batch_size=len(row_id))
-
-        # This process signals to finalize its shard and then synchronize with
-        # the other processes
-        self.evidence_embedder_obj.save_shard()
-        torch.distributed.barrier()
-        del self.model
-
-        # rank 0 process builds the final copy
-        if self.is_main_builder:
-            self.evidence_embedder_obj.merge_shards_and_save()
-            # make sure that every single piece of data was embedded
-            assert len(self.evidence_embedder_obj.embed_data) == \
-                len(self.dataset)
-        self.evidence_embedder_obj.clear()
-
-        # complete building the final copy
-        torch.distributed.barrier()
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/microbatches.py b/toolbox/Megatron-DeepSpeed/megatron_ds/microbatches.py
deleted file mode 100644
index 6449d7479c9c983b4813889ee8f1beec9e027cc3..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/microbatches.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron number of micro-batches calculators."""
-
-from abc import ABC
-from abc import abstractmethod
-
-
-def build_num_microbatches_calculator(args):
-
-    # Constant num micro-batches.
-    if args.rampup_batch_size is None:
-        num_microbatches_calculator = ConstantNumMicroBatches(
-            args.global_batch_size, args.micro_batch_size,
-            args.data_parallel_size)
-        if args.rank == 0:
-            print('setting number of micro-batches to constant {}'.format(
-                num_microbatches_calculator.get()), flush=True)
-
-    else:
-        assert len(args.rampup_batch_size) == 3, 'expected the following ' \
-            'format: --rampup-batch-size <start batch size> ' \
-            '<batch size incerement> <ramp-up samples>'
-        start_batch_size = int(args.rampup_batch_size[0])
-        batch_size_increment = int(args.rampup_batch_size[1])
-        ramup_samples = int(args.rampup_batch_size[2])
-        if args.rank == 0:
-            print('will use batch size rampup starting from global batch '
-                  'size {} to global batch size {} with batch size increments '
-                  '{} over {} samples.'.format(start_batch_size,
-                                               args.global_batch_size,
-                                               batch_size_increment,
-                                               ramup_samples), flush=True)
-        num_microbatches_calculator = RampupBatchsizeNumMicroBatches(
-            start_batch_size, batch_size_increment, ramup_samples,
-            args.global_batch_size, args.micro_batch_size,
-            args.data_parallel_size)
-
-    return num_microbatches_calculator
-
-
-class NumMicroBatchesCalculator(ABC):
-
-    def __init__(self):
-        self.num_micro_batches = None
-        self.current_global_batch_size = None
-
-    def get(self):
-        return self.num_micro_batches
-
-    def get_current_global_batch_size(self):
-        return self.current_global_batch_size
-
-    @abstractmethod
-    def update(self, consumed_samples, consistency_check):
-        pass
-
-
-class ConstantNumMicroBatches(NumMicroBatchesCalculator):
-
-    def __init__(self, global_batch_size, micro_batch_size, data_parallel_size):
-        micro_batch_times_data_parallel = micro_batch_size * \
-                                          data_parallel_size
-        assert global_batch_size % micro_batch_times_data_parallel == 0, \
-            'global batch size ({}) is not divisible by micro batch size ({})' \
-            ' times data parallel size ({})'.format(global_batch_size,
-                                                    micro_batch_size,
-                                                    data_parallel_size)
-        self.num_micro_batches = global_batch_size // \
-                                 micro_batch_times_data_parallel
-        assert self.num_micro_batches >= 1
-        self.current_global_batch_size = global_batch_size
-
-    def update(self, consumed_samples, consistency_check):
-        pass
-
-
-class RampupBatchsizeNumMicroBatches(NumMicroBatchesCalculator):
-
-    def __init__(self, start_batch_size, batch_size_increment, ramup_samples,
-                 global_batch_size, micro_batch_size, data_parallel_size):
-        """Batch size ramp up.
-        Over 
-          steps = (global-batch-size - start-batch-size) / batch_size_increment
-        increment batch size from start-batch-size to global-batch-size using
-          rampup-samples / steps
-        samples.
-        Arguments:
-            start_batch_size: global batch size to start with
-            batch_size_increment: global batch size increments
-            ramup_samples: number of samples to use ramp up global
-               batch size from `start_batch_size` to `global_batch_size`
-            global_batch_size: global batch size post rampup
-            micro_batch_size: micro batch size
-            data_parallel_size: data parallel size.
-        """
-
-        self.micro_batch_size = micro_batch_size
-        self.data_parallel_size = data_parallel_size
-        self.micro_batch_times_data_parallel_size = self.micro_batch_size * \
-                                                    self.data_parallel_size
-        assert self.micro_batch_times_data_parallel_size > 0
-        
-        assert start_batch_size > 0
-        self.start_batch_size = start_batch_size
-
-        assert global_batch_size > 0
-        self.global_batch_size = global_batch_size
-        diff_batch_size = self.global_batch_size - self.start_batch_size
-        assert diff_batch_size >= 0
-        assert batch_size_increment > 0
-        self.batch_size_increment = batch_size_increment
-        assert diff_batch_size % batch_size_increment == 0, 'expected ' \
-            'global batch size interval ({}) to be divisible by global batch ' \
-            'size increment ({})'.format(diff_batch_size, batch_size_increment)
-
-        num_increments = diff_batch_size // self.batch_size_increment
-        self.ramup_samples = ramup_samples
-        assert self.ramup_samples >= 0
-        self.rampup_samples_per_increment = self.ramup_samples / num_increments
-
-        # Initialize number of microbatches.
-        self.update(0, False)
-
-
-    def update(self, consumed_samples, consistency_check):
-
-        if consumed_samples > self.ramup_samples:
-            self.current_global_batch_size = self.global_batch_size
-        else:
-            steps = int(consumed_samples / self.rampup_samples_per_increment)
-            self.current_global_batch_size = self.start_batch_size + \
-                steps * self.batch_size_increment
-            assert self.current_global_batch_size <= self.global_batch_size
-
-        if consistency_check:
-            assert self.current_global_batch_size % \
-                self.micro_batch_times_data_parallel_size == 0, 'current global ' \
-                'batch size ({}) is not divisible by micro-batch-size ({}) times' \
-                'data parallel size ({})'.format(self.current_global_batch_size,
-                                                 self.micro_batch_size,
-                                                 self.data_parallel_size)
-        self.num_micro_batches = self.current_global_batch_size // \
-                                 self.micro_batch_times_data_parallel_size
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/__init__.py
deleted file mode 100644
index 5611d1ddaf6e4bfbd717390d3d127da472b92ef7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
-from .fused_layer_norm import MixedFusedRMSNormResidual as RMSNormResidual
-from .rms_norm import RMSNorm
-
-from .distributed import DistributedDataParallel
-#from .bert_model import BertModel
-from .gpt_model import GPTModel, GPTModelPipe
-from .t5_model import T5Model
-from .language_model import get_language_model
-from .module import Float16Module
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/biencoder_model.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/biencoder_model.py
deleted file mode 100644
index 7d4427cda72752000217289b73bb2d8545e7bade..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/biencoder_model.py
+++ /dev/null
@@ -1,328 +0,0 @@
-import os
-import torch
-import sys
-
-from megatron_ds import get_args, print_rank_0, get_tokenizer
-from megatron_ds.core import mpu
-from megatron_ds.checkpointing import fix_query_key_value_ordering
-from megatron_ds.checkpointing import get_checkpoint_tracker_filename
-from megatron_ds.checkpointing import get_checkpoint_name
-from megatron_ds.model.bert_model import bert_position_ids
-from megatron_ds.model.enums import AttnMaskType
-from megatron_ds.model.language_model import get_language_model
-from megatron_ds.model.utils import get_linear_layer
-from megatron_ds.model.utils import init_method_normal
-from megatron_ds.model.utils import scaled_init_method_normal
-from .module import MegatronModule
-
-def get_model_provider(only_query_model=False, only_context_model=False,
-        biencoder_shared_query_context_model=False):
-
-    def model_provider(pre_process=True, post_process=True):
-        """Build the model."""
-
-        print_rank_0('building Bienoder model ...')
-        model = biencoder_model_provider(only_query_model=only_query_model,
-                only_context_model = only_context_model,
-                biencoder_shared_query_context_model = \
-                biencoder_shared_query_context_model,
-                pre_process=pre_process, post_process=post_process)
-
-        return model
-
-    return model_provider
-
-
-def biencoder_model_provider(only_query_model=False,
-                             only_context_model=False,
-                             biencoder_shared_query_context_model=False,
-                             pre_process=True,
-                             post_process=True):
-    """Build the model."""
-
-    assert mpu.get_tensor_model_parallel_world_size() == 1 and \
-        mpu.get_pipeline_model_parallel_world_size() == 1, \
-        "Model parallel size > 1 not supported for ICT"
-
-    print_rank_0('building BiEncoderModel...')
-
-    # simpler to just keep using 2 tokentypes since
-    # the LM we initialize with has 2 tokentypes
-    model = BiEncoderModel(
-        num_tokentypes=2,
-        parallel_output=False,
-        only_query_model=only_query_model,
-        only_context_model=only_context_model,
-        biencoder_shared_query_context_model=\
-        biencoder_shared_query_context_model,
-        pre_process=pre_process,
-        post_process=post_process)
-
-    return model
-
-
-class BiEncoderModel(MegatronModule):
-    """Bert-based module for Biencoder model."""
-
-    def __init__(self,
-                 num_tokentypes=1,
-                 parallel_output=True,
-                 only_query_model=False,
-                 only_context_model=False,
-                 biencoder_shared_query_context_model=False,
-                 pre_process=True,
-                 post_process=True):
-        super(BiEncoderModel, self).__init__()
-        args = get_args()
-
-        bert_kwargs = dict(
-            num_tokentypes=num_tokentypes,
-            parallel_output=parallel_output,
-            pre_process=pre_process,
-            post_process=post_process)
-
-        self.biencoder_shared_query_context_model = \
-            biencoder_shared_query_context_model
-        assert not (only_context_model and only_query_model)
-        self.use_context_model = not only_query_model
-        self.use_query_model = not only_context_model
-        self.biencoder_projection_dim = args.biencoder_projection_dim
-
-        if self.biencoder_shared_query_context_model:
-            self.model = PretrainedBertModel(**bert_kwargs)
-            self._model_key = 'shared_model'
-            self.query_model, self.context_model = self.model, self.model
-        else:
-            if self.use_query_model:
-                # this model embeds (pseudo-)queries - Embed_input in the paper
-                self.query_model = PretrainedBertModel(**bert_kwargs)
-                self._query_key = 'query_model'
-
-            if self.use_context_model:
-                # this model embeds evidence blocks - Embed_doc in the paper
-                self.context_model = PretrainedBertModel(**bert_kwargs)
-                self._context_key = 'context_model'
-
-    def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
-        # this is just a placeholder and will be needed when model
-        # parallelism will be used
-        # self.language_model.set_input_tensor(input_tensor)
-        return
-
-    def forward(self, query_tokens, query_attention_mask, query_types,
-                context_tokens, context_attention_mask, context_types):
-        """Run a forward pass for each of the models and
-        return the respective embeddings."""
-
-        if self.use_query_model:
-            query_logits = self.embed_text(self.query_model,
-                                           query_tokens,
-                                           query_attention_mask,
-                                           query_types)
-        else:
-            raise ValueError("Cannot embed query without the query model.")
-        if self.use_context_model:
-            context_logits = self.embed_text(self.context_model,
-                                             context_tokens,
-                                             context_attention_mask,
-                                             context_types)
-        else:
-            raise ValueError("Cannot embed block without the block model.")
-        return query_logits, context_logits
-
-    @staticmethod
-    def embed_text(model, tokens, attention_mask, token_types):
-        """Embed a batch of tokens using the model"""
-        logits = model(tokens,
-                              attention_mask,
-                              token_types)
-        return logits
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """Save dict with state dicts of each of the models."""
-        state_dict_ = {}
-        if self.biencoder_shared_query_context_model:
-            state_dict_[self._model_key] = \
-                self.model.state_dict_for_save_checkpoint(
-                    prefix=prefix, keep_vars=keep_vars)
-        else:
-            if self.use_query_model:
-                state_dict_[self._query_key] = \
-                    self.query_model.state_dict_for_save_checkpoint(
-                        prefix=prefix, keep_vars=keep_vars)
-
-            if self.use_context_model:
-                state_dict_[self._context_key] = \
-                    self.context_model.state_dict_for_save_checkpoint(
-                        prefix=prefix, keep_vars=keep_vars)
-
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Load the state dicts of each of the models"""
-        if self.biencoder_shared_query_context_model:
-            print_rank_0("Loading shared query-context model")
-            self.model.load_state_dict(state_dict[self._model_key], \
-                strict=strict)
-        else:
-            if self.use_query_model:
-                print_rank_0("Loading query model")
-                self.query_model.load_state_dict( \
-                    state_dict[self._query_key], strict=strict)
-
-            if self.use_context_model:
-                print_rank_0("Loading context model")
-                self.context_model.load_state_dict( \
-                    state_dict[self._context_key], strict=strict)
-
-    def init_state_dict_from_bert(self):
-        """Initialize the state from a pretrained BERT model
-        on iteration zero of ICT pretraining"""
-        args = get_args()
-
-        if args.bert_load is None:
-            print_rank_0("bert-load argument is None")
-            return
-
-        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
-        if not os.path.isfile(tracker_filename):
-            raise FileNotFoundError("Could not find BERT checkpoint")
-        with open(tracker_filename, 'r') as f:
-            iteration = int(f.read().strip())
-            assert iteration > 0
-
-        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
-        if mpu.get_data_parallel_rank() == 0:
-            print('global rank {} is loading BERT checkpoint {}'.format(
-                torch.distributed.get_rank(), checkpoint_name))
-
-        # Load the checkpoint.
-        try:
-            state_dict = torch.load(checkpoint_name, map_location='cpu')
-        except ModuleNotFoundError:
-            from megatron_ds.fp16_deprecated import loss_scaler
-            # For backward compatibility.
-            print_rank_0(' > deserializing using the old code structure ...')
-            sys.modules['fp16.loss_scaler'] = sys.modules[
-                'megatron_ds.fp16_deprecated.loss_scaler']
-            sys.modules['megatron_ds.fp16.loss_scaler'] = sys.modules[
-                'megatron_ds.fp16_deprecated.loss_scaler']
-            state_dict = torch.load(checkpoint_name, map_location='cpu')
-            sys.modules.pop('fp16.loss_scaler', None)
-            sys.modules.pop('megatron_ds.fp16.loss_scaler', None)
-        except BaseException:
-            print_rank_0('could not load the BERT checkpoint')
-            sys.exit()
-
-        checkpoint_version = state_dict.get('checkpoint_version', 0)
-
-        # load the LM state dict into each model
-        model_dict = state_dict['model']['language_model']
-
-        if self.biencoder_shared_query_context_model:
-            self.model.language_model.load_state_dict(model_dict)
-            fix_query_key_value_ordering(self.model, checkpoint_version)
-        else:
-            if self.use_query_model:
-                self.query_model.language_model.load_state_dict(model_dict)
-                # give each model the same ict_head to begin with as well
-                if self.biencoder_projection_dim > 0:
-                    query_proj_state_dict = \
-                        self.state_dict_for_save_checkpoint()\
-                        [self._query_key]['projection_enc']
-                fix_query_key_value_ordering(self.query_model, checkpoint_version)
-
-            if self.use_context_model:
-                self.context_model.language_model.load_state_dict(model_dict)
-                if self.query_model is not None and \
-                    self.biencoder_projection_dim > 0:
-                    self.context_model.projection_enc.load_state_dict\
-                        (query_proj_state_dict)
-                fix_query_key_value_ordering(self.context_model, checkpoint_version)
-
-
-class PretrainedBertModel(MegatronModule):
-    """BERT-based encoder for queries or contexts used for
-    learned information retrieval."""
-
-    def __init__(self, num_tokentypes=2,
-            parallel_output=True, pre_process=True, post_process=True):
-        super(PretrainedBertModel, self).__init__()
-
-        args = get_args()
-        tokenizer = get_tokenizer()
-        self.pad_id = tokenizer.pad
-        self.biencoder_projection_dim = args.biencoder_projection_dim
-        self.parallel_output = parallel_output
-        self.pre_process = pre_process
-        self.post_process = post_process
-        init_method = init_method_normal(args.init_method_std)
-        scaled_init_method = scaled_init_method_normal(
-            args.init_method_std, args.num_layers)
-
-        self.language_model, self._language_model_key = get_language_model(
-            num_tokentypes=num_tokentypes,
-            add_pooler=False,
-            encoder_attn_mask_type=AttnMaskType.padding,
-            init_method=init_method,
-            scaled_init_method=scaled_init_method,
-            pre_process=self.pre_process,
-            post_process=self.post_process)
-
-        if args.biencoder_projection_dim > 0:
-            self.projection_enc = get_linear_layer(args.hidden_size,
-                                                   args.biencoder_projection_dim,
-                                                   init_method)
-            self._projection_enc_key = 'projection_enc'
-
-    def forward(self, input_ids, attention_mask, tokentype_ids=None):
-        extended_attention_mask = attention_mask.unsqueeze(1)
-        #extended_attention_mask = bert_extended_attention_mask(attention_mask)
-        position_ids = bert_position_ids(input_ids)
-
-        lm_output = self.language_model(input_ids,
-                                        position_ids,
-                                        extended_attention_mask,
-                                        tokentype_ids=tokentype_ids)
-        # This mask will be used in average-pooling and max-pooling
-        pool_mask = (input_ids == self.pad_id).unsqueeze(2)
-
-        # Taking the representation of the [CLS] token of BERT
-        pooled_output = lm_output[0, :, :]
-
-        # Converting to float16 dtype
-        pooled_output = pooled_output.to(lm_output.dtype)
-
-        # Output.
-        if self.biencoder_projection_dim:
-            pooled_output = self.projection_enc(pooled_output)
-
-        return pooled_output
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """For easy load when model is combined with other heads,
-        add an extra key."""
-
-        state_dict_ = {}
-        state_dict_[self._language_model_key] \
-            = self.language_model.state_dict_for_save_checkpoint(
-                prefix=prefix, keep_vars=keep_vars)
-
-        if self.biencoder_projection_dim > 0:
-            state_dict_[self._projection_enc_key] = \
-                self.projection_enc.state_dict(prefix=prefix,
-                                               keep_vars=keep_vars)
-
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-        print_rank_0("loading pretrained weights")
-        self.language_model.load_state_dict(
-            state_dict[self._language_model_key], strict=strict)
-
-        if self.biencoder_projection_dim > 0:
-            print_rank_0("loading projection head weights")
-            self.projection_enc.load_state_dict(
-                state_dict[self._projection_enc_key], strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/enums.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/enums.py
deleted file mode 100644
index 6c5c600e3b20f21f27b10f387f5ad3e16775a100..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/enums.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import enum
-
-class LayerType(enum.Enum):
-    encoder = 1
-    decoder = 2
-    retro_encoder = 3
-    retro_decoder = 4
-    retro_decoder_with_retriever = 5
- 
-class AttnType(enum.Enum):
-    self_attn = 1
-    cross_attn = 2
-
-class AttnMaskType(enum.Enum):
-    padding = 1
-    causal = 2
-
-# For backward compatibility with old model checkpoints
-from megatron_ds.core.enums import ModelType
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_bias_gelu.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_bias_gelu.py
deleted file mode 100644
index 29222db024eb5c5e54c7f38f58be8edd45c49b39..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_bias_gelu.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-
-###### BIAS GELU FUSION/ NO AUTOGRAD ################
-# 1/sqrt(2*pi)-> 0.3989423
-# 1/sqrt(2)   -> 0.70710678
-# sqrt(2/pi)  -> 0.79788456
-# this function is tanh approximation of gelu
-# actual gelu is:
-# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
-
-@torch.jit.script
-def bias_gelu(bias, y):
-    x = bias + y
-    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
-
-# gradient of tanh approximation of gelu
-# gradient of actual gelu is:
-# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
-@torch.jit.script
-def bias_gelu_back(g, bias, y):
-    x = bias + y
-    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
-    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
-    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
-    return ff*g
-
-class GeLUFunction(torch.autograd.Function):
-    @staticmethod
-    # bias is an optional argument
-    def forward(ctx, input, bias):
-        ctx.save_for_backward(input, bias)
-        return bias_gelu(bias, input)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, bias = ctx.saved_tensors
-        tmp = bias_gelu_back(grad_output, bias, input)
-        return tmp, tmp
-
-bias_gelu_impl = GeLUFunction.apply
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_layer_norm.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_layer_norm.py
deleted file mode 100644
index d45e4de698da7a5b7c3bdf5e1f6641d3378eb8b4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_layer_norm.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""This code is copied fron NVIDIA apex:
-      https://github.com/NVIDIA/apex
-   with some changes. """
-
-import numbers
-import torch
-from torch.nn.parameter import Parameter
-from torch.nn import init
-import importlib
-
-from megatron_ds.core.utils import make_viewless_tensor
-import inspect
-try:
-    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
-    HAVE_PERSIST_LAYER_NORM = True
-except:
-    HAVE_PERSIST_LAYER_NORM = False
-
-try:
-    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
-except:
-    FusedLayerNormAffineFunction = None
-from apex.normalization.fused_layer_norm import FusedRMSNormResidualFunction
-global fused_layer_norm_cuda
-fused_layer_norm_cuda = None
-
-
-class MixedFusedLayerNorm(torch.nn.Module):
-
-  def __init__(self, normalized_shape, eps=1e-5,
-               no_persist_layer_norm=True,
-               sequence_parallel=False,
-               apply_layernorm_1p=False,
-               mem_efficient_ln=True):
-        super(MixedFusedLayerNorm, self).__init__()
-
-        self.apply_layernorm_1p = apply_layernorm_1p
-        self.mem_efficient_ln = mem_efficient_ln
-
-        global fused_layer_norm_cuda
-        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
-
-        # List of hiddens sizes supported in the persistent layer norm kernel
-        # If the hidden size is not supported, fall back to the non-persistent
-        # kernel.
-        persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
-            5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
-            24576, 25600, 30720, 32768, 40960, 49152, 65536]
-        if normalized_shape not in persist_ln_hidden_sizes or \
-                not HAVE_PERSIST_LAYER_NORM:
-            no_persist_layer_norm = True
-
-        if isinstance(normalized_shape, numbers.Integral):
-            normalized_shape = (normalized_shape,)
-        self.normalized_shape = torch.Size(normalized_shape)
-        self.eps = eps
-        self.weight = Parameter(torch.Tensor(*normalized_shape))
-        self.bias = Parameter(torch.Tensor(*normalized_shape))
-        self.reset_parameters()
-        self.no_persist_layer_norm = no_persist_layer_norm
-        self.sequence_parallel = sequence_parallel
-
-        # set sequence parallelism flag on weight and bias parameters
-        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
-        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
-
-
-  def reset_parameters(self):
-
-    if self.apply_layernorm_1p:
-        init.zeros_(self.weight)
-        init.zeros_(self.bias)
-    else:
-        init.ones_(self.weight)
-        init.zeros_(self.bias)
-
-  def forward(self, input):
-
-    weight = self.weight + 1 if self.apply_layernorm_1p else self.weight
-
-    if self.no_persist_layer_norm:
-        # Apex does not have versions yet (https://github.com/NVIDIA/apex/pull/1648), so we need to inspect 
-        # the function manually on whether the extra arg introduced in https://github.com/NVIDIA/apex/pull/1715 exists yet
-        assert FusedLayerNormAffineFunction is not None, \
-            "FusedLayerNormAffineFunction is not available, please install apex from https://github.com/NVIDIA/apex"
-        if 'memory_efficient' in inspect.getfullargspec(FusedLayerNormAffineFunction.forward).args:
-            return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps, self.mem_efficient_ln)
-        else:
-            return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps)
-        return FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.normalized_shape, self.eps)
-    else:
-        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
-
-        # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
-        # a populated '_base' field). This will result in schedule.py's
-        # deallocate_output_tensor() throwing an error, so a viewless tensor is
-        # created to prevent this.
-        output = make_viewless_tensor(inp = output,
-                                      requires_grad = input.requires_grad,
-                                      keep_graph = True)
-
-        return output
-    
-    
-class MixedFusedRMSNormResidual(torch.nn.Module):
-
-  def __init__(self, normalized_shape, eps=1e-5,
-               no_persist_layer_norm=True,
-               sequence_parallel=False,
-               apply_layernorm_1p=False,
-               apply_layernorm_rms=False,
-               init_weight=None):
-        super(MixedFusedRMSNormResidual, self).__init__()
-
-        self.apply_layernorm_1p = apply_layernorm_1p
-        self.apply_layernorm_rms = apply_layernorm_rms
-        assert not (self.apply_layernorm_1p and self.apply_layernorm_rms), \
-            "Cannot apply both 1p and rms layernorm"
-
-        self.init_weight = init_weight
-        assert self.init_weight is None or isinstance(self.init_weight, float), \
-            "Cannot init_weight of None or of non-float"
-        assert not (self.init_weight is not None and self.apply_layernorm_1p), \
-            "Cannot float init_weight and 1p layernorm"
-
-        global fused_layer_norm_cuda
-        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
-
-        # List of hiddens sizes supported in the persistent layer norm kernel
-        # If the hidden size is not supported, fall back to the non-persistent
-        # kernel.
-        persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
-            5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
-            24576, 25600, 30720, 32768, 40960, 49152, 65536]
-        if normalized_shape not in persist_ln_hidden_sizes or \
-                not HAVE_PERSIST_LAYER_NORM:
-            no_persist_layer_norm = True
-
-        if isinstance(normalized_shape, numbers.Integral):
-            normalized_shape = (normalized_shape,)
-        self.normalized_shape = torch.Size(normalized_shape)
-        self.eps = eps
-        self.weight = Parameter(torch.Tensor(*normalized_shape))
-        # no bias parameter when using rms layernorm
-        if not self.apply_layernorm_rms:
-            self.bias = Parameter(torch.Tensor(*normalized_shape))
-        self.reset_parameters()
-        self.no_persist_layer_norm = no_persist_layer_norm
-        self.sequence_parallel = sequence_parallel
-
-        # set sequence parallelism flag on weight and bias parameters
-        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
-        if not self.apply_layernorm_rms:
-            setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
-
-
-  def reset_parameters(self):
-
-    if self.apply_layernorm_1p:
-        init.zeros_(self.weight)
-        init.zeros_(self.bias)
-    else:
-        if self.init_weight:
-            init.constant_(self.weight, self.init_weight)
-        else:
-            init.ones_(self.weight)
-        if not self.apply_layernorm_rms:
-            init.zeros_(self.bias)
-
-  def forward(self, input, residual):
-
-    weight = self.weight + 1 if self.apply_layernorm_1p else self.weight
-
-    return FusedRMSNormResidualFunction.apply(input, weight, residual, self.normalized_shape, self.eps)
-    
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_softmax.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_softmax.py
deleted file mode 100644
index c8809fa60de69375ce7b64fd05ec62311fbe6ab7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/fused_softmax.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-
-import torch
-import torch.nn as nn
-from megatron_ds.model.enums import AttnMaskType
-
-
-class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply upper triangular mask (typically used in gpt models).
-    3. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, scale):
-        import scaled_upper_triang_masked_softmax_cuda
-
-        scale_t = torch.tensor([scale])
-        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
-            inputs, scale_t[0]
-        )
-
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        import scaled_upper_triang_masked_softmax_cuda
-
-        softmax_results, scale_t = ctx.saved_tensors
-        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
-            output_grads, softmax_results, scale_t[0]
-        )
-
-        return input_grads, None
-
-
-class ScaledMaskedSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply the mask.
-    3. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, mask, scale):
-        import scaled_masked_softmax_cuda
-
-        scale_t = torch.tensor([scale])
-
-        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        import scaled_masked_softmax_cuda
-
-        softmax_results, scale_t = ctx.saved_tensors
-
-        input_grads = scaled_masked_softmax_cuda.backward(
-            output_grads, softmax_results, scale_t[0]
-        )
-        return input_grads, None, None
-
-
-class ScaledSoftmax(torch.autograd.Function):
-    """
-    Fused operation which performs following two operations in sequence
-    1. Scale the tensor.
-    2. Perform softmax.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, scale):
-        import scaled_softmax_cuda
-
-        scale_t = torch.tensor([scale])
-
-        softmax_results = scaled_softmax_cuda.forward(
-            inputs, scale_t[0]
-        )
-        ctx.save_for_backward(softmax_results, scale_t)
-        return softmax_results
-
-    @staticmethod
-    def backward(ctx, output_grads):
-        import scaled_softmax_cuda
-
-        softmax_results, scale_t = ctx.saved_tensors
-
-        input_grads = scaled_softmax_cuda.backward(
-            output_grads, softmax_results, scale_t[0]
-        )
-        return input_grads, None, None
-
-
-class FusedScaleMaskSoftmax(nn.Module):
-    """
-    fused operation: scaling + mask + softmax
-
-    Arguments:
-        input_in_fp16: flag to indicate if input in fp16 data format.
-        input_in_bf16: flag to indicate if input in bf16 data format.
-        attn_mask_type: attention mask type (pad or causal)
-        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
-        mask_func: mask function to be applied.
-        softmax_in_fp32: if true, softmax in performed at fp32 precision.
-        scale: scaling factor used in input tensor scaling.
-    """
-
-    def __init__(
-        self,
-        input_in_fp16,
-        input_in_bf16,
-        attn_mask_type,
-        scaled_masked_softmax_fusion,
-        mask_func,
-        softmax_in_fp32,
-        scale,
-    ):
-        super(FusedScaleMaskSoftmax, self).__init__()
-        self.input_in_fp16 = input_in_fp16
-        self.input_in_bf16 = input_in_bf16
-        assert not (
-            self.input_in_fp16 and self.input_in_bf16
-        ), "both fp16 and bf16 flags cannot be active at the same time."
-        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
-        self.attn_mask_type = attn_mask_type
-        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
-        self.mask_func = mask_func
-        self.softmax_in_fp32 = softmax_in_fp32
-        self.scale = scale
-
-        assert (
-            self.scale is None or softmax_in_fp32
-        ), "softmax should be in fp32 when scaled"
-
-    def forward(self, input, mask):
-        # [b, np, sq, sk]
-        assert input.dim() == 4
-
-        if self.is_kernel_available(mask, *input.size()):
-            return self.forward_fused_softmax(input, mask)
-        else:
-            return self.forward_torch_softmax(input, mask)
-
-    def is_kernel_available(self, mask, b, np, sq, sk):
-        attn_batches = b * np
-
-        if (
-            self.scaled_masked_softmax_fusion  # user want to fuse
-            and self.input_in_float16  # input must be fp16
-            and 16 < sk <= 16384  # sk must be 16 ~ 16384
-            and sq % 4 == 0  # sq must be divisor of 4
-            and sk % 4 == 0  # sk must be divisor of 4
-            and attn_batches % 4 == 0  # np * b must be divisor of 4
-        ):
-            if 0 <= sk <= 16384:
-                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
-
-                if self.attn_mask_type == AttnMaskType.causal:
-                    if attn_batches % batch_per_block == 0:
-                        return True
-                else:
-                    if sq % batch_per_block == 0:
-                        return True
-        return False
-
-    def forward_fused_softmax(self, input, mask):
-        b, np, sq, sk = input.size()
-        scale = self.scale if self.scale is not None else 1.0
-
-        if self.attn_mask_type == AttnMaskType.causal:
-            assert sq == sk, "causal mask is only for self attention"
-
-            # input is 3D tensor (attn_batches, sq, sk)
-            input = input.view(-1, sq, sk)
-            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
-            return probs.view(b, np, sq, sk)
-        else:
-            # input is 4D tensor (b, np, sq, sk)
-            if mask is not None:
-                return ScaledMaskedSoftmax.apply(input, mask, scale)
-            else:
-                return ScaledSoftmax.apply(input, scale)
-
-    def forward_torch_softmax(self, input, mask):
-        if self.input_in_float16 and self.softmax_in_fp32:
-            input = input.float()
-
-        if self.scale is not None:
-            input = input * self.scale
-        mask_output = self.mask_func(input, mask) if mask is not None else input
-        probs = torch.nn.Softmax(dim=-1)(mask_output)
-
-        if self.input_in_float16 and self.softmax_in_fp32:
-            if self.input_in_fp16:
-                probs = probs.half()
-            else:
-                probs = probs.bfloat16()
-
-        return probs
-
-    @staticmethod
-    def get_batch_per_block(sq, sk, b, np):
-        import scaled_masked_softmax_cuda
-
-        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/language_model.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/language_model.py
deleted file mode 100644
index c9304092821b2b5ef3d1811cbf03b00d4b816655..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/language_model.py
+++ /dev/null
@@ -1,699 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Transformer based language model."""
-
-import torch
-import torch.nn.functional as F
-
-from megatron_ds import get_args, get_rlhf_args
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.core.enums import ModelType
-from megatron_ds.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
-
-from .enums import AttnMaskType, LayerType
-from .module import MegatronModule
-from .transformer import ParallelTransformer
-from .utils import get_linear_layer
-from .utils import init_method_normal, scaled_init_method_normal, gather_and_init
-
-
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
-                       bias=None, inference_params=None):
-    """LM logits using word embedding weights."""
-    args = get_args()
-    # Parallel logits.
-    if args.async_tensor_model_parallel_allreduce or\
-            args.sequence_parallel:
-        input_parallel = input_
-        model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
-        async_grad_allreduce = args.async_tensor_model_parallel_allreduce and \
-            model_parallel and not args.sequence_parallel
-    else:
-        input_parallel = tensor_parallel.copy_to_tensor_model_parallel_region(input_)
-        async_grad_allreduce = False
-
-    # Matrix multiply.
-    logits_parallel = tensor_parallel.linear_with_grad_accumulation_and_async_allreduce(
-        input=input_parallel,
-        weight=word_embeddings_weight,
-        bias=bias,
-        gradient_accumulation_fusion=args.gradient_accumulation_fusion,
-        async_grad_allreduce=async_grad_allreduce,
-        sequence_parallel=args.sequence_parallel,
-        inference_params=inference_params)
-    # Gather if needed.
-
-    if parallel_output:
-        return logits_parallel
-    if not args.RLHF:
-        return tensor_parallel.gather_from_tensor_model_parallel_region(logits_parallel)
-    else:
-        return logits_parallel
-
-
-def get_language_model(config, num_tokentypes, add_pooler,
-                       encoder_attn_mask_type,
-                       add_encoder=True,
-                       add_decoder=False,
-                       decoder_attn_mask_type=AttnMaskType.causal,
-                       pre_process=True, post_process=True, num_experts=[1],
-                       rlhf_training=False):
-    """Build language model and return along with the key to save."""
-    if config.init_method is None:
-        config.init_method = init_method_normal(config.init_method_std)
-
-    if config.output_layer_init_method is None:
-        config.output_layer_init_method = scaled_init_method_normal(config.init_method_std,
-                                                                    config.num_layers)
-
-    # Language model.
-    language_model = TransformerLanguageModel(
-        config,
-        encoder_attn_mask_type,
-        num_tokentypes=num_tokentypes,
-        add_encoder=add_encoder,
-        add_decoder=add_decoder,
-        decoder_attn_mask_type=decoder_attn_mask_type,
-        add_pooler=add_pooler,
-        pre_process=pre_process,
-        post_process=post_process,
-        num_experts=num_experts,
-        rlhf_training=rlhf_training
-    )
-    # key used for checkpoints.
-    language_model_key = 'language_model'
-
-    return language_model, language_model_key
-
-
-class Pooler(MegatronModule):
-    """Pooler layer.
-
-    Pool hidden states of a specific token (for example start of the
-    sequence) and add a linear transformation followed by a tanh.
-
-    Arguments:
-        hidden_size: hidden size
-        init_method: weight initialization method for the linear layer.
-            bias is set to zero.
-    """
-
-    def __init__(self, hidden_size, init_method):
-        super(Pooler, self).__init__()
-        args = get_args()
-        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
-        self.sequence_parallel = args.sequence_parallel
-
-
-    def forward(self, hidden_states, sequence_index=0):
-        # hidden_states: [s, b, h]
-        # sequence_index: index of the token to pool.
-
-        # gather data along sequence dimensions
-        # same pooler is run on all tensor parallel nodes
-        if self.sequence_parallel:
-            hidden_states = tensor_parallel.gather_from_sequence_parallel_region(
-                hidden_states,
-                tensor_parallel_output_grad=False)
-
-        pooled = hidden_states[sequence_index, :, :]
-        pooled = self.dense(pooled)
-        pooled = torch.tanh(pooled)
-        return pooled
-
-
-class Embedding(MegatronModule):
-    """Language model embeddings.
-
-    Arguments:
-        hidden_size: hidden size
-        vocab_size: vocabulary size
-        max_sequence_length: maximum size of sequence. This
-                             is used for positional embedding
-        embedding_dropout_prob: dropout probability for embeddings
-        init_method: weight initialization method
-        num_tokentypes: size of the token-type embeddings. 0 value
-                        will ignore this embedding
-    """
-
-    def __init__(self,
-                 hidden_size,
-                 vocab_size,
-                 max_sequence_length,
-                 embedding_dropout_prob,
-                 config,
-                 num_tokentypes=0,
-                 embedding_weights_in_fp32=False,
-                 rlhf_training=False):
-        super(Embedding, self).__init__()
-
-        self.hidden_size = hidden_size
-        self.init_method = config.init_method
-        self.num_tokentypes = num_tokentypes
-
-        if rlhf_training:
-            args = get_rlhf_args()
-        else:
-            args = get_args()
-
-        # Word embeddings (parallel).
-        self.embedding_weights_in_fp32 = embedding_weights_in_fp32
-        self.params_dtype = args.params_dtype
-        self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
-            vocab_size, self.hidden_size, config=config, init_method=config.init_method)
-        self._word_embeddings_key = 'word_embeddings'
-
-        # Position embedding (serial).
-        self.add_position_embedding = args.position_embedding_type == 'learned_absolute'
-        if self.add_position_embedding:
-            self._position_embeddings_key = 'position_embeddings'
-            if args.sequence_parallel:
-                self.position_embeddings = tensor_parallel.layers.SequenceParallelPositionEmbedding(
-                    max_sequence_length, self.hidden_size)
-                # Initialize the position embeddings.
-                self.init_method(self.position_embeddings.local_embeddings.weight)
-            else:
-                self.position_embeddings = torch.nn.Embedding(
-                    max_sequence_length, self.hidden_size)
-                # Initialize the position embeddings.
-                if args.perform_initialization:
-                    if args.zero_stage == 3:
-                        gather_and_init(self.position_embeddings.weight, self.init_method)
-                    else:
-                        self.init_method(self.position_embeddings.weight)
-
-        # Token type embedding.
-        # Add this as an optional field that can be added through
-        # method call so we can load a pretrain model without
-        # token types and add them as needed.
-        self._tokentype_embeddings_key = 'tokentype_embeddings'
-        if self.num_tokentypes > 0:
-            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
-                                                           self.hidden_size)
-            # Initialize the token-type embeddings.
-            if args.perform_initialization:
-                self.init_method(self.tokentype_embeddings.weight)
-        else:
-            self.tokentype_embeddings = None
-
-        self.fp32_residual_connection = args.fp32_residual_connection
-        self.sequence_parallel = args.sequence_parallel
-        self.clone_scatter_output_in_embedding = args.clone_scatter_output_in_embedding
-        # Embeddings dropout
-        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
-
-    def zero_parameters(self):
-        """Zero out all parameters in embedding."""
-        self.word_embeddings.weight.data.fill_(0)
-        self.word_embeddings.weight.shared = True
-        if self.add_position_embedding:
-            self.position_embeddings.weight.data.fill_(0)
-            self.position_embeddings.weight.shared = True
-        if self.num_tokentypes > 0:
-            self.tokentype_embeddings.weight.data.fill_(0)
-            self.tokentype_embeddings.weight.shared = True
-
-    def add_tokentype_embeddings(self, num_tokentypes):
-        """Add token-type embedding. This function is provided so we can add
-        token-type embeddings in case the pretrained model does not have it.
-        This allows us to load the model normally and then add this embedding.
-        """
-        if self.tokentype_embeddings is not None:
-            raise Exception('tokentype embeddings is already initialized')
-        if torch.distributed.get_rank() == 0:
-            print('adding embedding for {} tokentypes'.format(num_tokentypes),
-                  flush=True)
-        self.num_tokentypes = num_tokentypes
-        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
-                                                       self.hidden_size)
-        # Initialize the token-type embeddings.
-        self.init_method(self.tokentype_embeddings.weight)
-
-    def forward(self, input_ids, position_ids, tokentype_ids=None, inference_params=None):
-        # Embeddings.
-        if self.embedding_weights_in_fp32:
-            self.word_embeddings = self.word_embeddings.to(torch.float32)
-        words_embeddings = self.word_embeddings(input_ids)
-        if self.embedding_weights_in_fp32:
-            words_embeddings = words_embeddings.to(self.params_dtype)
-            self.word_embeddings = self.word_embeddings.to(self.params_dtype)
-        if self.add_position_embedding:
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings = words_embeddings + position_embeddings
-        else:
-            embeddings = words_embeddings
-
-        if tokentype_ids is not None:
-            assert self.tokentype_embeddings is not None
-            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
-        else:
-            assert self.tokentype_embeddings is None
-
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.fp32_residual_connection:
-            embeddings = embeddings.float()
-
-        # Dropout.
-        if self.sequence_parallel and not inference_params:
-            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
-            with tensor_parallel.get_cuda_rng_tracker().fork():
-                embeddings = self.embedding_dropout(embeddings)
-        else:
-            embeddings = self.embedding_dropout(embeddings)
-
-        return embeddings
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """For easy load."""
-
-        state_dict_ = {}
-        state_dict_[self._word_embeddings_key] \
-            = self.word_embeddings.state_dict(prefix=prefix,
-                                              keep_vars=keep_vars)
-        if self.add_position_embedding:
-            state_dict_[self._position_embeddings_key] \
-                = self.position_embeddings.state_dict(prefix=prefix,
-                                                  keep_vars=keep_vars)
-        if self.num_tokentypes > 0:
-            state_dict_[self._tokentype_embeddings_key] \
-                = self.tokentype_embeddings.state_dict(prefix=prefix,
-                                                       keep_vars=keep_vars)
-
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-
-        # Word embedding.
-        if self._word_embeddings_key in state_dict:
-            state_dict_ = state_dict[self._word_embeddings_key]
-        else:
-            # for backward compatibility.
-            state_dict_ = {}
-            for key in state_dict.keys():
-                if 'word_embeddings' in key:
-                    state_dict_[key.split('word_embeddings.')[1]] \
-                        = state_dict[key]
-        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
-
-        # Position embedding.
-        if self.add_position_embedding:
-            if self._position_embeddings_key in state_dict:
-                state_dict_ = state_dict[self._position_embeddings_key]
-            else:
-                # for backward compatibility.
-                state_dict_ = {}
-                for key in state_dict.keys():
-                    if 'position_embeddings' in key:
-                        state_dict_[key.split('position_embeddings.')[1]] \
-                            = state_dict[key]
-            self.position_embeddings.load_state_dict(state_dict_, strict=strict)
-
-        # Tokentype embedding.
-        if self.num_tokentypes > 0:
-            state_dict_ = {}
-            if self._tokentype_embeddings_key in state_dict:
-                state_dict_ = state_dict[self._tokentype_embeddings_key]
-            else:
-                # for backward compatibility.
-                for key in state_dict.keys():
-                    if 'tokentype_embeddings' in key:
-                        state_dict_[key.split('tokentype_embeddings.')[1]] \
-                            = state_dict[key]
-            if len(state_dict_.keys()) > 0:
-                self.tokentype_embeddings.load_state_dict(state_dict_,
-                                                          strict=strict)
-            else:
-                print('***WARNING*** expected tokentype embeddings in the '
-                      'checkpoint but could not find it', flush=True)
-
-
-class EmbeddingPipe(Embedding):
-
-    def forward(self, inputs, **kwargs):
-        if not hasattr(self, '_args'):
-            self._args = get_args()
-
-        input_ids = inputs[0]
-        position_ids = inputs[1]
-        if hasattr(self._args, 'attn_mask'):
-            attention_mask = None
-        else:
-            attention_mask = inputs[2]
-
-        if len(inputs) == 4:
-            tokentype_ids = inputs[3]
-        else:
-            tokentype_ids = None
-        
-        embeddings = super().forward(input_ids, position_ids, tokentype_ids=tokentype_ids)
-
-        # If cmd args has attn_mask, we don't forward it as an activation.
-        if hasattr(self._args, 'attn_mask'):
-            return embeddings
-        else:
-            assert False
-            return embeddings, attention_mask
-
-
-    @property
-    def word_embeddings_weight(self):
-        """Easy accessory for the DeepSpeed pipeline engine to tie embeddings across stages."""
-        return self.word_embeddings.weight
-
-
-class TransformerLanguageModel(MegatronModule):
-    """Transformer language model.
-
-    Arguments:
-        transformer_hparams: transformer hyperparameters
-        vocab_size: vocabulary size
-        max_sequence_length: maximum size of sequence. This
-                             is used for positional embedding
-        embedding_dropout_prob: dropout probability for embeddings
-        num_tokentypes: size of the token-type embeddings. 0 value
-                        will ignore this embedding
-    """
-
-    def __init__(self,
-                 config,
-                 encoder_attn_mask_type,
-                 num_tokentypes=0,
-                 add_encoder=True,
-                 add_decoder=False,
-                 decoder_attn_mask_type=AttnMaskType.causal,
-                 add_pooler=False,
-                 pre_process=True,
-                 post_process=True,
-                 num_experts=[1],
-                 rlhf_training=False):
-        if rlhf_training:
-            args = get_rlhf_args()
-        else:
-            args = get_args()
-
-        # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.
-        if args.untie_embeddings_and_output_weights: assert not add_decoder
-        super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
-
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.hidden_size = config.hidden_size
-        self.num_tokentypes = num_tokentypes
-        self.init_method = config.init_method
-        self.add_encoder = add_encoder
-        self.encoder_attn_mask_type = encoder_attn_mask_type
-        self.add_decoder = add_decoder
-        self.decoder_attn_mask_type = decoder_attn_mask_type
-        self.add_pooler = add_pooler
-        self.encoder_hidden_state = None
-        self.add_retriever = args.retro_add_retriever
-        self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
-        self.num_experts = num_experts
-
-        # Embeddings.
-        if self.pre_process:
-            self.embedding = Embedding(self.hidden_size,
-                                       args.padded_vocab_size,
-                                       args.max_position_embeddings,
-                                       args.hidden_dropout,
-                                       config,
-                                       self.num_tokentypes,
-                                       args.embedding_weights_in_fp32,
-                                       rlhf_training=rlhf_training)
-            self._embedding_key = 'embedding'
-
-        # Rotary positional embeddings
-        self.use_rotary_position_embeddings = \
-            args.position_embedding_type == 'rope'
-        if self.use_rotary_position_embeddings:
-            self.seq_length = args.seq_length
-            rotary_dim = args.hidden_size // args.num_attention_heads \
-                if args.kv_channels is None else args.kv_channels
-
-            # partial rotary embeddings, which is better than full rotary
-            # Wang and Komatsuzaki et al
-            # https://github.com/kingoflolz/mesh-transformer-jax/
-            self.rotary_pos_emb = RotaryEmbedding(
-                rotary_dim,
-                args.rotary_percent,
-                seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor,
-                rotary_base=args.rope_theta
-            )
-
-        # Encoder (usually set to True, False if part of an encoder-decoder
-        # architecture and in encoder-only stage).
-        if self.add_encoder:
-            self.encoder = ParallelTransformer(
-                config,
-                model_type=args.model_type if not args.retro_add_retriever \
-                    else ModelType.retro_decoder,
-                self_attn_mask_type=self.encoder_attn_mask_type,
-                pre_process=self.pre_process,
-                post_process=self.post_process,
-                num_experts=self.num_experts,
-                rlhf_training=rlhf_training)
-            self._encoder_key = 'encoder'
-        else:
-            self.encoder = None
-
-        # Decoder (usually set to False, True if part of an encoder-decoder
-        # architecture and in decoder-only stage).
-        if self.add_decoder:
-            self.decoder = ParallelTransformer(
-                config,
-                model_type=args.model_type,
-                layer_type=LayerType.decoder,
-                self_attn_mask_type=self.decoder_attn_mask_type,
-                pre_process=self.pre_process,
-                post_process=self.post_process,
-                num_experts=self.num_experts,
-                rlhf_training=rlhf_training)
-            self._decoder_key = 'decoder'
-        else:
-            self.decoder = None
-
-        if self.post_process:
-            # Pooler.
-            if self.add_pooler:
-                self.pooler = Pooler(self.hidden_size, self.init_method)
-                self._pooler_key = 'pooler'
-
-            if self.untie_embeddings_and_output_weights:
-                if rlhf_training:
-                    self.output_layer = torch.nn.Linear(args.hidden_size, 1, bias=False, dtype=config.params_dtype)
-                else:
-                    self.output_layer = tensor_parallel.ColumnParallelLinear(
-                        args.hidden_size,
-                        args.padded_vocab_size,
-                        config=config,
-                        init_method=self.init_method,
-                        bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias.
-                self._output_layer_key = 'output_layer'
-
-    def set_input_tensor(self, input_tensor):
-        """ See megatron_ds.model.transformer.set_input_tensor()"""
-
-        # This is usually handled in schedules.py but some inference code still
-        # gives us non-lists or None
-        if not isinstance(input_tensor, list):
-            input_tensor = [input_tensor]
-
-        if self.add_encoder and self.add_decoder:
-            assert len(input_tensor) == 1, \
-                'input_tensor should only be length 1 for stage with both encoder and decoder'
-            self.encoder.set_input_tensor(input_tensor[0])
-        elif self.add_encoder:
-            assert len(input_tensor) == 1, \
-                'input_tensor should only be length 1 for stage with only encoder'
-            self.encoder.set_input_tensor(input_tensor[0])
-        elif self.add_decoder:
-            if len(input_tensor) == 2:
-                self.decoder.set_input_tensor(input_tensor[0])
-                self.encoder_hidden_state = input_tensor[1]
-            elif len(input_tensor) == 1:
-                self.decoder.set_input_tensor(None)
-                self.encoder_hidden_state = input_tensor[0]
-            else:
-                raise Exception('input_tensor must have either length 1 or 2')
-        else:
-            raise Exception('Stage must have at least either encoder or decoder')
-
-    def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
-                dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
-                retriever_input_ids=None,
-                retriever_position_ids=None,
-                retriever_attn_mask=None,
-                enc_dec_attn_mask=None, tokentype_ids=None,
-                inference_params=None,
-                pooling_sequence_index=0,
-                enc_hidden_states=None, output_enc_hidden=False):
-        args = get_args()
-        # Encoder embedding.
-        if self.pre_process:
-            encoder_input = self.embedding(enc_input_ids, enc_position_ids,
-                                           tokentype_ids=tokentype_ids, inference_params=inference_params)
-        else:
-            encoder_input = None
-
-        # Retriever embedding.
-        if self.add_retriever and self.pre_process:
-            retriever_input = self.embedding(retriever_input_ids,
-                                             retriever_position_ids,
-                                             tokentype_ids=tokentype_ids, inference_params=inference_params)
-        else:
-            retriever_input = None
-
-        # Rotary positional embeddings
-        rotary_pos_emb = None
-        if self.use_rotary_position_embeddings:
-            if inference_params is not None:
-                rotary_pos_emb = \
-                    self.rotary_pos_emb(inference_params.max_sequence_length)
-            else:
-                if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
-                    rotary_pos_emb = self.rotary_pos_emb(args.curriculum_seqlen)
-                else:
-                    rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-
-        # Run encoder.
-        if enc_hidden_states is None:
-            if self.encoder is not None:
-                encoder_output = self.encoder(
-                    encoder_input,
-                    enc_attn_mask,
-                    position_ids=enc_position_ids,
-                    retriever_input=retriever_input,
-                    retriever_attn_mask=retriever_attn_mask,
-                    inference_params=inference_params,
-                    rotary_pos_emb=rotary_pos_emb)
-            else:
-                encoder_output = self.encoder_hidden_state
-        else:
-            encoder_output = enc_hidden_states.to(encoder_input.dtype)
-
-        if self.post_process:
-            if self.add_pooler:
-                pooled_output = self.pooler(encoder_output,
-                                            pooling_sequence_index)
-
-        # output_enc_hidden refers to when we just need the encoder's
-        # output. For example, it is helpful to compute
-        # similarity between two sequences by average pooling
-        if not self.add_decoder or output_enc_hidden:
-            if self.add_pooler and self.post_process:
-                return encoder_output, pooled_output
-            else:
-                return encoder_output
-
-        # Decoder embedding.
-        if self.pre_process:
-            decoder_input = self.embedding(dec_input_ids,
-                                           dec_position_ids)
-        else:
-            decoder_input = None
-
-        # Run decoder.
-        decoder_output = self.decoder(
-            decoder_input,
-            dec_attn_mask,
-            encoder_output=encoder_output,
-            enc_dec_attn_mask=enc_dec_attn_mask,
-            inference_params=inference_params,
-            rotary_pos_emb=rotary_pos_emb)
-
-        if self.add_pooler and self.post_process:
-            return decoder_output, encoder_output, pooled_output
-        else:
-            return decoder_output, encoder_output
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """For easy load."""
-
-        state_dict_ = {}
-        if self.pre_process:
-            state_dict_[self._embedding_key] \
-                = self.embedding.state_dict_for_save_checkpoint(prefix=prefix,
-                                                                keep_vars=keep_vars)
-        if self.add_encoder:
-            state_dict_[self._encoder_key] \
-                = self.encoder.state_dict_for_save_checkpoint(prefix=prefix,
-                                                              keep_vars=keep_vars)
-        if self.post_process:
-            if self.add_pooler:
-                state_dict_[self._pooler_key] \
-                    = self.pooler.state_dict_for_save_checkpoint(prefix=prefix,
-                                                                 keep_vars=keep_vars)
-            if self.untie_embeddings_and_output_weights:
-                state_dict_[self._output_layer_key] \
-                    = self.output_layer.state_dict(prefix=prefix, keep_vars=keep_vars)
-
-        if self.add_decoder:
-            state_dict_[self._decoder_key] \
-                = self.decoder.state_dict_for_save_checkpoint(prefix=prefix,
-                                                              keep_vars=keep_vars)
-
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-
-        # Embedding.
-        if self.pre_process:
-            if self._embedding_key in state_dict:
-                state_dict_ = state_dict[self._embedding_key]
-            else:
-                # for backward compatibility.
-                state_dict_ = {}
-                for key in state_dict.keys():
-                    if '_embeddings' in key:
-                        state_dict_[key] = state_dict[key]
-            self.embedding.load_state_dict(state_dict_, strict=strict)
-
-        # Encoder.
-        if self.add_encoder:
-            if self._encoder_key in state_dict:
-                state_dict_ = state_dict[self._encoder_key]
-            # For backward compatibility.
-            elif 'transformer' in state_dict:
-                state_dict_ = state_dict['transformer']
-            else:
-                # For backward compatibility.
-                state_dict_ = {}
-                for key in state_dict.keys():
-                    if 'transformer.' in key:
-                        state_dict_[key.split('transformer.')[1]] = state_dict[key]
-
-            # For backward compatibility.
-            state_dict_self_attention = {}
-            for key in state_dict_.keys():
-                if '.attention.' in key:
-                    state_dict_self_attention[key.replace(".attention.",
-                        ".self_attention.")] = state_dict_[key]
-                else:
-                    state_dict_self_attention[key] = state_dict_[key]
-            state_dict_ = state_dict_self_attention
-
-            self.encoder.load_state_dict(state_dict_, strict=strict)
-
-        # Pooler.
-        if self.post_process:
-            if self.add_pooler:
-                assert 'pooler' in state_dict, \
-                    'could not find data for pooler in the checkpoint'
-                self.pooler.load_state_dict(state_dict[self._pooler_key],
-                                            strict=strict)
-            if self.untie_embeddings_and_output_weights:
-                assert 'output_layer' in state_dict, \
-                    'could not find data for output_layer in the checkpoint'
-                self.output_layer.load_state_dict(state_dict[self._output_layer_key],
-                                                  strict=strict)
-        # Decoder.
-        if self.add_decoder:
-            assert 'decoder' in state_dict, \
-                'could not find data for pooler in the checkpoint'
-            self.decoder.load_state_dict(state_dict[self._decoder_key],
-                                         strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/module.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/module.py
deleted file mode 100644
index 28a94eab3818668c0aa6ac6ed9fb904e8285d7a6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/module.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron Module"""
-
-import torch
-from torch.autograd import Variable
-from torch.nn.parameter import Parameter
-
-from megatron_ds import get_args
-from megatron_ds.core import mpu, tensor_parallel
-
-
-_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
-_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
-_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
-
-
-
-def param_is_not_shared(param):
-    return not hasattr(param, 'shared') or not param.shared
-
-
-
-class MegatronModule(torch.nn.Module):
-    """Megatron specific extensions of torch Module with support
-    for pipelining."""
-
-    def __init__(self, config=None, share_embeddings_and_output_weights=True):
-        super(MegatronModule, self).__init__()
-        self.config = config
-        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """Use this function to override the state dict for
-        saving checkpoints."""
-        return self.state_dict(prefix=prefix, keep_vars=keep_vars)
-
-
-    def shared_embedding_or_output_weight(self):
-        if self.pre_process:
-            return self.language_model.embedding.word_embeddings.weight
-        else:
-            if not self.share_embeddings_and_output_weights:
-                raise Exception('shared_embedding_or_output_weight() called for last '
-                                'stage, but share_embeddings_and_output_weights is false')
-            return self.word_embeddings.weight
-
-
-    def initialize_word_embeddings(self):
-        args = get_args()
-        if not self.share_embeddings_and_output_weights:
-            raise Exception('initialize_word_embeddings() was called but '
-                            'share_embeddings_and_output_weights is false')
-
-        # This function just initializes the word embeddings in the final stage
-        # when we are using pipeline parallelism. Nothing to do if we aren't
-        # using pipeline parallelism.
-        if args.pipeline_model_parallel_size == 1:
-            return
-
-        # Parameters are shared between the word embeddings layers, and the
-        # heads at the end of the model. In a pipelined setup with more than
-        # one stage, the initial embedding layer and the head are on different
-        # workers, so we do the following:
-        # 1. Create a second copy of word_embeddings on the last stage, with
-        #    initial parameters of 0.0.
-        # 2. Do an all-reduce between the first and last stage to ensure that
-        #    the two copies of word_embeddings start off with the same
-        #    parameter values.
-        # 3. In the training loop, before an all-reduce between the grads of
-        #    the two word_embeddings layers to ensure that every applied weight
-        #    update is the same on both stages.
-        if mpu.is_pipeline_last_stage() and not self.pre_process:
-            assert not mpu.is_pipeline_first_stage()
-            self._word_embeddings_for_head_key = 'word_embeddings_for_head'
-            # set word_embeddings weights to 0 here, then copy first
-            # stage's weights using all_reduce below.
-            self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
-                args.padded_vocab_size, self.config.hidden_size,
-                config=self.config, init_method=self.config.init_method)
-            self.word_embeddings.weight.data.fill_(0)
-            self.word_embeddings.weight.shared = True
-
-        # Zero out initial weights for decoder embedding.
-        # NOTE: We don't currently support T5 with the interleaved schedule.
-        if not mpu.is_pipeline_first_stage(ignore_virtual=True) and \
-                self.pre_process:
-            self.language_model.embedding.zero_parameters()
-
-        if not torch.distributed.is_initialized():
-            if not getattr(MegatronModule, "embedding_warning_printed", False):
-                print("WARNING! Distributed processes aren't initialized, so "
-                      "word embeddings in the last layer are not initialized. "
-                      "If you are just manipulating a model this is fine, but "
-                      "this needs to be handled manually. If you are training "
-                      "something is definitely wrong.")
-                MegatronModule.embedding_warning_printed = True
-            return
-
-        # Ensure that first and last stages have the same initial parameter
-        # values.
-        if mpu.is_rank_in_embedding_group():
-            torch.distributed.all_reduce(self.shared_embedding_or_output_weight().data,
-                                         group=mpu.get_embedding_group())
-
-        # Ensure that encoder(first stage) and decoder(split stage) position
-        # embeddings have the same initial parameter values
-        # NOTE: We don't currently support T5 with the interleaved schedule.
-        if mpu.is_rank_in_position_embedding_group() and \
-                args.pipeline_model_parallel_split_rank is not None:
-            # TODO: Support tokentype embedding.
-            self.language_model.embedding.cuda()
-            position_embeddings = self.language_model.embedding.position_embeddings
-            torch.distributed.all_reduce(position_embeddings.weight.data,
-                                         group=mpu.get_position_embedding_group())
-
-    def universal_checkpoint_info(self):
-        return {}
-
-def conversion_helper(val, conversion):
-    """Apply conversion to val. Recursively apply conversion if `val`
-    #is a nested tuple/list structure."""
-    if not isinstance(val, (tuple, list)):
-        return conversion(val)
-    rtn = [conversion_helper(v, conversion) for v in val]
-    if isinstance(val, tuple):
-        rtn = tuple(rtn)
-    return rtn
-
-
-def fp32_to_float16(val, float16_convertor):
-    """Convert fp32 `val` to fp16/bf16"""
-    def half_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if val_typecheck.dtype in _FLOAT_TYPES:
-            val = float16_convertor(val)
-        return val
-    return conversion_helper(val, half_conversion)
-
-
-def float16_to_fp32(val):
-    """Convert fp16/bf16 `val` to fp32"""
-    def float_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
-            val = val.float()
-        return val
-    return conversion_helper(val, float_conversion)
-
-
-
-class Float16Module(MegatronModule):
-
-    def __init__(self, module, args):
-        super(Float16Module, self).__init__()
-
-        if args.fp16:
-            self.add_module('module', module.half())
-            def float16_convertor(val):
-                return val.half()
-        elif args.bf16:
-            self.add_module('module', module.bfloat16())
-            def float16_convertor(val):
-                return val.bfloat16()
-        else:
-            raise Exception('should not be here')
-
-        self.float16_convertor = float16_convertor
-
-
-    def set_input_tensor(self, input_tensor):
-        return self.module.set_input_tensor(input_tensor)
-
-
-    def forward(self, *inputs, **kwargs):
-        if mpu.is_pipeline_first_stage():
-            inputs = fp32_to_float16(inputs, self.float16_convertor)
-        outputs = self.module(*inputs, **kwargs)
-        if mpu.is_pipeline_last_stage():
-            outputs = float16_to_fp32(outputs)
-        return outputs
-
-
-    def state_dict(self, prefix='', keep_vars=False):
-        return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
-
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        return self.module.state_dict_for_save_checkpoint(prefix=prefix,
-                                                          keep_vars=keep_vars)
-
-
-    def load_state_dict(self, state_dict, strict=True):
-        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/realm_model.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/realm_model.py
deleted file mode 100644
index 08afd954302dd921b855c15b913c5d2c28a73eb7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/realm_model.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import os
-import torch
-
-from megatron_ds import get_args, print_rank_0
-from megatron_ds.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
-from megatron_ds.model import BertModel
-from .module import MegatronModule
-from megatron_ds.core import mpu
-from megatron_ds.model.enums import AttnMaskType
-from megatron_ds.model.utils import get_linear_layer
-from megatron_ds.model.utils import init_method_normal
-from megatron_ds.model.language_model import get_language_model
-from megatron_ds.model.utils import scaled_init_method_normal
-from megatron_ds.model.bert_model import bert_extended_attention_mask, bert_position_ids
-
-
-def general_ict_model_provider(only_query_model=False, only_block_model=False):
-    """Build the model."""
-    args = get_args()
-    assert args.ict_head_size is not None, \
-        "Need to specify --ict-head-size to provide an ICTBertModel"
-    assert mpu.get_tensor_model_parallel_world_size() == 1 and mpu.get_pipeline_model_parallel_world_size() == 1, \
-        "Model parallel size > 1 not supported for ICT"
-
-    print_rank_0('building ICTBertModel...')
-
-    # simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes
-    model = ICTBertModel(
-        ict_head_size=args.ict_head_size,
-        num_tokentypes=2,
-        parallel_output=True,
-        only_query_model=only_query_model,
-        only_block_model=only_block_model)
-
-    return model
-
-
-class ICTBertModel(MegatronModule):
-    """Bert-based module for Inverse Cloze task."""
-    def __init__(self,
-                 ict_head_size,
-                 num_tokentypes=1,
-                 parallel_output=True,
-                 only_query_model=False,
-                 only_block_model=False):
-        super(ICTBertModel, self).__init__()
-        bert_kwargs = dict(
-            ict_head_size=ict_head_size,
-            num_tokentypes=num_tokentypes,
-            parallel_output=parallel_output
-        )
-        assert not (only_block_model and only_query_model)
-        self.use_block_model = not only_query_model
-        self.use_query_model = not only_block_model
-
-        if self.use_query_model:
-            # this model embeds (pseudo-)queries - Embed_input in the paper
-            self.query_model = IREncoderBertModel(**bert_kwargs)
-            self._query_key = 'question_model'
-
-        if self.use_block_model:
-            # this model embeds evidence blocks - Embed_doc in the paper
-            self.block_model = IREncoderBertModel(**bert_kwargs)
-            self._block_key = 'context_model'
-
-    def forward(self, query_tokens, query_attention_mask, block_tokens, block_attention_mask):
-        """Run a forward pass for each of the models and return the respective embeddings."""
-        query_logits = self.embed_query(query_tokens, query_attention_mask)
-        block_logits = self.embed_block(block_tokens, block_attention_mask)
-        return query_logits, block_logits
-
-    def embed_query(self, query_tokens, query_attention_mask):
-        """Embed a batch of tokens using the query model"""
-        if self.use_query_model:
-            query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
-            query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types)
-            return query_ict_logits
-        else:
-            raise ValueError("Cannot embed query without query model.")
-
-    def embed_block(self, block_tokens, block_attention_mask):
-        """Embed a batch of tokens using the block model"""
-        if self.use_block_model:
-            block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0)
-            block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types)
-            return block_ict_logits
-        else:
-            raise ValueError("Cannot embed block without block model.")
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """Save dict with state dicts of each of the models."""
-        state_dict_ = {}
-        if self.use_query_model:
-            state_dict_[self._query_key] \
-                = self.query_model.state_dict_for_save_checkpoint(
-                    prefix=prefix, keep_vars=keep_vars)
-
-        if self.use_block_model:
-            state_dict_[self._block_key] \
-                = self.block_model.state_dict_for_save_checkpoint(
-                    prefix=prefix, keep_vars=keep_vars)
-
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Load the state dicts of each of the models"""
-        if self.use_query_model:
-            print("Loading ICT query model", flush=True)
-            self.query_model.load_state_dict(
-                state_dict[self._query_key], strict=strict)
-
-        if self.use_block_model:
-            print("Loading ICT block model", flush=True)
-            self.block_model.load_state_dict(
-                state_dict[self._block_key], strict=strict)
-
-    def init_state_dict_from_bert(self):
-        """Initialize the state from a pretrained BERT model on iteration zero of ICT pretraining"""
-        args = get_args()
-        tracker_filename = get_checkpoint_tracker_filename(args.bert_load)
-        if not os.path.isfile(tracker_filename):
-            raise FileNotFoundError("Could not find BERT load for ICT")
-        with open(tracker_filename, 'r') as f:
-            iteration = int(f.read().strip())
-            assert iteration > 0
-
-        checkpoint_name = get_checkpoint_name(args.bert_load, iteration, False)
-        if mpu.get_data_parallel_rank() == 0:
-            print('global rank {} is loading checkpoint {}'.format(
-                torch.distributed.get_rank(), checkpoint_name))
-
-        try:
-            state_dict = torch.load(checkpoint_name, map_location='cpu')
-        except BaseException:
-            raise ValueError("Could not load checkpoint")
-
-        # load the LM state dict into each model
-        model_dict = state_dict['model']['language_model']
-        self.query_model.language_model.load_state_dict(model_dict)
-        self.block_model.language_model.load_state_dict(model_dict)
-
-        # give each model the same ict_head to begin with as well
-        query_ict_head_state_dict = self.state_dict_for_save_checkpoint()[self._query_key]['ict_head']
-        self.block_model.ict_head.load_state_dict(query_ict_head_state_dict)
-
-
-class IREncoderBertModel(MegatronModule):
-    """BERT-based encoder for queries or blocks used for learned information retrieval."""
-    def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True):
-        super(IREncoderBertModel, self).__init__()
-        args = get_args()
-
-        self.ict_head_size = ict_head_size
-        self.parallel_output = parallel_output
-        init_method = init_method_normal(args.init_method_std)
-        scaled_init_method = scaled_init_method_normal(args.init_method_std,
-                                                       args.num_layers)
-
-        self.language_model, self._language_model_key = get_language_model(
-            num_tokentypes=num_tokentypes,
-            add_pooler=True,
-            encoder_attn_mask_type=AttnMaskType.padding,
-            init_method=init_method,
-            scaled_init_method=scaled_init_method)
-
-        self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method)
-        self._ict_head_key = 'ict_head'
-
-    def forward(self, input_ids, attention_mask, tokentype_ids=None):
-        extended_attention_mask = bert_extended_attention_mask(
-            attention_mask, next(self.language_model.parameters()).dtype)
-        position_ids = bert_position_ids(input_ids)
-
-        lm_output, pooled_output = self.language_model(
-            input_ids,
-            position_ids,
-            extended_attention_mask,
-            tokentype_ids=tokentype_ids)
-
-        # Output.
-        ict_logits = self.ict_head(pooled_output)
-        return ict_logits, None
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-        """For easy load when model is combined with other heads,
-        add an extra key."""
-
-        state_dict_ = {}
-        state_dict_[self._language_model_key] \
-            = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
-                                                                 keep_vars=keep_vars)
-        state_dict_[self._ict_head_key] \
-            = self.ict_head.state_dict(prefix=prefix,
-                                       keep_vars=keep_vars)
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-        self.language_model.load_state_dict(
-            state_dict[self._language_model_key], strict=strict)
-        self.ict_head.load_state_dict(
-            state_dict[self._ict_head_key], strict=strict)
-
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/rms_norm.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/rms_norm.py
deleted file mode 100644
index 342782f683f8645ff745d840b0a0235e820f91e2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/rms_norm.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-from torch import nn
-from megatron_ds import get_args
-# from apex.normalization.fused_layer_norm import FusedRMSNormAffineMixedDtypesFunction
-# from transformer_engine.pytorch.module.rmsnorm import _RMSNorm
-import ixformer.functions as F
-
-if hasattr(F, "FusedRMSNorm"):
-    use_ixformer = True
-else:
-    Warning("ixformer version is old. RMSNorm uses torch implementation in megatron-deepspeed")
-    use_ixformer = False
-class RMSNorm(torch.nn.Module):
-
-    def __init__(self,
-                 dim: int,
-                 eps: float = 1e-6,
-                 sequence_parallel: bool = False):
-        """RMS Normaliation module
-
-        Arguments:
-            dim (int): The width of input, i.e. hidden size
-            eps (float): epsilon to use for the norm, default to 1e-6
-            sequence_parallel (bool): Set to true if sequence parallelism is being used,
-              this marks the weights as needing to be allreduced.
-        """
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-        self.normalized_shape = torch.Size((dim,))
-        self.args = get_args()
-
-        setattr(self.weight, 'sequence_parallel', sequence_parallel)
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-
-        ## apex implementation
-        # return FusedRMSNormAffineMixedDtypesFunction.apply(x, self.weight, self.normalized_shape, self.eps)
-
-        ## transformer_engine implementation
-        # dtype = x.dtype
-        # return _RMSNorm.apply(x, self.weight, self.eps, False, False, False, torch.is_grad_enabled(), dtype)
-
-        ## ixformer implementation and torch implementation
-        if use_ixformer and not self.args.RLHF:
-            rmsn = F.FusedRMSNorm(self.normalized_shape, self.eps)
-            rmsn.weight.data = self.weight
-            return rmsn(x)
-        else:
-            output = self._norm(x.float()).type_as(x)
-            return output * self.weight
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/transformer.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/transformer.py
deleted file mode 100644
index 3f8f79dab1187edc99e52baa3a50e785c88667a9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/transformer.py
+++ /dev/null
@@ -1,2091 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Transformer."""
-from contextlib import nullcontext
-import os
-import math
-import numpy as np
-import torch
-import torch.nn.functional as F
-from typing import Optional
-
-from apex.corex.activations import swiglu 
-from apex.transformer.functional import fused_apply_rotary_pos_emb
-from megatron_ds import get_timers, get_args, get_retro_args, get_rlhf_args, core, get_num_microbatches
-from .module import MegatronModule
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.core.enums import ModelType
-from megatron_ds.model.enums import AttnMaskType, LayerType, AttnType
-from megatron_ds.model.fused_softmax import FusedScaleMaskSoftmax
-from megatron_ds.model.fused_bias_gelu import bias_gelu_impl
-from megatron_ds.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding, apply_rotary_pos_emb
-from megatron_ds.model.utils import attention_mask_func, openai_gelu, erf_gelu, get_norm, get_rmsnorm_residual
-from megatron_ds.core.parallel_state import (
-    get_context_parallel_global_ranks,
-    get_context_parallel_group
-)
-import deepspeed
-from deepspeed.moe.layer import MoE
-from deepspeed.accelerator import get_accelerator
-from megatron_ds.core.tensor_parallel import (
-    gather_from_sequence_parallel_region_to_moe,
-    reduce_scatter_to_sequence_parallel_region_from_moe,
-    get_cuda_rng_tracker,
-    get_data_parallel_rng_tracker_name
-)
-from megatron_ds.core.parallel_state import get_tensor_model_parallel_group, get_tensor_and_expert_parallel_group
-
-try:
-    from einops import rearrange
-except ImportError:
-    rearrange = None
-
-try:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
-except ImportError:
-    try:
-        from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
-    except ImportError:
-        flash_attn_unpadded_func = None
-
-""" We use the following notation throughout this file:
-     h: hidden size
-     n: number of attention heads
-     p: number of model parallel partitions
-     np: n/p
-     hp: h/p
-     hn: h/n
-     b: batch size
-     s: sequence length
-     l: number of layers
-    Transformer takes input of size [s, b, h] and returns a
-    tensor of the same size. We use the following arguments:
-        hyperparameters: transformer hyperparameters
-"""
-
-class DropPath(MegatronModule):
-    """Drop paths (Stochastic Depth) per sample
-    (when applied in main path of residual blocks).
-    """
-
-    def __init__(self, drop_prob=0.):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, hidden_state):
-        if self.drop_prob == 0. or not self.training:
-            return hidden_state
-        keep_prob = 1 - self.drop_prob
-        # work with diff dim tensors, not just 2D ConvNets
-        # hidden_state: [s, b, h]
-        shape = (1,) + (hidden_state.shape[1],) + (1,) * (hidden_state.ndim - 2)
-        random_tensor = keep_prob + \
-            torch.rand(shape, dtype=hidden_state.dtype, device=hidden_state.device)
-        random_tensor.floor_()  # binarize
-        output = hidden_state.div(keep_prob) * random_tensor
-        return output
-
-class ParallelMLP(MegatronModule):
-    """MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-    """
-
-    def __init__(self, config, is_expert=False, rlhf_training=False):
-        super(ParallelMLP, self).__init__()
-        if rlhf_training:
-            args = get_rlhf_args()
-        else:
-            args = get_args()
-
-        self.add_bias = config.add_bias_linear
-
-        ffn_hidden_size = config.ffn_hidden_size
-        if config.gated_linear_unit:
-            ffn_hidden_size *= 2
-
-        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear(
-            config.hidden_size,
-            ffn_hidden_size,
-            config=config,
-            init_method=config.init_method,
-            bias=self.add_bias,
-            gather_output=False,
-            skip_bias_add=True,
-            is_expert=is_expert,
-        )
-
-        self.bias_gelu_fusion = False
-        self.activation_func = None
-        self.swiglu = args.swiglu
-
-        if args.openai_gelu:
-            self.activation_func = openai_gelu
-        elif args.onnx_safe:
-            self.activation_func = erf_gelu
-        elif args.swiglu:
-            # def swiglu(x):
-            #     x = torch.chunk(x, 2, dim=-1)
-            #     return F.silu(x[0]) * x[1]
-            self.activation_func = swiglu
-        elif args.squared_relu:
-            def squared_relu(x):
-                return torch.pow(F.relu(x), 2)
-            self.activation_func = squared_relu
-        else:
-            self.bias_gelu_fusion = args.bias_gelu_fusion
-            self.activation_func = F.gelu
-
-        # Project back to h.
-        self.dense_4h_to_h = tensor_parallel.RowParallelLinear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            config=config,
-            init_method=config.output_layer_init_method,
-            bias=self.add_bias,
-            skip_bias_add=True,
-            input_is_parallel=True,
-            is_expert=is_expert,
-        )
-
-    def forward(self, hidden_states, inference_params=None):
-
-        # [s, b, 4hp]
-        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states, inference_params=inference_params)
-
-        if self.bias_gelu_fusion:
-            assert self.add_bias is True
-            assert self.activation_func == F.gelu
-            intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
-        else:
-            if bias_parallel is not None:
-                intermediate_parallel = intermediate_parallel + bias_parallel
-            intermediate_parallel = self.activation_func(intermediate_parallel)
-
-        # [s, b, h]
-        output, output_bias = self.dense_4h_to_h(intermediate_parallel, inference_params=inference_params)
-        return output, output_bias
-
-def sinkhorn(cost, tol=0.0001):
-    cost = torch.exp(cost)
-    d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
-    d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype)
-    
-    eps = 0.00000001
-    error = 1e9
-    d1_old = d1
-    while error > tol:
-        d0 = (1/d0.size(0))*1/(torch.sum(d1*cost,1) + eps)
-        d1 = (1/d1.size(0))*1/(torch.sum(d0.unsqueeze(1)*cost,0)+eps)
-        error = torch.mean(torch.abs(d1_old-d1))
-        d1_old = d1
-    return d1*cost*d0.unsqueeze(1)
-
-
-def get_router_linear_layer(config):
-    args = get_args()
-    router = torch.nn.Linear(args.hidden_size, args.num_experts, bias=False)
-    with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()):
-        config.init_method(router.weight)
-    setattr(router.weight, 'sequence_parallel',config.sequence_parallel)
-    return router
-
-
-class SwitchMLP(MegatronModule):
-    """
-    Routes input to one of N MLP "experts"
-    """
-    def __init__(self, config):
-        super(SwitchMLP, self).__init__()
-        args = get_args()
-        self.router = get_router_linear_layer(config)
-        self.expert_parallel_size = mpu.get_expert_model_parallel_world_size()
-        self.sequence_parallel = config.sequence_parallel
-        self.add_bias = config.add_bias_linear
-
-        assert args.num_experts_switch % self.expert_parallel_size == 0
-        self.num_local_experts = args.num_experts_switch // self.expert_parallel_size
-        local_expert_indices_offset = mpu.get_expert_model_parallel_rank() * self.num_local_experts
-        self.local_expert_indices = [local_expert_indices_offset + i for i in range(self.num_local_experts)]
-
-        self.local_experts = torch.nn.ModuleList()
-        for i in range(self.num_local_experts):
-            self.local_experts.append(ParallelMLP(config, is_expert=True))
-
-    def gather_indices(self, local_indices):
-        """ Gather tensors and concatinate along the first dimension."""
-        group = get_tensor_and_expert_parallel_group()
-        world_size = torch.distributed.get_world_size(group=group)
-        # Bypass the function if we are using only 1 GPU.
-        if world_size == 1:
-            return local_indices
-
-        dim_size = list(local_indices.size())
-        dim_size[0] = dim_size[0] * world_size
-
-        # TODO pre allocate memory
-        output = torch.empty(dim_size, dtype=local_indices.dtype,
-                             device=torch.cuda.current_device())
-        torch.distributed._all_gather_base(
-            output, local_indices.contiguous(), group=group
-        )
-        return output
-
-    def forward(self, hidden_states):
-        # hidden_states: [b, s, h]
-        args = get_args()
-        s = hidden_states.size(0)
-        b = hidden_states.size(1)
-        h = hidden_states.size(2)
-        route = self.router(hidden_states).view(-1, args.num_experts_switch)
-        
-        # TODO (rprenger) Right now we're just using the sinkhorn algorithm
-        # for load balancing. There should be an option to do no load balancing
-        # and the algorithm and parametets should be further tested
-        if self.training:
-            with torch.no_grad():
-                sinkroute = sinkhorn(route.detach().to(dtype=torch.float32))
-                _, max_ind = torch.max(sinkroute, dim=1)
-            route = torch.sigmoid(route)
-            max_prob = route[torch.arange(route.size(0)), max_ind]
-        else:
-            route = torch.sigmoid(route)
-            max_prob, max_ind = torch.max(route, dim=1)
-
-        max_prob = torch.unsqueeze(max_prob, 1)
-        hidden_states = hidden_states.view(-1, hidden_states.size(2))
-
-        # TODO (rprenger) TODO this could be made easier to read
-        # Converting [s, b, h] to [s*b, h].
-        # Each vector could be routed differently
-        if self.sequence_parallel or (self.expert_parallel_size > 1):
-            global_hidden_states = \
-                gather_from_sequence_parallel_region_to_moe(hidden_states)
-            global_indices = self.gather_indices(max_ind)
-        else:
-            global_hidden_states = hidden_states
-            global_indices = max_ind
-
-        output_total = torch.zeros_like(global_hidden_states)
-        if self.add_bias:
-            output_bias_total = torch.zeros_like(global_hidden_states)
-
-        for expert_num, expert in enumerate(self.local_experts):
-            local_expert_index = self.local_expert_indices[expert_num]
-            local_indices = (global_indices == local_expert_index).nonzero()
-            hidden = global_hidden_states[local_indices, :]
-            output, output_bias = expert(hidden)
-            output_total[local_indices, :] = output
-            if self.add_bias:
-                output_bias = output_bias.expand_as(output)
-                output_bias_total[local_indices, :] = output_bias
-
-        if self.sequence_parallel or (self.expert_parallel_size > 1):
-            output_total = \
-                reduce_scatter_to_sequence_parallel_region_from_moe(output_total)
-            if self.add_bias:
-                output_bias_total = \
-                    reduce_scatter_to_sequence_parallel_region_from_moe(output_bias_total)
-
-                # bias is duplicated across tensor parallelism ranks;
-                # reduce scatter reduces bias across tensor parallel_ranks
-                output_bias_total = \
-                    output_bias_total/mpu.get_tensor_model_parallel_world_size()
-
-        output_total = output_total*max_prob
-        output_total = output_total.view(s, b, h)
-        if self.add_bias:
-            output_bias_total = output_bias_total*max_prob
-            output_bias_total = output_bias_total.view(s, b, h)
-        else:
-            output_bias_total = None
-
-        return output_total, output_bias_total
-
-
-class CoreAttention(MegatronModule):
-
-    def __init__(self, layer_number, config,
-                 attn_mask_type=AttnMaskType.padding):
-        super(CoreAttention, self).__init__()
-        self.fp16 = config.fp16
-        self.bf16 = config.bf16
-
-        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-        self.layer_number = max(1, layer_number)
-        self.attn_mask_type = attn_mask_type
-        self.sequence_parallel = config.sequence_parallel
-
-        projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        world_size = mpu.get_tensor_model_parallel_world_size()
-        self.hidden_size_per_partition = core.utils.divide(projection_size,
-                                                           world_size)
-        self.hidden_size_per_attention_head = core.utils.divide(
-            projection_size, config.num_attention_heads)
-        self.num_attention_heads_per_partition = core.utils.divide(
-            config.num_attention_heads, world_size)
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-
-        self.scale_mask_softmax = FusedScaleMaskSoftmax(
-            self.fp16, self.bf16,
-            self.attn_mask_type,
-            config.masked_softmax_fusion,
-            attention_mask_func,
-            self.attention_softmax_in_fp32,
-            coeff)
-
-        # Dropout. Note that for a single iteration, this layer will generate
-        # different outputs on different number of parallel partitions but
-        # on average it should not be partition dependent.
-        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
-
-    def forward(self, query_layer, key_layer,
-                value_layer, attention_mask):
-
-        # ===================================
-        # Raw attention scores. [b, np, s, s]
-        # ===================================
-
-        # [b, np, sq, sk]
-        output_size = (query_layer.size(1),
-                       query_layer.size(2),
-                       query_layer.size(0),
-                       key_layer.size(0))
-
-        # [sq, b, np, hn] -> [sq, b * np, hn]
-        query_layer = query_layer.reshape(output_size[2],
-                                          output_size[0] * output_size[1], -1)
-        # [sk, b, np, hn] -> [sk, b * np, hn]
-        key_layer = key_layer.view(output_size[3],
-                                   output_size[0] * output_size[1], -1)
-
-        # preallocting input tensor: [b * np, sq, sk]
-        matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor(
-            (output_size[0]*output_size[1], output_size[2], output_size[3]),
-            query_layer.dtype, "mpu")
-
-        # Raw attention scores. [b * np, sq, sk]
-        matmul_result = torch.baddbmm(
-            matmul_input_buffer,
-            query_layer.transpose(0, 1),   # [b * np, sq, hn]
-            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0, alpha=(1.0/self.norm_factor))
-
-        # change view to [b, np, sq, sk]
-        attention_scores = matmul_result.view(*output_size)
-
-        # ===========================
-        # Attention probs and dropout
-        # ===========================
-
-        # attention scores and attention mask [b, np, sq, sk]
-        attention_probs = self.scale_mask_softmax(attention_scores,
-                                                  attention_mask)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        if not self.sequence_parallel:
-            with tensor_parallel.get_cuda_rng_tracker().fork():
-                attention_probs = self.attention_dropout(attention_probs)
-        else:
-            attention_probs = self.attention_dropout(attention_probs)
-
-        # =========================
-        # Context layer. [sq, b, hp]
-        # =========================
-
-        # value_layer -> context layer.
-        # [sk, b, np, hn] --> [b, np, sq, hn]
-
-        # context layer shape: [b, np, sq, hn]
-        output_size = (value_layer.size(1),
-                       value_layer.size(2),
-                       query_layer.size(0),
-                       value_layer.size(3))
-
-        # change view [sk, b * np, hn]
-        value_layer = value_layer.view(value_layer.size(0),
-                                       output_size[0] * output_size[1], -1)
-
-        # change view [b * np, sq, sk]
-        attention_probs = attention_probs.view(output_size[0] * output_size[1],
-                                               output_size[2], -1)
-
-        # matmul: [b * np, sq, hn]
-        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-
-        # change view [b, np, sq, hn]
-        context_layer = context_layer.view(*output_size)
-
-        # [b, np, sq, hn] --> [sq, b, np, hn]
-        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-
-        # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + \
-            (self.hidden_size_per_partition,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        return context_layer
-
-
-class FlashSelfAttention(torch.nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Arguments
-    ---------
-        softmax_scale: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.0)
-    """
-    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
-                 device=None, dtype=None):
-        super().__init__()
-        assert flash_attn_unpadded_func is not None, ('Please install FlashAttention first, '
-                                                      'e.g., with pip install flash-attn')
-        assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-
-    def forward(self, q, k, v):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
-        """
-
-        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v)))
-        assert all((i.is_cuda for i in (q,k,v)))
-
-        batch_size, seqlen_q = q.shape[0], q.shape[1]
-        seqlen_k = k.shape[1]
-
-        q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]]
-        # if os.getenv('ENABLE_FLASH_ATTENTION_WITH_IXDNN', '0') != '0':
-        #     cu_seqlens_q = torch.empty((batch_size), dtype=torch.int32, device=q.device)
-        # else:
-        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
-                                        device=q.device)
-
-        if self.training:
-            # during training q,k,v always have same seqlen
-            assert seqlen_k == seqlen_q
-
-            is_causal = self.causal
-            cu_seqlens_k = cu_seqlens_q
-            dropout_p = self.dropout_p
-        else:
-            # turn off FA causal mask after first inference autoregressive iteration
-            # only on first autoregressive step q,k,v have same seqlen
-            is_causal = seqlen_q == seqlen_k
-            cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
-                        device=q.device)
-            dropout_p = 0
-
-        output = flash_attn_unpadded_func(
-            q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k,
-            dropout_p,
-            softmax_scale=self.softmax_scale, causal=is_causal
-        )
-
-        output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
-        return output
-
-
-class ParallelAttention(MegatronModule):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(self, config, layer_number,
-                 attention_type=AttnType.self_attn,
-                 attn_mask_type=AttnMaskType.padding,
-                 rlhf_training=False):
-        super(ParallelAttention, self).__init__()
-        if rlhf_training:
-            args = get_rlhf_args()
-        else:
-            args = get_args()
-        self.layer_number = max(1, layer_number)
-        self.attention_type = attention_type
-        self.attn_mask_type = attn_mask_type
-        self.params_dtype = config.params_dtype
-        self.sequence_parallel = config.sequence_parallel
-
-        self.group_query_attention = args.group_query_attention
-        self.num_query_groups = args.num_query_groups
-
-        query_projection_size = config.kv_channels * config.num_attention_heads
-        if self.group_query_attention:
-            kv_projection_size = args.kv_channels * args.num_query_groups
-        else:
-            kv_projection_size = args.kv_channels * args.num_attention_heads
-
-        self.use_flash_attn = args.use_flash_attn \
-            and attention_type == AttnType.self_attn \
-            and self.attn_mask_type == AttnMaskType.causal
-        if self.use_flash_attn:
-            if flash_attn_unpadded_func is None:
-                raise ImportError('FlashAttention is not installed, please install with '
-                                  'pip install flash-attn')
-            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
-                                                          'self-attention for now')
-            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
-                                                                'supports causal mask for now')
-            if rearrange is None:
-                raise ImportError('einops is not installed, please install with pip install einops')
-
-        # Per attention head and per partition values.
-        world_size = mpu.get_tensor_model_parallel_world_size()
-        self.hidden_size_per_attention_head = core.utils.divide(
-            query_projection_size, config.num_attention_heads)
-        self.num_attention_heads_per_partition = core.utils.divide(
-            config.num_attention_heads, world_size)
-
-        if self.group_query_attention:
-            if args.num_query_groups % world_size != 0:
-                raise NotImplementedError('Currently the num_query_groups should be '
-                                          'a multiple of the tensor parallel size')
-            self.num_query_groups_per_partition = core.utils.divide(
-                        args.num_query_groups, world_size)
-        else:
-            self.num_query_groups_per_partition = self.num_attention_heads_per_partition
-
-        # Strided linear layer.
-        if attention_type == AttnType.self_attn:
-            self.query_key_value = tensor_parallel.ColumnParallelLinear(
-                config.hidden_size,
-                query_projection_size + 2 * kv_projection_size,
-                config=config,
-                init_method=config.init_method,
-                bias=args.add_bias_linear,
-                gather_output=False)
-        else:
-            assert attention_type == AttnType.cross_attn
-
-            if self.group_query_attention:
-                raise NotImplementedError("Grouped query attention not implemented for cross-attention.")
-            assert query_projection_size == kv_projection_size
-
-            self.query = tensor_parallel.ColumnParallelLinear(
-                config.hidden_size,
-                query_projection_size,
-                config=config,
-                init_method=config.init_method,
-                bias=config.add_bias_linear,
-                gather_output=False)
-
-            self.key_value = tensor_parallel.ColumnParallelLinear(
-                config.hidden_size,
-                2 * kv_projection_size,
-                config=config,
-                init_method=config.init_method,
-                bias=config.add_bias_linear,
-                gather_output=False)
-
-        self.core_attention = CoreAttention(self.layer_number, config,
-                                            self.attn_mask_type)
-        self.checkpoint_core_attention = config.recompute_granularity == 'selective'
-
-        if self.use_flash_attn:
-            self.core_attention_flash = FlashSelfAttention(
-                causal=True, attention_dropout=config.attention_dropout
-            )
-
-        # Output.
-        self.dense = tensor_parallel.RowParallelLinear(
-            query_projection_size,
-            config.hidden_size,
-            config=config,
-            init_method=config.output_layer_init_method,
-            bias=args.add_bias_linear,
-            input_is_parallel=True,
-            skip_bias_add=True)
-
-
-    def _checkpointed_attention_forward(self, query_layer, key_layer,
-                                        value_layer, attention_mask,
-                                        rotary_pos_emb=None):
-        """Forward method with activation checkpointing."""
-        def custom_forward(*inputs):
-            query_layer = inputs[0]
-            key_layer = inputs[1]
-            value_layer = inputs[2]
-            attention_mask = inputs[3]
-            output_ = self.core_attention(query_layer, key_layer,
-                                          value_layer, attention_mask)
-            return output_
-
-        q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \
-            else rotary_pos_emb
-
-        hidden_states = tensor_parallel.checkpoint(
-            custom_forward,
-            False, query_layer, key_layer, value_layer, attention_mask,
-            q_pos_emb, k_pos_emb)
-
-        return hidden_states
-
-    def _allocate_memory(self, inference_max_sequence_len, batch_size, num_attention_heads):
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            num_attention_heads,
-            self.hidden_size_per_attention_head,
-            dtype=self.params_dtype,
-            device=torch.cuda.current_device())
-
-    def repeat_kv(self, hidden_states, n_rep):
-        slen, batch, num_key_value_heads_per_partition, head_dim = hidden_states.shape
-        if n_rep == 1:
-            return hidden_states
-        hidden_states = hidden_states[:, :, :, None, :].expand(
-            slen, batch, num_key_value_heads_per_partition, n_rep, head_dim)
-        return hidden_states.reshape(slen, batch,
-                                     num_key_value_heads_per_partition * n_rep,
-                                     head_dim)
-                                     
-    def split_tensor(self, mixed_x_layer):
-        query_layer = mixed_x_layer[:, :, :, :-2, :].reshape(mixed_x_layer.shape[:2] + (-1, self.hidden_size_per_attention_head))
-        key_layer = mixed_x_layer[:, :, :, -2, :]
-        value_layer = mixed_x_layer[:, :, :, -1, :]
-
-        return query_layer, key_layer, value_layer
-
-    def forward(self, hidden_states, attention_mask, position_ids=None,
-                encoder_output=None, inference_params=None,
-                rotary_pos_emb=None):
-        # hidden_states: [sq, b, h]
-
-        # Inference or Forward 使用, 会影响 RoPE
-        if position_ids is not None:
-            # position_ids = position_ids.transpose(1, 0) #[s, b]
-            ## 适配rope fused kernel
-            position_ids = position_ids.transpose(1, 0)[:, 0].unsqueeze(-1) #[s, b] -> [s, b] -> [s, 1]  rope position ids embedding 在同一位置是一样的
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        is_first_step = False
-        if inference_params:
-            if self.layer_number not in inference_params.key_value_memory_dict:
-                inf_max_seq_len = inference_params.max_sequence_length
-                inf_max_batch_size = inference_params.max_batch_size
-                inference_key_memory = self._allocate_memory(
-                    inf_max_seq_len, inf_max_batch_size,
-                    self.num_query_groups_per_partition)
-                inference_value_memory = self._allocate_memory(
-                    inf_max_seq_len, inf_max_batch_size,
-                    self.num_query_groups_per_partition)
-
-                inference_params.key_value_memory_dict[self.layer_number] = (
-                    inference_key_memory, inference_value_memory)
-                is_first_step = True
-            else:
-                inference_key_memory, inference_value_memory = \
-                    inference_params.key_value_memory_dict[self.layer_number]
-
-            # 存储 inference position_ids
-            if is_first_step and position_ids is not None \
-                    and "position_ids" not in inference_params.key_value_memory_dict:
-                inference_params.key_value_memory_dict["position_ids"] = position_ids
-
-        # =====================
-        # Query, Key, and Value
-        # =====================
-        if self.attention_type == AttnType.self_attn:
-            # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
-            mixed_x_layer, _ = self.query_key_value(hidden_states, inference_params=inference_params)
-
-            # [sq, b, ((nq + 2 * nkv) * hn)] --> [sq, b, nkv, (nq // nkv + 2), hn]
-            new_tensor_shape = mixed_x_layer.size()[:-1] + (
-                self.num_query_groups_per_partition,
-                (
-                    (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
-                    * self.hidden_size_per_attention_head
-                ),
-            )
-            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [sq, b, nkv, (nq // nkv + 2), hn] --> 3 [sq, b, np, hn]
-            (query_layer,
-             key_layer,
-             value_layer) = torch.split(
-                mixed_x_layer,
-                [
-                    (
-                        self.num_attention_heads_per_partition // self.num_query_groups_per_partition
-                        * self.hidden_size_per_attention_head
-                    ),
-                    self.hidden_size_per_attention_head,
-                    self.hidden_size_per_attention_head
-                ],
-                dim=3)
-            query_layer = query_layer.contiguous().view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
-
-        else:
-            
-            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
-            mixed_kv_layer, _ = self.key_value(encoder_output)
-
-            # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
-            new_tensor_shape = mixed_kv_layer.size()[:-1] + \
-                (self.num_attention_heads_per_partition,
-                2 * self.hidden_size_per_attention_head)
-            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
-
-            # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
-            (key_layer,
-            value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2)
-
-            # Attention head [sq, b, h] --> [sq, b, hp]
-            query_layer, _ = self.query(hidden_states)
-            # [sq, b, hp] --> [sq, b, np, hn]
-            new_tensor_shape = query_layer.size()[:-1] + \
-                (self.num_attention_heads_per_partition,
-                self.hidden_size_per_attention_head)
-            query_layer = query_layer.view(*new_tensor_shape)
-
-        # ==================================
-        # Adjust key and value for inference
-        # ==================================
-
-        # duplicate the pos_emb for self attention
-        if rotary_pos_emb is not None:
-            if isinstance(rotary_pos_emb, tuple):
-                rotary_pos_emb = rotary_pos_emb
-            else:
-                rotary_pos_emb = ((rotary_pos_emb,) * 2)
-
-        if inference_params:
-            batch_start = inference_params.batch_size_offset
-            batch_end = batch_start + key_layer.size(1)
-            assert batch_end <= inference_key_memory.size(1)
-            sequence_start = inference_params.sequence_len_offset
-            sequence_end = sequence_start + key_layer.size(0)
-            assert sequence_end <= inference_key_memory.size(0)
-            # Copy key and values.
-            inference_key_memory[sequence_start:sequence_end,
-                                 batch_start:batch_end, ...] = key_layer
-            inference_value_memory[sequence_start:sequence_end,
-                                   batch_start:batch_end, ...] = value_layer
-            key_layer = inference_key_memory[
-                :sequence_end, batch_start:batch_end, ...]
-            value_layer = inference_value_memory[
-                :sequence_end, batch_start:batch_end, ...]
-
-
-            # adjust the key rotary positional embedding
-            if rotary_pos_emb is not None:
-                q_pos_emb, k_pos_emb = rotary_pos_emb
-                # need to cross check this condition during inference
-                if not is_first_step:
-                    # In inference, we compute one token at a time.
-                    # Select the correct query positional embedding (only the last token in the sequence)
-                    if position_ids is not None:
-                        # 取 last position_id 对应的 q_pos_emb
-                        assert position_ids.shape[0] == 1
-                        # cur_pos_id = position_ids[-1].item()
-                        q_pos_emb = q_pos_emb[position_ids].squeeze(2) # [1, bs, 1, dim]
-
-                        # 取 position_id 对应的 k_pos_emb
-                        k_pos_emb = k_pos_emb.squeeze(1).squeeze(1) # [max_seq, dim]
-                        mem_position_ids = inference_params.key_value_memory_dict["position_ids"]
-                        if mem_position_ids.shape[0] == sequence_end:
-                            k_pos_emb = k_pos_emb[mem_position_ids].unsqueeze(2) # [sequence_end, b, 1, dim]
-                        elif mem_position_ids.shape[0] == sequence_end - 1:
-                            new_position_ids = torch.concat((mem_position_ids, position_ids), 0)
-                            k_pos_emb = k_pos_emb[new_position_ids].unsqueeze(2) # [sequence_end, b, 1, dim]
-                            inference_params.key_value_memory_dict["position_ids"] = new_position_ids # update memory position_ids
-                        else:
-                            raise Exception("input position_ids shape wrong.")
-                    else:
-                        q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end] # [1, 1, 1, dim]
-                        k_pos_emb = k_pos_emb[:sequence_end, :, :, :] # [sequence_end, 1, 1, dim]
-                else:
-                    # In the first forward pass of inference, we use the entire provided prefix.
-                    # q_pos_emb here has the rope embeddings of the entire prefix + to-be-generated output
-                    # so we slice to just the prefix.
-                    if position_ids is not None:
-                        assert position_ids.shape[0] <= q_pos_emb.shape[0] and q_pos_emb.shape[0] == k_pos_emb.shape[0]
-                        q_pos_emb = q_pos_emb.squeeze(1).squeeze(1) # [max_seq, dim]
-                        q_pos_emb = q_pos_emb[position_ids].unsqueeze(2) # [s, b, 1, dim]
-                        k_pos_emb = k_pos_emb.squeeze(1).squeeze(1) # [max_seq, dim]
-                        k_pos_emb = k_pos_emb[position_ids].unsqueeze(2) # [s, b, 1, dim]
-                    else:
-                        q_pos_emb = q_pos_emb[:sequence_end, :, :, :] # [sequence_end, 1, 1, dim]
-                        k_pos_emb = k_pos_emb[:sequence_end, :, :, :] # [sequence_end, 1, 1, dim]
-
-                rotary_pos_emb = (q_pos_emb, k_pos_emb)
-
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn]
-        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
-            key_layer = key_layer.repeat_interleave(
-                self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
-                dim = 2
-            )
-            value_layer = value_layer.repeat_interleave(
-                self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
-                dim = 2
-            )
-
-        # apply relative positional encoding (rotary embedding)
-        if rotary_pos_emb is not None:
-            q_pos_emb, k_pos_emb = rotary_pos_emb
-            # query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb)
-            # key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb)
-            query_layer = fused_apply_rotary_pos_emb(query_layer, q_pos_emb)
-            key_layer = fused_apply_rotary_pos_emb(key_layer, k_pos_emb)
-            # TODO, can apply positional embedding to value_layer so it has
-            # absolute positional embedding.
-            # otherwise, only relative positional embedding takes effect
-            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
-
-        if not self.use_flash_attn:
-            if self.checkpoint_core_attention:
-                context_layer = self._checkpointed_attention_forward(
-                    query_layer, key_layer, value_layer, attention_mask)
-            else:
-                context_layer = self.core_attention(
-                    query_layer, key_layer, value_layer, attention_mask)
-        else:
-            q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
-                       for x in (query_layer, key_layer, value_layer)]
-            if not self.sequence_parallel:
-                with tensor_parallel.get_cuda_rng_tracker().fork():
-                    context_layer = self.core_attention_flash(q, k, v)
-            else:
-                context_layer = self.core_attention_flash(q, k, v)
-            context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output, bias = self.dense(context_layer, inference_params)
-
-        return output, bias
-
-
-def bias_dropout_add(x, bias, residual, prob, training):
-    # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
-    if bias is not None:
-        x = x + bias
-    out = torch.nn.functional.dropout(x, p=prob, training=training)
-    out = residual + out
-    return out
-
-
-def get_bias_dropout_add(training):
-    def _bias_dropout_add(x, bias, residual, prob):
-        return bias_dropout_add(x, bias, residual, prob, training)
-    return _bias_dropout_add
-
-
-@torch.jit.script
-def bias_dropout_add_fused_train(x: torch.Tensor,
-                                 bias: Optional[torch.Tensor],
-                                 residual: torch.Tensor,
-                                 prob: float) -> torch.Tensor:
-    return bias_dropout_add(x, bias, residual, prob, True)
-
-
-@torch.jit.script
-def bias_dropout_add_fused_inference(x: torch.Tensor,
-                                     bias: Optional[torch.Tensor],
-                                     residual: torch.Tensor,
-                                     prob: float) -> torch.Tensor:
-    return bias_dropout_add(x, bias, residual, prob, False)
-
-
-class ParallelTransformerLayer(MegatronModule):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(self, config,
-                 layer_number, layer_type=LayerType.encoder,
-                 self_attn_mask_type=AttnMaskType.padding,
-                 drop_path_rate=0., num_experts=1,
-                 rlhf_training=False):
-        if rlhf_training:
-            args = get_rlhf_args()
-        else:
-            args = get_args()
-        self.args = args
-
-        super(ParallelTransformerLayer, self).__init__()
-        self.layer_number = layer_number
-        self.layer_type = layer_type
-        
-        self.normalization = args.normalization
-        self.apply_residual_connection_post_norm \
-            = config.apply_residual_connection_post_layernorm
-
-        self.bf16 = config.bf16
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-        # Normalize the input data.
-        self.input_norm = get_norm(config)
-
-        # Self attention.
-        self.self_attention = ParallelAttention(
-            config,
-            layer_number,
-            attention_type=AttnType.self_attn,
-            attn_mask_type=self_attn_mask_type,
-            rlhf_training=rlhf_training)
-        self.hidden_dropout = config.hidden_dropout
-        self.bias_dropout_fusion = config.bias_dropout_fusion
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None
-
-        # Normalize the attention output
-        # self.post_attention_norm = get_norm(config)
-        if self.normalization != "RMSNorm":
-            self.post_attention_norm = get_norm(config)
-        else:
-            self.post_attention_norm = get_rmsnorm_residual(config)
-
-        # Cross attention.
-        if self.layer_type in (LayerType.decoder,
-                               LayerType.retro_decoder,
-                               LayerType.retro_decoder_with_retriever,
-                               LayerType.retro_encoder):
-            self.inter_attention = ParallelAttention(
-                config,
-                layer_number,
-                attention_type=AttnType.cross_attn,
-                rlhf_training=rlhf_training)
-            # Normalize the attention output.
-            self.post_inter_attention_norm = get_norm(config)
-
-        # MLP
-        self.num_experts = num_experts
-        if args.num_experts_switch is not None:
-            self.mlp = SwitchMLP(config)
-        else:
-            if self.num_experts <= 1: # dense, not MoE
-                self.mlp = ParallelMLP(config, rlhf_training=rlhf_training)
-            else: # DeepSpeed's MoE
-                enable_expert_tensor_parallelism = args.enable_expert_tensor_parallelism
-                self.mlp = MoE(args.hidden_size,
-                                ParallelMLP(config,
-                                    moe=True,
-                                    enable_expert_tensor_parallelism=enable_expert_tensor_parallelism),
-                                num_experts=self.num_experts,
-                                ep_size=args.moe_expert_parallel_size,
-                                k=args.topk,
-                                use_residual=(args.mlp_type == 'residual'),
-                                capacity_factor=args.moe_train_capacity_factor,
-                                eval_capacity_factor=args.moe_eval_capacity_factor,
-                                min_capacity=args.moe_min_capacity,
-                                drop_tokens=args.moe_token_dropping, use_tutel=args.use_tutel,
-                                enable_expert_tensor_parallelism=enable_expert_tensor_parallelism)
-
-        # Set bias+dropout+add fusion grad_enable execution handler.
-        TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        TORCH_MINOR = int(torch.__version__.split('.')[1])
-        use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
-        self.bias_dropout_add_exec_handler = \
-                nullcontext if use_nvfuser else torch.enable_grad
-
-        if args.retro_add_retriever:
-            retro_args = get_retro_args()
-            self.retro_num_neighbors = args.retro_num_neighbors
-            self.retro_chunk_length = retro_args.retro_gpt_chunk_length
-            self.retro_retrieved_length = retro_args.retro_gpt_retrieved_length
-
-        # Retriever (bi-directional transformer with cross attention)
-        if layer_type == LayerType.retro_decoder_with_retriever:
-            self.retriever = ParallelTransformer(
-                config=config,
-                model_type=ModelType.retro_encoder,
-                self_attn_mask_type=AttnMaskType.padding,
-                pre_process=True,
-                post_process=False,
-            )
-            self._retriever_key = 'retriever'
-        else:
-            self.retriever = None
-
-    def default_decoder_cross_attention(self,
-                                        encoder_output,
-                                        enc_dec_attn_mask,
-                                        norm_input,
-                                        norm_output,
-                                        bias_dropout_add_func):
-        '''Cross attention for a standard encoder-decoder model.'''
-
-        # Attention.
-        attention_output, attention_bias = \
-            self.inter_attention(norm_output,
-                                 enc_dec_attn_mask,
-                                 encoder_output=encoder_output)
-
-        # Residual connection.
-        if self.apply_residual_connection_post_norm:
-            residual = norm_output
-        else:
-            residual = norm_input
-
-        if attention_bias is not None:
-            attention_bias = attention_bias.expand_as(residual)
-
-        # Bias-dropout-add.
-        with self.bias_dropout_add_exec_handler():
-            norm_input = bias_dropout_add_func(
-                attention_output,
-                attention_bias,
-                residual,
-                self.hidden_dropout)
-
-        # Normalize.
-        norm_output = self.post_inter_attention_norm(norm_input)
-
-        return norm_input, norm_output
-
-    def retro_encoder_cross_attention(self,
-                                      retriever_output,
-                                      norm_input,
-                                      norm_output,
-                                      bias_dropout_add_func):
-        """Cross attention for Retro encoder.
-
-        Notation:
-            ns : Sequence length.
-            bs : Batch size.
-            d  : Hidden size.
-            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-            k  : Number of neighbors.
-            r  : Number of retrieved tokens (neighbors + continuation).
-        """
-
-        ns, bs, d = norm_output.shape # [r, bs * l * k, d]
-
-        # Divide sequence dimension into chunks.
-        chunked_outputs = norm_output.reshape(self.retro_retrieved_length,
-                                              -1,
-                                              self.retro_num_neighbors,
-                                              d)
-        chunked_outputs_before_norm = \
-            norm_input.reshape(self.retro_retrieved_length, -1,
-                               self.retro_num_neighbors, d) # [r, bs*l, k, d]
-
-        # Per-chunk attention.
-        norm_inputs = []
-        norm_outputs = []
-        for k in range(self.retro_num_neighbors):
-
-            # Attention.
-            chunked_output = chunked_outputs[:,:,k].contiguous()
-            attention_output, attention_bias = \
-                self.inter_attention(
-                    chunked_output, # Q (neighbor embedding)
-                    None,
-                    encoder_output=retriever_output) # K, V (hidden act)
-
-            # Residual connection.
-            if self.apply_residual_connection_post_norm:
-                residual = chunked_output
-            else:
-                residual = chunked_outputs_before_norm[:,:,k]
-
-            # Re-enable torch grad to enable fused optimization.
-            with torch.enable_grad():
-                norm_input = bias_dropout_add_func(
-                    attention_output,
-                    None if attention_bias is None else attention_bias.expand_as(residual),
-                    residual,
-                    self.hidden_dropout)
-                norm_inputs.append(norm_input)
-
-            # Layer norm.
-            norm_output = self.post_inter_attention_norm(norm_input)
-            norm_outputs.append(norm_output)
-
-        # Concatenate layer norms.
-        # norm_input : [r, k * bs * l, d]
-        # norm_output : [r, k * bs * l, d]
-        norm_input = torch.stack(norm_inputs, dim=1).reshape(ns, bs, d)
-        norm_output = torch.stack(norm_outputs, dim=1).reshape(ns, bs, d)
-
-        return norm_input, norm_output
-
-    def retro_decoder_cross_attention(self,
-                                      retriever_input,
-                                      retriever_output,
-                                      retriever_attn_mask,
-                                      norm_input,
-                                      norm_output,
-                                      inference_params,
-                                      bias_dropout_add_func):
-        """Cross attention for Retro decoder.
-
-        Notation:
-            ns : Sequence length.
-            bs : Batch size.
-            d  : Hidden size.
-            l  : Number of chunks per sample (i.e., seq_length/chunk_length).
-            m  : Number of tokens per chunk.
-            k  : Number of neighbors.
-            r  : Number of retrieved tokens (neighbors + continuation).
-        """
-
-        ns, bs, d = norm_output.shape
-        l = int(np.ceil(ns / self.retro_chunk_length))
-
-        # Retrieve neighbors.
-        if self.layer_type == LayerType.retro_decoder_with_retriever:
-            first_ns = ns % self.retro_chunk_length
-            if first_ns > 0:
-                raise Exception("test this case.")
-                first_chunk, rest_chunk = \
-                    norm_output[:first_ns], norm_output[first_ns:]
-                first_chunk = torch.nn.functional.pad(
-                    first_chunk,
-                    (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns),
-                    'constant',
-                    0)
-                chunked_output = \
-                    torch.cat((first_chunk, rest_chunk), dim=0) # [l * m, bs, d]
-            else:
-                chunked_output = norm_output # [l * m, bs, d]
-            chunked_output = chunked_output \
-                .reshape(l, self.retro_chunk_length, bs, d) \
-                .permute(1, 2, 0, 3) \
-                .reshape(self.retro_chunk_length, bs * l, d) \
-                .contiguous()
-
-            # Get Encoder Output
-            retriever_output = self.retriever(
-                hidden_states=retriever_input,
-                attention_mask=retriever_attn_mask,
-                retriever_output=chunked_output,
-                retriever_attn_mask=retriever_attn_mask,
-                inference_params=inference_params) # [r, k * bs * l , d]
-            retriever_output = retriever_output.reshape(
-                self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d) # [r * k, bs * l, d]
-
-        # Chunks.
-        pad = (ns - 1) % self.retro_chunk_length
-        attending_chunks = norm_output[pad:]
-        padded_chunks = torch.nn.functional.pad(
-            attending_chunks,
-            (0, 0, 0, 0, 0, self.retro_chunk_length - 1),
-            'constant', 0)
-        padded_chunked_output = padded_chunks \
-            .reshape(l, self.retro_chunk_length, bs, d) \
-            .permute(1, 2, 0, 3)
-        padded_chunked_output = padded_chunked_output.reshape(
-            self.retro_chunk_length, bs * l, d).contiguous()
-
-        # Encoder output.
-        attention_output, attention_bias = \
-            self.inter_attention(padded_chunked_output,
-                                 None,
-                                 encoder_output=retriever_output)
-
-        # Residual connection.
-        if self.apply_residual_connection_post_norm:
-            residual = norm_output
-        else:
-            residual = norm_input
-
-        # Re-enable torch grad to enable fused optimization.
-        with torch.enable_grad():
-            norm_input = bias_dropout_add_func(
-                attention_output,
-                None if attention_bias is None else attention_bias.expand_as(attention_output),
-                torch.zeros_like(attention_output),
-                self.hidden_dropout)
-            norm_input = norm_input \
-                .reshape(self.retro_chunk_length, bs, l, d) \
-                .permute(2, 0, 1, 3) # [l, m, bs, d]
-            norm_input = norm_input.reshape(self.retro_chunk_length * l, bs, d)
-            norm_input = torch.nn.functional.pad(
-                norm_input,
-                (0, 0, 0, 0, pad, 0),
-                'constant', 0)[:ns] # [ns, b, d]
-            norm_input = norm_input + residual
-
-        # Layer norm post the decoder attention
-        norm_output = self.post_inter_attention_norm(norm_input)
-
-        return retriever_output, norm_input, norm_output
-
-
-    def forward(self, hidden_states, attention_mask,
-                position_ids=None,
-                encoder_output=None, enc_dec_attn_mask=None,
-                retriever_input=None,
-                retriever_output=None,
-                retriever_attn_mask=None,
-                inference_params=None,
-                rotary_pos_emb=None):
-        # hidden_states: [s, b, h]
-
-        # Layer norm at the beginning of the transformer layer.
-        norm_output = self.input_norm(hidden_states)
-
-        # Self attention.
-        attention_output, attention_bias = \
-            self.self_attention(
-                norm_output,
-                attention_mask,
-                position_ids=position_ids,
-                inference_params=inference_params,
-                rotary_pos_emb=rotary_pos_emb)
-
-        # Residual connection.
-        if self.apply_residual_connection_post_norm:
-            residual = norm_output
-        else:
-            residual = hidden_states
-
-        if self.drop_path is None:
-            # jit scripting for a nn.module (with dropout) is not
-            # trigerring the fusion kernel. For now, we use two
-            # different nn.functional routines to account for varying
-            # dropout semantics during training and inference phases.
-            if self.bias_dropout_fusion:
-                if self.training:
-                    bias_dropout_add_func = bias_dropout_add_fused_train
-                else:
-                    bias_dropout_add_func = bias_dropout_add_fused_inference
-            else:
-                bias_dropout_add_func = get_bias_dropout_add(self.training)
-
-            if attention_bias is not None:
-                attention_bias = attention_bias.expand_as(residual)
-            with self.bias_dropout_add_exec_handler():
-            #     norm_input = bias_dropout_add_func(
-            #         attention_output,
-            #         attention_bias,
-            #         residual,
-            #         self.hidden_dropout)
-                if self.normalization != "RMSNorm":
-                    norm_input = bias_dropout_add_func(
-                        attention_output,
-                        attention_bias,
-                        residual,
-                        self.hidden_dropout)
-                else:
-                    if attention_bias is not None:
-                        attention_output = attention_output + attention_bias
-                    out = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
-                    norm_output, norm_input = self.post_attention_norm(out, residual)
-            
-        else:
-            out = torch.nn.functional.dropout(attention_output + attention_bias,
-                                              p=self.hidden_dropout,
-                                              training=self.training)
-            # norm_input = residual + self.drop_path(out)
-            if self.normalization != "RMSNorm":
-                if self.normalization != "RMSNorm":
-                    norm_input = residual + self.drop_path(out)
-            else:
-                norm_output, norm_input = self.post_attention_norm(self.drop_path(out), residual)
-                
-            
-            
-            
-        # Layer norm post the self attention.
-        # norm_output = self.post_attention_norm(norm_input)
-        if self.normalization != "RMSNorm":
-            norm_output = self.post_attention_norm(norm_input)
-        
-        # Cross attention.
-        if self.layer_type == LayerType.encoder:
-            pass
-        elif self.layer_type == LayerType.decoder:
-            norm_input, norm_output = \
-                self.default_decoder_cross_attention(
-                    encoder_output,
-                    enc_dec_attn_mask,
-                    norm_input,
-                    norm_output,
-                    bias_dropout_add_func)
-        elif self.layer_type == LayerType.retro_encoder:
-            norm_input, norm_output = \
-                self.retro_encoder_cross_attention(
-                    retriever_output,
-                    norm_input,
-                    norm_output,
-                    bias_dropout_add_func)
-        elif self.layer_type in (LayerType.retro_decoder,
-                                 LayerType.retro_decoder_with_retriever):
-            retriever_output, norm_input, norm_output = \
-                self.retro_decoder_cross_attention(
-                    retriever_input,
-                    retriever_output,
-                    retriever_attn_mask,
-                    norm_input,
-                    norm_output,
-                    inference_params,
-                    bias_dropout_add_func)
-        else:
-            raise Exception("Unsupported layer type, '%s'." %
-                            self.layer_type.name)
-
-        # MLP.
-        mlp_bias = torch.tensor(0.0, device=norm_output.device, dtype=norm_output.dtype)
-        moe_loss = torch.tensor(0.0, device=norm_output.device, dtype=norm_output.dtype)
-
-        mlp_output, mlp_bias = self.mlp(norm_output, inference_params)
-        # Second residual connection.
-        if self.apply_residual_connection_post_norm:
-            residual = norm_output
-        else:
-            residual = norm_input
-
-        if self.drop_path is None:
-            if mlp_bias is not None:
-                mlp_bias = mlp_bias.expand_as(residual)
-            with self.bias_dropout_add_exec_handler():
-                output = bias_dropout_add_func(
-                    mlp_output,
-                    mlp_bias,
-                    residual,
-                    self.hidden_dropout)
-
-            # Jit compiled function creates 'view' tensor. This tensor
-            # potentially gets saved in the MPU checkpoint function context,
-            # which rejects view tensors. While making a viewless tensor here
-            # won't result in memory savings (like the data loader, or
-            # p2p_communication), it serves to document the origin of this
-            # 'view' tensor.
-            output = core.utils.make_viewless_tensor(inp = output,
-                                                     requires_grad = output.requires_grad,
-                                                     keep_graph = True)
-
-        else:
-            if mlp_bias is not None:
-                mlp_output = mlp_output + mlp_bias
-            out = torch.nn.functional.dropout(mlp_output,
-                                              p=self.hidden_dropout,
-                                              training=self.training)
-            output = residual + self.drop_path(out)
-
-        if self.args.deepspeed:
-            if self.layer_type == LayerType.retro_decoder_with_retriever:
-                return output, retriever_output, moe_loss
-            else:
-                return output, moe_loss
-        else:
-            if self.layer_type == LayerType.retro_decoder_with_retriever:
-                return output, retriever_output
-            else:
-                return output
-
-
-class ParallelTransformerLayerPipe(ParallelTransformerLayer):
-    """Extends ParallelTransformerLayer to forward attention_mask through the pipeline.
-
-    Forward has two usages that affect attention mask communication:
-
-    1) forward((input, attn_mask) , **kwargs) -> (output, mask)
-       When the attention mask is provided as the second positional
-       argument, typical pipeline behavior is used and both the output
-       *and* mask are returned in a tuple. This tuple is then forwarded
-       to the next stage in the pipeline.
-
-       This version is useful if masks are dynamic.
-
-    2) forward(input, **kwargs) -> output
-       When the mask is static over all samples, it is advantageous to
-       cache the mask and avoid communicating it.
-
-       If no mask is provided, the module will query `self._args.attn_mask`
-       for the mask and only return `super().forward(...)`
-    """
-    def forward(self, inputs, **kwargs):
-        assert torch.is_tensor(inputs) or isinstance(inputs, tuple)
-        if not hasattr(self, '_args'):
-            self._args = get_args()
-        rotary_pos_emb = self._args.rotary_pos_emb if self._args.use_rotary_position_embeddings else None
-        if torch.is_tensor(inputs) or len(inputs) == 1:
-            # No attention mask forwarded, search for args.attn_mask
-            hidden_states, attention_mask = inputs, self._args.attn_mask
-            # HACK: currently MoE model does not support pipeline parallel, so
-            # here we just ignore the moe_loss returned by forward()
-            return super().forward(hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb)[0]
-        elif len(inputs) == 2:
-            # Attention mask is an activation.
-            hidden_states, attention_mask = inputs[0], inputs[1]
-            # HACK: currently MoE model does not support pipeline parallel, so
-            # here we just ignore the moe_loss returned by forward()
-            return super().forward(*inputs, **kwargs, rotary_pos_emb=rotary_pos_emb)[0], attention_mask
-        else:
-            raise RuntimeError('Received more inputs than understood.')
-
-
-class NoopTransformerLayer(MegatronModule):
-    """A single 'no-op' transformer layer.
-
-    The sole purpose of this layer is for when a standalone embedding layer
-    is used (i.e., args.standalone_embedding_stage == True). In this case,
-    zero transformer layers are assigned when pipeline rank == 0. Additionally,
-    when virtual pipeline rank >= 1, zero total model parameters are created
-    (virtual rank 0 contains the input embedding). This results in the model's
-    input and output tensors being the same, which causes an error when
-    performing certain memory optimiations on the output tensor (e.g.,
-    deallocating it). Thus, this layer disconnects the input from the output
-    via a clone. Since ranks containing a no-op layer are generally under-
-    utilized (both compute and memory), there's no worry of any performance
-    degredation.
-    """
-
-    def __init__(self, layer_number):
-        super().__init__()
-        self.layer_number = layer_number
-
-    def forward(self, hidden_states, attention_mask,
-                encoder_output=None, enc_dec_attn_mask=None,
-                inference_params=None):
-        return hidden_states.clone()
-
-
-def _get_num_layers(args, model_type, is_decoder=False):
-    """Compute the number of transformer layers resident on the current rank."""
-    is_encoder_and_decoder_model = (model_type == ModelType.encoder_and_decoder)
-    if model_type == ModelType.retro_encoder:
-        num_layers = args.retro_encoder_layers
-    elif mpu.get_pipeline_model_parallel_world_size() > 1:
-        if is_encoder_and_decoder_model:
-            assert args.pipeline_model_parallel_split_rank is not None
-
-            # When a standalone embedding stage is used, a rank is taken from
-            # the encoder's ranks, to be used for the encoder's embedding
-            # layer. This way, the rank referenced by the 'split rank' remains
-            # the same whether or not a standalone embedding stage is used.
-            num_ranks_in_encoder = (
-                args.pipeline_model_parallel_split_rank - 1
-                if args.standalone_embedding_stage else
-                args.pipeline_model_parallel_split_rank
-            )
-            num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder
-            assert args.encoder_num_layers % num_ranks_in_encoder == 0, \
-                    'encoder_num_layers (%d) must be divisible by number of ranks given to encoder (%d)' % (args.encoder_num_layers, num_ranks_in_encoder)
-            assert args.decoder_num_layers % num_ranks_in_decoder == 0, \
-                    'decoder_num_layers (%d) must be divisible by number of ranks given to decoder (%d)' % (args.decoder_num_layers, num_ranks_in_decoder)
-            if mpu.is_pipeline_stage_before_split():
-                num_layers = (
-                    0
-                    if args.standalone_embedding_stage
-                    and mpu.get_pipeline_model_parallel_rank() == 0 else
-                    args.encoder_num_layers // num_ranks_in_encoder
-                )
-            else:
-                num_layers = args.decoder_num_layers // num_ranks_in_decoder
-        else:
-            if args.custom_partition == None:
-                assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
-                    'num_layers must be divisible by transformer_pipeline_model_parallel_size'
-            else:
-                assert args.num_layers == sum(args.custom_partition), \
-                    "total custom partition layers must equal to model transformer layers"
-
-            # When a standalone embedding stage is used, all transformer layers
-            # are divided among pipeline rank >= 1, while on pipeline rank 0,
-            # ranks either contain the input embedding layer (virtual pp rank 0),
-            # or no layers at all (virtual pp rank >= 1).
-
-            if args.custom_partition != None:
-                if args.virtual_pipeline_model_parallel_size is None:
-                    num_layers = args.custom_partition[mpu.get_pipeline_model_parallel_rank()]
-                else:
-                    num_layers =  args.custom_partition[mpu.get_virtual_pipeline_model_parallel_rank() * mpu.get_pipeline_model_parallel_world_size() \
-                                                        + mpu.get_pipeline_model_parallel_rank()]
-            else:
-                num_layers = (
-                    0
-                    if args.standalone_embedding_stage
-                    and mpu.get_pipeline_model_parallel_rank() == 0 else
-                    args.num_layers // args.transformer_pipeline_model_parallel_size
-                )
-    else:
-        num_layers = args.num_layers
-    return num_layers
-
-
-def _get_layer_type(model_type, default_layer_type, retro_layer_numbers,
-                    layer_number):
-    args = get_args()
-    if args.retro_add_retriever and layer_number in retro_layer_numbers:
-        if model_type == ModelType.retro_decoder:
-            return LayerType.retro_decoder_with_retriever \
-                if layer_number == retro_layer_numbers[0] \
-                   else LayerType.retro_decoder
-        elif model_type == ModelType.retro_encoder:
-            return LayerType.retro_encoder
-        else:
-            raise Exception("Unsupported model type, '%s'." % model_type)
-    else:
-        return default_layer_type
-
-
-class ParallelTransformer(MegatronModule):
-    """Transformer class."""
-
-    def __init__(self, config,
-                 model_type, layer_type=LayerType.encoder,
-                 self_attn_mask_type=AttnMaskType.padding,
-                 post_norm=True,
-                 pre_process=True,
-                 post_process=True,
-                 drop_path_rate=0.0,
-                 num_experts=[1],
-                 rlhf_training=False):
-        super(ParallelTransformer, self).__init__()
-        if rlhf_training:
-            args = get_rlhf_args()
-        else:
-            args = get_args()
-
-        self.layer_type = layer_type
-        self.model_type = model_type
-        self.bf16 = config.bf16
-        self.fp32_residual_connection = config.fp32_residual_connection
-        self.post_norm = post_norm
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.input_tensor = None
-        self.drop_path_rate = drop_path_rate
-        self.transformer_impl = args.transformer_impl
-        self.retro_add_retriever = args.retro_add_retriever
-
-        # Store activation checkpoiting flag.
-        self.recompute_granularity = config.recompute_granularity
-        self.recompute_method = config.recompute_method
-        self.recompute_num_layers = config.recompute_num_layers
-        self.distribute_saved_activations = \
-            config.distribute_saved_activations and not config.sequence_parallel
-
-        if args.custom_recompute_layers_per_stage is not None:
-            if args.virtual_pipeline_model_parallel_size != None:
-                self.recompute_num_layers = args.custom_recompute_layers_per_stage[mpu.get_virtual_pipeline_model_parallel_rank() * args.pipeline_model_parallel_size + mpu.get_pipeline_model_parallel_rank()]
-            else:
-                self.recompute_num_layers = args.custom_recompute_layers_per_stage[mpu.get_pipeline_model_parallel_rank()]
-
-        self.sequence_parallel = config.sequence_parallel
-
-        # Transformer Engine Init.
-        self.transformer_engine_v_0_10 = False
-        self.transformer_engine_v_0_11 = False
-        self.transformer_engine_v_0_8 = False
-        if self.transformer_impl == 'transformer_engine':
-            global transformer_engine
-            import transformer_engine
-            from importlib.metadata import version
-            from pkg_resources import packaging
-
-            # te_version = packaging.version.Version(version("transformer-engine"))
-            te_version = packaging.version.Version("2.4.1")
-            if te_version >= packaging.version.Version("0.8.0"):
-                self.transformer_engine_v_0_8 = True
-            if te_version >= packaging.version.Version("0.10.0"):
-                self.transformer_engine_v_0_10 = True
-            if te_version >= packaging.version.Version("0.11.0"):
-                self.transformer_engine_v_0_11 = True
-
-            del version, packaging
-
-            assert not args.squared_relu, "TransformerEngine does not support squared relu activation."
-
-        self.use_fp8 = args.fp8 is not None
-        self.fp8_recipe = None
-        self.fp8_group = None
-        if self.use_fp8:
-            assert args.transformer_impl == 'transformer_engine', \
-                'transformer-engine required for fp8 training and inference'
-            self.fp8_group = mpu.get_amax_reduction_group()
-            if args.fp8 == "e4m3":
-                fp8_format = transformer_engine.common.recipe.Format.E4M3
-            elif args.fp8 == "hybrid":
-                fp8_format = transformer_engine.common.recipe.Format.HYBRID
-            else:
-                raise ValueError("The DelayedScaling recipe only supports E4M3 and HYBRID formats.")
-            self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
-                margin=args.fp8_margin,
-                interval=args.fp8_interval,
-                fp8_format=fp8_format,
-                amax_history_len=args.fp8_amax_history_len,
-                amax_compute_algo=args.fp8_amax_compute_algo,
-                override_linear_precision=(False, False, not args.fp8_wgrad),
-            )
-
-        self.num_microbatches_in_previous_step = -1
-        self.microbatch_count = 0
-        self.checkpoint_core_attention = config.recompute_granularity == 'selective'
-
-        ## check custom parition pp stage
-        if args.custom_partition is not None:
-            assert sum(args.custom_partition) == args.num_layers, \
-                f"total custom partition pp stage transformer layers should equal to model layers" \
-                f"get total custom partition layers ({sum(args.custom_partition)})  !=   model layers ({args.num_layers})"
-            if args.virtual_pipeline_model_parallel_size is None:
-                assert len(args.custom_partition) == mpu.get_pipeline_model_parallel_world_size(), \
-                    f"custom partition pp stage length should equal to PP size" \
-                    f"get custom pp stage length ({len(args.custom_partition)})  != PP size ({mpu.get_pipeline_model_parallel_world_size()})"
-            else:
-                assert len(args.custom_partition) == (mpu.get_virtual_pipeline_model_parallel_world_size() * mpu.get_pipeline_model_parallel_world_size()), \
-                    f"custom partition pp stage length should equal to PP size * vitual size" \
-                    f"get custom pp stage length ({len(args.custom_partition)})  != PP size * virtual size ({mpu.get_virtual_pipeline_model_parallel_world_size() * mpu.get_pipeline_model_parallel_world_size()})"
-
-        # Number of layers.
-        self.num_layers = _get_num_layers(args, model_type,
-                                          layer_type==LayerType.decoder)
-
-        self.drop_path_rates = [
-            rate.item() for rate in
-            torch.linspace(0, self.drop_path_rate, config.num_layers)]
-
-        self.retro_layer_numbers = None
-        if model_type == ModelType.retro_decoder:
-            retro_layer_start = 6 if config.num_layers <= 15 else 9
-            self.retro_layer_numbers = \
-                np.arange(retro_layer_start, args.num_layers + 1, 3).tolist()
-        if model_type == ModelType.retro_encoder:
-            self.retro_layer_numbers = [1]
-
-        # Transformer layers.
-        if args.retro_add_retriever:
-            assert self.recompute_granularity != 'full', \
-                "Full recompute not supported for Retro."
-            assert args.transformer_impl == 'local', \
-                "Transformer engine does not support Retro layers."
-        def build_layer(layer_number, n_e):
-            if args.transformer_impl == 'local':
-                current_layer_type = _get_layer_type(
-                    model_type, layer_type, self.retro_layer_numbers,
-                    layer_number)
-                return ParallelTransformerLayer(
-                    config,
-                    layer_number,
-                    layer_type=current_layer_type,
-                    self_attn_mask_type=self_attn_mask_type,
-                    drop_path_rate=self.drop_path_rates[layer_number - 1],
-                    num_experts=n_e,
-                    rlhf_training=rlhf_training)
-            else:
-                # This argument is only available from TE v0.10 onwards.
-                extra_transformer_engine_kwargs = {}
-                if self.transformer_engine_v_0_8:
-                    extra_transformer_engine_kwargs["bias"] = args.add_bias_linear
-                if self.transformer_engine_v_0_10:
-                    extra_transformer_engine_kwargs["activation"] = "swiglu" if args.swiglu else "gelu"
-                if self.transformer_engine_v_0_11:
-                    extra_transformer_engine_kwargs["normalization"] = args.normalization
-
-                if os.environ.get("ENABLE_TORCH_TP_OVERLAP", "0").lower() in ["1", "t", "on"]:
-                    extra_transformer_engine_kwargs["torch_tp_overlap"] = True
-                if os.environ.get("ENABLE_TORCH_PP_OVERLAP", "0").lower() in ["1", "t", "on"]:
-                    extra_transformer_engine_kwargs["torch_pp_overlap"] = True
-                extra_transformer_engine_kwargs["cp_group"] = get_context_parallel_group(check_initialized=False)
-                extra_transformer_engine_kwargs["cp_global_ranks"] = get_context_parallel_global_ranks(check_initialized=False)
-                extra_transformer_engine_kwargs["cp_stream"] = torch.cuda.Stream()
-
-                return transformer_engine.pytorch.TransformerLayer(
-                    config.hidden_size,
-                    config.ffn_hidden_size,
-                    config.num_attention_heads,
-                    num_gqa_groups = config.num_query_groups,
-                    layernorm_epsilon=config.layernorm_epsilon,
-                    hidden_dropout=config.hidden_dropout,
-                    attention_dropout=config.attention_dropout,
-                    init_method=config.init_method,
-                    output_layer_init_method=config.output_layer_init_method,
-                    layer_number=layer_number,
-                    kv_channels=config.kv_channels,
-                    self_attn_mask_type=self_attn_mask_type.name,
-                    tp_group=mpu.get_tensor_model_parallel_group(),
-                    get_rng_state_tracker=tensor_parallel.get_cuda_rng_tracker,
-                    fuse_wgrad_accumulation=config.gradient_accumulation_fusion,
-                    # apply_query_key_layer_scaling=config.apply_query_key_layer_scaling, # deprecated transformerengine v1.0.0
-                    # attention_softmax_in_fp32=config.attention_softmax_in_fp32, # deprecated transformerengine v1.0.0
-                    seq_length=args.seq_length,
-                    micro_batch_size=args.micro_batch_size,
-                    sequence_parallel=config.sequence_parallel,
-                    params_dtype=config.params_dtype,
-                    apply_residual_connection_post_layernorm=config.apply_residual_connection_post_layernorm,
-                    output_layernorm=False,
-                    layer_type="encoder",
-                    drop_path_rate=self.drop_path_rates[layer_number - 1],
-                    set_parallel_mode=True,
-                    fuse_qkv_params=True,
-                    **extra_transformer_engine_kwargs)
-
-        if config.virtual_pipeline_model_parallel_size is not None:
-            assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \
-                'num_layers_per_stage must be divisible by ' \
-                'virtual_pipeline_model_parallel_size'
-            assert args.model_type != ModelType.encoder_and_decoder
-            # Number of layers in each model chunk is the number of layers in the stage,
-            # divided by the number of model chunks in a stage.
-            if args.custom_partition is None:
-                self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size
-            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
-            # layers to stages like (each list is a model chunk):
-            # Stage 0: [0]  [2]  [4]  [6]
-            # Stage 1: [1]  [3]  [5]  [7]
-            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
-            # layers to stages like (each list is a model chunk):
-            # Stage 0: [0, 1]  [4, 5]
-            # Stage 1: [2, 3]  [6, 7]
-            if args.custom_partition == None:
-                offset = mpu.get_virtual_pipeline_model_parallel_rank() * (
-                    config.num_layers // config.virtual_pipeline_model_parallel_size) + \
-                    (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
-            else:
-                offset = sum(args.custom_partition[:mpu.get_virtual_pipeline_model_parallel_rank() * mpu.get_pipeline_model_parallel_world_size() \
-                                              + mpu.get_pipeline_model_parallel_rank()])
-        else:
-            # Each stage gets a contiguous set of layers.
-            if args.model_type == ModelType.encoder_and_decoder and \
-                    mpu.get_pipeline_model_parallel_world_size() > 1:
-                pipeline_rank = mpu.get_pipeline_model_parallel_rank()
-                if layer_type == LayerType.encoder:
-                    if args.custom_partition == None:
-                        offset = pipeline_rank * self.num_layers
-                    else:
-                        offset = sum(args.custom_partition[:pipeline_rank])
-                else:
-                    if args.custom_partition == None:
-                        num_ranks_in_enc = args.pipeline_model_parallel_split_rank
-                        offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers
-                    else:
-                        NotImplementedError("custom pp stage layers doesn`t adapter this case, please delete <custom-partition> parameter")
-            else:
-                if args.custom_partition == None:
-                    offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
-                else:
-                    offset = sum(args.custom_partition[:mpu.get_pipeline_model_parallel_rank()])
-
-        if self.num_layers == 0:
-            # When a standalone embedding stage is used (e.g.,
-            # args.standalone_embedding_stage == True), virtual pipeline ranks
-            # on pipeline rank 0 will have zero transformer layers assigned to
-            # them. This results in the model's input and output tensors to be
-            # the same, which will cause failure for certain output tensor
-            # optimizations (e.g., pipeline output deallocation). To remedy
-            # this, we assign a 'no-op' layer on these ranks, which will
-            # disconnect the input tensor from the output tensor.
-            self.num_layers = 1
-            self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ])
-        else:
-            assert len(num_experts) == 1 or len(num_experts) == args.num_layers // args.expert_interval, \
-            'num_experts must be either a single value or a list of the same length as the number of MoE layers'
-
-            # Create the list of MoE experts
-            if len(num_experts) == 1:
-                num_experts = num_experts * (args.num_layers // args.expert_interval)
-
-            # Build the layers
-            self.layers = []
-            for i in range(self.num_layers):
-                layer_num = i + 1 + offset
-                if layer_num % args.expert_interval == 0:
-                    n_e = num_experts[(layer_num-1) // args.expert_interval]
-                else:
-                    n_e = 1
-                self.layers.append(build_layer(layer_num, n_e))
-            self.layers = torch.nn.ModuleList(self.layers)
-            # self.layers = torch.nn.ModuleList(
-            #     [build_layer(i + 1 + offset) for i in range(self.num_layers)])
-
-            # Update dropout rate for Retro encoder.
-            if model_type == ModelType.retro_encoder:
-                for layer in self.layers:
-                    if layer.self_attention.use_flash_attn:
-                        layer.self_attention.core_attention_flash.dropout_p = \
-                            torch.nn.Dropout(args.retro_encoder_attention_dropout)
-                    else:
-                        layer.self_attention.core_attention.attention_dropout.p =\
-                            args.retro_encoder_attention_dropout
-                    layer.hidden_dropout = args.retro_encoder_hidden_dropout
-
-        if self.post_process and self.post_norm:
-            # Final layer norm before output.
-            self.final_norm = get_norm(config)
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def _checkpointed_forward(self, hidden_states, attention_mask, position_ids,
-                              encoder_output, enc_dec_attn_mask,
-                              rotary_pos_emb, is_first_microbatch):
-        """Forward method with activation checkpointing."""
-        def custom(start, end):
-            def custom_forward(*args, **kwargs):
-                x_, *args = args
-                for index in range(start, end):
-                    layer = self._get_layer(index)
-                    x_ = layer(x_, *args, **kwargs)
-                return x_
-            return custom_forward
-
-        te_forward_kwargs = {}
-        if self.transformer_impl == 'transformer_engine':
-            te_forward_kwargs['is_first_microbatch'] = is_first_microbatch
-            if self.transformer_engine_v_0_10:
-                te_forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
-
-        if self.recompute_method == 'uniform':
-            # Uniformly divide the total number of Transformer layers and
-            # checkpoint the input activation of each divided chunk.
-            # A method to further reduce memory usage reducing checkpoints.
-            l = 0
-            while l < self.num_layers:
-                if self.transformer_impl == 'transformer_engine':
-                    hidden_states = transformer_engine.pytorch.checkpoint(
-                        custom(l, l + self.recompute_num_layers),
-                        self.distribute_saved_activations,
-                        tensor_parallel.get_cuda_rng_tracker,
-                        mpu.get_tensor_model_parallel_group(),
-                        hidden_states, attention_mask, None, None, encoder_output,
-                        enc_dec_attn_mask, **te_forward_kwargs)
-                else:
-                    hidden_states = tensor_parallel.checkpoint(
-                        custom(l, l + self.recompute_num_layers),
-                        self.distribute_saved_activations,
-                        hidden_states, attention_mask, position_ids,
-                        encoder_output, enc_dec_attn_mask,
-                        None, None, None, None, rotary_pos_emb)
-
-                l += self.recompute_num_layers
-
-        elif self.recompute_method == 'block':
-            # Checkpoint the input activation of only a set number of individual
-            # Transformer layers and skip the rest.
-            # A method fully use the device memory removing redundant re-computation.
-            for l in range(self.num_layers):
-                if l < self.recompute_num_layers:
-                    if self.transformer_impl == 'transformer_engine':
-                        hidden_states = transformer_engine.pytorch.checkpoint(
-                            custom(l, l + 1),
-                            self.distribute_saved_activations,
-                            tensor_parallel.get_cuda_rng_tracker,
-                            mpu.get_tensor_model_parallel_group(),
-                            hidden_states, attention_mask, None, None, encoder_output,
-                            enc_dec_attn_mask, **te_forward_kwargs)
-                    else:
-                        hidden_states = tensor_parallel.checkpoint(
-                            custom(l, l + 1),
-                            self.distribute_saved_activations,
-                            hidden_states, attention_mask, position_ids,
-                            encoder_output, enc_dec_attn_mask,
-                            None, None, None, None, rotary_pos_emb)
-                else:
-                    if self.transformer_impl == 'transformer_engine':
-                        hidden_states = custom(l, l + 1)(
-                            hidden_states, attention_mask, None, None, encoder_output,
-                            enc_dec_attn_mask, **te_forward_kwargs)
-                    else:
-                        hidden_states = custom(l, l + 1)(
-                            hidden_states, attention_mask, position_ids,
-                            encoder_output, enc_dec_attn_mask,
-                            None, None, None, None, rotary_pos_emb)
-        else:
-            raise ValueError("Invalid activation recompute method.")
-
-        return hidden_states
-
-    def set_input_tensor(self, input_tensor):
-        """Set input tensor to be used instead of forward()'s input.
-
-        When doing pipeline parallelism the input from the previous
-        stage comes from communication, not from the input, so the
-        model's forward_step_func won't have it. This function is thus
-        used by internal code to bypass the input provided by the
-        forward_step_func"""
-        self.input_tensor = input_tensor
-
-    def forward(self, hidden_states, attention_mask,
-                position_ids=None,
-                encoder_output=None, enc_dec_attn_mask=None,
-                retriever_input=None,
-                retriever_output=None,
-                retriever_attn_mask=None,
-                inference_params=None,
-                rotary_pos_emb=None):
-        # hidden_states: [s, b, h]
-
-        # Checks.
-        if inference_params:
-            assert self.recompute_granularity is None, \
-                'inference does not work with activation checkpointing'
-
-        if not self.pre_process:
-            # See set_input_tensor()
-            hidden_states = self.input_tensor
-
-        # Viewless tensor.
-        # - We only need to create a viewless tensor in the case of micro batch
-        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
-        #   above creates a view tensor, and '.contiguous()' is a pass-through.
-        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
-        #   the need to make it viewless.
-        #
-        #   However, we don't explicitly check mbs == 1 here because
-        #   make_viewless_tensor() has negligible overhead when its input
-        #   is already viewless.
-        #
-        # - For the 'else' case above, calling make_viewless_tensor() here is
-        #   likely redundant, since p2p_communication.py (likely originator)
-        #   already creates viewless tensors. That said, make_viewless_tensor()
-        #   is called here to be future-proof and corner-case-proof.
-        hidden_states = core.utils.make_viewless_tensor(
-            hidden_states,
-            requires_grad=True,
-            keep_graph=True,
-        )
-
-        # RNG context.
-        if self.sequence_parallel and not inference_params:
-            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
-        else:
-            rng_context = nullcontext()
-
-        # Forward layers.
-        with rng_context:
-            # The fp8_autocast context manager is a no-op when enabled=True
-            # The if...else serves to short circuit name resolution for fp8_autocast
-            with transformer_engine.pytorch.fp8_autocast(
-                enabled=self.use_fp8,
-                fp8_recipe=self.fp8_recipe,
-                fp8_group=self.fp8_group
-            ) if self.use_fp8 else nullcontext():
-                # Determine if the current iteration is first microbatch
-                if self.num_microbatches_in_previous_step != get_num_microbatches():
-                    self.microbatch_count = 0 # Reset count on new batch size rampup interval
-                self.num_microbatches_in_previous_step = get_num_microbatches()
-                is_first_microbatch = self.microbatch_count % get_num_microbatches() == 0
-
-                # Forward pass.
-                if self.recompute_granularity == 'full':
-                    hidden_states = self._checkpointed_forward(hidden_states,
-                                                               attention_mask,
-                                                               position_ids,
-                                                               encoder_output,
-                                                               enc_dec_attn_mask,
-                                                               rotary_pos_emb,
-                                                               is_first_microbatch)
-                else:
-                    forward_kwargs = {
-                        'encoder_output': encoder_output,
-                        'enc_dec_attn_mask': enc_dec_attn_mask,
-                        'inference_params': inference_params,
-                    }
-
-                    if self.transformer_impl == 'transformer_engine':
-                        forward_kwargs['is_first_microbatch'] = is_first_microbatch
-                        forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention
-                        if self.transformer_engine_v_0_10:
-                            forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
-                    else:
-                        forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
-                        forward_kwargs['retriever_input'] = retriever_input
-                        forward_kwargs['retriever_output'] = retriever_output
-                        forward_kwargs['retriever_attn_mask'] = retriever_attn_mask
-                        forward_kwargs['position_ids'] = position_ids
-
-                    for index in range(self.num_layers):
-                        layer = self._get_layer(index)
-
-                        hidden_states = layer(
-                            hidden_states,
-                            attention_mask,
-                            **forward_kwargs)
-
-                        # First Retro decoder layer returns both hidden_states
-                        # and retriever_output. Make retriever_output available
-                        # to subsequence Retro layers.
-                        if isinstance(hidden_states, tuple):
-                            assert len(hidden_states) == 2
-                            hidden_states, retriever_output = hidden_states
-                            forward_kwargs["retriever_output"] = retriever_output
-
-                # Skip counter update for eval and activation checkpointing
-                if torch.is_grad_enabled() and self.training:
-                    self.microbatch_count += 1
-
-        # Final layer norm.
-        if self.post_process and self.post_norm:
-            hidden_states = self.final_norm(hidden_states)
-
-        return hidden_states
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customize load."""
-
-        # Handle renaming layernorm -> norm in component names
-        # state_dict_ = {}
-        # for key in state_dict.keys():
-        #     newkey = key.replace("layernorm", "norm")
-        #     state_dict_[newkey] = state_dict[key]
-
-        super().load_state_dict(state_dict, strict)
-
-class LMHeadPipe(MegatronModule):
-    """
-    Arguments:
-        vocab_size: size of vocabulary.
-        hidden_size: hidden size
-        gather_output: wether output logits being gathered or not.
-        init_method: init method for weight initialization
-        config:
-    """
-
-    def __init__(self, hidden_size, vocab_size, config):
-        super(LMHeadPipe, self).__init__()
-        self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=hidden_size,
-                                                            output_size=vocab_size,
-                                                            bias=False,
-                                                            config=config,
-                                                            init_method=config.init_method,)
-
-    def forward(self, inputs, **kwargs):
-        assert torch.is_tensor(inputs) or isinstance(inputs, tuple)
-        if isinstance(inputs, tuple):
-            hidden_states = inputs[0]
-        else:
-            hidden_states = inputs
-
-        if not hasattr(self, '_args'):
-            self._args = get_args()
-
-        if hasattr(self._args, 'attn_mask'):
-            attention_mask = None
-        else:
-            attention_mask = inputs[1]
-
-        logits, _ = self.lm_head(hidden_states)
-
-        # If cmd args has attn_mask, we don't forward it as an activation.
-        if hasattr(self._args, 'attn_mask'):
-            return logits
-        else:
-            return logits, attention_mask
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/classification.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/classification.py
deleted file mode 100644
index 50ad89f442d4f4c31f81af22fa40d7087e7fb49e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/classification.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Vision Transformer(VIT) model."""
-
-import torch
-from torch.nn.init import trunc_normal_
-from megatron_ds import get_args
-from megatron_ds.model.utils import get_linear_layer
-from megatron_ds.model.vision.vit_backbone import VitBackbone, VitMlpHead
-from megatron_ds.model.vision.mit_backbone import mit_b3_avg
-from megatron_ds.model.module import MegatronModule
-
-class VitClassificationModel(MegatronModule):
-    """Vision Transformer Model."""
-
-    def __init__(self, config, num_classes, finetune=False,
-                 pre_process=True, post_process=True):
-        super(VitClassificationModel, self).__init__()
-        args = get_args()
-        self.config = config
-
-        self.hidden_size = args.hidden_size
-        self.num_classes = num_classes
-        self.finetune = finetune
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.backbone = VitBackbone(
-            config=config,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-            single_token_output=True
-        )
-
-        if self.post_process:
-            if not self.finetune:
-                self.head = VitMlpHead(config, self.hidden_size, self.num_classes)
-            else:
-                self.head = get_linear_layer(
-                    self.hidden_size,
-                    self.num_classes,
-                    torch.nn.init.zeros_
-                )
-
-    def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
-        self.backbone.set_input_tensor(input_tensor)
-
-    def forward(self, input):
-        hidden_states = self.backbone(input)
-
-        if self.post_process:
-            hidden_states = self.head(hidden_states)
-
-        return hidden_states
-
-
-class MitClassificationModel(MegatronModule):
-    """Mix vision Transformer Model."""
-
-    def __init__(self, num_classes,
-                 pre_process=True, post_process=True):
-        super(MitClassificationModel, self).__init__()
-        args = get_args()
-
-        self.hidden_size = args.hidden_size
-        self.num_classes = num_classes
-
-        self.backbone = mit_b3_avg()
-        self.head = torch.nn.Linear(512, num_classes)
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, torch.nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, torch.nn.Linear) and m.bias is not None:
-                torch.nn.init.constant_(m.bias, 0)
-
-    def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
-        pass
-
-    def forward(self, input):
-        hidden_states = self.backbone(input)
-        hidden_states = self.head(hidden_states)
-
-        return hidden_states
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/dino.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/dino.py
deleted file mode 100644
index 5dfc9172866f04049ca42ebe7f9e927ea9256c9e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/dino.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the Apache license found in the
-# LICENSE file in the root directory of this source tree.
-
-# copied from https://github.com/facebookresearch/dino/blob/main/main_dino.py
-# reworked/refactored some parts to make it run in Megatron.
-import math
-import apex
-import einops
-import torch
-import numpy as np
-import torch.nn.functional as F
-from torch.nn.init import trunc_normal_
-from megatron_ds import get_args, print_rank_0
-from megatron_ds.model.utils import get_linear_layer
-from megatron_ds.model.vision.vit_backbone import VitBackbone
-from megatron_ds.model.module import MegatronModule
-from megatron_ds.model.vision.mit_backbone import mit_b5_avg
-from megatron_ds.model.vision.esvit_swin_backbone import get_swin
-
-
-class DINOLoss(torch.nn.Module):
-    def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp,
-                 warmup_teacher_temp_epochs, nepochs, student_temp=0.1,
-                 center_momentum=0.9):
-        super().__init__()
-        self.student_temp = student_temp
-        self.center_momentum = center_momentum
-        self.ncrops = ncrops
-        self.register_buffer("center", torch.zeros(1, out_dim))
-        # we apply a warm up for the teacher temperature because
-        # a too high temperature makes the training instable at the beginning
-        self.teacher_temp_schedule = np.concatenate((
-            np.linspace(warmup_teacher_temp,
-                        teacher_temp, warmup_teacher_temp_epochs),
-            np.ones(nepochs - warmup_teacher_temp_epochs) * teacher_temp
-        ))
-        self.teacher_temp = teacher_temp
-
-    def forward(self, student_output, teacher_output, iteration):
-        """
-        Cross-entropy between softmax outputs of the teacher
-        and student network.
-        """
-        args = get_args()
-        student_out = student_output / self.student_temp
-        student_out = student_out.chunk(self.ncrops)
-
-        epoch = iteration // args.iter_per_epoch
-
-        # teacher centering and sharpening
-        temp = self.teacher_temp_schedule[epoch]
-        teacher_out = F.softmax((teacher_output - self.center) / temp, dim=-1)
-
-        teacher_out = teacher_out.detach().chunk(2)
-
-        total_loss = 0
-        n_loss_terms = 0
-        for iq, q in enumerate(teacher_out):
-            for v in range(len(student_out)):
-                if v == iq:
-                    # we skip cases where student and teacher operate on the same view
-                    continue
-                loss = torch.sum(-q * F.log_softmax(student_out[v], dim=-1), dim=-1)
-                total_loss += loss.mean()
-                n_loss_terms += 1
-        total_loss /= n_loss_terms
-        self.update_center(teacher_output)
-        return total_loss
-
-    @torch.no_grad()
-    def update_center(self, teacher_output):
-        """
-        Update center used for teacher output.
-        """
-        batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
-        torch.distributed.all_reduce(batch_center)
-        batch_center = batch_center / (len(teacher_output) * torch.distributed.get_world_size())
-        self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)
-
-class DINOHead(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, norm_last_layer=True, nlayers=3):
-        super().__init__()
-        args = get_args()
-        hidden_dim = args.dino_head_hidden_size
-        bottleneck_dim = args.dino_bottleneck_size
-        nlayers = max(nlayers, 1)
-        if nlayers == 1:
-            self.mlp = torch.nn.Linear(in_dim, bottleneck_dim)
-        else:
-            layers = [torch.nn.Linear(in_dim, hidden_dim)]
-            layers.append(torch.nn.GELU())
-            for _ in range(nlayers - 2):
-                layers.append(torch.nn.Linear(hidden_dim, hidden_dim))
-                layers.append(torch.nn.GELU())
-            layers.append(torch.nn.Linear(hidden_dim, bottleneck_dim))
-            self.mlp = torch.nn.Sequential(*layers)
-        self.apply(self._init_weights)
-        self.last_layer = torch.nn.utils.weight_norm(torch.nn.Linear(bottleneck_dim, out_dim, bias=False))
-        self.last_layer.weight_g.data.fill_(1)
-        if norm_last_layer:
-            self.last_layer.weight_g.requires_grad = False
-
-    def _init_weights(self, m):
-        if isinstance(m, torch.nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, torch.nn.Linear) and m.bias is not None:
-                torch.nn.init.constant_(m.bias, 0)
-
-    def forward(self, x):
-        x = self.mlp(x)
-        x = torch.nn.functional.normalize(x, dim=-1, p=2)
-        x = self.last_layer(x)
-        return x
-
-
-class MultiCropWrapper(MegatronModule):
-
-    """
-    Perform forward pass separately on each resolution input.
-    The inputs corresponding to a single resolution are clubbed and single
-    forward is run on the same resolution inputs. Hence we do several
-    forward passes = number of different resolutions used. We then
-    concatenate all the output features and run the head forward on these
-    concatenated features.
-    """
-    def __init__(self, backbone, head):
-        super(MultiCropWrapper, self).__init__()
-        # disable layers dedicated to ImageNet labels classification
-        #backbone.fc, backbone.head = torch.nn.Identity(), torch.nn.Identity()
-        self.backbone = backbone
-        self.head = head
-
-    def forward(self, x):
-        # convert to list
-        if not isinstance(x, list):
-            x = [x]
-        idx_crops = torch.cumsum(torch.unique_consecutive(
-            torch.tensor([inp.shape[-1] for inp in x]),
-            return_counts=True,
-        )[1], 0)
-
-        start_idx = 0
-        for end_idx in idx_crops:
-            _out = self.backbone(torch.cat(x[start_idx: end_idx]))
-            if start_idx == 0:
-                output = _out
-            else:
-                output = torch.cat((output, _out))
-            start_idx = end_idx
-        # Run the head forward on the concatenated features.
-        if self.training:
-            return self.head(output)
-        else:
-            return output
-
-
-def cosine_scheduler(base_value, final_value, epochs, niter_per_ep,
-                     warmup_epochs=0, start_warmup_value=0):
-    warmup_schedule = np.array([])
-    warmup_iters = warmup_epochs * niter_per_ep
-    if warmup_epochs > 0:
-        warmup_schedule = \
-                np.linspace(start_warmup_value, base_value, warmup_iters)
-
-    iters = np.arange(epochs * niter_per_ep - warmup_iters)
-    schedule = final_value + 0.5 * (base_value - final_value) \
-        * (1 + np.cos(np.pi * iters / len(iters)))
-
-    schedule = np.concatenate((warmup_schedule, schedule))
-    assert len(schedule) == epochs * niter_per_ep
-    return schedule
-
-
-def get_student_backbone_and_num_features(config, pre_process=True, post_process=True):
-    args = get_args()
-
-    if args.vision_backbone_type == 'vit':
-        student = VitBackbone(config,
-                              pre_process=pre_process,
-                              post_process=post_process,
-                              drop_path_rate=0.1,
-                              single_token_output=True)
-        num_features = args.hidden_size
-    elif args.vision_backbone_type == 'mit':
-        student = mit_b5_avg(drop_path_rate=0.1)
-        num_features = 512
-    elif args.vision_backbone_type == 'swin':
-        student = get_swin()
-        num_features = student.num_features
-    else:
-        raise Exception('{} vision backbone is not supported.'.format(
-                              args.vision_backbone_type))
-
-    return student, num_features
-
-def get_teacher_backbone_and_num_features(config, pre_process=True, post_process=True):
-    args = get_args()
-
-    if args.vision_backbone_type == 'vit':
-        teacher = VitBackbone(config,
-                              pre_process=pre_process,
-                              post_process=post_process,
-                              single_token_output=True)
-        num_features = args.hidden_size
-    elif args.vision_backbone_type == 'mit':
-        teacher = mit_b5_avg(drop_path_rate=0.0)
-        num_features = 512
-    elif args.vision_backbone_type == 'swin':
-        teacher = get_swin(is_teacher=True)
-        num_features = teacher.num_features
-    else:
-        raise Exception('{} vision backbone is not supported.'.format(
-                              args.vision_backbone_type))
-    return teacher, num_features
-
-
-class DINOPretrainModel(MegatronModule):
-    def __init__(self, config, pre_process=True, post_process=True):
-        super(DINOPretrainModel, self).__init__()
-        args = get_args()
-        self.config = config
-        self.out_dim = 65536
-
-        self.dino_loss = DINOLoss(
-            self.out_dim,
-            args.dino_local_crops_number + 2,
-            args.dino_warmup_teacher_temp,
-            args.dino_teacher_temp,
-            args.dino_warmup_teacher_temp_epochs,
-            300,
-        )
-
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.momentum_teacher = 0.996
-
-        student_backbone, num_features = \
-            get_student_backbone_and_num_features(config, pre_process, post_process)
-
-        self.student = MultiCropWrapper(
-            student_backbone,
-            DINOHead(num_features, self.out_dim,
-                     norm_last_layer=args.dino_norm_last_layer)
-        )
-
-        self.momentum_schedule = cosine_scheduler(
-            self.momentum_teacher, 1,
-            args.train_iters // args.iter_per_epoch,
-            args.iter_per_epoch
-        )
-
-        teacher_backbone, num_features = \
-            get_teacher_backbone_and_num_features(config, pre_process, post_process)
-        self.teacher = MultiCropWrapper(
-            teacher_backbone,
-            DINOHead(num_features, self.out_dim)
-        )
-        self.teacher.load_state_dict(self.student.state_dict())
-
-        for p in self.teacher.parameters():
-            if hasattr(p, "requires_grad") and p.requires_grad is not None:
-                p.requires_grad = False
-
-    def set_input_tensor(self, tensor):
-        pass
-
-    def forward(self, input):
-        student_output = None
-        if self.training:
-            student_output = self.student(input)
-            teacher_output = self.teacher(input[:2])
-        else:
-            teacher_output = self.teacher(input)
-        return student_output, teacher_output
-
-    def cancel_gradients_last_layer(self, iteration):
-        args = get_args()
-        epoch = iteration // args.iter_per_epoch
-        if epoch < args.dino_freeze_last_layer:
-            for n, p in self.student.named_parameters():
-                if "last_layer" in n:
-                    p.grad = None
-
-    def update_momentum(self, iteration):
-        with torch.no_grad():
-            m = self.momentum_schedule[iteration]
-            for param_q, param_k in zip(self.student.parameters(), self.teacher.parameters()):
-                param_k.data.mul_(m).add_((1 - m) * param_q.detach().data)
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/esvit_swin_backbone.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/esvit_swin_backbone.py
deleted file mode 100644
index 57778e81df6042d6330fcde4e0f8dcaab36b16ed..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/esvit_swin_backbone.py
+++ /dev/null
@@ -1,849 +0,0 @@
-# Copyright (c) 2021 Microsoft
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# Modified by Chunyuan Li (chunyl@microsoft.com)
-# Swin Transformer
-# --------------------------------------------------------
-
-import os
-import logging
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from functools import partial
-import torch.distributed as dist
-from torch.nn.init import trunc_normal_
-from megatron_ds.model.transformer import DropPath
-from megatron_ds import get_args
-from megatron_ds.model import LayerNorm
-import numpy as np
-from math import sqrt
-
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None,
-                 out_features=None, act_layer=nn.GELU, drop=0.):
-        super(Mlp, self).__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
-    r"""Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
-
-        super(WindowAttention, self).__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2 Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0).type(attn.type())
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn_out = attn
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x, attn_out
-
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
-
-    def flops(self, N):
-        # calculate flops for 1 window with token length of N
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += N * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
-        #  x = (attn @ v)
-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += N * self.dim * self.dim
-        return flops
-
-    @staticmethod
-    def compute_macs(module, input, output):
-        B, N, C = input[0].shape
-
-        module.__flops__ += module.flops(N) * B
-
-
-class SwinTransformerBlock(nn.Module):
-    r"""Swin Transformer Block.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=(self.window_size, self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-        self.H = input_resolution[0]
-        self.W = input_resolution[1]
-
-        self.attn_mask_dict = {}
-
-
-    def create_attn_mask(self, H, W):
-        # calculate attention mask for SW-MSA
-
-        Hp = int(np.ceil(H / self.window_size)) * self.window_size
-        Wp = int(np.ceil(W / self.window_size)) * self.window_size
-        img_mask = torch.zeros((1, Hp, Wp, 1))  # 1 Hp Wp 1
-        h_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-                cnt += 1
-
-        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-
-        return attn_mask
-
-
-    def forward(self, x):
-        B, L, C = x.shape
-        H = int(sqrt(L))
-        W = H
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        # pad feature maps to multiples of window size
-        pad_l = pad_t = 0
-        pad_r = (self.window_size - W % self.window_size) % self.window_size
-        pad_b = (self.window_size - H % self.window_size) % self.window_size
-        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
-        _, Hp, Wp, _ = x.shape
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-
-            if H in self.attn_mask_dict.keys():
-                attn_mask = self.attn_mask_dict[H]
-            else:
-                self.attn_mask_dict[H] = self.create_attn_mask(self.H, self.W).to(x.device)
-                attn_mask = self.attn_mask_dict[H]
-
-        else:
-            shifted_x = x
-            attn_mask = None
-
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA
-        attn_windows, attn = self.attn(x_windows, attn_mask)  # nW*B, window_size*window_size, C
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-
-        if pad_r > 0 or pad_b > 0:
-            x = x[:, :H, :W, :].contiguous()
-
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x, attn
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
-               f"window_size={self.window_size}, shift_size={self.shift_size} mlp_ratio={self.mlp_ratio}"
-
-    def flops(self):
-        flops = 0
-        H, W = self.input_resolution
-        # norm1
-        flops += self.dim * H * W
-        # W-MSA/SW-MSA
-        nW = H * W / self.window_size / self.window_size
-        flops += nW * self.attn.flops(self.window_size * self.window_size)
-        # mlp
-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * H * W
-        return flops
-
-
-class PatchMerging(nn.Module):
-    r"""Patch Merging Layer.
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x):
-        """ Forward function.
-        Args:
-            x: Input feature, tensor size (B, H*W, C).
-            H, W: Spatial resolution of the input feature.
-        """
-        B, L, C = x.shape
-        H = int(sqrt(L))
-        W = H
-
-        x = x.view(B, H, W, C)
-
-        # padding
-        pad_input = (H % 2 == 1) or (W % 2 == 1)
-        if pad_input:
-            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.dim
-        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
-        return flops
-
-
-class BasicLayer(nn.Module):
-    """A basic Swin Transformer layer for one stage.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
-                                 num_heads=num_heads, window_size=window_size,
-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
-                                 mlp_ratio=mlp_ratio,
-                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                 drop=drop, attn_drop=attn_drop,
-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                                 norm_layer=norm_layer)
-            for i in range(depth)])
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-    def forward(self, x):
-        for blk in self.blocks:
-            x, _ = blk(x)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-
-    def forward_with_features(self, x):
-        fea = []
-        for blk in self.blocks:
-            x, _ = blk(x)
-            fea.append(x)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x, fea
-
-    def forward_with_attention(self, x):
-        attns = []
-        for blk in self.blocks:
-            x, attn = blk(x)
-            attns.append(attn)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x, attns
-
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-
-
-class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None):
-        super().__init__()
-        img_size = (img_size, img_size)
-        patch_size = (patch_size, patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-
-        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-
-    def flops(self):
-        Ho, Wo = self.patches_resolution
-        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
-        if self.norm is not None:
-            flops += Ho * Wo * self.embed_dim
-        return flops
-
-class SwinTransformer(nn.Module):
-    r""" Swin Transformer
-        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
-          https://arxiv.org/pdf/2103.14030
-    Args:
-        img_size (int | tuple(int)): Input image size.
-        patch_size (int | tuple(int)): Patch size.
-        in_chans (int): Number of input channels.
-        num_classes (int): Number of classes for classification head.
-        embed_dim (int): Embedding dimension.
-        depths (tuple(int)): Depth of Swin Transformer layers.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
-        drop_rate (float): Dropout rate.
-        attn_drop_rate (float): Attention dropout rate.
-        drop_path_rate (float): Stochastic depth rate.
-        norm_layer (nn.Module): normalization layer.
-        ape (bool): If True, add absolute position embedding to the patch embedding.
-        patch_norm (bool): If True, add normalization after patch embedding.
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
-                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
-                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
-                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True, **kwargs):
-        super().__init__()
-
-        self.num_classes = num_classes
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
-        self.mlp_ratio = mlp_ratio
-
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
-                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
-                                                 patches_resolution[1] // (2 ** i_layer)),
-                               depth=depths[i_layer],
-                               num_heads=num_heads[i_layer],
-                               window_size=window_size,
-                               mlp_ratio=self.mlp_ratio,
-                               qkv_bias=qkv_bias, qk_scale=qk_scale,
-                               drop=drop_rate, attn_drop=attn_drop_rate,
-                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                               norm_layer=norm_layer,
-                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None)
-            self.layers.append(layer)
-
-        self.norm = norm_layer(self.num_features)
-        self.avgpool = nn.AdaptiveAvgPool1d(1)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        # todo: to be implemented
-        return {'relative_position_bias_table'}
-
-    def forward(self, x):
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x)
-
-        x_region = self.norm(x)  # B L C
-        x = self.avgpool(x_region.transpose(1, 2))  # B C 1
-        x = torch.flatten(x, 1)
-
-        return x
-
-
-    def forward_feature_maps(self, x):
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x)
-
-        x_grid = self.norm(x)  # B L C
-        x = self.avgpool(x_grid.transpose(1, 2))  # B C 1
-        x = torch.flatten(x, 1)
-
-        return x, x_grid
-
-
-    def forward_selfattention(self, x, n=1):
-        # n=1 return the last layer attn map; otherwise return attn maps in all layers
-
-        
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        if n==1:
-            return self.forward_last_selfattention(x)
-        else:
-            return self.forward_all_selfattention(x)
-
-    def forward_last_selfattention(self, x):
-
-        for i, layer in enumerate(self.layers):
-            if i < len(self.layers) - 1:
-                x = layer(x)
-            else:
-                x, attns = layer.forward_with_attention(x)
-                return attns[-1]
-
-    def forward_all_selfattention(self, x):
-        attn_out = []
-
-        for layer in self.layers:
-            x, attns = layer.forward_with_attention(x)
-            attn_out += attns
-
-        return attn_out
-
-
-    def forward_return_n_last_blocks(self, x, n=1, return_patch_avgpool=False, depth=[]):
-
-        num_blks = sum(depth)
-        start_idx = num_blks - n
-
-        sum_cur = 0
-        for i, d in enumerate(depth):
-            sum_cur_new = sum_cur + d
-            if start_idx >= sum_cur and start_idx < sum_cur_new:
-                start_stage = i
-                start_blk = start_idx - sum_cur
-            sum_cur = sum_cur_new
-
-
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        # we will return the averaged token features from the `n` last blocks
-        # note: there is no [CLS] token in Swin Transformer
-        output = []
-        s = 0
-        for i, layer in enumerate(self.layers):
-            x, fea = layer.forward_with_features(x)
-
-            if i >= start_stage:
-                for x_ in fea[start_blk:]:
-
-                    if i == len(self.layers)-1: # use the norm in the last stage
-                        x_ = self.norm(x_)
-
-                    x_avg = torch.flatten(self.avgpool(x_.transpose(1, 2)), 1)  # B C     
-                    # print(f'Stage {i},  x_avg {x_avg.shape}')          
-                    output.append(x_avg)
-
-                start_blk = 0
-
-        return torch.cat(output, dim=-1)
-
-
-
-    def flops(self):
-        flops = 0
-        flops += self.patch_embed.flops()
-        for i, layer in enumerate(self.layers):
-            flops += layer.flops()
-            if dist.get_rank() == 0:
-                print(f"GFLOPs layer_{i}: {layer.flops() / 1e9}")
-        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
-        flops += self.num_features * self.num_classes
-        return flops
-
-    def init_weights(self, pretrained='', pretrained_layers=[], verbose=True):
-        if os.path.isfile(pretrained):
-            pretrained_dict = torch.load(pretrained, map_location='cpu')
-            logging.info(f'=> loading pretrained model {pretrained}')
-            model_dict = self.state_dict()
-            pretrained_dict = {
-                k: v for k, v in pretrained_dict.items()
-                if k in model_dict.keys()
-            }
-            need_init_state_dict = {}
-            for k, v in pretrained_dict.items():
-                need_init = (
-                        k.split('.')[0] in pretrained_layers
-                        or pretrained_layers[0] is '*'
-                        or 'relative_position_index' not in k
-                        or 'attn_mask' not in k
-                )
-
-                if need_init:
-                    if verbose:
-                        logging.info(f'=> init {k} from {pretrained}')
-
-                    if 'relative_position_bias_table' in k and v.size() != model_dict[k].size():
-                        relative_position_bias_table_pretrained = v
-                        relative_position_bias_table_current = model_dict[k]
-                        L1, nH1 = relative_position_bias_table_pretrained.size()
-                        L2, nH2 = relative_position_bias_table_current.size()
-                        if nH1 != nH2:
-                            logging.info(f"Error in loading {k}, passing")
-                        else:
-                            if L1 != L2:
-                                logging.info(
-                                    '=> load_pretrained: resized variant: {} to {}'
-                                        .format((L1, nH1), (L2, nH2))
-                                )
-                                S1 = int(L1 ** 0.5)
-                                S2 = int(L2 ** 0.5)
-                                relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
-                                    relative_position_bias_table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
-                                    size=(S2, S2),
-                                    mode='bicubic')
-                                v = relative_position_bias_table_pretrained_resized.view(nH2, L2).permute(1, 0)
-
-                    if 'absolute_pos_embed' in k and v.size() != model_dict[k].size():
-                        absolute_pos_embed_pretrained = v
-                        absolute_pos_embed_current = model_dict[k]
-                        _, L1, C1 = absolute_pos_embed_pretrained.size()
-                        _, L2, C2 = absolute_pos_embed_current.size()
-                        if C1 != C1:
-                            logging.info(f"Error in loading {k}, passing")
-                        else:
-                            if L1 != L2:
-                                logging.info(
-                                    '=> load_pretrained: resized variant: {} to {}'
-                                        .format((1, L1, C1), (1, L2, C2))
-                                )
-                                S1 = int(L1 ** 0.5)
-                                S2 = int(L2 ** 0.5)
-                                absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.reshape(-1, S1, S1, C1)
-                                absolute_pos_embed_pretrained = absolute_pos_embed_pretrained.permute(0, 3, 1, 2)
-                                absolute_pos_embed_pretrained_resized = torch.nn.functional.interpolate(
-                                    absolute_pos_embed_pretrained, size=(S2, S2), mode='bicubic')
-                                v = absolute_pos_embed_pretrained_resized.permute(0, 2, 3, 1).flatten(1, 2)
-
-                    need_init_state_dict[k] = v
-            self.load_state_dict(need_init_state_dict, strict=False)
-
-    def freeze_pretrained_layers(self, frozen_layers=[]):
-        for name, module in self.named_modules():
-            if (
-                    name.split('.')[0] in frozen_layers
-                    or '.'.join(name.split('.')[0:2]) in frozen_layers
-                    or (len(frozen_layers) > 0 and frozen_layers[0] is '*')
-            ):
-                for _name, param in module.named_parameters():
-                    param.requires_grad = False
-                logging.info(
-                    '=> set param {} requires grad to False'
-                        .format(name)
-                )
-        for name, param in self.named_parameters():
-            if (
-                    name.split('.')[0] in frozen_layers
-                    or (len(frozen_layers) > 0 and frozen_layers[0] is '*')
-                    and param.requires_grad is True
-            ):
-                param.requires_grad = False
-                logging.info(
-                    '=> set param {} requires grad to False'
-                        .format(name)
-                )
-        return self
-
-
-def get_swin(is_teacher=False):
-    args = get_args()
-
-    if args.swin_backbone_type == "tiny":
-        embed_dim = 96
-        depths = [2, 2, 6, 2]
-        num_heads = [3, 6, 12, 24]
-        drop_path_rate = 0.1
-    elif args.swin_backbone_type == 'h3':
-        embed_dim = 384
-        depths = [2, 2, 18, 2]
-        num_heads = [6, 12, 24, 48]
-        drop_path_rate = 0.2
-    else:
-        embed_dim = 128
-        depths = [2, 2, 18, 2]
-        num_heads = [4, 8, 16, 32]
-        drop_path_rate = 0.2
-
-    swin = SwinTransformer(
-        img_size=224,
-        in_chans=3,
-        num_classes=1000,
-        patch_size=4,
-        embed_dim=embed_dim,
-        depths=depths,
-        num_heads=num_heads,
-        window_size=7,
-        mlp_ratio=4,
-        qkv_bias=True,
-        drop_rate=0,
-        attn_drop_rate=0,
-        drop_path_rate=(0.0 if is_teacher else drop_path_rate),
-        norm_layer=partial(LayerNorm, eps=1e-6),
-        ape=False,
-        patch_norm=True,
-    )
-
-    return swin
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/inpainting.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/inpainting.py
deleted file mode 100644
index f84faac206e85c11532b6aacebaf7c3e9da12af1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/inpainting.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-import apex
-import einops
-import torch
-import torch.nn.functional as F
-from megatron_ds import get_args, print_rank_0
-from megatron_ds.model.utils import get_linear_layer
-from megatron_ds.model.vision.vit_backbone import VitBackbone
-from megatron_ds.model.module import MegatronModule
-from megatron_ds.model.vision.mit_backbone import mit_b3
-from megatron_ds.model.vision.utils import resize
-
-
-class VitInpaintingModel(MegatronModule):
-
-    def __init__(self, config, pre_process=True, post_process=True):
-        super(VitInpaintingModel, self).__init__()
-        args = get_args()
-
-        self.config = config
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.hidden_size = config.hidden_size
-        self.backbone = VitBackbone(
-            config=config,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-            class_token=False,
-        )
-        self.patch_dim = args.patch_dim
-        self.img_h = args.img_h
-        self.img_w = args.img_w
-        self.seq_length = args.seq_length
-        # full mask
-
-        if self.post_process:
-            self.linear_decoder = get_linear_layer(
-                self.hidden_size,
-                self.backbone.flatten_dim,
-                torch.nn.init.zeros_
-            )
-
-    def set_input_tensor(self, input_tensor):
-        self.backbone.set_input_tensor(input_tensor)
-
-    def forward(self, input):
-
-        hidden_states = self.backbone(input)
-
-        if not self.post_process:
-            return hidden_states
-        decoded_output = self.linear_decoder(hidden_states)
-        output = einops.rearrange(
-                decoded_output,
-                "b (h w) (p1 p2 c) -> b c (h p1) (w p2)",
-                p1=self.patch_dim,
-                p2=self.patch_dim,
-                h=self.img_h//self.patch_dim,
-                w=self.img_w//self.patch_dim,
-            )
-
-        return output
-
-
-class MLP(torch.nn.Module):
-    """
-    Linear Embedding
-    """
-    def __init__(self, input_dim=2048, embed_dim=768):
-        super().__init__()
-        self.proj = torch.nn.Linear(input_dim, embed_dim)
-
-    def forward(self, x):
-        x = x.flatten(2).transpose(1, 2)
-        x = self.proj(x)
-        return x
-
-
-class MitInpaintingModel(MegatronModule):
-    """Mix vision Transformer Model."""
-
-    def __init__(self, pre_process=True, post_process=True):
-        super(MitInpaintingModel, self).__init__()
-        self.pre_process = pre_process
-        self.post_process = post_process
-
-        args = get_args()
-        self.patch_dim = args.patch_dim
-        self.img_h = args.img_h
-        self.img_w = args.img_w
-        self.flatten_dim = self.patch_dim * self.patch_dim * 3
-        self.backbone = mit_b3()
-
-        self.in_channels = [64, 128, 320, 512]
-        self.embedding_dim = 768
-
-        c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.in_channels
-
-        self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=self.embedding_dim)
-        self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=self.embedding_dim)
-        self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=self.embedding_dim)
-        self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=self.embedding_dim)
-
-        self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4, self.embedding_dim, 1, 1, bias=False)
-        self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim)
-        self.dropout = torch.nn.Dropout2d(0.1)
-
-        self.linear_pred = torch.nn.Conv2d(self.embedding_dim, self.flatten_dim, kernel_size=1)
-
-    def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
-        pass
-
-    def forward(self, input):
-        c1, c2, c3, c4 = self.backbone(input)
-
-        n, _, h, w = c4.shape
-        _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3])
-        _c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False)
-
-        _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3])
-        _c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False)
-
-        _c2 = self.linear_c2(c2).permute(0, 2, 1).reshape(n, -1, c2.shape[2], c2.shape[3])
-        _c2 = resize(_c2, size=c1.size()[2:], mode='bilinear', align_corners=False)
-
-        _c1 = self.linear_c1(c1).permute(0, 2, 1).reshape(n, -1, c1.shape[2], c1.shape[3])
-
-        _c = torch.cat([_c4, _c3, _c2, _c1], dim=1)
-        _c = self.conv_fuse(_c)
-
-        x = self.norm(_c)
-        x = F.relu(x, inplace=True)
-        x = self.dropout(x)
-
-        x = self.linear_pred(x)
-
-        output = einops.rearrange(
-            x,
-            "b (c p1 p2) h w -> b c (h p1) (w p2)",
-            p1=self.patch_dim,
-            p2=self.patch_dim,
-            h=self.img_h//self.patch_dim,
-            w=self.img_w//self.patch_dim,
-        )
-
-        return output
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/knn_monitor.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/knn_monitor.py
deleted file mode 100644
index 4882a5480fdeba09c8497e99a25aad4728953082..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/knn_monitor.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import torch.nn.functional as F
-import torch
-from megatron_ds import print_rank_0, get_args
-from megatron_ds.core import mpu
-from megatron_ds.data.vit_dataset import ClassificationTransform
-from megatron_ds.data.image_folder import ImageFolder
-
-_FEATURE_BANK = None
-
-
-def build_data_loader(dataset, drop_last=True, shuffle=False):
-    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
-    # Sampler.
-    args = get_args()
-    micro_batch_size = 16
-    num_workers = args.num_workers
-    world_size = mpu.get_data_parallel_world_size()
-    rank = mpu.get_data_parallel_rank()
-    sampler = torch.utils.data.distributed.DistributedSampler(
-        dataset, num_replicas=world_size, rank=rank,
-        drop_last=drop_last, shuffle=shuffle
-    )
-
-    # Data loader. Note that batch size is the per GPU batch size.
-    data_loader = torch.utils.data.DataLoader(
-        dataset,
-        batch_size=micro_batch_size,
-        sampler=sampler,
-        shuffle=False,
-        num_workers=num_workers,
-        drop_last=not drop_last,
-        pin_memory=True,
-    )
-    return data_loader
-
-
-def compute_feature_bank(model):
-    args = get_args()
-    global _FEATURE_BANK
-    feature_bank = []
-    feature_label = []
-
-    train_ds = ImageFolder(
-        root=args.data_path[0],
-        transform=ClassificationTransform((args.img_h, args.img_w), train=False),
-        data_per_class_fraction=1.0
-    )
-    classes = len(train_ds.classes)
-    dataloader = build_data_loader(train_ds)
-     
-    for m in model:
-        m.eval()
-
-    with torch.no_grad():
-        for i, batch in enumerate(dataloader):
-            images = batch[0].cuda().contiguous()
-            labels = batch[1].cuda().contiguous()
-            student_feature, teacher_feature = model[0](images)
-            feature = F.normalize(teacher_feature.float(), dim=1)
-            feature_bank.append(feature)
-            feature_label.append(labels)
-    
-    for m in model:
-        m.train()
-
-    # [N', D]
-    feature_bank = torch.cat(feature_bank, dim=0).contiguous()
-    feature_label = torch.cat(feature_label, dim=0).contiguous()
-
-    feature_banks = [torch.zeros_like(feature_bank)
-                     for i in range(mpu.get_data_parallel_world_size())]
-    torch.distributed.all_gather(feature_banks,
-                                 feature_bank,
-                                 group=mpu.get_data_parallel_group())
-
-    assert torch.all(torch.eq(feature_banks[mpu.get_data_parallel_rank()],
-                              feature_bank))
-
-    feature_labels = [torch.zeros_like(feature_label)
-                      for i in range(mpu.get_data_parallel_world_size())]
-    torch.distributed.all_gather(feature_labels,
-                                 feature_label,
-                                 group=mpu.get_data_parallel_group())
-
-    # [D, N]
-    feature_banks = torch.cat(feature_banks, dim=0).t().contiguous()
-    # [N]
-    feature_labels = torch.cat(feature_labels, dim=0).contiguous()
-    print_rank_0("feature_banks size is {}".format(feature_banks.size()))
-    print_rank_0("feature labels size is {}".format(feature_labels.size()))
-
-    _FEATURE_BANK = (feature_banks, feature_labels, classes)
-
-
-def get_feature_bank():
-    global _FEATURE_BANK
-    assert _FEATURE_BANK is not None
-    return _FEATURE_BANK
-
-
-# knn monitor as in InstDisc https://arxiv.org/abs/1805.01978
-# implementation follows http://github.com/zhirongw/lemniscate.pytorch and
-# https://github.com/leftthomas/SimCLR
-def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t):
-    # compute cos similarity between each feature vector and feature bank ---> [B, N]
-    sim_matrix = torch.mm(feature, feature_bank)
-    # [B, K]
-    sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1)
-    # [B, K]
-    sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1),
-                              dim=-1,
-                              index=sim_indices)
-    sim_weight = (sim_weight / knn_t).exp()
-
-    # counts for each class
-    one_hot_label = torch.zeros(feature.size(0) * knn_k,
-                                classes,
-                                device=sim_labels.device)
-    # [B*K, C]
-    one_hot_label = one_hot_label.scatter(dim=-1,
-                                          index=sim_labels.view(-1, 1),
-                                          value=1.0)
-    # weighted score ---> [B, C]
-    pred_scores = torch.sum(
-            one_hot_label.view(feature.size(0), -1, classes) * sim_weight.unsqueeze(dim=-1),
-            dim=1)
-
-    pred_labels = pred_scores.argsort(dim=-1, descending=True)
-    return pred_labels
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/mit_backbone.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/mit_backbone.py
deleted file mode 100644
index 4a3c5f75259e2d3f3acd8bf022b8b56c76222f4b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/mit_backbone.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# Copyright (c) 2023, NVIDIA Corporation. All rights reserved.
-
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from functools import partial
-from torch.nn.init import trunc_normal_
-from megatron_ds.model.transformer import DropPath
-from megatron_ds.model import LayerNorm
-
-
-class Mlp(nn.Module):
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer=nn.GELU,
-                 drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.dwconv = DWConv(hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-        elif isinstance(m, nn.Conv2d):
-            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-            fan_out //= m.groups
-            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-            if m.bias is not None:
-                m.bias.data.zero_()
-
-    def forward(self, x, H, W):
-        x = self.fc1(x)
-        x = self.dwconv(x, H, W)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 sr_ratio=1):
-        super().__init__()
-        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
-
-        self.dim = dim
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        self.q = nn.Linear(dim, dim, bias=qkv_bias)
-        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        self.sr_ratio = sr_ratio
-        if sr_ratio > 1:
-            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
-            self.norm = LayerNorm(dim)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-        elif isinstance(m, nn.Conv2d):
-            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-            fan_out //= m.groups
-            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-            if m.bias is not None:
-                m.bias.data.zero_()
-
-    def forward(self, x, H, W):
-        B, N, C = x.shape
-        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-
-        if self.sr_ratio > 1:
-            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
-            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
-            x_ = self.norm(x_)
-            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        else:
-            kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        k, v = kv[0], kv[1]
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-
-        return x
-
-
-class Block(nn.Module):
-
-    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., act_layer=nn.GELU, norm_layer=LayerNorm, sr_ratio=1):
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
-            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-        elif isinstance(m, nn.Conv2d):
-            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-            fan_out //= m.groups
-            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-            if m.bias is not None:
-                m.bias.data.zero_()
-
-    def forward(self, x, H, W):
-        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
-        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
-
-        return x
-
-
-class OverlapPatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
-        super().__init__()
-        img_size = (img_size, img_size)
-        patch_size = (patch_size, patch_size)
-
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
-                              padding=(patch_size[0] // 2, patch_size[1] // 2))
-        self.norm = LayerNorm(embed_dim)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-        elif isinstance(m, nn.Conv2d):
-            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-            fan_out //= m.groups
-            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-            if m.bias is not None:
-                m.bias.data.zero_()
-
-    def forward(self, x):
-        x = self.proj(x)
-        _, _, H, W = x.shape
-        x = x.flatten(2).transpose(1, 2)
-        x = self.norm(x)
-
-        return x, H, W
-
-
-class MixVisionTransformer(nn.Module):
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
-                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
-                 attn_drop_rate=0., drop_path_rate=0., norm_layer=LayerNorm,
-                 depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], output_avg=False):
-        super().__init__()
-        self.num_classes = num_classes
-        self.depths = depths
-        self.output_avg = output_avg
-
-        # patch_embed
-        self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans,
-                                              embed_dim=embed_dims[0])
-        self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0],
-                                              embed_dim=embed_dims[1])
-        self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1],
-                                              embed_dim=embed_dims[2])
-        self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2],
-                                              embed_dim=embed_dims[3])
-
-        # transformer encoder
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-        cur = 0
-        self.block1 = nn.ModuleList([Block(
-            dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
-            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
-            sr_ratio=sr_ratios[0])
-            for i in range(depths[0])])
-        self.norm1 = norm_layer(embed_dims[0])
-
-        cur += depths[0]
-        self.block2 = nn.ModuleList([Block(
-            dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
-            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
-            sr_ratio=sr_ratios[1])
-            for i in range(depths[1])])
-        self.norm2 = norm_layer(embed_dims[1])
-
-        cur += depths[1]
-        self.block3 = nn.ModuleList([Block(
-            dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
-            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
-            sr_ratio=sr_ratios[2])
-            for i in range(depths[2])])
-        self.norm3 = norm_layer(embed_dims[2])
-
-        cur += depths[2]
-        self.block4 = nn.ModuleList([Block(
-            dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
-            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
-            sr_ratio=sr_ratios[3])
-            for i in range(depths[3])])
-        self.norm4 = norm_layer(embed_dims[3])
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-        elif isinstance(m, nn.Conv2d):
-            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-            fan_out //= m.groups
-            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-            if m.bias is not None:
-                m.bias.data.zero_()
-
-    def reset_drop_path(self, drop_path_rate):
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
-        cur = 0
-        for i in range(self.depths[0]):
-            self.block1[i].drop_path.drop_prob = dpr[cur + i]
-
-        cur += self.depths[0]
-        for i in range(self.depths[1]):
-            self.block2[i].drop_path.drop_prob = dpr[cur + i]
-
-        cur += self.depths[1]
-        for i in range(self.depths[2]):
-            self.block3[i].drop_path.drop_prob = dpr[cur + i]
-
-        cur += self.depths[2]
-        for i in range(self.depths[3]):
-            self.block4[i].drop_path.drop_prob = dpr[cur + i]
-
-    def freeze_patch_emb(self):
-        self.patch_embed1.requires_grad = False
-
-    def forward_features(self, x):
-        B = x.shape[0]
-        outs = []
-
-        # stage 1
-        x, H, W = self.patch_embed1(x)
-        for i, blk in enumerate(self.block1):
-            x = blk(x, H, W)
-        x = self.norm1(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
-        outs.append(x)
-
-        # stage 2
-        x, H, W = self.patch_embed2(x)
-        for i, blk in enumerate(self.block2):
-            x = blk(x, H, W)
-        x = self.norm2(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
-        outs.append(x)
-
-        # stage 3
-        x, H, W = self.patch_embed3(x)
-        for i, blk in enumerate(self.block3):
-            x = blk(x, H, W)
-        x = self.norm3(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
-        outs.append(x)
-
-        # stage 4
-        x, H, W = self.patch_embed4(x)
-        for i, blk in enumerate(self.block4):
-            x = blk(x, H, W)
-        x = self.norm4(x)
-        if not self.output_avg:
-            x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
-        outs.append(x)
-
-        return outs
-
-    def forward(self, x):
-        x = self.forward_features(x)
-    
-        if self.output_avg:
-            x = x[3].mean(dim=1)
-
-        return x
-
-
-class DWConv(nn.Module):
-    def __init__(self, dim=768):
-        super(DWConv, self).__init__()
-        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
-
-    def forward(self, x, H, W):
-        B, N, C = x.shape
-        x = x.transpose(1, 2).view(B, C, H, W)
-        x = self.dwconv(x)
-        x = x.flatten(2).transpose(1, 2)
-
-        return x
-
-class mit_b0(MixVisionTransformer):
-    def __init__(self, **kwargs):
-        super(mit_b0, self).__init__(
-            patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
-            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
-            drop_rate=0.0, drop_path_rate=0.1)
-
-
-class mit_b1(MixVisionTransformer):
-    def __init__(self, **kwargs):
-        super(mit_b1, self).__init__(
-            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
-            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
-            drop_rate=0.0, drop_path_rate=0.1)
-
-
-class mit_b2(MixVisionTransformer):
-    def __init__(self, **kwargs):
-        super(mit_b2, self).__init__(
-            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
-            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
-            drop_rate=0.0, drop_path_rate=0.1)
-
- 
-class mit_b3(MixVisionTransformer):
-    def __init__(self, **kwargs):
-        super(mit_b3, self).__init__(
-            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
-            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
-            drop_rate=0.0, drop_path_rate=0.1)
-
-class mit_b3_avg(MixVisionTransformer):
-    def __init__(self, drop_path_rate=0.1, **kwargs):
-        super(mit_b3_avg, self).__init__(
-            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
-            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
-            drop_rate=0.0, drop_path_rate=drop_path_rate, output_avg=True)
-
-class mit_b4(MixVisionTransformer):
-    def __init__(self, **kwargs):
-        super(mit_b4, self).__init__(
-            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
-            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
-            drop_rate=0.0, drop_path_rate=0.1)
-
-class mit_b5(MixVisionTransformer):
-    def __init__(self, **kwargs):
-        super(mit_b5, self).__init__(
-            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
-            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
-            drop_rate=0.0, drop_path_rate=0.1)
-
-class mit_b5_avg(MixVisionTransformer):
-    def __init__(self, drop_path_rate=0.1, **kwargs):
-        super(mit_b5_avg, self).__init__(
-            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
-            qkv_bias=True, norm_layer=partial(LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
-            drop_rate=0.0, drop_path_rate=drop_path_rate, output_avg=True)
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/swin_backbone.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/swin_backbone.py
deleted file mode 100644
index 2c6a88584547116382895e2ad4edef2610a93349..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/swin_backbone.py
+++ /dev/null
@@ -1,625 +0,0 @@
-# Copyright (c) 2021 Microsoft
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------
-# Swin Transformer
-# --------------------------------------------------------
-
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint as checkpoint
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-from math import sqrt
-
-from megatron_ds import get_args
-from functools import partial
-
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None,
-                 out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
-
-    def flops(self, N):
-        # calculate flops for 1 window with token length of N
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += N * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
-        #  x = (attn @ v)
-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += N * self.dim * self.dim
-        return flops
-
-
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-        self.H = input_resolution[0]
-        self.W = input_resolution[1]
-
-        self.attn_mask_dict = {} 
-
-    def create_attn_mask(self, H, W):
-        # calculate attention mask for SW-MSA
-
-        Hp = int(np.ceil(H / self.window_size)) * self.window_size
-        Wp = int(np.ceil(W / self.window_size)) * self.window_size
-        img_mask = torch.zeros((1, Hp, Wp, 1))  # 1 Hp Wp 1
-        h_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-                cnt += 1
-
-        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-
-        return attn_mask
-
-
-    def forward(self, x):
-        B, L, C = x.shape
-        H = int(sqrt(L))
-        W = H
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-        else:
-            shifted_x = x
-
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA
-        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
-               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
-
-    def flops(self):
-        flops = 0
-        H, W = self.input_resolution
-        # norm1
-        flops += self.dim * H * W
-        # W-MSA/SW-MSA
-        nW = H * W / self.window_size / self.window_size
-        flops += nW * self.attn.flops(self.window_size * self.window_size)
-        # mlp
-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * H * W
-        return flops
-
-
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = x.view(B, H, W, C)
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.dim
-        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
-        return flops
-
-
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
-                                 num_heads=num_heads, window_size=window_size,
-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
-                                 mlp_ratio=mlp_ratio,
-                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                 drop=drop, attn_drop=attn_drop,
-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                                 norm_layer=norm_layer)
-            for i in range(depth)])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-    def forward(self, x):
-        for blk in self.blocks:
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x)
-            else:
-                x = blk(x)
-        x_b4_ds = x
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x_b4_ds, x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-
-
-class PatchEmbed(nn.Module):
-    r""" Image to Patch Embedding
-
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-    def flops(self):
-        Ho, Wo = self.patches_resolution
-        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
-        if self.norm is not None:
-            flops += Ho * Wo * self.embed_dim
-        return flops
-
-
-class SwinTransformer(nn.Module):
-    r""" Swin Transformer
-        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
-          https://arxiv.org/pdf/2103.14030
-
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 224
-        patch_size (int | tuple(int)): Patch size. Default: 4
-        in_chans (int): Number of input image channels. Default: 3
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3,
-                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
-                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.3,
-                 norm_layer=partial(nn.LayerNorm, eps=1e-6), ape=False, patch_norm=True,
-                 use_checkpoint=False, output_avg=False, **kwargs):
-        super().__init__()
-
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
-        self.mlp_ratio = mlp_ratio
-        self.img_size = to_2tuple(img_size)
-        self.patch_size = to_2tuple(patch_size)
-        self.output_avg = output_avg
-        
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-
-        # build layers
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
-                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
-                                                 patches_resolution[1] // (2 ** i_layer)),
-                               depth=depths[i_layer],
-                               num_heads=num_heads[i_layer],
-                               window_size=window_size,
-                               mlp_ratio=self.mlp_ratio,
-                               qkv_bias=qkv_bias, qk_scale=qk_scale,
-                               drop=drop_rate, attn_drop=attn_drop_rate,
-                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                               norm_layer=norm_layer,
-                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
-                               use_checkpoint=use_checkpoint)
-            self.layers.append(layer)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-
-    def forward(self, x):
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        h = self.img_size[0] // self.patch_size[0]
-        w = self.img_size[1] // self.patch_size[1]
-        outs = []
-
-        for i, layer in enumerate(self.layers):
-            px, x = layer(x)
-            b, n, c = px.shape
-
-            if i != len(self.layers) - 1 or not self.output_avg:
-                px = px.permute(0, 2, 1).contiguous()
-                px = px.reshape(b, c, h, w)
-            # is this a fair assumption ?? i think it's baked into the architecture
-            h, w = h//2, w//2
-            outs.append(px)
-
-        if self.output_avg:
-            return outs[-1].mean(dim=1)
-
-        return outs
-
-    def flops(self):
-        flops = 0
-        flops += self.patch_embed.flops()
-        for i, layer in enumerate(self.layers):
-            flops += layer.flops()
-        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
-        flops += self.num_features * self.num_classes
-        return flops
-
-
-def get_swin(drop_path_rate=0.3, output_avg=False):
-    args = get_args()
-
-    window_size = 7
-    embed_dim = 128
-    depths = [2, 2, 18, 2]
-    num_heads = [4, 8, 16, 32]
-    swin = SwinTransformer(
-        img_size=(args.img_h, args.img_w,),
-        in_chans=3,
-        patch_size=args.patch_dim,
-        embed_dim=embed_dim,
-        depths=depths,
-        num_heads=num_heads,
-        window_size=window_size,
-        drop_path_rate=drop_path_rate,
-        output_avg=output_avg,
-    )
-
-    return swin
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/utils.py
deleted file mode 100644
index b4068912c8bb234eff54d6b4feae499f7e8ab30c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/utils.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import warnings
-import torch
-import torch.nn.functional as F
-
-
-def resize(input,
-           size=None,
-           scale_factor=None,
-           mode='nearest',
-           align_corners=None,
-           warning=True):
-    if warning:
-        if size is not None and align_corners:
-            input_h, input_w = tuple(int(x) for x in input.shape[2:])
-            output_h, output_w = tuple(int(x) for x in size)
-            if output_h > input_h or output_w > output_h:
-                if ((output_h > 1 and output_w > 1 and input_h > 1
-                     and input_w > 1) and (output_h - 1) % (input_h - 1)
-                        and (output_w - 1) % (input_w - 1)):
-                    warnings.warn(
-                        f'When align_corners={align_corners}, '
-                        'the output would more aligned if '
-                        f'input size {(input_h, input_w)} is `x+1` and '
-                        f'out size {(output_h, output_w)} is `nx+1`')
-    if isinstance(size, torch.Size):
-        size = tuple(int(x) for x in size)
-    return F.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/vit_backbone.py b/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/vit_backbone.py
deleted file mode 100644
index e4523e885d98797399803f38502f22a8e3188d4f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/vision/vit_backbone.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Vision Transformer(VIT) model."""
-
-import math
-import einops
-import torch
-import apex
-import torch.nn.functional as F
-from megatron_ds import get_args
-from megatron_ds.model.transformer import ParallelTransformer
-from megatron_ds.model.utils import (
-    get_linear_layer,
-    init_method_normal,
-    scaled_init_method_normal,
-)
-from megatron_ds.model.module import MegatronModule
-
-CLASS_TOKEN_LENGTH = 8
-
-class VitMlpHead(MegatronModule):
-    """Pooler layer.
-
-    Pool hidden states of a specific token (for example start of the
-    sequence) and add a linear transformation followed by a tanh.
-
-    Arguments:
-        hidden_size: hidden size
-        init_method: weight initialization method for the linear layer.
-            bias is set to zero.
-    """
-
-    def __init__(self, config, hidden_size, num_classes):
-        super(VitMlpHead, self).__init__()
-        self.config = config
-        self.dense_in = torch.nn.Linear(hidden_size, hidden_size)
-        self.relu = torch.nn.ReLU()
-        self.dense_out = torch.nn.Linear(hidden_size, num_classes)
-        torch.nn.init.constant_(self.dense_out.bias, -10)
-
-    def forward(self, hidden_states):
-        # hidden_states: [b, 1, h]
-        # sequence_index: index of the token to pool.
-        dense_in_result = self.dense_in(hidden_states)
-        tanh_result = torch.tanh(dense_in_result)
-        dense_out_result = self.dense_out(tanh_result)
-        return dense_out_result
-
-
-def isPerfectSquare(x):
-    if(x >= 0):
-        sr = math.sqrt(x)
-        return (int(sr) * int(sr) == x)
-    return False
-
-
-def twod_interpolate_position_embeddings_hook(
-    state_dict,
-    prefix,
-    local_metadata,
-    strict,
-    missing_keys,
-    unexpected_keys,
-    error_msgs,
-):
-
-    args = get_args()
-    num_patches_per_dim_h = args.img_h // args.patch_dim
-    num_patches_per_dim_w = args.img_w // args.patch_dim
-    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-    hidden_size = args.hidden_size
-
-    key = prefix + "weight"
-
-    assert key in state_dict
-    if key in state_dict:
-        input_param = state_dict[key]
-
-        input_seq_len = input_param.shape[0]
-        assert(isPerfectSquare(input_seq_len) or isPerfectSquare(input_seq_len - CLASS_TOKEN_LENGTH))
-        input_has_class_token = not isPerfectSquare(input_seq_len)
-        num_tok_input = input_seq_len - CLASS_TOKEN_LENGTH if input_has_class_token else input_seq_len
-        num_tok_output = num_patches
-        output_has_class_token = args.class_token_present
-
-        # update input_param and load it to state_dict[key]
-        if input_has_class_token:
-            input_param_tok = input_param[:CLASS_TOKEN_LENGTH, :]
-            input_param_grid = input_param[CLASS_TOKEN_LENGTH:, :]
-        else:
-            input_param_tok = torch.zeros(CLASS_TOKEN_LENGTH, hidden_size)
-            input_param_grid = input_param
-
-        assert input_param.shape[1] == hidden_size
-
-        if num_tok_input != num_tok_output:
-
-            gs_input = int(math.sqrt(num_tok_input))
-            gs_new = (num_patches_per_dim_h, num_patches_per_dim_w)
-
-            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
-            input_param_grid = input_param_grid.reshape(
-                (1, -1, gs_input, gs_input)
-            )
-            input_param_grid = input_param_grid.float()
-            scale_factor = (gs_new[0] / gs_input, gs_new[1] / gs_input)
-
-            input_param_grid = F.interpolate(
-                input_param_grid, scale_factor=scale_factor, mode="bilinear"
-            )
-
-            input_param_grid = input_param_grid.half()
-            input_param_grid = input_param_grid.reshape((-1, num_tok_output))
-            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
-
-            assert input_param_grid.shape[1] == hidden_size
-
-        input_param = input_param_grid
-        assert (
-            input_param.shape[0] == num_tok_output
-            and input_param.shape[1] == hidden_size
-        )
-
-        if output_has_class_token:
-            input_param = torch.cat((input_param_tok, input_param), dim=0)
-
-        state_dict[key] = input_param
-
-
-class VitBackbone(MegatronModule):
-    """Vision Transformer Model."""
-
-    def __init__(self,
-                 config,
-                 pre_process=True,
-                 post_process=True,
-                 class_token=True,
-                 single_token_output=False,
-                 post_layer_norm=True,
-                 drop_path_rate=0.0):
-        super(VitBackbone, self).__init__(share_embeddings_and_output_weights=False)
-        args = get_args()
-        self.config = config
-
-        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
-
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.class_token = class_token
-        self.post_layer_norm = post_layer_norm
-        self.hidden_size = args.hidden_size
-        self.patch_dim = args.patch_dim
-        self.img_h = args.img_h
-        self.img_w = args.img_w
-        self.micro_batch_size = args.micro_batch_size
-        self.single_token_output = single_token_output
-        self.drop_path_rate = drop_path_rate
-
-        assert self.img_h % self.patch_dim == 0
-        assert self.img_w % self.patch_dim == 0
-        self.num_patches_per_dim_h = self.img_h // self.patch_dim
-        self.num_patches_per_dim_w = self.img_w // self.patch_dim
-        self.num_patches = self.num_patches_per_dim_h * self.num_patches_per_dim_w
-        self.seq_length = self.num_patches + (CLASS_TOKEN_LENGTH if self.class_token else 0)
-        self.flatten_dim = self.patch_dim * self.patch_dim * args.num_channels
-        self.input_tensor = None
-        self.position_ids = None
-
-        if self.pre_process:
-            # cls_token
-            if self.class_token:
-                self.cls_token = torch.nn.Parameter(
-                    torch.randn(1, CLASS_TOKEN_LENGTH, self.hidden_size)
-                )
-                torch.nn.init.zeros_(self.cls_token)
-            self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
-
-            # Linear encoder
-            self.linear_encoder = torch.nn.Linear(
-                self.flatten_dim, self.hidden_size
-            )
-
-            # embedding
-            self.position_embeddings = torch.nn.Embedding(
-                self.seq_length, self.hidden_size
-            )
-            init_method_normal(args.init_method_std)(
-                self.position_embeddings.weight
-            )
-
-            args.class_token_present = self.class_token
-            self.position_embeddings._register_load_state_dict_pre_hook(
-                twod_interpolate_position_embeddings_hook
-            )
-
-            self.embedding_dropout = torch.nn.Dropout(args.hidden_dropout)
-
-        # Transformer
-        self.transformer = ParallelTransformer(
-            config,
-            model_type=args.model_type,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-            post_layer_norm=self.post_layer_norm,
-            drop_path_rate=self.drop_path_rate
-        )
-
-    def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
-        self.transformer.set_input_tensor(input_tensor)
-
-    def forward(self, input):
-
-        if self.pre_process:
-            rearranged_input = einops.rearrange(
-                input,
-                "b c (h p1) (w p2) -> b (h w) (p1 p2 c)",
-                p1=self.patch_dim,
-                p2=self.patch_dim,
-            )
-
-            assert rearranged_input.dtype == torch.half
-            encoder_output = self.linear_encoder(rearranged_input)
-
-            concatenated_tokens = encoder_output
-            if self.class_token:
-                cls_tokens = self.cls_token.expand(encoder_output.shape[0], -1, -1)
-                concatenated_tokens = torch.cat((cls_tokens, encoder_output), dim=1)
-
-            token_embeddings = concatenated_tokens + \
-                    self.position_embeddings(self.position_ids[:, :concatenated_tokens.shape[1]])
-            # [b, s, h] => [s, b, h]
-            token_embeddings = token_embeddings.transpose(0, 1).contiguous()
-            hidden_states = self.embedding_dropout(token_embeddings)
-        else:
-            hidden_states = input
-
-        hidden_states = self.transformer(hidden_states, None)
-
-        if self.post_process:
-            # [s b h] => [b s h]
-            if self.single_token_output:
-                hidden_states = hidden_states[0]
-            else:
-                hidden_states = hidden_states.transpose(0, 1).contiguous()
-
-        return hidden_states
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/commons.py b/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/commons.py
deleted file mode 100644
index 611daf0f66692426ee5ad59824f3c421d7b94a90..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/commons.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import argparse
-import os
-import random
-import numpy
-import torch
-
-import mpu
-
-
-class IdentityLayer(torch.nn.Module):
-    def __init__(self, size, scale=1.0):
-        super(IdentityLayer, self).__init__()
-        self.weight = torch.nn.Parameter(scale * torch.randn(size))
-
-    def forward(self):
-        return self.weight
-
-
-def set_random_seed(seed):
-    """Set random seed for reproducability."""
-    random.seed(seed)
-    numpy.random.seed(seed)
-    torch.manual_seed(seed)
-    mpu.model_parallel_cuda_manual_seed(seed)
-
-
-def initialize_distributed(backend='nccl'):
-    """Initialize torch.distributed."""
-    # Get local rank in case it is provided.
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--local_rank', type=int, default=None,
-                        help='local rank passed from distributed launcher')
-    args = parser.parse_args()
-    local_rank = args.local_rank
-
-    # Get rank and world size.
-    rank = int(os.getenv('RANK', '0'))
-    world_size = int(os.getenv("WORLD_SIZE", '1'))
-
-    print('> initializing torch.distributed with local rank: {}, '
-          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
-
-    # Set the device id.
-    device = rank % torch.cuda.device_count()
-    if local_rank is not None:
-        device = local_rank
-    torch.cuda.set_device(device)
-
-    # Call the init process.
-    init_method = 'tcp://'
-    master_ip = os.getenv('MASTER_ADDR', 'localhost')
-    master_port = os.getenv('MASTER_PORT', '6000')
-    init_method += master_ip + ':' + master_port
-    torch.distributed.init_process_group(
-        backend=backend,
-        world_size=world_size,
-        rank=rank,
-        init_method=init_method)
-
-
-def print_separator(message):
-    torch.distributed.barrier()
-    filler_len = (78 - len(message)) // 2
-    filler = '-' * filler_len
-    string = '\n' + filler + ' {} '.format(message) + filler
-    if torch.distributed.get_rank() == 0:
-        print(string, flush=True)
-    torch.distributed.barrier()
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_cross_entropy.py b/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_cross_entropy.py
deleted file mode 100644
index 00ae42228a9259e12640034a911899b6386882bc..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_cross_entropy.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-from commons import set_random_seed
-from commons import IdentityLayer
-from commons import print_separator
-from commons import initialize_distributed
-from mpu.cross_entropy import vocab_parallel_cross_entropy
-import mpu
-import torch.nn.functional as F
-import torch
-import random
-import sys
-sys.path.append("../..")
-
-
-def torch_cross_entropy(batch_size, seq_length, vocab_size,
-                        logits_scale, seed):
-    set_random_seed(seed)
-    identity = IdentityLayer((batch_size, seq_length, vocab_size),
-                             scale=logits_scale).cuda()
-    logits = identity()
-    target = torch.cuda.LongTensor(
-        size=(batch_size, seq_length)).random_(0, vocab_size)
-    loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
-                           target.view(-1),
-                           reduction='none').view_as(target).mean()
-    loss.backward()
-    return loss, identity.weight.grad
-
-
-def mpu_cross_entropy(batch_size, seq_length, vocab_size,
-                      logits_scale, seed):
-    set_random_seed(seed)
-    identity = IdentityLayer((batch_size, seq_length, vocab_size),
-                             scale=logits_scale).cuda()
-    logits = identity()
-    logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits)
-    target = torch.cuda.LongTensor(
-        size=(batch_size, seq_length)).random_(0, vocab_size)
-    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
-    loss.backward()
-    return loss, identity.weight.grad
-
-
-def test_cross_entropy(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing cross entropy with model parallel size {} ...'.
-              format(tensor_model_parallel_size))
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    batch_size = 13
-    seq_length = 17
-    vocab_size_per_partition = 11
-    logits_scale = 1000.0
-    vocab_size = vocab_size_per_partition * tensor_model_parallel_size
-    seed = 1234
-
-    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
-                                                 vocab_size, logits_scale,
-                                                 seed)
-    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
-                                           vocab_size, logits_scale,
-                                           seed)
-
-    error = loss_torch.sub_(loss_mpu).abs().max()
-    print('   max error in loss on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    error = grad_torch.sub_(grad_mpu).abs().max()
-    print('   max error in grad on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset groups
-    mpu.destroy_tensor_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        print_separator('test cross entropy')
-        test_cross_entropy(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_data.py b/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_data.py
deleted file mode 100644
index c30bf4bb8d4dbb0c2d576d20b18b4ae640d00d2c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_data.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-from commons import print_separator
-from commons import initialize_distributed
-from mpu import data as data_utils
-import mpu
-import torch
-import functools
-import operator
-import sys
-sys.path.append("../..")
-
-
-def test_broadcast_data(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing broadcast_data with model parallel size {} ...'.
-              format(tensor_model_parallel_size))
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    key_size_t = {'key1': [7, 11],
-                  'key2': [8, 2, 1],
-                  'key3': [13],
-                  'key4': [5, 1, 2],
-                  'key5': [5, 12]}
-    keys = list(key_size_t.keys())
-
-    data = {}
-    data_t = {}
-    for key in key_size_t:
-        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
-        data_t[key] = data[key].clone()
-    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
-    data_t['keyX'] = data['keyX'].clone()
-    if mpu.get_tensor_model_parallel_rank() != 0:
-        data = None
-
-    data_utils._check_data_types(keys, data_t, torch.int64)
-    key_size, key_numel, \
-        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
-    for key in keys:
-        assert key_size[key] == key_size_t[key]
-    total_numel_t = 0
-    for key in keys:
-        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
-        assert key_numel[key] == target_size
-        total_numel_t += target_size
-    assert total_numel == total_numel_t
-
-    data_b = data_utils.broadcast_data(keys, data, torch.int64)
-    for key in keys:
-        tensor = data_t[key].cuda()
-        assert data_b[key].sub(tensor).abs().max() == 0
-
-    # Reset groups
-    mpu.destroy_tensor_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        print_separator('test test broadcast data')
-        test_broadcast_data(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_initialize.py b/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_initialize.py
deleted file mode 100644
index e5d2be37e269d8176a987b8a6ef5d7f47de98394..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_initialize.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-from commons import print_separator
-from commons import initialize_distributed
-import mpu
-import torch
-import sys
-sys.path.append("../..")
-
-
-def test_initialize_model_parallel(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing initialize_model_parallel with size {} ...'.format(
-            tensor_model_parallel_size))
-    tensor_model_parallel_size_ = min(tensor_model_parallel_size,
-                               torch.distributed.get_world_size())
-    assert not mpu.model_parallel_is_initialized()
-    mpu.initialize_model_parallel(tensor_model_parallel_size_)
-    assert mpu.model_parallel_is_initialized()
-
-    # Checks.
-    def check(group, world_size, rank):
-        assert world_size == torch.distributed.get_world_size(group=group)
-        assert rank == torch.distributed.get_rank(group=group)
-
-    # Model parallel.
-    world_size = tensor_model_parallel_size_
-    rank = torch.distributed.get_rank() % tensor_model_parallel_size_
-    assert world_size == mpu.get_tensor_model_parallel_world_size()
-    assert rank == mpu.get_tensor_model_parallel_rank()
-    check(mpu.get_tensor_model_parallel_group(), world_size, rank)
-
-    # Data parallel.
-    world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
-    rank = torch.distributed.get_rank() // tensor_model_parallel_size
-    assert world_size == mpu.get_data_parallel_world_size()
-    assert rank == mpu.get_data_parallel_rank()
-    check(mpu.get_data_parallel_group(), world_size, rank)
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
-            tensor_model_parallel_size_))
-    tensor_model_parallel_size = min(tensor_model_parallel_size_,
-                              torch.distributed.get_world_size())
-    assert not mpu.model_parallel_is_initialized()
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    assert mpu.model_parallel_is_initialized()
-
-    # Checks
-    src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank()
-    assert mpu.get_tensor_model_parallel_src_rank() == src_rank
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        print_separator('test initialize model parallel')
-        test_initialize_model_parallel(tensor_model_parallel_size)
-        print_separator('test model parallel source rank')
-        test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_layers.py b/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_layers.py
deleted file mode 100644
index 73ad4b9459502dc2f68a8e3d0cb66157895eda1d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_layers.py
+++ /dev/null
@@ -1,517 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-from mpu import layers
-from commons import set_random_seed
-from commons import print_separator
-from commons import initialize_distributed
-import mpu
-from torch.nn.parameter import Parameter
-import torch.nn.init as init
-import torch
-import random
-import sys
-sys.path.append("../..")
-
-
-def test_parallel_embedding(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing parallel embedding with model parallel size {} ...'.
-              format(tensor_model_parallel_size))
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    batch_size = 17
-    seq_length = 23
-    vocab_size = 48
-    hidden_size = 16
-    seed = 1236
-
-    set_random_seed(123)
-    input_data = torch.LongTensor(
-        size=(batch_size, seq_length)).random_(0, vocab_size).cuda()
-    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
-
-    set_random_seed(seed)
-    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
-
-    output = embedding_original(input_data)
-    loss_original = torch.mul(output, loss_weight).sum()
-    loss_original.backward()
-
-    set_random_seed(seed)
-    embedding_parallel = layers.ParallelEmbedding(
-        vocab_size, hidden_size, init_method=init.normal_).cuda()
-    output = embedding_parallel(input_data)
-    loss_parallel = torch.mul(output, loss_weight).sum()
-    loss_parallel.backward()
-
-    set_random_seed(seed)
-    embedding_vocab_parallel = layers.VocabParallelEmbedding(
-        vocab_size, hidden_size, init_method=init.normal_).cuda()
-    output = embedding_vocab_parallel(input_data)
-    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
-    loss_vocab_parallel.backward()
-
-    torch.distributed.barrier()
-    error = loss_parallel.sub(loss_original).abs()
-    print('   error in loss (parallel) on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-12, 'error: {}'.format(error)
-
-    torch.distributed.barrier()
-    error = loss_vocab_parallel.sub(loss_original).abs()
-    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-12, 'error: {}'.format(error)
-
-    weight_grad_orig = torch.split(embedding_original.weight.grad,
-                                   hidden_size // tensor_model_parallel_size,
-                                   1)[mpu.get_tensor_model_parallel_rank()]
-    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
-    print('   error in grad (parallel) on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-12, 'error: {}'.format(error)
-
-    weight_grad_orig = torch.split(embedding_original.weight.grad,
-                                   vocab_size // tensor_model_parallel_size,
-                                   0)[mpu.get_tensor_model_parallel_rank()]
-    error = embedding_vocab_parallel.weight.grad.sub(
-        weight_grad_orig).abs().max()
-    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-12, 'error: {}'.format(error)
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-def test_initialize_affine_weight(tensor_model_parallel_size):
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    if torch.distributed.get_rank() == 0:
-        print('> testing initialize_affine_weight with model parallel '
-              'size: {}'.format(tensor_model_parallel_size))
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    seed = 12345
-    input_size_coeff = 13
-    input_size = input_size_coeff * tensor_model_parallel_size
-    output_size_coeff = 17
-    output_size = output_size_coeff * tensor_model_parallel_size
-
-    # ---------------
-    # Column parallel
-    # ---------------
-    weight = torch.empty(output_size_coeff, input_size)
-    set_random_seed(seed)
-    layers._initialize_affine_weight(weight, output_size, input_size,
-
-                                     output_size_coeff, 0,
-                                     torch.nn.init.normal_)
-    # Target.
-    set_random_seed(seed)
-    master_weight = torch.empty(output_size, input_size)
-    torch.nn.init.normal_(master_weight)
-    rank = mpu.get_tensor_model_parallel_rank()
-    my_weight = torch.split(master_weight, output_size_coeff,
-                            dim=0)[rank].contiguous().clone()
-
-    # Compare.
-    error = weight.sub(my_weight).abs().max()
-    torch.distributed.barrier()
-    print('   column parallel max error (should be zero) on global rank '
-          '{}: {}'.format(torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # ------------
-    # Row parallel
-    # ------------
-    weight = torch.empty(output_size, input_size_coeff)
-    set_random_seed(seed)
-    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
-                                         input_size_coeff, 1,
-                                         torch.nn.init.normal_)
-    # Target.
-    set_random_seed(seed)
-    master_weight = torch.empty(output_size, input_size)
-    torch.nn.init.normal_(master_weight)
-    rank = mpu.get_tensor_model_parallel_rank()
-    my_weight = torch.split(master_weight, input_size_coeff,
-                            dim=1)[rank].contiguous().clone()
-
-    # Compare.
-    error = weight.sub(my_weight).abs().max()
-    torch.distributed.barrier()
-    print('   row parallel max error (should be zero) on global rank '
-          '{}: {}'.format(torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-class IdentityLayer2D(torch.nn.Module):
-    def __init__(self, m, n):
-        super(IdentityLayer2D, self).__init__()
-        self.weight = Parameter(torch.Tensor(m, n))
-        torch.nn.init.xavier_normal_(self.weight)
-
-    def forward(self):
-        return self.weight
-
-
-def test_column_parallel_linear(tensor_model_parallel_size):
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    if torch.distributed.get_rank() == 0:
-        print('> testing ColumnParallelLinear with model parallel '
-              'size: {}'.format(tensor_model_parallel_size))
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    seed = 12345
-    set_random_seed(seed)
-    input_size_coeff = 13
-    input_size = input_size_coeff * tensor_model_parallel_size
-    output_size_coeff = 17
-    output_size = output_size_coeff * tensor_model_parallel_size
-    batch_size = 7
-
-    # Network
-    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
-    linear_layer = mpu.ColumnParallelLinear(
-        input_size, output_size, keep_master_weight_for_test=True).cuda()
-    loss_weight = torch.randn([batch_size, output_size]).cuda()
-    # Forward
-    input_ = identity_layer()
-    output = linear_layer(input_)
-    loss = torch.mul(output, loss_weight).sum()
-    # Backward
-    loss.backward()
-
-    # Values.
-    dLdY = loss_weight
-    X = identity_layer.weight
-    A = linear_layer.master_weight.cuda()
-    dLdA = torch.matmul(dLdY.t(), X)
-    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
-    dLdX = torch.matmul(dLdY, A)
-
-    rank = mpu.get_tensor_model_parallel_rank()
-    my_dLdA = torch.split(dLdA, output_size_coeff,
-                          dim=0)[rank].contiguous().clone()
-    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdA on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    my_dLdb = torch.split(dLdb, output_size_coeff,
-                          dim=0)[rank].contiguous().clone()
-    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdb on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    error = dLdX.sub(identity_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdX on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-def test_row_parallel_linear(tensor_model_parallel_size):
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    if torch.distributed.get_rank() == 0:
-        print('> testing RowParallelLinear with model parallel '
-              'size: {}'.format(tensor_model_parallel_size))
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    seed = 12345
-    set_random_seed(seed)
-    input_size_coeff = 13
-    input_size = input_size_coeff * tensor_model_parallel_size
-    output_size_coeff = 17
-    output_size = output_size_coeff * tensor_model_parallel_size
-    batch_size = 7
-
-    # Network
-    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
-    linear_layer = mpu.RowParallelLinear(
-        input_size, output_size, keep_master_weight_for_test=True).cuda()
-    loss_weight = torch.randn([batch_size, output_size]).cuda()
-    # Forward
-    input_ = identity_layer()
-    output = linear_layer(input_)
-    loss = torch.mul(output, loss_weight).sum()
-    # Backward
-    loss.backward()
-
-    # Values.
-    dLdY = loss_weight
-    X = identity_layer.weight
-    A = linear_layer.master_weight.cuda()
-    dLdA = torch.matmul(dLdY.t(), X)
-    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
-    dLdX = torch.matmul(dLdY, A)
-
-    rank = mpu.get_tensor_model_parallel_rank()
-    my_dLdA = torch.split(dLdA, input_size_coeff,
-                          dim=1)[rank].contiguous().clone()
-    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdA on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    error = dLdb.sub(linear_layer.bias.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdb on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    error = dLdX.sub(identity_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdX on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-class IdentityLayer3D(torch.nn.Module):
-    def __init__(self, m, n, k):
-        super(IdentityLayer3D, self).__init__()
-        self.weight = Parameter(torch.Tensor(m, n, k))
-        torch.nn.init.xavier_normal_(self.weight)
-
-    def forward(self):
-        return self.weight
-
-
-def parallel_self_attention(tensor_model_parallel_size, num_att_heads_per_partition,
-                            hidden_size_per_att_head, dropout_prob, batch_size,
-                            sequence_length):
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    seed = 12345
-    set_random_seed(seed)
-
-    num_att_heads = num_att_heads_per_partition * \
-        torch.distributed.get_world_size()
-    hidden_size = hidden_size_per_att_head * num_att_heads
-
-    # Network
-    identity_layer = IdentityLayer3D(batch_size, sequence_length,
-                                     hidden_size).cuda()
-    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
-                                                    dropout_prob).cuda()
-    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
-    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
-    # Forward
-    input_ = identity_layer()
-    output = attention_layer(input_, attention_mask)
-    loss = torch.mul(output, loss_weight).sum()
-    # Backward
-    loss.backward()
-
-    rank = mpu.get_tensor_model_parallel_rank()
-    mpu.destroy_model_parallel()
-    return rank, hidden_size, tensor_model_parallel_size, loss, \
-        attention_layer, identity_layer
-
-
-def test_parallel_self_attention(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing ParallelSelfAttention with model parallel '
-              'size: {}'.format(tensor_model_parallel_size))
-
-    num_att_heads_per_partition = 3
-    hidden_size_per_att_head = 7
-    dropout_prob = 0.0  # has to be zero
-    batch_size = 5
-    sequence_length = 13
-
-    rank_1, hideen_size_1, tensor_model_parallel_size_1, loss_1, \
-        attention_layer_1, identity_layer_1 = parallel_self_attention(
-            1, num_att_heads_per_partition,
-            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
-
-    rank, hidden_size, tensor_model_parallel_size, loss, \
-        attention_layer, identity_layer = parallel_self_attention(
-            tensor_model_parallel_size, num_att_heads_per_partition,
-            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
-    assert hideen_size_1 == hidden_size
-
-    error = loss_1.sub(loss).abs().max()
-    torch.distributed.barrier()
-    print('   loss error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-6
-
-    my_lin_grad_list = torch.split(
-        attention_layer_1.query_key_value.weight.grad,
-        hidden_size // tensor_model_parallel_size, 0)[rank::tensor_model_parallel_size]
-    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
-    error = my_lin_grad.sub(
-        attention_layer.query_key_value.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   weight gradient error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-6
-
-    error = identity_layer_1.weight.grad.sub(
-        identity_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   input gradient error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-6
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-def parallel_transformer(tensor_model_parallel_size, num_att_heads_per_partition,
-                         hidden_size_per_att_head, batch_size, sequence_length):
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    seed = 12345
-    set_random_seed(seed)
-
-    num_att_heads = num_att_heads_per_partition * \
-        torch.distributed.get_world_size()
-    hidden_size = hidden_size_per_att_head * num_att_heads
-    intermediate_size = 4 * hidden_size
-
-    # Network
-    identity_layer = IdentityLayer3D(batch_size, sequence_length,
-                                     hidden_size).cuda()
-    transformer_layer = mpu.BertParallelTransformerLayer(
-        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
-        torch.nn.functional.relu, 1.0e-5).cuda()
-
-    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
-    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
-    # Forward
-    input_ = identity_layer()
-    output = transformer_layer(input_, attention_mask)
-    loss = torch.mul(output, loss_weight).sum()
-    # Backward
-    loss.backward()
-
-    rank = mpu.get_tensor_model_parallel_rank()
-    mpu.destroy_model_parallel()
-    return rank, hidden_size, tensor_model_parallel_size, loss, \
-        transformer_layer, identity_layer
-
-
-def test_parallel_transformer_layer(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing ParallelTransformerLayer with model parallel '
-              'size: {}'.format(tensor_model_parallel_size))
-
-    num_att_heads_per_partition = 3
-    hidden_size_per_att_head = 7
-    batch_size = 5
-    sequence_length = 13
-
-    rank_1, hidden_size_1, tensor_model_parallel_size_1, loss_1, \
-        transformer_layer_1, identity_layer_1 = parallel_transformer(
-            1, num_att_heads_per_partition,
-            hidden_size_per_att_head, batch_size, sequence_length)
-
-    rank, hidden_size, tensor_model_parallel_size, loss, \
-        transformer_layer, identity_layer = parallel_transformer(
-            tensor_model_parallel_size, num_att_heads_per_partition,
-            hidden_size_per_att_head, batch_size, sequence_length)
-
-    error = loss_1.sub(loss).abs().max()
-    torch.distributed.barrier()
-    print('   loss error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-5, 'error: {}'.format(error)
-
-    error = identity_layer_1.weight.grad.sub(
-        identity_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   input gradient error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-5, 'error: {}'.format(error)
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-
-    print_separator('test initialize affine weight')
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        test_initialize_affine_weight(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
-
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        print_separator('test parallel embedding')
-        test_parallel_embedding(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
-
-    print_separator('test column-parallel linear')
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        test_column_parallel_linear(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
-
-    print_separator('test row-parallel linear')
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        test_row_parallel_linear(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
-
-    print_separator('test parallel self-attention')
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        test_parallel_self_attention(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
-
-    print_separator('test parallel transformer')
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        test_parallel_transformer_layer(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_random.py b/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_random.py
deleted file mode 100644
index 8ee6942cf01fd7d9c93012c37f7b5e4b351f3c15..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/mpu/tests/test_random.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-from commons import print_separator
-from commons import initialize_distributed
-import mpu
-import torch
-import sys
-sys.path.append("../..")
-
-
-def test_set_cuda_rng_state(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing set_rng_state with size {} ...'.
-              format(tensor_model_parallel_size))
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    size = 123
-    seed = 1234
-    torch.cuda.manual_seed(1234)
-    tensor = torch.cuda.FloatTensor(size)
-
-    # Get the state
-    rng_state = torch.cuda.get_rng_state()
-    rng_state_copy = rng_state.clone()
-
-    # Do some stuff.
-    for _ in range(5):
-        torch.randn(size, out=tensor)
-    result_1 = tensor.clone()
-
-    assert rng_state.sub(rng_state_copy).max() == 0
-    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
-
-    # State should be different.
-    new_rng_state = torch.cuda.get_rng_state()
-    max_diff = new_rng_state.sub(rng_state).max()
-    print('   max diff in rng state (should be non-zero) on global rank {}: {}'.
-          format(torch.distributed.get_rank(), max_diff))
-    assert max_diff > 0
-
-    # Reset the rng state and do the same stuff.
-    mpu.random._set_cuda_rng_state(rng_state)
-    for _ in range(5):
-        torch.randn(size, out=tensor)
-    mpu.random._set_cuda_rng_state(rng_state)
-    for _ in range(5):
-        torch.randn(size, out=tensor)
-    result_2 = tensor.clone()
-
-    # Results should be the same
-    error = result_2.sub(result_1).abs().max()
-    print('   max error in generated tensors (should be zero) on '
-          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Input state should have remained intact.
-    error = rng_state.sub(rng_state_copy).max()
-    print('   max error in rng state (should be zero) on global rank {}: {}'.
-          format(torch.distributed.get_rank(), error))
-    assert error == 0
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-def test_cuda_rng_tracker(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing cuda rng tracker with size {} ...'.
-              format(tensor_model_parallel_size))
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    seed_1 = 1234
-    seed_2 = 4321
-    size = [12, 21]
-    tensor = torch.cuda.FloatTensor(size)
-
-    # Set to seed_1 and generate two tensors.
-    torch.cuda.manual_seed(seed_1)
-    torch.randn(size, out=tensor)
-    target_11 = tensor.clone()
-    torch.randn(size, out=tensor)
-    target_12 = tensor.clone()
-
-    # Set to seed_2 and generate two tensors.
-    torch.cuda.manual_seed(seed_2)
-    torch.randn(size, out=tensor)
-    target_21 = tensor.clone()
-    torch.randn(size, out=tensor)
-    target_22 = tensor.clone()
-
-    # Now if we interleave seed_1 and seed_2,
-    # we should still get the same tensors
-    torch.cuda.manual_seed(seed_1)
-    mpu.get_cuda_rng_tracker().add('test', seed_2)
-
-    torch.randn(size, out=tensor)
-    result_11 = tensor.clone()
-
-    with mpu.get_cuda_rng_tracker().fork('test'):
-        torch.randn(size, out=tensor)
-        result_21 = tensor.clone()
-
-    torch.randn(size, out=tensor)
-    result_12 = tensor.clone()
-
-    with mpu.get_cuda_rng_tracker().fork('test'):
-        torch.randn(size, out=tensor)
-        result_22 = tensor.clone()
-
-    diff = result_11.sub(result_21).abs().max()
-    diff = min(diff, result_12.sub(result_22).abs().max())
-    print('   max diff in generated tensors (should be non-zero) on '
-          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
-    assert diff > 1.0e-6
-    error = max(result_11.sub(target_11).abs().max(),
-                result_12.sub(target_12).abs().max())
-    error = max(error, result_21.sub(target_21).abs().max())
-    error = max(error, result_22.sub(target_22).abs().max())
-    print('   max error in generated tensors (should be zero) on '
-          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset the tracker
-    mpu.get_cuda_rng_tracker().reset()
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing model parallel cuda manual seed with size {} ...'.
-              format(tensor_model_parallel_size))
-
-    mpu.initialize_model_parallel(tensor_model_parallel_size)
-    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
-
-    mpu.model_parallel_cuda_manual_seed(12345)
-    assert torch.cuda.initial_seed() == 12345
-    with mpu.get_cuda_rng_tracker().fork():
-        assert torch.cuda.initial_seed() == (12345 + 2718 +
-                                             mpu.get_tensor_model_parallel_rank())
-
-    # Reset the tracker
-    mpu.get_cuda_rng_tracker().reset()
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        print_separator('test set rng state')
-        test_set_cuda_rng_state(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
-
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        print_separator('test cuda rng tracker')
-        test_cuda_rng_tracker(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
-
-    tensor_model_parallel_size = 1
-    while tensor_model_parallel_size <= world_size:
-        print_separator('test model parallel cuda manual seed')
-        test_model_parallel_cuda_manual_seed(tensor_model_parallel_size)
-        tensor_model_parallel_size *= 2
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/__init__.py
deleted file mode 100644
index f7fe6ef23e906e0a8f6a3eda8510dc56e5c705d2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/__init__.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-from apex.optimizers import FusedAdam as Adam
-from apex.optimizers import FusedSGD as SGD
-
-from megatron_ds import get_args
-
-from .distrib_optimizer import DistributedOptimizer
-from .grad_scaler import ConstantGradScaler, DynamicGradScaler
-from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
-
-def get_param_groups(modules,
-                     no_weight_decay_cond,
-                     scale_lr_cond,
-                     lr_mult):
-    """creates param groups based on weight decay condition (regularized vs non regularized)
-       and learning rate scale condition (args.lr vs lr_mult * args.lr)
-       scale_lr_cond is used during finetuning where head of the network requires a scaled
-       version of the base learning rate. 
-    """
-    wd_no_scale_lr = []
-    wd_scale_lr = []
-    no_wd_no_scale_lr = []
-    no_wd_scale_lr = []
-    for module in modules:
-        for name, param in module.named_parameters():
-            if not param.requires_grad:
-                continue
-
-            if no_weight_decay_cond is not None:
-                no_wd = no_weight_decay_cond(name, param)
-            else:
-                # do not regularize biases nor Norm parameters
-                no_wd = name.endswith(".bias") or len(param.shape) == 1
-
-            if scale_lr_cond is not None:
-                scale_lr = scale_lr_cond(name, param)
-            else:
-                scale_lr = False
-
-            if not no_wd and not scale_lr:
-                wd_no_scale_lr.append(param)
-            elif not no_wd and scale_lr:
-                wd_scale_lr.append(param)
-            elif no_wd and not scale_lr:
-                no_wd_no_scale_lr.append(param)
-            else:
-                no_wd_scale_lr.append(param)
-
-    param_groups = []
-    if len(wd_no_scale_lr):
-        param_groups.append({'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0})
-    if len(wd_scale_lr):
-        param_groups.append({'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult})
-    if len(no_wd_no_scale_lr):
-        param_groups.append({'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0})
-    if len(no_wd_scale_lr):
-        param_groups.append({'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult})
-
-    return param_groups
-
-def get_megatron_optimizer(model,
-                           no_weight_decay_cond=None,
-                           scale_lr_cond=None,
-                           lr_mult=1.0,
-                           lr=None,
-                           weight_decay=None):
-    args = get_args()
-
-    if lr is None:
-        lr = args.lr
-    if weight_decay is None:
-        weight_decay = args.weight_decay
-
-    # Base optimizer.
-    param_groups = get_param_groups(model,
-                                    no_weight_decay_cond,
-                                    scale_lr_cond,
-                                    lr_mult)
-    if args.create_moe_param_group:
-        from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
-        param_groups = split_params_into_different_moe_groups_for_optimizer(param_groups)
-
-    if args.cpu_optimizer:
-        assert args.optimizer == 'adam', 'CPU offloading is for Adam'
-        if args.cpu_torch_adam:
-            cpu_adam_optimizer = torch.optim.AdamW
-        else:
-            from deepspeed.ops.adam import DeepSpeedCPUAdam
-            cpu_adam_optimizer = DeepSpeedCPUAdam
-        optimizer = cpu_adam_optimizer(param_groups,
-                                       lr=lr,
-                                       weight_decay=weight_decay,
-                                       betas=(args.adam_beta1, args.adam_beta2),
-                                       eps=args.adam_eps)
-    else:
-        if args.optimizer == 'adam':
-            if args.ds_fused_adam:
-                global Adam
-                from deepspeed.ops.adam import FusedAdam
-                Adam = FusedAdam
-            optimizer = Adam(param_groups,
-                            lr=lr,
-                            weight_decay=weight_decay,
-                            betas=(args.adam_beta1, args.adam_beta2),
-                            eps=args.adam_eps)
-        elif args.optimizer == 'sgd':
-            optimizer = SGD(param_groups,
-                            lr=lr,
-                            weight_decay=weight_decay,
-                            momentum=args.sgd_momentum)
-        else:
-            raise Exception('{} optimizer is not supported.'.format(
-            args.optimizer))
-
-    if args.deepspeed:
-        return optimizer
-
-    # Determine whether the params have main-grad field.
-    params_have_main_grad = True
-
-    # Mixed precision optimizer.
-    # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
-    #   from the MixedPrecisionOptimizer, which manages any optimizer where
-    #   the model params and main params are distinct.
-    if args.fp16 or args.bf16 or args.use_distributed_optimizer:
-
-        # Grad scaler:
-        #    if loss-scale is provided, instantiate the constant scaler.
-        #    if we are using fp16 and loss-scale is not present, use a
-        #       dynamic scaler.
-        #    otherwise we are running in bf16 with no loss-scale so
-        #       leave it as None.
-        grad_scaler = None
-
-        # Constant loss scale.
-        if args.loss_scale:
-            grad_scaler = ConstantGradScaler(args.loss_scale)
-
-        # Dynamic loss scale.
-        else:
-            if args.fp16:
-                grad_scaler = DynamicGradScaler(
-                    initial_scale=args.initial_loss_scale,
-                    min_scale=args.min_loss_scale,
-                    growth_factor=2.0,
-                    backoff_factor=0.5,
-                    growth_interval=args.loss_scale_window,
-                    hysteresis=args.hysteresis)
-
-        # Megatron optimizer.
-        opt_ty = DistributedOptimizer \
-            if args.use_distributed_optimizer else \
-            Float16OptimizerWithFloat16Params
-        return opt_ty(optimizer,
-                      args.clip_grad,
-                      args.log_num_zeros_in_grad,
-                      args.check_for_nan_in_loss_and_grad,
-                      params_have_main_grad,
-                      args.fp16,
-                      args.bf16,
-                      args.params_dtype,
-                      grad_scaler,
-                      model)
-
-    # FP32.
-    return FP32Optimizer(optimizer, args.clip_grad,
-                         args.log_num_zeros_in_grad,
-                         args.check_for_nan_in_loss_and_grad,
-                         params_have_main_grad,
-                         model)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/clip_grads.py b/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/clip_grads.py
deleted file mode 100644
index 935aa94fb07be26072a3d1f2fbe9c7a2c875b610..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/clip_grads.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Gradient clipping."""
-
-import os
-
-import torch
-from torch import inf
-
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-
-from megatron_ds.model.module import param_is_not_shared
-from megatron_ds.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
-
-
-def clip_grad_norm_fp32(parameters, grads_for_norm,
-                        max_norm, check_for_nan_in_grad,
-                        norm_type=2, model_parallel_group=None):
-    """Clips gradient norm of an iterable of parameters whose gradients
-       are in fp32.
-
-    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place.
-
-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single
-            Tensor that will be used for calculating the grad norm.
-        max_norm (float or int): max norm of the gradients.
-        check_for_nan_in_grad (bool): check if gradients have a NaN.
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
-        model_parallel_group (group): given the nature of the distributed
-            optimizer, this is passed as an argument.
-
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
-    """
-
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    if isinstance(grads_for_norm, torch.Tensor):
-        grads_for_norm = [grads_for_norm]
-
-    # Grads.
-    grads = []
-    for param in parameters:
-        if param.grad is not None:
-            assert param.grad.type() == 'torch.cuda.FloatTensor'
-            grads.append(param.grad.detach())
-
-    # Norm parameters.
-    max_norm = float(max_norm)
-    norm_type = float(norm_type)
-    total_norm = 0.0
-
-    # Calculate norm.
-    if norm_type == inf:
-        total_norm = max(grad.abs().max() for grad in grads_for_norm)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        # Take max across all model-parallel GPUs.
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=model_parallel_group)
-        total_norm = total_norm_cuda[0].item()
-
-    else:
-        if norm_type == 2.0:
-            dummy_overflow_buf = torch.cuda.IntTensor([0])
-            # Use apex's multi-tensor applier for efficiency reasons.
-            # Multi-tensor applier takes a function and a list of list
-            # and performs the operation on that list all in one kernel.
-            if grads_for_norm:
-                grad_norm, _ = multi_tensor_applier(
-                    amp_C.multi_tensor_l2norm,
-                    dummy_overflow_buf,
-                    [grads_for_norm],
-                    False # no per-parameter norm
-                )
-            else:
-                grad_norm = torch.cuda.FloatTensor([0])
-            # Since we will be summing across data parallel groups,
-            # we need the pow(norm-type).
-            total_norm = grad_norm ** norm_type
-
-        else:
-            for grad in grads_for_norm:
-                grad_norm = torch.norm(grad, norm_type)
-                total_norm += grad_norm ** norm_type
-
-        # Check individual rank grad norms are not NaN
-        # prior to model-parallel all-reduce.
-        if check_for_nan_in_grad:
-            global_rank = torch.distributed.get_rank()
-            assert not total_norm.isnan(), (
-                f'Rank {global_rank}: found NaN in local grad norm in '
-                f'backwards pass. Device: {torch.cuda.current_device()}, '
-                f'node: {os.uname()[1]}'
-            )
-
-        # Sum across all model-parallel GPUs.
-        torch.distributed.all_reduce(total_norm,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=model_parallel_group)
-        total_norm = total_norm.item() ** (1.0 / norm_type)
-
-    # Scale.
-    clip_coeff = max_norm / (total_norm + 1.0e-6)
-    if clip_coeff < 1.0:
-        dummy_overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             dummy_overflow_buf,
-                             [grads, grads],
-                             clip_coeff)
-
-    return total_norm
-
-
-def count_zeros_fp32(parameters, model_parallel_group):
-
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-
-    # Filter parameters based on:
-    #   - grad should not be none
-    #   - parameter should not be shared
-    #   - should not be a replica due to tensor model parallelism
-    total_num_zeros = torch.cuda.FloatTensor([0.0])
-    for param in parameters:
-        grad_not_none = param.grad is not None
-        is_not_shared = param_is_not_shared(param)
-        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-        if grad_not_none and is_not_shared and is_not_tp_duplicate:
-            grad = param.grad.detach()
-            num_zeros = grad.numel() - torch.count_nonzero(grad)
-            total_num_zeros = num_zeros + total_num_zeros
-
-    # Sum across all model-parallel GPUs.
-    torch.distributed.all_reduce(total_num_zeros,
-                                 op=torch.distributed.ReduceOp.SUM,
-                                 group=model_parallel_group)
-
-    total_num_zeros = total_num_zeros.item()
-
-    return total_num_zeros
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/distrib_optimizer.py b/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/distrib_optimizer.py
deleted file mode 100644
index 7c796e1d4643b6d2dd11c1be5dec20710548e724..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/distrib_optimizer.py
+++ /dev/null
@@ -1,1162 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron distributed optimizer."""
-
-
-from apex.optimizers import FusedAdam as Adam
-import math
-import torch
-from packaging import version
-
-from megatron_ds import get_args
-from megatron_ds import get_timers
-from megatron_ds import print_rank_0
-from megatron_ds.core import mpu, tensor_parallel
-
-from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper
-from .utils import shard_buffer
-
-
-
-class Range:
-    """
-    A range represents a start and end points for indexing a shard
-    from a full tensor.
-    """
-    def __init__(self, start, end):
-        self.start = start
-        self.end = end
-        self.size = end - start
-    def normalize(self, start = 0):
-        return Range(start, start + self.size)
-    def __str__(self):
-        return "%d,%d [%d]" % (self.start, self.end, self.size)
-    def __len__(self):
-        return self.end - self.start
-
-
-class DistributedOptimizer(MixedPrecisionOptimizer):
-    """Distributed optimizer, for all data types (fp16, bf16, and fp32).
-
-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        check_for_nan_in_grad: check if gradients have a NaN.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a continuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        fp16: if true, the model is running in fp16.
-        bf16: if true, the model is running in bfloat16.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
-        models: list of models (i.e., the virtual pipelining models). This
-            is used by the distributed optimizer for mapping parameters.
-    """
-
-    @classmethod
-    def build_model_gbuf_param_range_map(cls, model, dtype, gbuf_world_range, bucket_offset):
-        """
-        Build mapping from param reference to grad buffer shard ranges.
-
-        This method builds a mapping from parameter references to grad
-        buffer shard ranges, specific to each data-parallel (DP) rank's
-        set of 'owned' parameters. Each grad buffer (padded to be an even
-        multiple of DP-world-size) is conceptually divided into DP-world-size
-        contiguous regions, where each DP rank 'owns' a contiguous regions.
-        Ownership in this sense means DP rank is responsible for reducing
-        the relevant subset of grads, and updating the relevant subset of
-        params.
-
-        This conceptual partitioning of the grad buffer does NOT respect
-        parameter boundaries, and as such it is assumed that each created
-        range references a shard (or subset) of the full parameter. It is
-        easiest to think of each DP rank as operating (i.e., reducing,
-        gathering) purely on views into the grad buffer, for all model-to-
-        main & main-to-model operations.
-
-        This method creates four ranges:
-        - The param's range within the entire grad buffer (i.e., world index).
-        - The param's range within the relevant grad bucket's buffer.
-        - The param's range within the DP rank's local view of the grad buffer.
-        - The param's range within itself (i.e., its shard).
-        """
-
-        # Param range map.
-        param_world_index_map = model.grad_buffer_param_index_map[dtype]
-        param_range_map = {}
-        for param, param_world_indexes in param_world_index_map.items():
-
-            # Param range.
-            param_world_start, param_world_end, _ = param_world_indexes
-            param_local_start = max(
-                0,
-                param_world_start - gbuf_world_range.start)
-            param_local_end = min(
-                gbuf_world_range.size,
-                param_world_end - gbuf_world_range.start)
-
-            # Add param, if within local gbuf range.
-            if param_local_end > param_local_start:
-                param_local_range = Range(param_local_start, param_local_end)
-                param_world_range = param_local_range.normalize(
-                    param_local_start + gbuf_world_range.start)
-                param_world_range_in_bucket = Range(param_world_range.start-bucket_offset,
-                                                    param_world_range.end-bucket_offset)
-                sub_param_start = max(0, gbuf_world_range.start-param_world_start)
-                sub_param_range = param_local_range.normalize(sub_param_start)
-                param_range_map[param] = {
-                    "gbuf_world" : param_world_range,
-                    "gbuf_world_in_bucket": param_world_range_in_bucket,
-                    "gbuf_local" : param_local_range,
-                    "param" : sub_param_range,
-                }
-
-        return param_range_map
-
-
-    @classmethod
-    def build_model_gbuf_range(cls, model, dtype, bucket_index):
-        """
-        Build mapping between params and their grad buffers.
-
-        This method does the initial setup for the method above. This setup
-        includes determining the shard ranges into the DDP's grad buffer for
-        each data-parallel (DP) rank. Each DP rank keeps range info for
-        all other DP ranks, for the purpose of creating args for
-        reduce-scatter and all-gather.
-        """
-
-        data_parallel_rank = mpu.get_data_parallel_rank(with_context_parallel=True)
-        data_parallel_world_size = mpu.get_data_parallel_world_size(with_context_parallel=True)
-
-        bucket = model.grad_buffers[dtype].buckets[bucket_index]
-        bucket_buffer = bucket.data
-        gbuf_size = bucket_buffer.numel()
-        assert gbuf_size % data_parallel_world_size == 0, \
-            f"Each bucket's buffer size should be divisible by {data_parallel_world_size}"
-        max_gbuf_range_size = gbuf_size // data_parallel_world_size
-
-        # All world ranges (i.e., across all data parallel ranks).
-        gbuf_world_all_ranges = []
-        for r in range(data_parallel_world_size):
-            # Compute start of chunk in this bucket.
-            gbuf_world_start = r * max_gbuf_range_size
-            gbuf_world_end = min(gbuf_size, gbuf_world_start+max_gbuf_range_size)
-            # Add bucket's offset in grad buffer.
-            gbuf_world_range = Range(gbuf_world_start + bucket.offset,
-                                     gbuf_world_end + bucket.offset)
-            gbuf_world_all_ranges.append(gbuf_world_range)
-
-        # Local DP's ranges.
-        gbuf_world_range = gbuf_world_all_ranges[data_parallel_rank]
-
-        # Get each param's ranges.
-        param_range_map = cls.build_model_gbuf_param_range_map(model,
-                                                               dtype,
-                                                               gbuf_world_range,
-                                                               bucket.offset)
-
-        # Group into dict.
-        data = {
-            "param_map" : param_range_map,
-        }
-
-        return data
-
-
-    @classmethod
-    def build_model_gbuf_range_map(cls, model):
-        """
-        Create param-to-grad-buffer mappings, for grad buffer data types
-        within a specific virtual model.
-        """
-        # Iterate through all buckets to construct param ranges that this rank "owns"
-        # (the dp_rank'th shard of each bucket, where each shard is 1/dp_world_size
-        # of the bucket).
-        return {
-            dtype : [cls.build_model_gbuf_range(model, dtype, bucket_index)
-                     for bucket_index in range(len(model.grad_buffers[dtype].buckets))]
-            for dtype in model.grad_buffers
-        }
-
-
-    @classmethod
-    def build_model_param_gbuf_map(cls, model_gbuf_ranges):
-        """
-        Create a reverse of the model_gbuf_ranges, for referencing in
-        opposite direction.
-        """
-        param_gbuf_map = {}
-        for model_index, model_gbuf_range_map in enumerate(model_gbuf_ranges):
-            for dtype, gbuf_range_map_for_all_buckets in model_gbuf_range_map.items():
-                for bucket_index, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
-                    for param, _ in gbuf_range_map["param_map"].items():
-                        assert param not in param_gbuf_map, \
-                            "Param should not be in param_gbuf_map; each param only belongs to a single bucket"
-                        param_gbuf_map[param] = (model_index, dtype, bucket_index)
-        return param_gbuf_map
-
-
-    @classmethod
-    def build_optimizer_group_ranges(cls, param_groups, model_gbuf_ranges):
-        """
-        Create optimizer groups.
-
-        Given the set of parameter shard ranges that are owned by the current
-        data-parallel (DP) rank, gather the set of parameters that will be
-        used (in the method below) to create the current DP's optimizer
-        groups.
-        """
-
-        num_groups = len(param_groups)
-
-        # Param group map.
-        # World param group map.
-        # - Store a mapping of <model_parameter:group_index> for all parameters
-        #   across all DP ranks. This is necessary because it is our first
-        #   cross reference between the DDP mappings and the optimizer group
-        #   parameters. This mapping only for use in the next step of building
-        #   the local mapping over this DP rank's parameters.
-        world_param_group_map = {}
-        for group_index, group in enumerate(param_groups):
-            for param in group["params"]:
-                assert param.requires_grad
-                world_param_group_map[param] = group_index
-
-        # Optimizer group ranges & param-group mapping.
-        # - Build a mapping from groups to their contained parameters, and also
-        #   from parameters to their containing group index and order within
-        #   the group. The group index and order are particularly important for
-        #   saving and loading checkpoints.
-        local_param_group_map = {}
-        group_ranges = [ {"params": []} for _ in param_groups ]
-        for model_gbuf_range_map in model_gbuf_ranges:
-            for dtype, gbuf_range_map_for_all_buckets in model_gbuf_range_map.items():
-                for gbuf_range_map in gbuf_range_map_for_all_buckets:
-                    for param in gbuf_range_map["param_map"]:
-                        group_index = world_param_group_map[param]
-                        group_range = group_ranges[group_index]
-                        group_range["params"].append(param)
-                        local_param_group_map[param] = \
-                            (group_index, len(group_range["params"]) - 1)
-
-        # Squeeze zero-size group ranges.
-        for group_index, group_range in enumerate(group_ranges):
-            group_range["orig_group"] = param_groups[group_index]
-            group_range["orig_group_idx"] = param_groups[group_index]
-
-        return local_param_group_map, group_ranges
-
-
-    @classmethod
-    def build_model_and_main_param_groups(cls,
-                                          model_gbuf_ranges,
-                                          param_gbuf_map,
-                                          opt_group_ranges):
-        """
-        Create main parameter groups needed for the optimizer step.
-
-        These groups encompass both: 1) groups used by this class, for
-        reducing/gather, and 2) groups used by the inner optimizer for the
-        parameter update. Given that the conceptual grad buffer partitioning
-        (created in earlier method) doesn't respect parameter boundaries,
-        the optimizer operates on shards of the model parameters, rather than
-        the full parameters.
-        """
-
-        # Parameter groups:
-        #   model_float16_groups: original float16 parameters
-        #   model_fp32_groups: original fp32 parameters
-        #   shard_float16_groups: shards of original float16 parameters
-        #   shard_fp32_groups: shards of original fp32 parameters
-        #   shard_fp32_from_float16_groups: fp32 copy of float16 parameters
-        model_float16_groups = []
-        model_fp32_groups = []
-        shard_float16_groups = []
-        shard_fp32_groups = []
-        shard_fp32_from_float16_groups = []
-
-        # Allocate (or slice) each group's param shard.
-        for group_index, group_range in enumerate(opt_group_ranges):
-
-            # Params of this group.
-            model_float16_params_this_group = []
-            model_fp32_params_this_group = []
-            shard_float16_params_this_group = []
-            shard_fp32_params_this_group = []
-            shard_fp32_from_float16_params_this_group = []
-            model_float16_groups.append(model_float16_params_this_group)
-            model_fp32_groups.append(model_fp32_params_this_group)
-            shard_float16_groups.append(shard_float16_params_this_group)
-            shard_fp32_groups.append(shard_fp32_params_this_group)
-            shard_fp32_from_float16_groups.append(
-                shard_fp32_from_float16_params_this_group)
-
-            for model_param in group_range["params"]:
-
-                assert model_param.requires_grad
-
-                model_index, dtype, bucket_index = param_gbuf_map[model_param]
-                gbuf_range = model_gbuf_ranges[model_index][dtype][bucket_index]
-                param_range = gbuf_range["param_map"][model_param]["param"]
-
-                # fp16, bf16 params.
-                if model_param.type() in ['torch.cuda.HalfTensor',
-                                          'torch.cuda.BFloat16Tensor']:
-
-                    # Clone model -> main.
-                    shard_model_param = model_param.detach().view(-1) \
-                        [param_range.start:param_range.end]
-                    shard_main_param = shard_model_param.clone().float()
-                    tensor_parallel.copy_tensor_model_parallel_attributes(
-                        shard_model_param, model_param)
-                    tensor_parallel.copy_tensor_model_parallel_attributes(
-                        shard_main_param, model_param)
-                    if hasattr(model_param, 'shared'):
-                        shard_model_param.shared = model_param.shared
-                        shard_main_param.shared = model_param.shared
-
-                    # Add to group.
-                    model_float16_params_this_group.append(model_param)
-                    shard_float16_params_this_group.append(shard_model_param)
-                    shard_fp32_from_float16_params_this_group.append(shard_main_param)
-
-                # fp32 params.
-                elif model_param.type() == 'torch.cuda.FloatTensor':
-                    shard_model_param = model_param.view(-1) \
-                        [param_range.start:param_range.end]
-                    model_fp32_params_this_group.append(model_param)
-                    shard_fp32_params_this_group.append(shard_model_param)
-                    tensor_parallel.copy_tensor_model_parallel_attributes(
-                        shard_model_param, model_param)
-                    if hasattr(model_param, 'shared'):
-                        shard_model_param.shared = model_param.shared
-
-                else:
-                    raise TypeError('Wrapped parameters must be one of '
-                                    'torch.cuda.FloatTensor,  '
-                                    'torch.cuda.HalfTensor, or '
-                                    'torch.cuda.BFloat16Tensor. '
-                                    'Received {}'.format(model_param.type()))
-
-            # Update optimizer's params.
-            group_range["orig_group"]["params"] = [
-                *shard_fp32_params_this_group,
-                *shard_fp32_from_float16_params_this_group,
-            ]
-
-        return (
-            model_float16_groups,
-            model_fp32_groups,
-            shard_float16_groups,
-            shard_fp32_groups,
-            shard_fp32_from_float16_groups,
-        )
-
-
-    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
-                 check_for_nan_in_grad, params_have_main_grad, fp16,
-                 bf16, params_dtype, grad_scaler, models):
-        """
-        See top of class definition for argument descriptions.
-
-        The steps in this method create the core mapping between DDP grad
-        buffers, parameters, and parameter shard ranges, that is needed for
-        converting between model param indexes and main parameter shard
-        indexes. This method also updates the optimizer parameter groups
-        with the newly created shards.
-        """
-
-        super().__init__(
-            optimizer, clip_grad, log_num_zeros_in_grad,
-            check_for_nan_in_grad, params_have_main_grad,
-            fp16, bf16, params_dtype, grad_scaler, models)
-
-        assert isinstance(optimizer, Adam), \
-            "Only Adam currently supported, due to checkpointing requirements."
-
-        # Model grad buffer ranges.
-        self.model_gbuf_ranges = []
-        self.per_bucket_numel = []
-        for _, model_chunk in enumerate(self.models):
-            self.per_bucket_numel.append(
-                {dtype: [bucket.data.numel() for bucket in model_chunk.grad_buffers[dtype].buckets]
-                 for dtype in model_chunk.grad_buffers})
-            self.model_gbuf_ranges.append(self.build_model_gbuf_range_map(model_chunk))
-        self.model_param_gbuf_map = \
-            self.build_model_param_gbuf_map(self.model_gbuf_ranges)
-
-        # Optimizer ranges.
-        self.model_param_group_index_map, self.opt_group_ranges = \
-            self.build_optimizer_group_ranges(self.optimizer.param_groups,
-                                              self.model_gbuf_ranges)
-
-        # Allocate main param shards.
-        (
-            self.model_float16_groups,
-            self.model_fp32_groups,
-            self.shard_float16_groups,
-            self.shard_fp32_groups,
-            self.shard_fp32_from_float16_groups,
-        ) = self.build_model_and_main_param_groups(self.model_gbuf_ranges,
-                                                   self.model_param_gbuf_map,
-                                                   self.opt_group_ranges)
-
-        # Initialize param buffers.
-        # - These are views on the DDP model's grad buffers, that share
-        #   storage & have their own dtype. This is safe because the param
-        #   dtype size is always <= grad dtype size.
-        self.param_buffers = []
-        for model_index, model in enumerate(self.models):
-            current_param_buffers = {}
-            for dtype, grad_buffer in model.grad_buffers.items():
-                size_ratio = torch.finfo(dtype).bits // torch.finfo(params_dtype).bits
-                current_param_buffers[dtype] = []
-                for bucket in grad_buffer.buckets:
-
-                    # Handle older/newer method for getting untyped storage.
-                    try:
-                        storage = bucket.data.storage()._untyped()
-                    except:
-                        storage = bucket.data.storage().untyped()
-
-                    # Typed param buffer.
-                    param_buffer = torch.tensor(
-                        storage,
-                        dtype = params_dtype,
-                        device = bucket.data.device)
-
-                    # .storage() ignores views / slices, so param_buffer now points to the start
-                    # of the grad_buffer instead of to the start of each bucket. As a result,
-                    # add bucket.offset to make sure param_buffers point to the right region of
-                    # memory.
-                    # Since we want the start of each bucket's param_buffer to coincide with the
-                    # start of the same bucket's grad_buffer (this ensures that zeroing the grad
-                    # buffer does not zero out params in the param_buffer before they are copied
-                    # into the model_params), multiply the offset by the size ratio of grads and
-                    # params.
-                    offset = bucket.offset * size_ratio
-                    param_buffer = param_buffer[offset:offset+bucket.data.numel()]
-                    assert param_buffer.data_ptr() == bucket.data.data_ptr(), \
-                        "param_buffer and grad_buffer for same bucket should start at the same byte address"
-                    assert param_buffer.numel() == bucket.data.numel(), \
-                        "param_buffer and grad_buffer for same bucket should have the same number of elements"
-                    current_param_buffers[dtype].append(param_buffer)
-            self.param_buffers.append(current_param_buffers)
-
-        # Now construct data structures to manage all-gather handles.
-        self.all_gather_handles = []
-        self.all_gather_handle_index_to_bucket_index_map = []
-        self.model_index_to_all_gather_handle_index_map = {}
-        self.param_to_all_gather_handle_index_map = {}
-        self.param_buffer_copied = []
-
-        self.pbuf_view_items = self.get_model_param_buffer_dp_views()
-        for (model_index, dtype, bucket_index, _, _) in self.pbuf_view_items:
-            self.all_gather_handle_index_to_bucket_index_map.append((model_index, dtype, bucket_index))
-            all_gather_handle_index = len(self.all_gather_handle_index_to_bucket_index_map) - 1
-
-            # Store all all_gather_handle_indices relevant to a particular model chunk.
-            if model_index not in self.model_index_to_all_gather_handle_index_map:
-                self.model_index_to_all_gather_handle_index_map[model_index] = []
-            self.model_index_to_all_gather_handle_index_map[model_index].append(all_gather_handle_index)
-
-            for param in self.models[model_index].grad_buffers[dtype].buckets[bucket_index].params_list:
-                self.param_to_all_gather_handle_index_map[param] = all_gather_handle_index
-            self.param_buffer_copied.append(False)
-        self.num_all_gather_handles = len(self.all_gather_handle_index_to_bucket_index_map)
-
-        self.overlap_param_gather = get_args().overlap_param_gather
-        if self.overlap_param_gather:
-            self.remove_pre_hook_handle = torch.nn.modules.module.register_module_forward_pre_hook(
-                self._make_forward_pre_hook())
-        else:
-            self.remove_pre_hook_handle = None
-
-        self.update_successful = False
-
-        # Update optimizer groups.
-        # - Also, leverage state_dict() and load_state_dict() to
-        #   recast preexisting per-param state tensors.
-        self.optimizer.param_groups = \
-            [ g["orig_group"] for g in self.opt_group_ranges ]
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
-
-
-    def get_model_param_range_map(self, param):
-        """
-        Given a model param, get the index sub-range of the param that this
-        data-parallel rank owns.
-        """
-        model_index, dtype, bucket_index = self.model_param_gbuf_map[param]
-        gbuf_range_map = self.model_gbuf_ranges[model_index][dtype][bucket_index]
-        param_range_map = gbuf_range_map["param_map"][param]
-        return param_range_map
-
-
-    def get_model_parallel_group(self):
-        """
-        With the distributed optimizer, the model parallel group is the
-        entire world.
-        """
-        return None
-
-
-    def state_dict(self):
-        """
-        The state dict contains all non-DP-rank-dependent (i.e., non-parameter-
-        related) optimizer variables. The returned state dict can be stored in
-        the standard model/RNG checkpoint file. The parameter and dependent
-        optimizer state (e.g., exp_avg, exp_avg_sq) are stored in a separate
-        checkpoint file by calling 'save_parameter_state()'.
-        """
-
-        state_dict = {}
-
-        # Optimizer state (do not store parameter state here).
-        state_dict['optimizer'] = {
-            k : v
-            for k, v in self.optimizer.state_dict().items()
-            if k != "state"
-        }
-        for param_group in state_dict["optimizer"]["param_groups"]:
-            del param_group["params"]
-
-        # Grad scaler state.
-        if self.grad_scaler:
-            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
-
-        return state_dict
-
-
-    def load_state_dict(self, state_dict):
-        """Load the state dict.
-
-        As detailed in state_dict(), the state dict contains all non-
-        parameter-related variables. This method is notably longer than
-        state_dict(), because the Torch optimizers state has yet to be
-        allocated at this point, and so we must do a cross referencing between
-        the optimizers state (and the ordering it expects for parameter state)
-        and this DP rank's shards. The optimizer at this point does not contain
-        any tensor dimension information, so we must get these dimensions from
-        the DP shards mapped during DistributedOptimizer.__init__().
-
-        The tensor parameter state is loaded via load_parameter_state(), and
-        so this method also must populate the loaded state dict with dummy
-        tensor data (i.e., via torch.empty() below). This will be overwritten
-        during load_parameter_state().
-
-        ** Note: Torch optimizer's state structure. **
-        The Torch optimizer stores its state in two levels. The top level is a
-        list of groups, where each group contains a list of integer indexes
-        (corresponding to parameters) that index into a master parameter list
-        that is shared by all groups. As such, three values are necessary for
-        maintaining this ordering:
-
-        - group_index : The group to which a parameter belongs.
-        - group_order : The index of a parameter within its group.
-        - state_order : The index of a parameter within the shared parameter
-            list.
-        """
-
-        # Get the Torch optimizer's state dict.
-        # - This 'inner' optimizer at this point is unallocated, and only
-        #   contains an integer odering of parameters within each group, and
-        #   the ordering of parameters within its flattened parameter state
-        #   list.
-        inner_state_dict = self.optimizer.state_dict()
-        state_dict_param_groups = [{
-            **group,
-            "params" : list(inner_state_dict["param_groups"][idx]["params"]),
-        } for idx, group in enumerate(state_dict["optimizer"]["param_groups"])]
-
-        # Allocate 'dummy' data for optimizer state (i.e., torch.empty() below)
-        # - Real data is overwritten during load_parameter_state().
-        state_dict_state = []
-        for gbuf_range_maps in self.model_gbuf_ranges:
-            for gbuf_range_map_for_all_buckets in gbuf_range_maps.values():
-                for gbuf_range_map in gbuf_range_map_for_all_buckets:
-                    for model_param, param_range_map in \
-                        gbuf_range_map["param_map"].items():
-
-                        # Get parameter ordering information (see method docstring
-                        # for details).
-                        group_index, group_order = \
-                            self.model_param_group_index_map[model_param]
-                        state_order = inner_state_dict["param_groups"] \
-                            [group_index]["params"][group_order]
-
-                        # Allocate dummy tensors.
-                        numel = len(param_range_map["gbuf_world"])
-                        init_shard = lambda : torch.empty(
-                            (numel,),
-                            dtype=torch.float32,
-                            device=torch.cuda.current_device())
-
-                        state_dict_state.append((state_order, {
-                            "exp_avg" : init_shard(),
-                            "exp_avg_sq" : init_shard(),
-                        }))
-
-        # Sort by state order (see method docstring for details).
-        state_dict_state.sort(key = lambda s : s[0])
-        state_dict_state = {s[0]:s[1] for s in state_dict_state}
-
-        # Optimizer.
-        self.optimizer.load_state_dict({
-            "state" : state_dict_state,
-            "param_groups" : state_dict_param_groups,
-        })
-
-        # Grad scaler.
-        if 'grad_scaler' not in state_dict:
-            if self.fp16:
-                print_rank_0('***WARNING*** found an old checkpoint, will not '
-                             'load grad scaler ...')
-        else:
-            if self.grad_scaler:
-                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
-            else:
-                print_rank_0('***WARNING*** fould the grad scaler in the '
-                             'checkpoint but it is None in the class. '
-                             'Skipping loading grad scaler ...')
-
-
-    def save_parameter_state(self, filename):
-        """Save parameter state (i.e., parameter & optimizer tensors).
-
-        This method performs three steps:
-        - For each DP rank, copy param & optimizer shards to contiguous CPU
-          buffers. (e.g., one buffer each for main_param, exp_avg, and
-          exp_avg_sq).
-        - Gather contiguous buffers on DP rank 0 and concatenate to world
-          buffers.
-        - Save world buffers to disk (i.e., distrib_opt.pt).
-        """
-
-        # Data parallelism variables.
-        data_parallel_world_size = mpu.get_data_parallel_world_size(with_context_parallel=True)
-        data_parallel_rank = mpu.get_data_parallel_rank(with_context_parallel=True)
-        data_parallel_group_gloo = mpu.get_data_parallel_group_gloo(with_context_parallel=True)
-        data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS_WITH_CP)
-
-        # Collect param states.
-        state = {"per_bucket_numel": self.per_bucket_numel}
-        for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges):
-
-            # Iterate grad buffers (by data type).
-            dtype_state = {}
-            assert len(gbuf_range_maps) == 1, "single dtype supported, for now."
-            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items():
-                world_tensors = {}
-                for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
-
-                    # Compute local DP contiguous shard's size.
-                    model = self.models[model_idx]
-                    gbuf_world_numel = model.grad_buffers[dtype].buckets[bucket_idx].data.numel()
-                    assert gbuf_world_numel % data_parallel_world_size == 0
-                    gbuf_local_numel = gbuf_world_numel // data_parallel_world_size
-                    local_shards = {key: torch.empty((gbuf_local_numel,),
-                                                     dtype=torch.float32,
-                                                     device="cpu")
-                                    for key in ("param", "exp_avg", "exp_avg_sq")}
-
-                    # Build contiguous DP rank shards (for param + optim states).
-                    for model_param, param_range_map in \
-                        gbuf_range_map["param_map"].items():
-
-                        # Main param & optimizer states.
-                        group_index, group_order = \
-                            self.model_param_group_index_map[model_param]
-                        main_param = self.optimizer.param_groups \
-                            [group_index]["params"][group_order]
-                        optim_state = self.optimizer.state[main_param]
-
-                        tensors = {
-                            "param" : main_param,
-                            **optim_state,
-                        }
-
-                        # Copy states into contiguous shard.
-                        gbuf_local_start = param_range_map["gbuf_local"].start
-                        gbuf_local_end = param_range_map["gbuf_local"].end
-                        for key in local_shards:
-                            local_shards[key][gbuf_local_start:gbuf_local_end] \
-                                .data.copy_(tensors[key].detach().cpu())
-
-                    # Gather contiguous shards on DP rank 0.
-                    for key, send_tensor in local_shards.items():
-
-                        # Gather tensor list.
-                        if data_parallel_rank == 0:
-                            recv_tensors = [torch.empty((gbuf_local_numel,),
-                                                        dtype=torch.float32,
-                                                        device="cpu")
-                                            for _ in range(data_parallel_world_size)]
-                        else:
-                            recv_tensors = None
-
-                        # Gather.
-                        torch.distributed.gather(
-                            send_tensor,
-                            recv_tensors,
-                            data_parallel_global_ranks[0],
-                            data_parallel_group_gloo,
-                        )
-
-                        # Concatenate.
-                        if data_parallel_rank == 0:
-                            if key not in world_tensors:
-                                world_tensors[key] = []
-                            world_tensors[key].append(torch.cat(recv_tensors))
-
-                # Collect world state.
-                dtype_state[dtype] = world_tensors
-            state[model_idx] = dtype_state
-
-        # Save param state.
-        if data_parallel_rank == 0:
-            torch.save(state, filename)
-
-
-    def load_parameter_state(self, filename):
-        """Load parameter state (i.e., parameter & optimizer tensors).
-
-        This method performs the reverse of save_parameter_state():
-        - Load world buffers from disk (i.e., distrib_opt.pt).
-        - Scatter contiguous buffers from DP rank 0 to each DP rank (each DP
-          rank receives its relevant subset of the world buffers).
-        - For each DP rank, copy param & optimizer shards from contiguous CPU
-          buffers. (e.g., one buffer each for main_param, exp_avg, and
-          exp_avg_sq).
-        """
-
-        # Data parallelism variables.
-        data_parallel_world_size = mpu.get_data_parallel_world_size(with_context_parallel=True)
-        data_parallel_rank = mpu.get_data_parallel_rank(with_context_parallel=True)
-        data_parallel_group_gloo = mpu.get_data_parallel_group_gloo(with_context_parallel=True)
-        data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS_WITH_CP)
-
-        # Load on DP rank 0.
-        if data_parallel_rank == 0:
-            loaded_state = torch.load(filename)
-            if "per_bucket_numel" in loaded_state:
-                per_bucket_numel_in_checkpoint = loaded_state["per_bucket_numel"]
-                assert self.per_bucket_numel == per_bucket_numel_in_checkpoint, \
-                    (f"Number of elements in each bucket need to be the same in current run "
-                     f"({self.per_bucket_numel}) and checkpoint ({per_bucket_numel_in_checkpoint})")
-
-        # Scatter tensors to all DP ranks.
-        for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges):
-            for dtype, gbuf_range_map_for_all_buckets in gbuf_range_maps.items():
-                for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
-
-                    # Compute local DP contiguous shard's size.
-                    model = self.models[model_idx]
-                    gbuf_world_numel = model.grad_buffers[dtype].buckets[bucket_idx].data.numel()
-                    assert gbuf_world_numel % data_parallel_world_size == 0
-                    gbuf_local_numel = gbuf_world_numel // data_parallel_world_size
-
-                    # Contiguous local shards (received from DP rank 0).
-                    local_shards = {key: torch.empty((gbuf_local_numel,),
-                                                     dtype=torch.float32,
-                                                     device="cpu")
-                                    for key in ("param", "exp_avg", "exp_avg_sq")}
-
-                    # Scatter local shards from DP rank 0.
-                    for key, recv_tensor in local_shards.items():
-
-                        # Scatter tensor list.
-                        if data_parallel_rank == 0:
-                            world_tensor_for_all_buckets = loaded_state[model_idx][dtype][key]
-                            if not isinstance(world_tensor_for_all_buckets, list):
-                                world_tensor_for_all_buckets = [world_tensor_for_all_buckets]
-                            assert bucket_idx < len(world_tensor_for_all_buckets), \
-                                (f"Trying to load state for bucket_id {bucket_idx} (out of "
-                                 f"{len(gbuf_range_map_for_all_buckets)} buckets) from checkpoint; "
-                                 f"checkpoint only has {len(world_tensor_for_all_buckets)} bucket(s)")
-                            world_tensor = world_tensor_for_all_buckets[bucket_idx]
-                            gbuf_start_idxs = \
-                                list(range(0, gbuf_world_numel, gbuf_local_numel))
-                            send_tensors = [world_tensor[i:(i+gbuf_local_numel)]
-                                            for i in gbuf_start_idxs]
-                        else:
-                            send_tensors = None
-
-                        # Scatter.
-                        torch.distributed.scatter(
-                            recv_tensor,
-                            send_tensors,
-                            data_parallel_global_ranks[0],
-                            data_parallel_group_gloo,
-                        )
-
-                    # Copy local contiguous shards to param/optim shards.
-                    for model_param, param_range_map in \
-                        gbuf_range_map["param_map"].items():
-
-                        # Main param & optimizer states.
-                        group_index, group_order = \
-                            self.model_param_group_index_map[model_param]
-                        main_param = self.optimizer.param_groups \
-                            [group_index]["params"][group_order]
-                        optim_state = self.optimizer.state[main_param]
-
-                        tensors = {
-                            "param" : main_param,
-                            **optim_state,
-                        }
-
-                        # Copy states into contiguous shard.
-                        gbuf_local_start = param_range_map["gbuf_local"].start
-                        gbuf_local_end = param_range_map["gbuf_local"].end
-                        for key in local_shards:
-                            tensors[key].data.copy_(
-                                local_shards[key][gbuf_local_start:gbuf_local_end])
-
-
-    def zero_grad(self, set_to_none=True):
-        """
-        Zero grads.
-
-        We only need to zero the model related parameters, i.e.,
-        model_float16_groups & model_fp32_groups. We additionally zero
-        the remaining groups as a memory optimization to reduce
-        fragmentation; in the case of set_to_none==True, the space
-        used by this field can be safely deallocated at this point.
-        """
-        for groups in (
-                self.model_float16_groups,
-                self.model_fp32_groups,
-                self.shard_float16_groups, # grad empty/unused here?
-                self.shard_fp32_groups, # throws grad-access warning
-                self.shard_fp32_from_float16_groups):
-            for group in groups:
-                _zero_grad_group_helper(group, set_to_none)
-
-        # If overlapping param all-gather with forward compute, launch all-gather
-        # for first accessed bucket here before forward compute is initiated.
-        # The all-gather for the next bucket will be launched in the forward
-        # pre-hook when this all-gather finishes (to ensure that the communication
-        # kernels don't head-of-line block the compute kernels since we run with
-        # CUDA_DEVICE_MAX_CONNECTIONS=1 to support sequence parallelism).
-        if self.overlap_param_gather:
-            self._dispatch_gather_model_params(all_gather_handle_index=0)
-
-
-    def get_model_param_buffer_dp_views(self):
-        """
-        Get shard views of each of the param buffers.
-
-        In this nested list, the top level is grouped by the virtual model
-        index and the buffer's data type. The sub-level is a list of
-        shards of that buffer, where each shard in the list represents
-        a contiguous view of the buffer, that is owned by a data-parallel
-        rank. The shard boundary does not respect parameter boundaries, and
-        so the elements of some parameters are split across data parallel
-        ranks.
-
-        Additionally, return references to the entire buffers, for use
-        in _all_gather_base.
-        """
-
-        # Buffer views.
-        # Add in reverse order in each model chunk since buckets start from the end of the model but we want
-        # all-gathers to run first for the start of the model (same order as forward pass).
-        # We keep the view_items in model chunk order since we want to still first run all_gather and
-        # all_gather_handle.wait() for the first model chunk.
-        # In all cases, we want all_gather and all_gather_handle.wait() to be called in the same order,
-        # and all_gather_handle.wait() needs to be called just before the corresponding forward pass.
-        view_items = []
-        for model_index, buffers in enumerate(self.param_buffers):
-            view_items_per_model_chunk = []
-            for dtype, buf_for_all_buckets in buffers.items():
-                for bucket_index, buf in enumerate(buf_for_all_buckets):
-                    buf_views = shard_buffer(buf)
-                    view_items_per_model_chunk.insert(0, (model_index, dtype, bucket_index, buf, buf_views))
-            view_items.extend(view_items_per_model_chunk)
-
-        return view_items
-
-
-    def _dispatch_gather_model_params(self, all_gather_handle_index):
-        """
-        All-gather updated model params.
-
-        The DDP's param buffer is used for the all-gather, and thus no
-        tensors are dynamically allocated. After the all-gather, the params
-        can be copied from the param buffer to the param.
-        """
-        if self.update_successful:
-            data_parallel_rank = mpu.get_data_parallel_rank(with_context_parallel=True)
-            data_parallel_group = mpu.get_data_parallel_group(with_context_parallel=True)
-
-            # All-gather updated main params.
-            # All param_buf views are guaranteed to have the same number of elements
-            # across all data-parallel ranks, due to padding (done in grad_buffer.py),
-            # and extended to the param_bufs. Thus, all sub-views will have consistent
-            # start / end indexes across data-parallel ranks.
-            (model_index, dtype, bucket_index, pbuf, pbuf_views) = self.pbuf_view_items[all_gather_handle_index]
-            assert all_gather_handle_index == len(self.all_gather_handles)
-            all_gather_handle = torch.distributed._all_gather_base(
-                pbuf,
-                pbuf_views[data_parallel_rank],
-                group = data_parallel_group,
-                async_op = self.overlap_param_gather
-            )
-            self.all_gather_handles.append(all_gather_handle)
-            assert self.all_gather_handle_index_to_bucket_index_map[all_gather_handle_index] == \
-                (model_index, dtype, bucket_index)
-            self.param_buffer_copied.append(False)
-
-        if not self.overlap_param_gather:
-            self._copy_params_from_param_buffer(all_gather_handle_index)
-
-
-
-    def _make_forward_pre_hook(self):
-        """
-        Create a forward pre-hook to wait on all-gather handles when necessary (i.e.,
-        when a module uses a parameter in a bucket with a still incomplete all-gather)
-        and then copy the results from the param_buffer into model_params.
-        """
-
-        def hook(module, *unused):
-            assert self.overlap_param_gather, "Should use pre-hook only when overlap_param_gather is True"
-
-            # Make sure all parameters in this module have been all-gathered as necessary.
-            for param in module.parameters(recurse=False):
-                # Skip parameters that don't require grad.
-                if not param.requires_grad:
-                    continue
-
-                assert param in self.param_to_all_gather_handle_index_map
-                all_gather_handle_index = self.param_to_all_gather_handle_index_map[param]
-                self._finish_param_sync_helper(all_gather_handle_index)
-
-        return hook
-
-
-    def finish_param_sync(self, model_index, *unused):
-        """
-        Finishes all necessary param syncs for the model_index'th model chunk.
-        """
-        all_gather_handle_indices = self.model_index_to_all_gather_handle_index_map[model_index]
-        for all_gather_handle_index in all_gather_handle_indices:
-            self._finish_param_sync_helper(all_gather_handle_index)
-
-
-    def _finish_param_sync_helper(self, all_gather_handle_index):
-        """
-        Waits on all_gather_handle if necessary, then copies params from param_buffer
-        into model_params if necessary.
-        """
-
-        # First check if there is an outstanding all-gather handle for this param.
-        # If so, wait on the handle to ensure the communication is finished.
-        if all_gather_handle_index >= len(self.all_gather_handles):
-            return
-
-        all_gather_handle = self.all_gather_handles[all_gather_handle_index]
-        if all_gather_handle is not None:
-            all_gather_handle.wait()
-            self.all_gather_handles[all_gather_handle_index] = None
-
-            # Launch the all-gather for the next bucket now.
-            # We can't pre-launch all-gathers for all buckets at once since we don't
-            # want to head-of-line block the compute kernels with communication kernels
-            # (since we run with CUDA_DEVICE_MAX_CONNECTIONS=1 to support sequence
-            # parallelism).
-            next_all_gather_handle_index = all_gather_handle_index + 1
-            if next_all_gather_handle_index < self.num_all_gather_handles:
-                self._dispatch_gather_model_params(next_all_gather_handle_index)
-
-        # Also check if we have already copied from the param buffer for this
-        # handle; if not, complete the copy and mark as such.
-        if not self.param_buffer_copied[all_gather_handle_index]:
-            self._copy_params_from_param_buffer(all_gather_handle_index)
-            self.param_buffer_copied[all_gather_handle_index] = True
-
-
-    def _copy_params_from_param_buffer(self, all_gather_handle_index):
-        """
-        Copy params from param_buffer to model_params.
-        """
-        (model_index, dtype, bucket_index) = self.all_gather_handle_index_to_bucket_index_map[
-            all_gather_handle_index]
-        model = self.models[model_index]
-        if self.update_successful:
-            # Copy from param buffer to each param.
-            param_map = model.grad_buffer_param_index_map[dtype]
-            for param, (buf_start, buf_end, bucket_index_in_param_map) in param_map.items():
-                if bucket_index == bucket_index_in_param_map:
-                    bucket_offset = model.grad_buffers[dtype].buckets[bucket_index].offset
-                    param_buf = self.param_buffers[model_index][dtype][bucket_index]
-                    # buf_start and buf_end store position of this parameter in the full grad_buffer,
-                    # so need to adjust these indices (by subtracting out bucket_offset) since we
-                    # have independent param_bufs for each bucket.
-                    param_buf_shard = param_buf[buf_start-bucket_offset:buf_end-bucket_offset]
-                    assert param.data.nelement() == param_buf_shard.nelement()
-                    param.view(-1).detach().copy_(param_buf_shard)
-
-        # Zero out the grad buffer in preparation for next set of fwd / bwd passes after copy
-        # completes (since param_buffer and grad_buffer are shared for each bucket).
-        param_buf = self.param_buffers[model_index][dtype][bucket_index]
-        grad_buf = model.grad_buffers[dtype].buckets[bucket_index].data
-        assert param_buf.data_ptr() == grad_buf.data_ptr()
-        grad_buf.zero_()
-
-
-    def _collect_main_grad_data_for_unscaling(self):
-        """
-        Note: this should be equivalent to the float-16 optimizer's method,
-        but writtent differently, so the two should be combined.
-        """
-        return [
-            param.grad.data
-            for group in self.optimizer.param_groups
-            for param in group["params"]
-        ]
-
-
-    def _get_model_and_main_params_data_float16(self):
-        """
-        Get aligned list of model and main params.
-        """
-        model_data = []
-        main_data = []
-        for model_group, main_group in zip(self.shard_float16_groups,
-                                           self.shard_fp32_from_float16_groups):
-            for model_param, main_param in zip(model_group, main_group):
-                model_data.append(model_param.data)
-                main_data.append(main_param.data)
-        return model_data, main_data
-
-
-    def _copy_model_grads_to_main_grads(self):
-        """
-        Copy model grads to main grads.
-
-        Since this step follows a reduce-scatter through the DDP's grad
-        buffer, this method is responsible for copying the updated grads
-        from the grad buffer to the main shard's grad field.
-        """
-
-        # Utility method for copying group grads.
-        def copy_group_grads(model_groups, shard_main_groups):
-            for model_group, shard_main_group in zip(model_groups,
-                                                     shard_main_groups):
-                for model_param, shard_main_param in zip(model_group,
-                                                         shard_main_group):
-
-                    param_range_map = self.get_model_param_range_map(model_param)
-                    param_range = param_range_map["param"]
-                    assert param_range.size == shard_main_param.nelement()
-
-                    model_grad = model_param.main_grad
-                    shard_model_grad = model_grad.view(-1) \
-                        [param_range.start:param_range.end]
-                    shard_main_param.grad = shard_model_grad.float()
-
-        # Copy model groups to shard groups.
-        copy_group_grads(self.model_float16_groups,
-                         self.shard_fp32_from_float16_groups)
-        copy_group_grads(self.model_fp32_groups,
-                         self.shard_fp32_groups)
-
-
-    def _copy_main_params_to_model_params(self):
-        """
-        Copy main params to model params.
-
-        Since this step is followed by an all-gather through the DDP's grad
-        buffer, this method is responsible for copying the updated params
-        from the main shards into the correct position in the grad buffer.
-        """
-
-        # Utility method for copying group params.
-        def copy_group_params(shard_main_groups, model_groups):
-            for shard_main_group, model_group in zip(shard_main_groups,
-                                                     model_groups):
-                for shard_main_param, model_param in zip(shard_main_group,
-                                                         model_group):
-
-                    param_range_map = self.get_model_param_range_map(model_param)
-                    world_range = param_range_map["gbuf_world_in_bucket"]
-
-                    assert world_range.size == shard_main_param.nelement()
-
-                    model_id, dtype, bucket_id = self.model_param_gbuf_map[model_param]
-                    model_param_buffer = self.param_buffers[model_id][dtype][bucket_id]
-
-                    shard_model_param = model_param_buffer.view(-1) \
-                        [world_range.start:world_range.end]
-
-                    shard_model_param.data.copy_(shard_main_param)
-
-        # Copy shard groups to model groups.
-        copy_group_params(self.shard_fp32_from_float16_groups,
-                          self.model_float16_groups)
-        copy_group_params(self.shard_fp32_groups,
-                          self.model_fp32_groups)
-
-
-    def _copy_model_params_to_main_params(self):
-        """
-        Copy model params to main params.
-
-        During finetuning, this method is used to reload the main params from
-        the model params. This copy does not make use of the grad buffer as
-        an intermediary.
-        """
-
-        # Utility method for copying group params.
-        def copy_group_params(model_groups, shard_main_groups):
-            for model_group, shard_main_group in zip(model_groups,
-                                                     shard_main_groups):
-                for model_param, shard_main_param in zip(model_group,
-                                                         shard_main_group):
-
-                    param_range_map = self.get_model_param_range_map(model_param)
-                    param_range = param_range_map["param"]
-                    assert param_range.size == shard_main_param.nelement()
-
-                    shard_model_param = model_param.view(-1) \
-                        [param_range.start:param_range.end]
-                    shard_main_param.data.copy_(shard_model_param)
-
-        # Copy model groups to shard groups.
-        copy_group_params(self.model_float16_groups,
-                          self.shard_fp32_from_float16_groups)
-        copy_group_params(self.model_fp32_groups,
-                          self.shard_fp32_groups)
-
-
-    @torch.no_grad()
-    def step(self, args, timers):
-        self.update_successful, grad_norm, num_zeros_in_grad = super().step(args, timers)
-
-        # Reset metadata needed to track results of all-gathers.
-        self.all_gather_handles = []
-        self.param_buffer_copied = []
-
-        # If not overlapping all-gather for parameters, launch synchronous all-gather
-        # communication calls here.
-        if not self.overlap_param_gather:
-            timers('params-all-gather', log_level=1).start(barrier=args.barrier_with_L1_time)
-            for all_gather_handle_index in range(self.num_all_gather_handles):
-                self._dispatch_gather_model_params(all_gather_handle_index)
-            timers('params-all-gather').stop()
-
-        return self.update_successful, grad_norm, num_zeros_in_grad
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/grad_scaler.py b/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/grad_scaler.py
deleted file mode 100644
index 4bb4475a88f621a7fe2ef2612ef888302095787a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/grad_scaler.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron grad scaler."""
-
-from abc import ABC
-from abc import abstractmethod
-
-import torch
-
-
-class MegatronGradScaler(ABC):
-
-    def __init__(self, initial_scale):
-        """Initialize scale value with the input initial scale."""
-        assert initial_scale > 0.0
-        self._scale = torch.cuda.FloatTensor([initial_scale])
-
-    @property
-    def scale(self):
-        return self._scale
-
-    @property
-    def inv_scale(self):
-        return self._scale.reciprocal().float()
-
-    @abstractmethod
-    def update(self, found_inf):
-        pass
-
-    @abstractmethod
-    def state_dict(self):
-        pass
-
-    @abstractmethod
-    def load_state_dict(self, state_dict):
-        pass
-
-
-
-class ConstantGradScaler(MegatronGradScaler):
-
-    def update(self, found_inf):
-        pass
-
-    def state_dict(self):
-        return dict()
-
-    def load_state_dict(self, state_dict):
-        pass
-
-
-
-class DynamicGradScaler(MegatronGradScaler):
-
-    def __init__(self, initial_scale, min_scale,
-                 growth_factor, backoff_factor,
-                 growth_interval, hysteresis):
-        """"Grad scaler with dynamic scale that gets adjusted
-        during training."""
-        super(DynamicGradScaler, self).__init__(initial_scale)
-
-        # Lower bound on the scale.
-        assert min_scale > 0.0
-        assert min_scale <= initial_scale
-        self.min_scale = torch.cuda.FloatTensor([min_scale])
-        # Growth and backoff factors for the scale.
-        assert growth_factor > 1.0
-        self.growth_factor = torch.cuda.FloatTensor([growth_factor])
-        assert backoff_factor < 1.0
-        assert backoff_factor > 0.0
-        self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
-        # Interval over which if we don't see any inf/nan,
-        # we will scale the grad scale by the growth factor.
-        assert growth_interval > 0
-        self.growth_interval = growth_interval
-        # Number of inf/nans we should see before scaling down
-        # the grad scale by the backoff factor.
-        assert hysteresis > 0
-        self.hysteresis = hysteresis
-
-        # Trackers.
-        self._growth_tracker = 0
-        self._hysteresis_tracker = self.hysteresis
-
-
-    def update(self, found_inf):
-
-        # If we have an inf/nan, growth tracker is set to 0
-        # and hysterisis tracker is reduced by 1.
-        if found_inf:
-            self._growth_tracker = 0
-            self._hysteresis_tracker -= 1
-            # Now if we are out of hysteresis count, scale down the loss.
-            if self._hysteresis_tracker <= 0:
-                self._scale = torch.max(self._scale * self.backoff_factor,
-                                        self.min_scale)
-        else:
-            # If there is no nan/inf, increment the growth tracker.
-            self._growth_tracker += 1
-            # If we have had enough consequitive intervals with no nan/inf:
-            if self._growth_tracker == self.growth_interval:
-                # Reset the tracker and hysteresis trackers,
-                self._growth_tracker = 0
-                self._hysteresis_tracker = self.hysteresis
-                # and scale up the loss scale.
-                self._scale = self._scale * self.growth_factor
-
-
-    def state_dict(self):
-        state_dict = {}
-        state_dict['scale'] = self._scale
-        state_dict['growth_tracker'] = self._growth_tracker
-        state_dict['hysteresis_tracker'] = self._hysteresis_tracker
-        return state_dict
-
-
-    def load_state_dict(self, state_dict):
-        self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
-        self._growth_tracker = state_dict['growth_tracker']
-        self._hysteresis_tracker = state_dict['hysteresis_tracker']
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/optimizer.py b/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/optimizer.py
deleted file mode 100644
index 1b49a7bb3a41aa36e02178475c0076a6450f4e98..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/optimizer.py
+++ /dev/null
@@ -1,644 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron optimizer."""
-
-from abc import ABC
-from abc import abstractmethod
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-import torch
-
-from megatron_ds import get_timers
-from megatron_ds import print_rank_0
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.model import Float16Module
-from megatron_ds.model.module import param_is_not_shared
-
-from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
-
-
-def _zero_grad_group_helper(group, set_to_none):
-    """Zero out the gradient for a group of parameters.
-    Note: copied from torch.optim.optimizer."""
-    for param in group:
-        if param.grad is not None:
-            if set_to_none:
-                param.grad = None
-            else:
-                if param.grad.grad_fn is not None:
-                    param.grad.detach_()
-                else:
-                    param.grad.requires_grad_(False)
-                param.grad.zero_()
-
-
-def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
-    """Use multi-tensor-applier to copy values from one list to another.
-    We don't have a blfoat16 implementation so for now if the overflow_buf
-    is not provided, we default back to simple loop copy to be compatible
-    with bfloat16."""
-    if overflow_buf:
-        overflow_buf.fill_(0)
-        # Scaling with factor `1.0` is equivalent to copy.
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             overflow_buf,
-                             [this, that],
-                             1.0)
-    else:
-        for this_, that_ in zip(this, that):
-            that_.copy_(this_)
-
-
-
-class MegatronOptimizer(ABC):
-
-
-    def __init__(self, optimizer, clip_grad,
-                 log_num_zeros_in_grad,
-                 check_for_nan_in_grad,
-                 params_have_main_grad,
-                 models):
-
-        """Input optimizer is the base optimizer for example Adam."""
-        self.optimizer = optimizer
-        assert self.optimizer, 'no optimizer is provided.'
-        # Set gradient clipping and logging params.
-        self.clip_grad = clip_grad
-        self.log_num_zeros_in_grad = log_num_zeros_in_grad
-        self.check_for_nan_in_grad = check_for_nan_in_grad
-        self.params_have_main_grad = params_have_main_grad
-
-        # 'models' are retained for access to the contiguous grad buffers.
-        # (see distributed optimizer)
-        self.models = models
-
-
-    def get_parameters(self):
-        params = []
-        for param_group in self.optimizer.param_groups:
-            for param in param_group['params']:
-                params.append(param)
-        return params
-
-
-    def get_main_grads_for_grad_norm(self):
-
-        # Filter parameters based on:
-        #   - grad should not be none
-        #   - parameter should not be shared
-        #   - should not be a replica due to tensor model parallelism
-        params = self.get_parameters()
-        grads_for_norm = []
-        for param in params:
-            grad = param.grad
-            grad_not_none = grad is not None
-            is_not_shared = param_is_not_shared(param)
-            is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(param)
-            if grad_not_none and is_not_shared and is_not_tp_duplicate:
-                grads_for_norm.append(grad)
-
-        return grads_for_norm
-
-
-    def get_model_parallel_group(self):
-        """Default returned here, but the distributed optimizer overrides this."""
-        return mpu.get_model_parallel_group()
-
-
-    def clip_grad_norm(self, clip_grad, check_for_nan_in_grad):
-        params = self.get_parameters()
-        grads_for_norm = self.get_main_grads_for_grad_norm()
-        return clip_grad_norm_fp32(
-            params, grads_for_norm, clip_grad,
-            check_for_nan_in_grad,
-            model_parallel_group=self.get_model_parallel_group())
-
-
-    def count_zeros(self):
-        params = self.get_parameters()
-        return count_zeros_fp32(params,
-                                model_parallel_group=self.get_model_parallel_group())
-
-
-    @abstractmethod
-    def zero_grad(self, set_to_none=True):
-        pass
-
-
-    @abstractmethod
-    def get_loss_scale(self):
-        """The output should be a cuda tensor of size 1."""
-        pass
-
-
-    def scale_loss(self, loss):
-        """Simple scaling."""
-        return self.get_loss_scale() * loss
-
-
-    @abstractmethod
-    def reload_model_params(self):
-        """Refreshes any internal state from the current model parameters.
-        Call whenever the parameters are changed outside of the optimizer.
-        For example, when we load a model from a checkpoint  without loading
-        the optimizer, the model parameters are updated but for fp16 optimizer
-        with main parameters, the main parameters need to also be updated."""
-        pass
-
-
-    @abstractmethod
-    def state_dict(self):
-        pass
-
-
-    @abstractmethod
-    def load_state_dict(self, state_dict):
-        pass
-
-
-    # Promote state so it can be retrieved or set via
-    # "optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-
-    # Promote param_groups so it can be retrieved or set via
-    # "optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
-
-    @abstractmethod
-    def step(self, args, timers):
-        pass
-
-
-
-class MixedPrecisionOptimizer(MegatronOptimizer):
-    """Base class for both the float-16 and the distributed optimizer.
-
-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        check_for_nan_in_grad: check if gradients have a NaN.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a continuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        fp16: if true, the model is running in fp16.
-        bf16: if true, the model is running in bfloat16.
-        params_dtype: used by distributed optimizer.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
-        models: list of models (i.e., the virtual pipelining models). This
-            is used by the distributed optimizer for mapping parameters.
-    """
-
-    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
-                 check_for_nan_in_grad, params_have_main_grad,
-                 fp16, bf16, params_dtype, grad_scaler, models):
-
-        super().__init__(
-            optimizer, clip_grad, log_num_zeros_in_grad,
-            check_for_nan_in_grad, params_have_main_grad,
-            models)
-
-        self.fp16 = fp16
-        self.bf16 = bf16
-        self.params_dtype = params_dtype
-        self.grad_scaler = grad_scaler
-
-        # None grad scaler is only supported for bf16.
-        if self.grad_scaler is None:
-            assert not self.fp16, 'fp16 expects a grad scaler.'
-
-        # Tensor used to determine if a nan/if has happend.
-        # Any non-zero value indicates inf/nan.
-        # Note that we keep this for the cases that grad scaler is none.
-        # We still record nan/inf if we have a bfloat16 with a grad scaler.
-        if self.grad_scaler:
-            self.found_inf = torch.cuda.FloatTensor([0.0])
-
-        # Dummy tensor needed for apex multi-apply tensor.
-        # For bfloat, we don't have multi-tensor apply and for now
-        # we set it to none so the multi-tensor apply gets ignored.
-        if bf16:
-            self._dummy_overflow_buf = None
-        else:
-            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
-
-        # In case grad scaler is not passed, define the unity scale.
-        if self.grad_scaler is None:
-            self._scale_one = torch.cuda.FloatTensor([1.0])
-
-
-    def get_loss_scale(self):
-        if self.grad_scaler is None:
-            return self._scale_one
-        return self.grad_scaler.scale
-
-
-    def reload_model_params(self):
-        self._copy_model_params_to_main_params()
-
-
-    def _unscale_main_grads_and_check_for_nan(self):
-
-        # Collect main grads.
-        main_grads = self._collect_main_grad_data_for_unscaling()
-
-        # Reset found inf.
-        self.found_inf.fill_(0.0)
-
-        # Unscale and set found inf/nan
-        torch._amp_foreach_non_finite_check_and_unscale_(
-            main_grads, self.found_inf, self.grad_scaler.inv_scale)
-
-        # Update across all model parallel instances.
-        torch.distributed.all_reduce(self.found_inf,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=self.get_model_parallel_group())
-
-        # Check for nan.
-        found_inf_flag = (self.found_inf.item() > 0)
-
-        return found_inf_flag
-
-
-    @torch.no_grad()
-    def step(self, args, timers):
-
-        # Copy gradients from model params to main params.
-        timers('optimizer-copy-to-main-grad', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self._copy_model_grads_to_main_grads()
-        timers('optimizer-copy-to-main-grad').stop()
-
-        # Do unscale, check for inf, and update grad scaler only for
-        # the case that grad scaler is provided.
-        if self.grad_scaler:
-
-            # Unscale and check for inf/nan.
-            timers('optimizer-unscale-and-check-inf', log_level=1).start(
-                barrier=args.barrier_with_L1_time)
-            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
-            timers('optimizer-unscale-and-check-inf').stop()
-
-            # We are done with scaling gradients
-            # so we can update the loss scale.
-            self.grad_scaler.update(found_inf_flag)
-
-            # If we found inf/nan, skip the update.
-            if found_inf_flag:
-                return False, None, None
-
-        # Clip the main gradients.
-        timers('optimizer-clip-main-grad', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        grad_norm = None
-        if self.clip_grad > 0.0:
-            grad_norm = self.clip_grad_norm(self.clip_grad,
-                                            self.check_for_nan_in_grad)
-        timers('optimizer-clip-main-grad').stop()
-
-        # Count the zeros in the grads.
-        timers('optimizer-count-zeros', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        num_zeros_in_grad = self.count_zeros() if \
-                            self.log_num_zeros_in_grad else None
-        timers('optimizer-count-zeros').stop()
-
-        # Step the optimizer.
-        timers('optimizer-inner-step', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self.optimizer.step()
-        timers('optimizer-inner-step').stop()
-
-        # Update params from main params.
-        timers('optimizer-copy-main-to-model-params', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self._copy_main_params_to_model_params()
-        timers('optimizer-copy-main-to-model-params').stop()
-
-        # Successful update.
-        return True, grad_norm, num_zeros_in_grad
-
-
-class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
-    """Float16 optimizer for fp16 and bf16 data types.
-
-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        check_for_nan_in_grad: check if gradients have a NaN.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a continuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        fp16: if true, the model is running in fp16.
-        bf16: if true, the model is running in bfloat16.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
-        models: list of models (i.e., the virtual pipelining models). This
-            is used by the distributed optimizer for mapping parameters.
-    """
-
-    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
-                 check_for_nan_in_grad, params_have_main_grad, fp16, bf16,
-                 params_dtype, grad_scaler, models):
-
-        super().__init__(
-            optimizer, clip_grad, log_num_zeros_in_grad,
-            check_for_nan_in_grad, params_have_main_grad,
-            fp16, bf16, params_dtype, grad_scaler, models)
-
-        # ======================
-        # main parameter stuff
-        # ======================
-
-        # Three groups of parameters:
-        #   float16_groups: original float16 parameters
-        #   fp32_from_float16_groups: fp32 copy of float16 parameters
-        #   fp32_from_fp32_groups: original fp32 parameters
-        self.float16_groups = []
-        self.fp32_from_float16_groups = []
-        self.fp32_from_fp32_groups = []
-
-        # For all the groups in the original optimizer:
-        for param_group in self.optimizer.param_groups:
-            float16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_float16_params_this_group = []
-            # For all the parameters in this group:
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-
-                    # float16 params:
-                    if param.type() in ['torch.cuda.HalfTensor',
-                                        'torch.cuda.BFloat16Tensor']:
-                        float16_params_this_group.append(param)
-                        # Create a copy
-                        main_param = param.detach().clone().float()
-                        # Copy tensor model parallel attributes.
-                        tensor_parallel.copy_tensor_model_parallel_attributes(main_param,
-                                                                              param)
-                        if hasattr(param, 'shared'):
-                            main_param.shared = param.shared
-                        # Replace the optimizer params with the new fp32 copy.
-                        param_group['params'][i] = main_param
-
-                        fp32_from_float16_params_this_group.append(main_param)
-                        # Reset existing state dict key to the new main param.
-                        if param in self.optimizer.state:
-                            self.optimizer.state[main_param] \
-                                = self.optimizer.state.pop(param)
-                    # fp32 params.
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-
-                    else:
-                        raise TypeError('Wrapped parameters must be one of '
-                                        'torch.cuda.FloatTensor,  '
-                                        'torch.cuda.HalfTensor, or '
-                                        'torch.cuda.BFloat16Tensor. '
-                                        'Received {}'.format(param.type()))
-
-            self.float16_groups.append(float16_params_this_group)
-            self.fp32_from_float16_groups.append(
-                fp32_from_float16_params_this_group)
-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-
-
-    def zero_grad(self, set_to_none=True):
-        """We only need to zero the model related parameters, i.e.,
-        float16_groups & fp32_from_fp32_groups. We additionally zero
-        fp32_from_float16_groups as a memory optimization to reduce
-        fragmentation; in the case of set_to_none==True, the space
-        used by this field can be safely deallocated at this point."""
-        for group in self.float16_groups:
-            _zero_grad_group_helper(group, set_to_none)
-        for group in self.fp32_from_float16_groups:
-            _zero_grad_group_helper(group, set_to_none)
-        for group in self.fp32_from_fp32_groups:
-            _zero_grad_group_helper(group, set_to_none)
-
-
-    def _collect_main_grad_data_for_unscaling(self):
-
-        main_grads = []
-
-        # fp32 params from float16 ones.
-        for main_group in self.fp32_from_float16_groups:
-            for main_param in main_group:
-                if main_param.grad is not None:
-                    main_grads.append(main_param.grad.data)
-
-        # Append fp32 parameters.
-        for main_group in self.fp32_from_fp32_groups:
-            for main_param in main_group:
-                if main_param.grad is not None:
-                    main_grads.append(main_param.grad.data)
-        
-        return main_grads
-
-
-    def _get_model_and_main_params_data_float16(self):
-        model_data = []
-        main_data = []
-        for model_group, main_group in zip(self.float16_groups,
-                                           self.fp32_from_float16_groups):
-            for model_param, main_param in zip(model_group, main_group):
-                model_data.append(model_param.data)
-                main_data.append(main_param.data)
-        return model_data, main_data
-
-
-    def _copy_model_grads_to_main_grads(self):
-        # This only needs to be done for the float16 group.
-        for model_group, main_group in zip(self.float16_groups,
-                                           self.fp32_from_float16_groups):
-            for model_param, main_param in zip(model_group, main_group):
-                if self.params_have_main_grad and hasattr(model_param, 'main_grad'):
-                    main_param.grad = model_param.main_grad.float()
-                else:
-                    if model_param.grad is not None:
-                        main_param.grad = model_param.grad.float()
-
-                # Safe to deallocate model's grad/main_grad after copying.
-                # (If using contiguous buffers, main_grad's memory should
-                # persist and therefore should not be deallocated.)
-                model_param.grad = None
-
-        # For fp32 grads, we need to reset the grads to main grad.
-        if self.params_have_main_grad:
-            for model_group in self.fp32_from_fp32_groups:
-                for model_param in model_group:
-                    model_param.grad = model_param.main_grad
-
-
-    def _copy_main_params_to_model_params(self):
-        # Only needed for the float16 params.
-        model_data, main_data = self._get_model_and_main_params_data_float16()
-        _multi_tensor_copy_this_to_that(this=main_data, that=model_data,
-                                        overflow_buf=self._dummy_overflow_buf)
-
-
-    def _copy_model_params_to_main_params(self):
-        # Only needed for the float16 params.
-        model_data, main_data = self._get_model_and_main_params_data_float16()
-        _multi_tensor_copy_this_to_that(this=model_data, that=main_data,
-                                        overflow_buf=self._dummy_overflow_buf)
-
-
-    def state_dict(self):
-        state_dict = {}
-        state_dict['optimizer'] = self.optimizer.state_dict()
-        if self.grad_scaler:
-            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
-        state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups
-        return state_dict
-
-
-    def load_state_dict(self, state_dict):
-        # Optimizer.
-        optimizer_key = 'optimizer'
-        if optimizer_key not in state_dict:
-            optimizer_key = 'optimizer_state_dict'
-            print_rank_0('***WARNING*** loading optimizer from '
-                         'an old checkpoint ...')
-        self.optimizer.load_state_dict(state_dict[optimizer_key])
-
-        # Grad scaler.
-        if 'grad_scaler' not in state_dict:
-            if self.fp16:
-                print_rank_0('***WARNING*** found an old checkpoint, will not '
-                             'load grad scaler ...')
-        else:
-            if self.grad_scaler:
-                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
-            else:
-                print_rank_0('***WARNING*** fould the grad scaler in the '
-                             'checkpoint but it is None in the class. '
-                             'Skipping loading grad scaler ...')
-
-        # Copy data for the main params.
-        fp32_from_float16_params_key = 'fp32_from_fp16_params'
-        if fp32_from_float16_params_key not in state_dict:
-            fp32_from_float16_params_key = 'fp32_from_fp16'
-        for current_group, saved_group in zip(
-                self.fp32_from_float16_groups,
-                state_dict[fp32_from_float16_params_key]):
-            for current_param, saved_param in zip(current_group, saved_group):
-                current_param.data.copy_(saved_param.data)
-
-
-class FP32Optimizer(MegatronOptimizer):
-
-    def __init__(self, optimizer, clip_grad,
-                 log_num_zeros_in_grad,
-                 check_for_nan_in_grad,
-                 params_have_main_grad,
-                 models):
-
-        super(FP32Optimizer, self).__init__(
-            optimizer, clip_grad, log_num_zeros_in_grad,
-            check_for_nan_in_grad, params_have_main_grad,
-            models)
-
-        self._scale = torch.cuda.FloatTensor([1.0])
-
-
-    def zero_grad(self, set_to_none=True):
-        """Copied from torch.optim.optimizer"""
-        for group in self.optimizer.param_groups:
-            _zero_grad_group_helper(group['params'], set_to_none)
-
-
-    def get_loss_scale(self):
-        """FP32 optimizer does not do any scaling."""
-        return self._scale
-
-
-    @torch.no_grad()
-    def step(self, args, timers):
-        """Clip gradients (if needed) and step the base optimizer.
-        Always return successful since there is no overflow."""
-
-        # Copy main_grads to grads.
-        timers('optimizer-copy-to-main-grad', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        if self.params_have_main_grad:
-            for param_group in self.optimizer.param_groups:
-                for param in param_group['params']:
-                    param.grad = param.main_grad
-
-        timers('optimizer-copy-to-main-grad').stop()
-
-        # Clip gradients.
-        timers('optimizer-clip-main-grad', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        grad_norm = None
-        if self.clip_grad > 0.0:
-            grad_norm = self.clip_grad_norm(self.clip_grad,
-                                            self.check_for_nan_in_grad)
-        timers('optimizer-clip-main-grad').stop()
-
-        # count the zeros in the grads
-        timers('optimizer-count-zeros', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        num_zeros_in_grad = self.count_zeros() if \
-                            self.log_num_zeros_in_grad else None
-        timers('optimizer-count-zeros').stop()
-
-        # Update parameters.
-        timers('optimizer-inner-step', log_level=1).start(
-            barrier=args.barrier_with_L1_time)
-        self.optimizer.step()
-        timers('optimizer-inner-step').stop()
-
-        # No overflow for FP32 optimizer.
-        return True, grad_norm, num_zeros_in_grad
-
-
-    def reload_model_params(self):
-        pass
-
-
-    def state_dict(self):
-        return self.optimizer.state_dict()
-
-
-    def load_state_dict(self, state_dict):
-        self.optimizer.load_state_dict(state_dict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/utils.py
deleted file mode 100644
index 88518760bdbc44d79014e997810d1f22a9a634c6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer/utils.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Utility functions for Megatron optimizer."""
-
-
-from megatron_ds.core import mpu
-
-
-def shard_buffer(buffer):
-    """
-    Shard buffer into dp_size chunks of equal size.
-    """
-    data_parallel_world_size = mpu.get_data_parallel_world_size(with_context_parallel=True)
-    assert buffer.numel() % data_parallel_world_size == 0
-    shard_size = buffer.numel() // data_parallel_world_size
-    sharded_buffer = [buffer[(r*shard_size):((r+1)*shard_size)]
-                      for r in range(data_parallel_world_size)]
-    return sharded_buffer
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer_param_scheduler.py b/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer_param_scheduler.py
deleted file mode 100644
index e6ee659b1bbdb3b49260b379ceb6bb4ae2337e98..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/optimizer_param_scheduler.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Learning rate decay and weight decay incr functions."""
-
-import math
-
-from megatron_ds import print_rank_0
-
-class OptimizerParamScheduler(object):
-    """Anneals learning rate and weight decay"""
-
-    def __init__(self, optimizer, init_lr, max_lr, min_lr,
-                 lr_warmup_steps, lr_decay_steps, lr_decay_style,
-                 start_wd, end_wd, wd_incr_steps, wd_incr_style,
-                 use_checkpoint_opt_param_scheduler=True,
-                 override_opt_param_scheduler=False):
-
-        # Class values.
-        self.optimizer = optimizer
-
-        self.init_lr = init_lr
-        self.max_lr = float(max_lr)
-        self.min_lr = min_lr
-        assert self.min_lr >= 0.0
-        assert self.max_lr >= self.min_lr
-        assert self.init_lr <= self.max_lr
-
-        self.lr_warmup_steps = lr_warmup_steps
-        self.num_steps = 0
-        self.lr_decay_steps = lr_decay_steps
-        assert self.lr_decay_steps > 0
-        assert self.lr_warmup_steps < self.lr_decay_steps
-
-        self.lr_decay_style = lr_decay_style
-
-        self.start_wd = start_wd
-        self.end_wd = end_wd
-        assert self.start_wd >= 0.0
-        assert self.end_wd >= self.start_wd
-        self.wd_incr_steps = wd_incr_steps
-        self.wd_incr_style = wd_incr_style
-
-        self.override_opt_param_scheduler = override_opt_param_scheduler
-        self.use_checkpoint_opt_param_scheduler = use_checkpoint_opt_param_scheduler
-        if self.override_opt_param_scheduler:
-            assert not self.use_checkpoint_opt_param_scheduler, 'both override and '\
-                'use-checkpoint are set.'
-
-        # Set the learning rate
-        self.step(0)
-        print_rank_0('> learning rate decay style: {}'.format(self.lr_decay_style))
-
-
-    def get_wd(self):
-        """ Weight decay incr functions"""
-        if self.num_steps > self.wd_incr_steps:
-            return self.end_wd
-
-        if self.wd_incr_style == 'constant':
-            assert self.start_wd == self.end_wd
-            return self.end_wd
-
-        incr_ratio = float(self.num_steps) / float(self.wd_incr_steps)
-        assert incr_ratio >= 0.0
-        assert incr_ratio <= 1.0
-        delta_wd = self.end_wd - self.start_wd
-
-        if self.wd_incr_style == 'linear':
-            coeff = incr_ratio
-        elif self.wd_incr_style == 'cosine':
-            coeff = 0.5 * (math.cos(math.pi * (1 - incr_ratio)) + 1.0)
-        else:
-            raise Exception('{} weight decay increment style is not supported.'.format(
-                self.wd_incr_style))
-
-        return self.start_wd + coeff * delta_wd
-
-
-    def get_lr(self):
-        """Learning rate decay functions from:
-              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
-
-        # Use linear warmup for the initial part.
-        if self.lr_warmup_steps > 0 and self.num_steps <= self.lr_warmup_steps:
-            return (
-                self.init_lr
-                + (
-                    (self.max_lr - self.init_lr)
-                    * float(self.num_steps)
-                    / float(self.lr_warmup_steps)
-                )
-            )
-
-        # If the learning rate is constant, just return the initial value.
-        if self.lr_decay_style == 'constant':
-            return self.max_lr
-
-        # For any steps larger than `self.lr_decay_steps`, use `self.min_lr`.
-        if self.num_steps > self.lr_decay_steps:
-            return self.min_lr
-
-        # If we are done with the warmup period, use the decay style.
-        if self.lr_decay_style == 'inverse-square-root':
-            warmup_steps = max(self.lr_warmup_steps, 1)
-            num_steps = max(self.num_steps, 1)
-            lr = self.max_lr * warmup_steps ** 0.5 / (num_steps ** 0.5)
-            return max(self.min_lr, lr)
-
-        num_steps_ = self.num_steps - self.lr_warmup_steps
-        decay_steps_ = self.lr_decay_steps - self.lr_warmup_steps
-        decay_ratio = float(num_steps_) / float(decay_steps_)
-        assert decay_ratio >= 0.0
-        assert decay_ratio <= 1.0
-        delta_lr = self.max_lr - self.min_lr
-
-        if self.lr_decay_style == 'linear':
-            coeff = (1.0 - decay_ratio)
-        elif self.lr_decay_style == 'cosine':
-            coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
-        else:
-            raise Exception('{} decay style is not supported.'.format(
-                self.lr_decay_style))
-
-        return self.min_lr + coeff * delta_lr
-
-
-    def step(self, increment):
-        """Set lr for all parameters groups."""
-        self.num_steps += increment
-        new_lr = self.get_lr()
-        new_wd = self.get_wd()
-        for group in self.optimizer.param_groups:
-            group['lr'] = new_lr * group.get('lr_mult', 1.0)
-            group['weight_decay'] = new_wd * group.get('wd_mult', 1.0)
-
-
-    def state_dict(self):
-        state_dict = {
-            'max_lr': self.max_lr,
-            'lr_warmup_steps': self.lr_warmup_steps,
-            'num_steps': self.num_steps,
-            'lr_decay_style': self.lr_decay_style,
-            'lr_decay_steps': self.lr_decay_steps,
-            'min_lr': self.min_lr,
-            'start_wd': self.start_wd,
-            'end_wd': self.end_wd,
-            'wd_incr_style': self.wd_incr_style,
-            'wd_incr_steps': self.wd_incr_steps
-        }
-        return state_dict
-
-
-    def _check_and_set(self, cls_value, sd_value, name):
-        """Auxiliary function for checking the values in the checkpoint and
-        setting them."""
-        if self.override_opt_param_scheduler:
-            print_rank_0(' > overriding {} value to {}'.format(name, cls_value))
-            return cls_value
-
-        if not self.use_checkpoint_opt_param_scheduler:
-            assert cls_value == sd_value, \
-                f'OptimizerParamScheduler: class input value {cls_value} and checkpoint' \
-                f'value {sd_value} for {name} do not match'
-        print_rank_0(' > using checkpoint value {} for {}'.format(sd_value,
-                                                                  name))
-        return sd_value
-
-
-    def load_state_dict(self, sd):
-
-        if 'start_lr' in sd:
-            max_lr_ = sd['start_lr']
-        else:
-            max_lr_ = sd['max_lr']
-        self.max_lr = self._check_and_set(self.max_lr, max_lr_,
-                                          'learning rate')
-        
-        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
-                                          'minimum learning rate')
-
-        if 'warmup_iter' in sd:
-            lr_warmup_steps_ = sd['warmup_iter']
-        elif 'warmup_steps' in sd:
-            lr_warmup_steps_ = sd['warmup_steps']
-        else:
-            lr_warmup_steps_ = sd['lr_warmup_steps']
-        self.lr_warmup_steps = self._check_and_set(self.lr_warmup_steps,
-                                                lr_warmup_steps_,
-                                                'warmup iterations')
-
-        if 'end_iter' in sd:
-            lr_decay_steps_ = sd['end_iter']
-        elif 'decay_steps' in sd:
-            lr_decay_steps_  = sd['decay_steps']
-        else:
-            lr_decay_steps_ = sd['lr_decay_steps']
-        self.lr_decay_steps = self._check_and_set(self.lr_decay_steps, lr_decay_steps_,
-                                               'total number of iterations')
-
-        if 'decay_style' in sd:
-            lr_decay_style_ = sd['decay_style']
-        else:
-            lr_decay_style_ = sd['lr_decay_style']
-        self.lr_decay_style = self._check_and_set(self.lr_decay_style,
-                                               lr_decay_style_,
-                                               'learning rate decay style')
-
-        if 'num_iters' in sd:
-            num_steps = sd['num_iters']
-        else:
-            num_steps = sd['num_steps']
-        self.step(increment=num_steps)
-
-
-        if 'start_wd' in sd:
-            self.start_wd = self._check_and_set(self.start_wd,
-                                                sd['start_wd'],
-                                                "start weight decay")
-            self.end_wd = self._check_and_set(self.end_wd,
-                                                sd['end_wd'],
-                                                "end weight decay")
-            self.wd_incr_steps = self._check_and_set(self.wd_incr_steps,
-                                                sd['wd_incr_steps'],
-                                                "total number of weight decay iterations")
-            self.wd_incr_style = self._check_and_set(self.wd_incr_style,
-                                                sd['wd_incr_style'],
-                                                "weight decay incr style")
-            
-
-
-
-
-
-
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/static/index.html b/toolbox/Megatron-DeepSpeed/megatron_ds/static/index.html
deleted file mode 100644
index 806287955bcc02e2d4148855af5ddb36ba94ae72..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/static/index.html
+++ /dev/null
@@ -1,124 +0,0 @@
-<!-- coding=utf-8-->
-<!-- Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.-->
-
-<!DOCTYPE html>
-<html lang="en">
-<head>
-<meta charset="utf-8" />
-<title>Megatron</title>
-<style>
-
-.wrapper {
-max-width: 75%;
-margin: auto;
-}
-
-h1 { 
-margin: 3rem 0 1rem 0; 
-padding: 0;
-font-size: 1.5rem;
-}
-
-textarea {
-width: 100%;
-min-height: 300px;
-resize: none;
-border-radius: 8px;
-border: 1px solid #ddd;
-padding: 0.5rem;
-box-shadow: inset 0 0 0.25rem #ddd;
-&:focus {
-outline: none;
-border: 1px solid darken(#ddd, 5%);
-box-shadow: inset 0 0 0.5rem darken(#ddd, 5%);
-}
-}
-
-#the-count {
-float: right;
-padding: 0.1rem 0 0 0;
-font-size: 0.875rem;
-}
-/* Chat containers */
-.container {
-font-family: 'Arial', sans-serif;
-font-size: 16px;
-border: 2px solid #dedede;
-background-color: #f1f1f1;
-border-radius: 5px;
-padding: 15px;
-margin: 10px 0;
-}
-
-
-/* Clear floats */
-.container::after {
-content: "";
-clear: both;
-display: table;
-}
-
-/* Style images */
-.container img {
-float: left;
-max-width: 60px;
-width: 100%;
-margin-right: 20px;
-border-radius: 50%;
-}
-
-</style>
-</head>
-<body>
-<div class="wrapper">
-<h1>Prompt Megatron</h1>
-<textarea name="prompt" id="prompt" maxlength="1024" placeholder="Add prompt"autofocus></textarea>
-<label for="tokens_to_generate">Number tokens to generate (1-1024):</label>
-<input type="number" id="tokens_to_generate" name="tokens_to_generate" min="10" max="256", value=32>
-<button onclick="submit_query()">Submit</button>
-
-<div id="the-count">
-<span id="current">0</span>
-<span id="maximum">/ 1000</span>
-</div>
-<textarea name="response" id="response" maxlength="2048" placeholder="Megatron response..."></textarea>
-</div>
-<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
-<script type="text/javascript">
-
-	function submit_query() {
-		$("#response").val("Waiting for Megatron response...");
-		$.ajax({
-			url:"api",
-			type:"PUT",
-			data:JSON.stringify({prompts: [$("#prompt").val()], tokens_to_generate: parseInt($("#tokens_to_generate").val(),10)}),
-			contentType:"application/json; charset=utf-8",
-			dataType:"json",
-			success: function(data){
-				data.max_len=35;
-				$("#response").val(data.text);
-			}
-		});
-	}
-	
-$('textarea').keyup(function() {
-var characterCount = $(this).val().length,
-current = $('#current'),
-maximum = $('#maximum'),
-theCount = $('#the-count');
-
-current.text(characterCount);
-
-if (characterCount >= 800) {
-maximum.css('color', '#8f0001');
-current.css('color', '#8f0001');
-theCount.css('font-weight','bold');
-} else {
-maximum.css('color','#666');
-theCount.css('font-weight','normal');
-}
-});
-</script>
-</body>
-</html>
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/__init__.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/__init__.py
deleted file mode 100644
index 77da7be30ae4d02bd7ab1e4bae86afc8923d4e23..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-
-from .api import (
-    generate,
-    generate_and_post_process,
-    beam_search_and_post_process)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/api.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/api.py
deleted file mode 100644
index 801b584ed3dca645a3645d98f1e966f5a4ba24a9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/api.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Inference API."""
-
-
-import torch
-
-from megatron_ds.core import mpu
-from .communication import broadcast_float_list
-from .generation import (
-        generate_tokens_probs_and_return_on_first_stage,
-        score_and_return_on_first_stage,
-        beam_search_and_return_on_first_stage)
-from .tokenization import (
-    tokenize_prompts,
-    detokenize_generations)
-
-def generate_and_post_process(model,
-                              prompts=None,
-                              tokens_to_generate=0,
-                              return_output_log_probs=False,
-                              top_k_sampling=0,
-                              top_p_sampling=0.0,
-                              top_p_decay=0.0,
-                              top_p_bound=0.0,
-                              temperature=1.0,
-                              add_BOS=False,
-                              use_eod_token_for_early_termination=True,
-                              stop_on_double_eol=False,
-                              stop_on_eol=False,
-                              prevent_newline_after_colon=False,
-                              random_seed=-1,
-                              return_logits=False):
-    """Run inference and post-process outputs, i.e., detokenize,
-    move to cpu and convert to list."""
-
-    # Main inference.
-    tokens, lengths, output_log_probs, logits = generate(
-        model,
-        prompts=prompts,
-        tokens_to_generate=tokens_to_generate,
-        return_output_log_probs=return_output_log_probs,
-        top_k_sampling=top_k_sampling,
-        top_p_sampling=top_p_sampling,
-        top_p_decay=top_p_decay,
-        top_p_bound=top_p_bound,
-        temperature=temperature,
-        add_BOS=add_BOS,
-        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
-        stop_on_double_eol=stop_on_double_eol,
-        stop_on_eol=stop_on_eol,
-        prevent_newline_after_colon=prevent_newline_after_colon,
-        random_seed=random_seed)
-
-    # Only post-process on first stage.
-    if mpu.is_pipeline_first_stage():
-        tokens, prompts_plus_generations, prompts_plus_generations_segments = \
-            detokenize_generations(tokens, lengths, True)
-
-        if return_output_log_probs:
-            output_log_probs = output_log_probs.cpu().numpy().tolist()
-            for i, (prob, seg) in enumerate(zip(output_log_probs, prompts_plus_generations_segments)):
-                output_log_probs[i] = prob[:len(seg)-1]
-
-        if return_logits:
-            assert(tokens_to_generate == 0)
-            assert(mpu.get_pipeline_model_parallel_world_size() == 1)
-            return prompts_plus_generations, prompts_plus_generations_segments, \
-            output_log_probs, tokens, logits
-        else:
-            return prompts_plus_generations, prompts_plus_generations_segments, \
-            output_log_probs, tokens
-
-    return None
-
-def generate(model,
-             prompts=None,
-             tokens_to_generate=0,
-             return_output_log_probs=False,
-             top_k_sampling=0,
-             top_p_sampling=0.0,
-             top_p_decay=0.0,
-             top_p_bound=0.0,
-             temperature=1.0,
-             add_BOS=False,
-             use_eod_token_for_early_termination=True,
-             stop_on_double_eol=False,
-             stop_on_eol=False,
-             prevent_newline_after_colon=False,
-             random_seed=-1):
-    """Given prompts and input parameters, run inference and return:
-       tokens: prompts plus the generated tokens.
-       lengths: length of the prompt + generations. Note that we can
-           discard tokens in the tokens tensor that are after the
-           corresponding length.
-       output_log_probs: log probs of the tokens.
-    """
-
-    # Make sure input params are avaialble to all ranks.
-    values = [tokens_to_generate,
-              return_output_log_probs,
-              top_k_sampling, top_p_sampling, top_p_decay, top_p_bound,
-              temperature, add_BOS, use_eod_token_for_early_termination,
-              stop_on_double_eol,
-              stop_on_eol,
-              prevent_newline_after_colon,
-              random_seed]
-    values_float_tensor = broadcast_float_list(len(values), float_list=values)
-    tokens_to_generate = int(values_float_tensor[0].item())
-    return_output_log_probs = bool(values_float_tensor[1].item())
-    top_k_sampling = int(values_float_tensor[2].item())
-    top_p_sampling = values_float_tensor[3].item()
-    top_p_decay = values_float_tensor[4].item()
-    top_p_bound = values_float_tensor[5].item()
-    temperature = values_float_tensor[6].item()
-    add_BOS = bool(values_float_tensor[7].item())
-    use_eod_token_for_early_termination = bool(values_float_tensor[8].item())
-    stop_on_double_eol = bool(values_float_tensor[9].item())
-    stop_on_eol = bool(values_float_tensor[10].item())
-    prevent_newline_after_colon = bool(values_float_tensor[11].item())
-    random_seed = int(values_float_tensor[12].item())
-
-    if random_seed != -1:
-        torch.random.manual_seed(random_seed)
-
-    # Tokenize prompts and get the batch.
-    # Note that these tensors are broadcaseted to all ranks.
-    if torch.distributed.get_rank() == 0:
-        assert prompts is not None
-    
-    context_tokens_tensor, context_length_tensor = tokenize_prompts(
-        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
-
-    if tokens_to_generate == 0:
-        return score_and_return_on_first_stage(
-            model, context_tokens_tensor, context_length_tensor)
-    
-    # Main inference function.
-    # Note that the outputs are available on the first stage.
-    return generate_tokens_probs_and_return_on_first_stage(
-        model, context_tokens_tensor, context_length_tensor,
-        return_output_log_probs=return_output_log_probs,
-        top_k=top_k_sampling,
-        top_p=top_p_sampling,
-        top_p_decay=top_p_decay,
-        top_p_bound=top_p_bound,
-        temperature=temperature,
-        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
-        stop_on_double_eol=stop_on_double_eol,
-        stop_on_eol=stop_on_eol,
-        prevent_newline_after_colon=prevent_newline_after_colon)
-
-def beam_search_and_post_process(model,
-                                 prompts=None,
-                                 tokens_to_generate=0,
-                                 beam_size=0,
-                                 add_BOS=False,
-                                 stop_token=50256,
-                                 num_return_gen=1,
-                                 length_penalty=1,
-                                 prevent_newline_after_colon=False):
-    """Run beam search and post-process outputs, i.e., detokenize,
-    move to cpu and convert to list."""
-
-    # Main inference.
-    tokens, scores = beam_search(model,
-                                 prompts=prompts,
-                                 tokens_to_generate=tokens_to_generate,
-                                 beam_size=beam_size,
-                                 add_BOS=add_BOS,
-                                 stop_token=stop_token,
-                                 num_return_gen=num_return_gen,
-                                 length_penalty=length_penalty,
-                                 prevent_newline_after_colon=prevent_newline_after_colon)
-    # Only post-process on first stage.
-    if mpu.is_pipeline_first_stage():
-        lengths = tokens.size(1)*torch.ones(beam_size, dtype=torch.int64, device=torch.cuda.current_device()) 
-        tokens, prompts_plus_generations, prompts_plus_generations_segments = detokenize_generations(tokens, lengths, True)
-        scores = scores.cpu().numpy().tolist()
-        return prompts_plus_generations, prompts_plus_generations_segments, scores
-
-    return None
-
-def beam_search(model, prompts=None, tokens_to_generate=0, beam_size=0, add_BOS=False, stop_token=50256, num_return_gen=1, length_penalty=1, prevent_newline_after_colon=False):
-    # Make sure input params are avaialble to all ranks.
-    values = [tokens_to_generate,
-              beam_size,
-              add_BOS,
-              stop_token,
-              num_return_gen,
-              length_penalty,
-              prevent_newline_after_colon]
-    values_float_tensor = broadcast_float_list(len(values), float_list=values)
-    tokens_to_generate = int(values_float_tensor[0].item())
-    beam_size = int(values_float_tensor[1].item())
-    add_BOS = bool(values_float_tensor[2].item())
-    stop_token = int(values_float_tensor[3].item())
-    num_return_gen = int(values_float_tensor[4].item())
-    length_penalty = values_float_tensor[5].item()
-    prevent_newline_after_colon = values_float_tensor[6].item()
-
-    context_tokens_tensor, context_length_tensor = tokenize_prompts(
-        prompts=prompts, tokens_to_generate=tokens_to_generate, add_BOS=add_BOS)
-    
-    return beam_search_and_return_on_first_stage(model, context_tokens_tensor, context_length_tensor, 
-            beam_size, stop_token=stop_token, num_return_gen=num_return_gen, length_penalty=length_penalty,
-            prevent_newline_after_colon=prevent_newline_after_colon)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/beam_utils.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/beam_utils.py
deleted file mode 100644
index 911a64143a86c8521abd9741df22de528a82f692..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/beam_utils.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-## from huggingface beam search
-class BeamHypotheses(object):
-    def __init__(self, num_beams, length_penalty=1.0, early_stopping=False):
-        """
-        Initialize n-best list of hypotheses.
-        """
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        self.num_beams = num_beams
-        self.beams = []
-        self.worst_score = 1e9
-
-    def __len__(self):
-        """
-        Number of hypotheses in the list.
-        """
-        return len(self.beams)
-
-    def add(self, hyp, sum_logprobs, length):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / length ** self.length_penalty
-        if len(self) < self.num_beams or score > self.worst_score:
-            self.beams.append((score, hyp))
-            if len(self) > self.num_beams:
-                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
-                del self.beams[sorted_scores[0][1]]
-                self.worst_score = sorted_scores[1][0]
-            else:
-                self.worst_score = min(score, self.worst_score)
-
-    def is_done(self, best_sum_logprobs, cur_len):
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated
-        can become better than the worst one in the heap, then we are done with this sentence.
-        """
-
-        if len(self) < self.num_beams:
-            return False
-        elif self.early_stopping:
-            return True
-        else:
-            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
-            ret = self.worst_score >= cur_score
-            return ret
-
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/communication.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/communication.py
deleted file mode 100644
index ecfbb43858b1f100fe1c649067ff5eeb1c2c931b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/communication.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Communications utilities."""
-
-
-import torch
-
-from megatron_ds.core import mpu
-
-
-
-# TODO: use functions from megatron/p2p
-def recv_from_prev_pipeline_rank_(recv_buffer=None):
-    """Receive from previous pipeline stage and update the
-    input buffer inplace."""
-    if not mpu.is_pipeline_first_stage():
-        assert recv_buffer is not None
-        recv_prev_op = torch.distributed.P2POp(
-            torch.distributed.irecv, recv_buffer,
-            mpu.get_pipeline_model_parallel_prev_rank())
-        reqs = torch.distributed.batch_isend_irecv([recv_prev_op])
-        for req in reqs:
-            req.wait()
-        # To protect against race condition when using batch_isend_irecv().
-        torch.cuda.synchronize()
-
-
-
-# TODO: use functions from megatron/p2p
-def send_to_next_pipeline_rank(tensor=None):
-    """Send output to the next pipeline stage."""
-    if not mpu.is_pipeline_last_stage():
-        assert tensor is not None
-        send_next_op = torch.distributed.P2POp(
-            torch.distributed.isend, tensor,
-            mpu.get_pipeline_model_parallel_next_rank())
-        reqs = torch.distributed.batch_isend_irecv([send_next_op])
-        for req in reqs:
-            req.wait()
-        # To protect against race condition when using batch_isend_irecv().
-        torch.cuda.synchronize()
-
-
-
-def _is_cuda(tensor):
-    """Check if a tensor is not none and is cuda."""
-    assert tensor is not None
-    assert tensor.is_cuda
-
-
-
-def _is_cuda_contiguous(tensor):
-    """Check if a tensor is not none, is cuda, and is contiguous."""
-    _is_cuda(tensor)
-    assert tensor.is_contiguous()
-
-
-
-def broadcast_from_last_pipeline_stage(size, dtype, tensor=None):
-    """Broadcast a tensor from last pipeline stage to all ranks."""
-
-    is_last_stage = mpu.is_pipeline_last_stage()
-    # If first stage and last state are the same, then there is no
-    # pipeline parallelism and no need to communicate.
-    if mpu.is_pipeline_first_stage() and is_last_stage:
-        return tensor
-
-    if is_last_stage:
-        _is_cuda_contiguous(tensor)
-    else:
-        tensor = torch.empty(size,
-                             dtype=dtype,
-                             device=torch.cuda.current_device())
-    # Get the group and corresponding source rank.
-    src = mpu.get_pipeline_model_parallel_last_rank()
-    group = mpu.get_pipeline_model_parallel_group()
-    torch.distributed.broadcast(tensor, src, group)
-
-    return tensor
-
-
-
-def broadcast_from_last_to_first_pipeline_stage(size, dtype, tensor=None):
-    """Broadcast tensor values from last stage into the first stage."""
-
-    is_last_stage = mpu.is_pipeline_last_stage()
-    is_first_stage = mpu.is_pipeline_first_stage()
-    # If first stage and last state are the same, then there is no
-    # pipeline parallelism and no need to communicate.
-    if is_first_stage and is_last_stage:
-        return tensor
-    # Only first and last stage pipeline stages need to be involved.
-    if is_last_stage or is_first_stage:
-        if is_last_stage:
-            _is_cuda_contiguous(tensor)
-        else:
-            tensor = torch.empty(size,
-                                 dtype=dtype,
-                                 device=torch.cuda.current_device())
-        src = mpu.get_pipeline_model_parallel_last_rank()
-        group = mpu.get_embedding_group()
-        # Broadcast from last stage into the first stage.
-        torch.distributed.broadcast(tensor, src, group)
-    else:
-        tensor = None
-
-    return tensor
-
-
-
-def copy_from_last_to_first_pipeline_stage(size, dtype, tensor=None):
-    """Copy tensor values from last stage into the first stage.
-    Note that the input tensor is updated in place."""
-
-    is_last_stage = mpu.is_pipeline_last_stage()
-    is_first_stage = mpu.is_pipeline_first_stage()
-    # If first stage and last state are the same, then there is no
-    # pipeline parallelism and no need to communicate.
-    if is_first_stage and is_last_stage:
-        return
-    # Only first and last stage pipeline stages need to be involved.
-    if is_last_stage or is_first_stage:
-        _is_cuda(tensor)
-        is_contiguous = tensor.is_contiguous()
-        src = mpu.get_pipeline_model_parallel_last_rank()
-        group = mpu.get_embedding_group()
-        if is_contiguous:
-            tensor_ = tensor
-        else:
-            if is_last_stage:
-                tensor_ = tensor.contiguous()
-            else:
-                tensor_ = torch.empty(size,
-                                      dtype=dtype,
-                                      device=torch.cuda.current_device())
-        # Broadcast from last stage into the first stage.
-        torch.distributed.broadcast(tensor_, src, group)
-        # Update the first stage tensor
-        if is_first_stage and not is_contiguous:
-            tensor[...] = tensor_
-
-
-
-def broadcast_tensor(size, dtype, tensor=None, rank=0):
-    """ Given size and type of a tensor on all ranks and the tensor value
-        only on a specific rank, broadcast from that rank to all other ranks.
-    """
-
-    if torch.distributed.get_rank() == rank:
-        _is_cuda_contiguous(tensor)
-    else:
-        tensor = torch.empty(size,
-                             dtype=dtype,
-                             device=torch.cuda.current_device())
-
-    torch.distributed.broadcast(tensor, rank)
-
-    return tensor
-
-
-
-def broadcast_list(size, dtype, list_values=None, rank=0):
-    """Broadcast a list of values with a given type."""
-
-    tensor = None
-    if torch.distributed.get_rank() == rank:
-        tensor = torch.tensor(list_values, dtype=dtype,
-                              device=torch.cuda.current_device())
-
-    return broadcast_tensor(size, dtype, tensor=tensor, rank=rank)
-
-
-
-def broadcast_int_list(size, int_list=None, rank=0):
-    """Broadcast a list of interger values."""
-
-    return broadcast_list(size, torch.int64, list_values=int_list, rank=rank)
-
-
-
-def broadcast_float_list(size, float_list=None, rank=0):
-    """Broadcast a list of float values."""
-
-    return broadcast_list(size, torch.float32, list_values=float_list,
-                          rank=rank)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/forward_step.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/forward_step.py
deleted file mode 100644
index e8590226af4fe00cec4bd5080d54e167cc2a6936..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/forward_step.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Forward step utilities."""
-
-from collections.abc import Iterable
-
-import torch
-
-from megatron_ds import get_args
-from megatron_ds.core import mpu, InferenceParams
-from .communication import (
-    send_to_next_pipeline_rank,
-    recv_from_prev_pipeline_rank_)
-
-
-class ForwardStep:
-    """Forward step function with all the communications.
-    We use a class here to hide the inference parameters
-    from the outside caller."""
-
-    def __init__(self, model, max_batch_size, max_sequence_length):
-        """Set values so we don't need to do it multiple times."""
-        # Make sure model is in eval mode.
-        assert not isinstance(model, Iterable), \
-            'interleaving schedule is not supported for inference'
-        model.eval()
-        self.model = model
-        # Initialize inference parameters.
-        self.inference_params = InferenceParams(max_batch_size,
-                                                max_sequence_length)
-        # Pipelining arguments.
-        args = get_args()
-        self.pipeline_size_larger_than_one = (
-            args.pipeline_model_parallel_size > 1)
-        # Threshold of pipelining.
-        self.pipelining_batch_x_seqlen = \
-            args.inference_batch_times_seqlen_threshold
-
-
-    def __call__(self, tokens, position_ids, attention_mask):
-        """Invocation of the forward methods. Note that self.inference_params
-        is being modified by the forward step."""
-        # Pipelining case.
-        if self.pipeline_size_larger_than_one:
-            current_batch_x_seqlen = tokens.size(0) * tokens.size(1)
-            if current_batch_x_seqlen >= self.pipelining_batch_x_seqlen:
-                micro_batch_size = \
-                    max(1, self.pipelining_batch_x_seqlen // tokens.size(1))
-                return _with_pipelining_forward_step(self.model,
-                                                     tokens,
-                                                     position_ids,
-                                                     attention_mask,
-                                                     self.inference_params,
-                                                     micro_batch_size)
-
-        return _no_pipelining_forward_step(self.model,
-                                           tokens,
-                                           position_ids,
-                                           attention_mask,
-                                           self.inference_params)
-
-
-
-def _get_recv_buffer_dtype(args):
-    """Receive happens between the layers."""
-    if args.fp32_residual_connection:
-        return torch.float
-    return args.params_dtype
-
-
-
-def _allocate_recv_buffer(batch_size, sequence_length):
-    """Receive happens between the layers with size [s, b, h]."""
-    if mpu.is_pipeline_first_stage():
-        return None
-    args = get_args()
-    recv_size = (sequence_length, batch_size, args.hidden_size)
-    return torch.empty(recv_size,
-                       dtype=_get_recv_buffer_dtype(args),
-                       device=torch.cuda.current_device())
-
-
-
-def _forward_step_helper(model, tokens, position_ids, attention_mask,
-                         inference_params, recv_buffer=None):
-    """Single forward step. Update the allocate memory flag so
-    only the first time the memory is allocated."""
-    batch_size = tokens.size(0)
-    sequence_length = tokens.size(1)
-    if recv_buffer is None:
-        recv_buffer = _allocate_recv_buffer(batch_size, sequence_length)
-
-    # Receive from previous stage.
-    recv_from_prev_pipeline_rank_(recv_buffer)
-
-    # Forward pass through the model.
-    model.set_input_tensor(recv_buffer)
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          inference_params=inference_params)
-
-    # Send output to the next stage.
-    send_to_next_pipeline_rank(output_tensor)
-
-    return output_tensor
-
-
-
-def _no_pipelining_forward_step(model, tokens, position_ids, attention_mask,
-                                inference_params, recv_buffer=None):
-    """If recv_buffer is none, we will allocate one on the fly."""
-    # Run a simple forward pass.
-    output_tensor = _forward_step_helper(model, tokens, position_ids,
-                                         attention_mask, inference_params,
-                                         recv_buffer=recv_buffer)
-    # Update the sequence length offset.
-    inference_params.sequence_len_offset += tokens.size(1)
-
-    logits = None
-    if mpu.is_pipeline_last_stage():
-        logits = output_tensor
-
-    return logits
-
-
-
-def _with_pipelining_forward_step(model, tokens, position_ids, attention_mask,
-                                  inference_params, micro_batch_size):
-    """No interleaving is supported."""
-    sequence_length = tokens.size(1)
-    batch_size = tokens.size(0)
-
-    # Divide the batch dimension into micro batches.
-    num_micro_batches, last_chunk = divmod(batch_size,
-                                           micro_batch_size)
-    if last_chunk > 0:
-        num_micro_batches += 1
-
-    # Preallocate memory for output logits.
-    logits = None
-    if mpu.is_pipeline_last_stage():
-        args = get_args()
-        logits = torch.empty(
-            (batch_size, sequence_length, args.padded_vocab_size),
-            dtype=torch.float32, device=torch.cuda.current_device())
-
-    # Preallocate recv buffer.
-    recv_buffer = _allocate_recv_buffer(micro_batch_size, sequence_length)
-
-    for micro_batch_index in range(num_micro_batches):
-        # Slice among the batch dimenion.
-        start = micro_batch_index * micro_batch_size
-        end = min(start + micro_batch_size, batch_size)
-        this_micro_batch_size = end - start
-        tokens2use = tokens[start:end, ...]
-        position_ids2use = position_ids[start:end, ...]
-
-        # Run a simple forward pass.
-        if this_micro_batch_size != micro_batch_size:
-            recv_buffer = None
-        output = _forward_step_helper(model, tokens2use, position_ids2use,
-                                      attention_mask, inference_params,
-                                      recv_buffer=recv_buffer)
-
-        # Adjust the batch size offset to account for the micro-batch.
-        inference_params.batch_size_offset += this_micro_batch_size
-
-        # Copy logits.
-        if mpu.is_pipeline_last_stage():
-            logits[start:end, ...] = output
-
-    # Once we are done with all the micro-batches, we can
-    # adjust the sequence length offset.
-    inference_params.sequence_len_offset += sequence_length
-    # and reset the batch size offset
-    inference_params.batch_size_offset = 0
-
-    return logits
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/generation.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/generation.py
deleted file mode 100644
index 249d1343f521ba9515b0fac01920ddc513235d7b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/generation.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Generation utilities."""
-
-import torch
-import torch.nn.functional as F
-
-from megatron_ds import get_args, get_tokenizer
-from megatron_ds.core import mpu
-from megatron_ds.utils import get_ltor_masks_and_position_ids
-from .communication import (
-    copy_from_last_to_first_pipeline_stage,
-    broadcast_from_last_pipeline_stage,
-    broadcast_from_last_to_first_pipeline_stage)
-from .forward_step import ForwardStep
-from .sampling import sample
-from .beam_utils import BeamHypotheses
-
-def score_and_return_on_first_stage(model, tokens, lengths):
-    """Function for just scoring.
-    Arguments:
-        model: no interleaving is supported.
-        tokens: prompt tokens extended to be of size [b, max_prompt_length]
-        lengths: original prompt length, size: [b]
-    Note: Outside of model, other parameters only need to be available on
-          rank 0.
-    Outputs: 
-        output_log_probs: log probability of the selected tokens. size: [b, s]
-    """
-
-    args = get_args()
-
-    batch_size = tokens.size(0)
-    max_prompt_length = lengths.max().item()
-    assert max_prompt_length == tokens.size(1)
-    
-    if max_prompt_length > args.max_position_embeddings:
-        raise ValueError("Length of prompt + tokens_to_generate longer than allowed")
-    
-    if max_prompt_length * batch_size > args.max_tokens_to_oom:
-        raise ValueError("Too many tokens.  " + str(max_prompt_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom))
-
-    # forward step.
-    forward_step = ForwardStep(model, batch_size, max_prompt_length)
-
-    # ===================
-    # Pre-allocate memory
-    # ===================
-
-    # Log probability of the sequence (prompt + generated tokens).
-    output_log_probs = None
-    output_log_probs_size = (batch_size, max_prompt_length - 1)
-    
-    if mpu.is_pipeline_last_stage():
-        output_log_probs = torch.empty(output_log_probs_size,
-                                       dtype=torch.float32,
-                                       device=torch.cuda.current_device())
-    
-    # =============
-    # Run infernece
-    # =============
-    with torch.no_grad():
-        attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens)
-        
-        # logits will be meanigful only in the last pipeline stage.
-        logits = forward_step(tokens, position_ids, attention_mask)
-
-        if mpu.is_pipeline_last_stage():
-            # Always the last stage should have an output.
-            assert logits is not None
-            log_probs = F.log_softmax(logits, dim=2)
-            
-            # Pick the tokens that we need to get the log
-            # probabilities for. Note that next input token is
-            # the token which we selected in the current logits,
-            # so shift by 1.
-            indices = torch.unsqueeze(tokens[:, 1:], 2)
-            output_log_probs = torch.gather(log_probs, 2, indices).squeeze(2)
-    
-    # ======================================
-    # Broadcast to the first pipeline stage.
-    # ======================================
-    output_log_probs = broadcast_from_last_to_first_pipeline_stage(
-        output_log_probs_size, torch.float32, output_log_probs)
-    
-    return tokens, lengths, output_log_probs, logits
-
-def generate_tokens_probs_and_return_on_first_stage(
-        model, tokens, lengths,
-        return_output_log_probs=False,
-        top_k=0, top_p=0.0, top_p_decay=0.0, top_p_bound=0.0,
-        temperature=1.0,
-        use_eod_token_for_early_termination=True,
-        stop_on_double_eol=False,
-        stop_on_eol=False,
-        prevent_newline_after_colon=True
-        ):
-    """Main token generation function.
-    Arguments:
-        model: no interleaving is supported.
-        tokens: prompt tokens extended to be of size [b, max-sequence-length]
-        lengths: original prompt length, size: [b]
-        return_output_log_probs: flag to calculate the log probability of
-            the generated tokens. Note that the log probability is the one
-            from the original logit.
-        top_k, top_p: top-k and top-p sampling parameters.
-            Note that top-k = 1 is gready. Also, these paramters are
-            exclusive meaning that:
-                if top-k > 0 then we expect top-p=0.
-                if top-p > 0 then we check for top-k=0.
-        temperature: sampling temperature.
-        use_eod_token_for_early_termination: if True, do early termination if
-            all the sequences have reached this token.
-        prevent_newline_after_colon: if True, it will disable generating new line \n after :
-    Note: Outside of model, other parameters only need to be available on
-          rank 0.
-    Outputs: Note that is size is adjusted to a lower value than
-             max-sequence-length if generation is terminated early.
-        tokens: prompt and generated tokens. size: [b, :]
-        generated_sequence_lengths: total length (including prompt) of
-            the generated sequence. size: [b]
-        output_log_probs: log probability of the selected tokens. size: [b, s]
-    """
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    batch_size = tokens.size(0)
-    min_prompt_length = lengths.min().item()
-    max_sequence_length = tokens.size(1)
-
-    if max_sequence_length > args.max_position_embeddings:
-        raise ValueError("Length of prompt + tokens_to_generate longer than allowed")
-    
-    if max_sequence_length * batch_size > args.max_tokens_to_oom:
-        raise ValueError("Too many tokens.  " + str(max_sequence_length*batch_size)+ " is greater than "+str(args.max_tokens_to_oom))
-
-    # forward step.
-    forward_step = ForwardStep(model, batch_size, max_sequence_length)
-
-    # Added termination_id to support the case that we want to terminate the
-    # generation once that id is generated.
-    if hasattr(args, 'eos_id'):
-        termination_id = args.eos_id
-    else:
-        termination_id = tokenizer.eod
-
-    # ===================
-    # Pre-allocate memory
-    # ===================
-
-    # Log probability of the sequence (prompt + generated tokens).
-    output_log_probs = None
-    output_log_probs_size = (batch_size, max_sequence_length - 1)
-    # Lengths of generated seuquence including including prompts.
-    generated_sequence_lengths = None
-    if mpu.is_pipeline_last_stage():
-        if return_output_log_probs:
-            output_log_probs = torch.empty(output_log_probs_size,
-                                           dtype=torch.float32,
-                                           device=torch.cuda.current_device())
-        generated_sequence_lengths = torch.ones(
-                batch_size, dtype=torch.int64,
-                device=torch.cuda.current_device()) * max_sequence_length
-    
-    # Whether we have reached a termination id.
-    is_generation_done = torch.zeros(batch_size, dtype=torch.uint8,
-                                     device=torch.cuda.current_device())
-
-    # =============
-    # Run infernece
-    # =============
-
-    with torch.no_grad():
-        attention_mask, position_ids = _build_attention_mask_and_position_ids(
-            tokens)
-        prev_context_length = 0
-        for context_length in range(min_prompt_length, max_sequence_length):
-
-            # Pick the slice that we need to pass through the network.
-            tokens2use = tokens[:, prev_context_length:context_length]
-            positions2use = position_ids[:, prev_context_length:context_length]
-            attention_mask2use = attention_mask[
-                ..., prev_context_length:context_length, :context_length]
-
-            # logits will be meanigful only in the last pipeline stage.
-            logits = forward_step(tokens2use, positions2use, attention_mask2use)
-
-            if mpu.is_pipeline_last_stage():
-                if prevent_newline_after_colon:
-                    logits[tokens2use[:, -1] == tokenizer.tokenize(':')[0], -1, tokenizer.tokenize('\n')[0]] = -1e10 # disable "\n" after ":"
-                # Always the last stage should have an output.
-                assert logits is not None
-
-                # Sample.
-                last_token_logits = logits[:, -1, :]
-                new_sample = sample(last_token_logits,
-                                    top_k=top_k,
-                                    top_p=top_p,
-                                    temperature=temperature,
-                                    vocab_size=tokenizer.vocab_size)
-                if top_p > 0.0 and top_p_decay > 0.0:
-                    top_p = top_p * top_p_decay
-                    if top_p_bound > 0.0:
-                        top_p = max(top_p, top_p_bound)
-
-                # If a prompt length is smaller or equal th current context
-                # length, it means we have started generating tokens
-                started = lengths <= context_length
-                # Update the tokens.
-                tokens[started, context_length] = new_sample[started]
-
-                # Calculate the log probabilities.
-                if return_output_log_probs:
-                    log_probs = F.log_softmax(logits, dim=2)
-                    if return_output_log_probs:
-                        # Pick the tokens that we need to get the log
-                        # probabilities for. Note that next input token is
-                        # the token which we selected in the current logits,
-                        # so shift by 1.
-                        indices = torch.unsqueeze(
-                            tokens[
-                                :,
-                                (prev_context_length + 1):(context_length + 1)],
-                            2)
-                        output_log_probs[:,
-                                         prev_context_length:context_length] = \
-                            torch.gather(log_probs, 2, indices).squeeze(2)
-
-            # Update the tokens on the first stage so the next input to
-            # the network is correct.
-            copy_from_last_to_first_pipeline_stage(batch_size, torch.int64,
-                                                   tokens[:, context_length])
-
-            # Update the context length for the next token generation.
-            prev_context_length = context_length
-
-            # Check if all the sequences have hit the termination_id.
-            done = None
-            if mpu.is_pipeline_last_stage():
-                # TODO(rprenger) These stopping methods are tokenizer dependent
-                # instead tokenization should be in the inference loop so stop sequences can be used
-                if stop_on_double_eol:
-                    hit_double_eol = (new_sample == 628).byte() & started.byte()
-                    hit_two_eols = (new_sample == 198).byte() & (tokens[:, context_length-1] == 198).byte() & started.byte()
-                    done_token = hit_double_eol | hit_two_eols
-                elif stop_on_eol:
-                    hit_double_eol = (new_sample == 628).byte() & started.byte()
-                    hit_eol = (new_sample == 198).byte() & started.byte()
-                    done_token = hit_double_eol | hit_eol
-                else: 
-                    done_token = (new_sample == termination_id).byte() & \
-                        started.byte()
-                
-                just_finished = (done_token & ~is_generation_done).bool()
-                generated_sequence_lengths[just_finished.view(-1)] = \
-                    context_length + 1
-                is_generation_done = is_generation_done | done_token
-                done = torch.all(is_generation_done)
-            done = broadcast_from_last_pipeline_stage(1, torch.uint8,
-                                                      tensor=done)
-            if use_eod_token_for_early_termination and done:
-                break
-            
-    # ===================================================
-    # Update the length of based on max generated length.
-    # ===================================================
-
-    tokens = tokens[:, :(context_length + 1)]
-    if mpu.is_pipeline_last_stage():
-        if return_output_log_probs:
-            output_log_probs = output_log_probs[:, :context_length]
-
-    # ======================================
-    # Broadcast to the first pipeline stage.
-    # ======================================
-
-    generated_sequence_lengths = broadcast_from_last_to_first_pipeline_stage(
-        batch_size, torch.int64, generated_sequence_lengths)
-    if return_output_log_probs:
-        output_log_probs_size = (batch_size, context_length)
-        output_log_probs = broadcast_from_last_to_first_pipeline_stage(
-            output_log_probs_size, torch.float32, output_log_probs)
-
-    return tokens, generated_sequence_lengths, output_log_probs, None
-
-def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, stop_token, num_return_gen, length_penalty, prevent_newline_after_colon=True):
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    batch_size = tokens.size(0)
-    assert(batch_size == 1)
-    prompt_length = lengths.item()
-    final_sequence_length = tokens.size(1)
-    final_sequence_length = min(final_sequence_length, args.max_position_embeddings)
-    
-    # If the context is too big, this happens
-    if prompt_length >= final_sequence_length:
-        raise ValueError("context length + tokens_to_generate too large")
-
-    # forward step.
-    forward_step = ForwardStep(model, beam_size, final_sequence_length)
-
-    beam_hyp = BeamHypotheses(beam_size, length_penalty)
-    best_batches = None
-    done = torch.zeros(1, dtype=torch.uint8, device=torch.cuda.current_device())
-    scores = torch.zeros(beam_size,
-                         dtype=torch.float32,
-                         device=torch.cuda.current_device()).unsqueeze(1)
-    scores_size_tensor, tokens_size_tensor = None, None
-    # =============
-    # Run infernece
-    # =============
-    with torch.no_grad():
-        tokens = tokens.repeat(beam_size, 1)
-        attention_mask, position_ids = _build_attention_mask_and_position_ids(tokens)
-        prev_context_length = 0
-        for context_length in range(prompt_length, final_sequence_length):
-
-            # Pick the slice that we need to pass through the network.
-            tokens2use = tokens[:, prev_context_length:context_length]
-            positions2use = position_ids[:, prev_context_length:context_length]
-            attention_mask2use = attention_mask[
-                ..., prev_context_length:context_length, :context_length]
-
-            # logits will be meanigful only in the last pipeline stage.
-            logits = forward_step(tokens2use, positions2use, attention_mask2use)
-
-            if mpu.is_pipeline_last_stage():
-                if prevent_newline_after_colon:
-                    logits[tokens2use[:, -1] == tokenizer.tokenize(':')[0], -1, tokenizer.tokenize('\n')[0]] = -1e10 # disable "\n" after ":"
-                vocab_size = logits.size(2)
-                log_probs = F.log_softmax(logits, dim=2)
-                new_scores = log_probs[:, -1, :] + scores
-
-                if context_length == prompt_length:  # if this is the first one
-                    sorted_scores, indices = torch.sort(new_scores[0,:], descending=True)
-                else:
-                    sorted_scores, indices = torch.sort(new_scores.view(-1), descending=True)
-
-                best_beam_ids = torch.div(indices[: 2 * beam_size], vocab_size).trunc().long()
-                best_words = indices[:2 * beam_size] % vocab_size
-                best_scores = sorted_scores[: 2 * beam_size]
-
-                next_beams = []
-                for beam_token_rank, (token_id, beam_score, beam_id) in enumerate(
-                    zip(best_words, best_scores, best_beam_ids)
-                ):
-                    if token_id.item() == stop_token:
-                        # if beam_token does not belong to top num_beams tokens, it should not be added
-                        is_beam_token_worse_than_top_num_beams = beam_token_rank >= beam_size
-                        if is_beam_token_worse_than_top_num_beams:
-                            continue
-                        beam_hyp.add(
-                            tokens[beam_id].clone(),
-                            beam_score,
-                            context_length + 1 - prompt_length
-                        )
-                    else:
-                        # add next predicted token since it is not eos_token
-                        next_beams.append((token_id, beam_score, beam_id))
-
-                    if len(next_beams) == beam_size:
-                        break
-
-                if beam_hyp.is_done(best_scores.max().item(), context_length + 1 - prompt_length):
-                    done = torch.ones(1, dtype=torch.uint8, device=torch.cuda.current_device())
-            
-                best_batches = tokens.new([item[2] for item in next_beams])
-                tokens = tokens[best_batches,:]
-                tokens[:, context_length] = tokens.new([item[0] for item in next_beams])
-                scores = scores.new([item[1] for item in next_beams]).unsqueeze(1)
-          
-            # torch.distributed.barrier()
-            done = broadcast_from_last_pipeline_stage(1, torch.uint8, done)
-            if done:
-                break
-
-            # Update the tokens on the first stage so the next input to
-            # the network is correct.
-            copy_from_last_to_first_pipeline_stage(tokens.size(), torch.int64,
-                                                   tokens)
-
-            # set inference key values to make it consistent with best beam index
-            best_batches = broadcast_from_last_pipeline_stage(beam_size, torch.int64, best_batches)
-            forward_step.inference_params.swap_key_value_dict(best_batches)
-
-            # Update the context length for the next token generation.
-            prev_context_length = context_length
-
-        if mpu.is_pipeline_last_stage():
-            # if cannot find stop token, add open beams to hyps
-            if not done:
-                for beam_id in range(beam_size):
-                    beam_hyp.add(tokens[beam_id].clone(), scores[beam_id].squeeze(), context_length + 1 - prompt_length)
-
-            # rank based on scores
-            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0], reverse=True)
-            num_return_gen = min(num_return_gen, len(sorted_hyps))
-            scores = [sorted_hyps[i][0] for i in range(num_return_gen)]
-            tokens = [sorted_hyps[i][1] for i in range(num_return_gen)]
-            scores = torch.stack(scores, dim=0)
-            tokens = torch.stack(tokens, dim=0)
-            scores_size_tensor = torch.tensor(scores.shape, dtype=torch.int64, device=torch.cuda.current_device())
-            tokens_size_tensor = torch.tensor(tokens.shape, dtype=torch.int64, device=torch.cuda.current_device())
-
-        scores_size_tensor = broadcast_from_last_pipeline_stage(1, torch.int64, scores_size_tensor)
-        tokens_size_tensor = broadcast_from_last_pipeline_stage(2, torch.int64, tokens_size_tensor)
-
-        scores = broadcast_from_last_to_first_pipeline_stage(tuple(scores_size_tensor), torch.float32, scores)
-        tokens = broadcast_from_last_to_first_pipeline_stage(tuple(tokens_size_tensor), torch.int64, tokens)
-
-    return tokens, scores
-
-
-def _build_attention_mask_and_position_ids(tokens):
-    """Build the attention mask and postition ids for the input tokens."""
-
-    # Since we are not interested in loss-mask and reset attention/position
-    # is also False, eod_token is not used so it is safe to set it to None.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
-        data=tokens,
-        eod_token=None,
-        reset_position_ids=False,
-        reset_attention_mask=False,
-        eod_mask_loss=False)
-
-    return attention_mask, position_ids
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/sampling.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/sampling.py
deleted file mode 100644
index 370773a36c087d01e75731e38724cfb35d4acd74..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/sampling.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Sampling utilities.
-Part of this code is inspired by:
- - https://github.com/ari-holtzman/degen/blob/master/gen.py
- - https://huggingface.co/transformers/_modules/transformers/generation_logits_process.html
-"""
-
-
-import torch
-
-
-
-def modify_logits_for_top_k_filtering(logits, top_k):
-    """Set the logits for none top-k values to -inf."""
-
-    filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
-    logits.masked_fill_(filter_, float('-Inf'))
-
-
-
-def modify_logits_for_top_p_filtering(logits, top_p):
-    """Set the logits for none top-p values to -inf."""
-
-    # First sort and calculate cumulative sum of probabilities.
-    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
-
-    # Filteration based on the cumulative sum.
-    filter_ = cumulative_probs > top_p
-    # This shift by 1 is weird and I cannot justify it. This existed
-    # in the original implementation:
-    #   https://github.com/ari-holtzman/degen/blob/master/gen.py
-    # and I guess it is needed so keeping it for now.
-    filter_[:, 1:] = filter_[:, :-1].clone()
-    # Make sure we at least have one token to select from.
-    filter_[..., 0] = 0
-
-    # Fill in the filtered part
-    filter_ = filter_.scatter(1, sorted_indices, filter_)
-    logits.masked_fill_(filter_, float('-Inf'))
-
-
-
-def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None):
-    """ Sample and generate a token.
-    Note: logits has the dimension [b, v] where b is the batch size
-          and v is the vocabulary size.
-    If vocab_size is provided, we will make sure the sample that is
-    generated is in [0, vocab-size). This will avoid out of vocabulary
-    generations due to padding.
-    """
-
-    # Check logits for consistency.
-    assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.'
-    assert logits.type() == 'torch.cuda.FloatTensor', \
-        'input logits should be floats.'
-
-
-    # Greedy is just simple argmax.
-    if top_k == 1:
-        assert top_p == 0.0, 'cannot set both greedy and top-p samplings.'
-        samples = torch.argmax(logits, dim=-1)
-
-    # Top-k or top-p sampling.
-    else:
-        # Clone so we do not modify the inputs,
-        logits = logits.clone()
-        # Apply temperature in place.
-        if temperature != 1.0:
-            logits.div_(temperature)
-
-        if top_k > 1:
-            assert top_p == 0.0, 'cannot set both top-k and top-p samplings.'
-            assert top_k <= logits.size(1), 'top-k is larger than logit size.'
-            if vocab_size:
-                assert top_k < vocab_size, 'top-k is larger than vocab size.'
-            modify_logits_for_top_k_filtering(logits, top_k)
-
-        elif top_p > 0.0:
-            assert top_p <= 1.0, 'top-p should be in (0, 1].'
-            modify_logits_for_top_p_filtering(logits, top_p)
-
-        # After filtering, we need to recalculate the distribution.
-        probs = logits.softmax(dim=-1)
-        samples = torch.multinomial(probs, num_samples=1).view(-1)
-
-    # If vocab size is provided, make sure the samples are in
-    # in the range [0, vocab-size).
-    if vocab_size:
-        samples = torch.clamp(samples, min=0, max=(vocab_size - 1))
-
-    return samples
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/tokenization.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/tokenization.py
deleted file mode 100644
index 9a8d74d50ece29c10706254bd6e28fe782bdb785..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation/tokenization.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Tokenization utilities."""
-
-
-import torch
-
-
-from megatron_ds import get_tokenizer, get_args
-from .communication import broadcast_int_list, broadcast_tensor
-
-
-def detokenize_generations(tokens_gpu_tensor,
-                           lengths_gpu_tensor,
-                           return_segments):
-    """Detokenize the generated tokens."""
-
-    tokenizer = get_tokenizer()
-    args = get_args()
-    prompts_plus_generations = []
-    if return_segments:
-        prompts_plus_generations_segments = []
-
-    tokens = tokens_gpu_tensor.cpu().numpy().tolist()
-    lengths = lengths_gpu_tensor.cpu().numpy().tolist()
-    for sequence_tokens, length in zip(tokens, lengths):
-        sequence_tokens = sequence_tokens[:length]
-        prompts_plus_generations.append(
-            tokenizer.detokenize(sequence_tokens))
-        if return_segments:
-            words = []
-            for token in sequence_tokens:
-                if args.tokenizer_type in ['SentencePieceTokenizer', 
-                                           'GPTSentencePieceTokenizer',
-                                           'Llama2Tokenizer']:
-                    word = tokenizer.decoder[token]
-                elif args.tokenizer_type == 'NullTokenizer':
-                    word = str(token)
-                else:
-                    word = tokenizer.tokenizer.decoder[token]
-                    word = bytearray(
-                        [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
-                            'utf-8', errors='replace')
-                words.append(word)
-            prompts_plus_generations_segments.append(words)
-
-    if return_segments:
-        return tokens, prompts_plus_generations, \
-            prompts_plus_generations_segments
-
-    return tokens, prompts_plus_generations
-
-
-def tokenize_prompts(prompts=None, tokens_to_generate=None,
-                     add_BOS=None, rank=0):
-    """Tokenize prompts and make them avaiable on all ranks."""
-
-    # On all ranks set to None so we can pass them to functions
-    sizes_list = None
-    prompts_tokens_cuda_long_tensor = None
-    prompts_length_cuda_long_tensor = None
-
-    # On the specified rank, build the above.
-    if torch.distributed.get_rank() == rank:
-        assert prompts is not None
-        assert tokens_to_generate is not None
-        # Tensor of tokens padded and their unpadded length.
-        prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
-            _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS)
-        # We need the sizes of these tensors for the boradcast
-        sizes_list = [prompts_tokens_cuda_long_tensor.size(0), # Batch size
-                      prompts_tokens_cuda_long_tensor.size(1)] # Sequence lenght
-
-    # First, broadcast the sizes.
-    sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
-
-    # Now that we have the sizes, we can boradcast the tokens
-    # and length tensors.
-    sizes = sizes_tensor.tolist()
-    prompts_tokens_cuda_long_tensor = broadcast_tensor(
-        sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
-    prompts_length_cuda_long_tensor = broadcast_tensor(
-        sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
-        rank=rank)
-
-    return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
-
-
-def _tokenize_prompts_and_batch(prompts, tokens_to_generate, add_BOS):
-    """Given a set of prompts and number of tokens to generate:
-        - tokenize prompts
-        - set the sequence length to be the max of length of prompts
-          plus the number of tokens we would like to generate
-        - pad all the sequences to this length so we can convert them
-          into a 2D tensor.
-    """
-
-    # Tokenize all the prompts.
-    tokenizer = get_tokenizer()
-    if add_BOS:
-        prompts_tokens = [[tokenizer.eod] + tokenizer.tokenize(prompt)
-                          for prompt in prompts]
-    else:
-        prompts_tokens = [tokenizer.tokenize(prompt) for prompt in prompts]
-
-    # Now we have a list of list of tokens which each list has a different
-    # size. We want to extend this list to:
-    #   - incorporate the tokens that need to be generated
-    #   - make all the sequences equal length.
-    # Get the prompts length.
-    prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
-    # Get the max prompts length.
-    max_prompt_len = max(prompts_length)
-    # Number of tokens in the each sample of the batch.
-    samples_length = max_prompt_len + tokens_to_generate
-    # Now update the list of list to be of the same size: samples_length.
-    for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
-        padding_size = samples_length - prompt_length
-        prompt_tokens.extend([tokenizer.eod] * padding_size)
-
-    # Now we are in a structured format, we can convert to tensors.
-    prompts_tokens_tensor = torch.cuda.LongTensor(prompts_tokens)
-    prompts_length_tensor = torch.cuda.LongTensor(prompts_length)
-
-    return prompts_tokens_tensor, prompts_length_tensor
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation_server.py b/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation_server.py
deleted file mode 100644
index f242defa51e26608e62eb7825499b0500af0f04f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation_server.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-import datetime
-import torch
-import json
-import threading
-from flask import Flask, request, jsonify, current_app
-from flask_restful import Resource, Api
-from megatron_ds import get_args
-from megatron_ds.text_generation import generate_and_post_process
-from megatron_ds.text_generation import beam_search_and_post_process
-
-
-GENERATE_NUM = 0
-BEAM_NUM = 1
-lock = threading.Lock()
-
-class MegatronGenerate(Resource):
-    def __init__(self, model):
-        self.model = model
-
-    @staticmethod
-    def send_do_generate():
-        choice = torch.cuda.LongTensor([GENERATE_NUM])
-        torch.distributed.broadcast(choice, 0)
-     
-    @staticmethod
-    def send_do_beam_search():
-        choice = torch.cuda.LongTensor([BEAM_NUM])
-        torch.distributed.broadcast(choice, 0)
-    
-    def put(self):
-        args = get_args()
-       
-        if not "prompts" in request.get_json():
-            return "prompts argument required", 400
-        
-        if "max_len" in request.get_json():
-            return "max_len is no longer used.  Replace with tokens_to_generate", 400
-        
-        if "sentences" in request.get_json():
-            return "sentences is no longer used.  Replace with prompts", 400
-
-        prompts = request.get_json()["prompts"]
-        if not isinstance(prompts, list):
-            return "prompts is not a list of strings", 400
-
-        if len(prompts) == 0:
-            return "prompts is empty", 400
-        
-        if len(prompts) > 128:
-            return "Maximum number of prompts is 128", 400
-        
-        tokens_to_generate = 64  # Choosing hopefully sane default.  Full sequence is slow
-        if "tokens_to_generate" in request.get_json():
-            tokens_to_generate = request.get_json()["tokens_to_generate"]
-            if not isinstance(tokens_to_generate, int):
-                return "tokens_to_generate must be an integer greater than 0"
-            if tokens_to_generate < 0:
-                return "tokens_to_generate must be an integer greater than or equal to 0"
-
-        logprobs = False
-        if "logprobs" in request.get_json():
-            logprobs = request.get_json()["logprobs"]
-            if not isinstance(logprobs, bool):
-                return "logprobs must be a boolean value"
-        
-        if tokens_to_generate == 0 and not logprobs:
-            return "tokens_to_generate=0 implies logprobs should be True"
-        
-        temperature = 1.0
-        if "temperature" in request.get_json():
-            temperature = request.get_json()["temperature"]
-            if not (type(temperature) == int or type(temperature) == float):
-                return "temperature must be a positive number less than or equal to 100.0"
-            if not (0.0 < temperature <= 100.0):
-                return "temperature must be a positive number less than or equal to 100.0"
-        
-        top_k = 0.0
-        if "top_k" in request.get_json():
-            top_k = request.get_json()["top_k"]
-            if not (type(top_k) == int):
-                return "top_k must be an integer equal to or greater than 0 and less than or equal to 1000"
-            if not (0 <= top_k <= 1000):
-                return "top_k must be equal to or greater than 0 and less than or equal to 1000"
-        
-        top_p = 0.0
-        if "top_p" in request.get_json():
-            top_p = request.get_json()["top_p"]
-            if not (type(top_p) == float):
-                return "top_p must be a positive float less than or equal to 1.0"
-            if top_p > 0.0 and top_k > 0.0:
-                return "cannot set both top-k and top-p samplings."
-            if not (0 <= top_p <= 1.0):
-                return "top_p must be less than or equal to 1.0"
-        
-        top_p_decay = 0.0
-        if "top_p_decay" in request.get_json():
-            top_p_decay = request.get_json()["top_p_decay"]
-            if not (type(top_p_decay) == float):
-                return "top_p_decay must be a positive float less than or equal to 1.0"
-            if top_p == 0.0:
-                return "top_p_decay cannot be set without top_p"
-            if not (0 <= top_p_decay <= 1.0):
-                return "top_p_decay must be less than or equal to 1.0"
-        
-        top_p_bound = 0.0
-        if "top_p_bound" in request.get_json():
-            top_p_bound = request.get_json()["top_p_bound"]
-            if not (type(top_p_bound) == float):
-                return "top_p_bound must be a positive float less than or equal to top_p"
-            if top_p == 0.0:
-                return "top_p_bound cannot be set without top_p"
-            if not (0.0 < top_p_bound <= top_p):
-                return "top_p_bound must be greater than 0 and less than top_p"
-        
-        add_BOS = False
-        if "add_BOS" in request.get_json():
-            add_BOS = request.get_json()["add_BOS"]
-            if not isinstance(add_BOS, bool):
-                return "add_BOS must be a boolean value"
-        
-        if any([len(prompt) == 0 for prompt in prompts]) and not add_BOS:
-            return "Empty prompts require add_BOS=true"
-
-        stop_on_double_eol = False
-        if "stop_on_double_eol" in request.get_json():
-            stop_on_double_eol = request.get_json()["stop_on_double_eol"]
-            if not isinstance(stop_on_double_eol, bool):
-                return "stop_on_double_eol must be a boolean value"
-        
-        stop_on_eol = False
-        if "stop_on_eol" in request.get_json():
-            stop_on_eol = request.get_json()["stop_on_eol"]
-            if not isinstance(stop_on_eol, bool):
-                return "stop_on_eol must be a boolean value"
-
-        prevent_newline_after_colon = False
-        if "prevent_newline_after_colon" in request.get_json():
-            prevent_newline_after_colon = request.get_json()["prevent_newline_after_colon"]
-            if not isinstance(prevent_newline_after_colon, bool):
-                return "prevent_newline_after_colon must be a boolean value"
-
-        random_seed = -1
-        if "random_seed" in request.get_json():
-            random_seed = request.get_json()["random_seed"]
-            if not isinstance(random_seed, int):
-                return "random_seed must be integer"
-            if random_seed < 0: 
-                return "random_seed must be a positive integer"
-
-        no_log = False
-        if "no_log" in request.get_json():
-            no_log = request.get_json()["no_log"]
-            if not isinstance(no_log, bool):
-                return "no_log must be a boolean value"
-        
-        beam_width = None
-        if "beam_width" in request.get_json():
-            beam_width = request.get_json()["beam_width"]
-            if not isinstance(beam_width, int):
-                return "beam_width must be integer"
-            if beam_width < 1:
-                return "beam_width must be an integer > 1"
-            if len(prompts) > 1:
-                return "When doing beam_search, batch size must be 1"
-
-        stop_token=50256
-        if "stop_token" in request.get_json():
-            stop_token = request.get_json()["stop_token"]
-            if not isinstance(stop_token, int):
-                return "stop_token must be an integer"
-        
-        length_penalty = 1 
-        if "length_penalty" in request.get_json():
-            length_penalty = request.get_json()["length_penalty"]
-            if not isinstance(length_penalty, float):
-                return "length_penalty must be a float"
-        
-        with lock:  # Need to get lock to keep multiple threads from hitting code
-            
-            if not no_log:
-                print("request IP: " + str(request.remote_addr))
-                print(json.dumps(request.get_json()),flush=True)
-                print("start time: ", datetime.datetime.now())
-            
-            try:
-                if beam_width is not None:
-                    MegatronGenerate.send_do_beam_search()  # Tell other ranks we're doing beam_search
-                    response, response_seg, response_scores = \
-                        beam_search_and_post_process(
-                        self.model,
-                        prompts=prompts,
-                        tokens_to_generate=tokens_to_generate,
-                        beam_size = beam_width,
-                        add_BOS=add_BOS,
-                        stop_token=stop_token,
-                        num_return_gen=beam_width,  # Returning whole beam
-                        length_penalty=length_penalty,
-                        prevent_newline_after_colon=prevent_newline_after_colon
-                        )
-                    
-                    return jsonify({"text": response,
-                        "segments": response_seg,
-                        "scores": response_scores})
-                else:
-                    MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
-                    response, response_seg, response_logprobs, _ = \
-                        generate_and_post_process(
-                        self.model,
-                        prompts=prompts,
-                        tokens_to_generate=tokens_to_generate,
-                        return_output_log_probs=logprobs,
-                        top_k_sampling=top_k,
-                        top_p_sampling=top_p,
-                        top_p_decay=top_p_decay,
-                        top_p_bound=top_p_bound,
-                        temperature=temperature,
-                        add_BOS=add_BOS,
-                        use_eod_token_for_early_termination=True,
-                        stop_on_double_eol=stop_on_double_eol,
-                        stop_on_eol=stop_on_eol,
-                        prevent_newline_after_colon=prevent_newline_after_colon,
-                        random_seed=random_seed)
-
-                    return jsonify({"text": response,
-                        "segments": response_seg,
-                        "logprobs": response_logprobs})
-
-            except ValueError as ve:
-                return ve.args[0]
-            print("end time: ", datetime.datetime.now())
-        
-
-class MegatronServer(object):
-    def __init__(self, model):
-        self.app = Flask(__name__, static_url_path='')
-        api = Api(self.app)
-        api.add_resource(MegatronGenerate, '/api', resource_class_args=[model])
-        
-    def run(self, url, port): 
-        self.app.run(url, threaded=True, debug=False, port=port)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/timers.py b/toolbox/Megatron-DeepSpeed/megatron_ds/timers.py
deleted file mode 100644
index 90216edf0410d9c12e408a89b6a1c06f76701d1f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/timers.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-
-"""Megatron timers."""
-
-from abc import ABC
-from abc import abstractmethod
-import time
-
-import torch
-from deepspeed.accelerator import get_accelerator
-from packaging import version
-
-
-class TimerBase(ABC):
-
-    def __init__(self, name):
-        self.name = name
-
-    @abstractmethod
-    def start(self, barrier=False):
-        pass
-
-    @abstractmethod
-    def stop(self, barrier=False):
-        pass
-
-    @abstractmethod
-    def reset(self):
-        pass
-
-    @abstractmethod
-    def elapsed(self, reset=True, barrier=False):
-        pass
-
-
-
-class DummyTimer(TimerBase):
-
-    def __init__(self):
-        super().__init__('dummy timer')
-
-    def start(self, barrier=False):
-        return
-
-    def stop(self, barrier=False):
-        return
-
-    def reset(self):
-        return
-
-    def elapsed(self, reset=True, barrier=False):
-        raise Exception('dummy timer should not be used to '
-                        'calculate elapsed time')
-
-
-
-class Timer(TimerBase):
-    """
-    Comment on using `barrier`: If this flag is passed, then all
-    the caller processes will wait till all reach the timing routine.
-    It is up to the user to make sure all the ranks in `barrier_group`
-    call it otherwise, it will result in a hang.
-    Comment on `barrier_group`: By default it is set to None which
-    in torch distributed land, it will result in the global communicator.
-    """
-
-    def __init__(self, name):
-        super().__init__(name)
-        self._elapsed = 0.0
-        self._started = False
-        # Note that None will default to the global process group
-        self._barrier_group = None
-        self._start_time = time.time()
-
-
-    def set_barrier_group(self, barrier_group):
-        self._barrier_group = barrier_group
-
-
-    def start(self, barrier=False):
-        """Start the timer."""
-        assert not self._started, 'timer has already been started'
-        if barrier:
-            torch.distributed.barrier(group=self._barrier_group)
-        torch.cuda.synchronize()
-        self._start_time = time.time()
-        self._started = True
-
-
-    def stop(self, barrier=False):
-        """Stop the timer."""
-        assert self._started, 'timer is not started'
-        if barrier:
-            torch.distributed.barrier(group=self._barrier_group)
-        torch.cuda.synchronize()
-        self._elapsed += (time.time() - self._start_time)
-        self._started = False
-
-
-    def reset(self):
-        """Reset timer."""
-        self._elapsed = 0.0
-        self._started = False
-
-
-    def elapsed(self, reset=True, barrier=False):
-        """Calculate the elapsed time."""
-        _started = self._started
-        # If the timing in progress, end it first.
-        if self._started:
-            self.stop(barrier=barrier)
-        # Get the elapsed time.
-        _elapsed = self._elapsed
-        # Reset the elapsed time
-        if reset:
-            self.reset()
-        # If timing was in progress, set it back.
-        if _started:
-            self.start(barrier=barrier)
-        return _elapsed
-
-
-
-class Timers:
-    """Group of timers."""
-
-    def __init__(self, log_level, log_option):
-        self._log_level = log_level
-        self._log_option = log_option
-        self._timers = {}
-        self._log_levels = {}
-        self._dummy_timer = DummyTimer()
-        self._max_log_level = 2
-
-
-    def __call__(self, name, log_level=None):
-        # If the timer has already been set, then check if the log-level
-        # is provided, it matches the one that the timer was created with.
-        if name in self._timers:
-            if log_level is not None:
-                assert log_level == self._log_levels[name], \
-                    'input log level {} does not match already existing '\
-                    'log level {} for {} timer'.format(
-                        log_level, self._log_levels[name], name)
-            return self._timers[name]
-        # If timer does not exist and no log level is provided,
-        # set it to the max log level which is 2.
-        if log_level is None:
-            log_level = self._max_log_level
-        assert log_level <= self._max_log_level, \
-            'log level {} is larger than max supported log level {}'.format(
-                log_level, self._max_log_level)
-        # Now if the input log level is larger than the one set for
-        # the timers class, just ignore it and return a dummy timer.
-        if log_level > self._log_level:
-            return self._dummy_timer
-        # Otherwise, initalize the timer and set the level.
-        self._timers[name] = Timer(name)
-        self._log_levels[name] = log_level
-        return self._timers[name]
-
-
-    def _get_elapsed_time_all_ranks(self, names, reset, barrier):
-        """
-        Assumptions:
-            - All the ranks call this function.
-            - `names` are identical on all ranks.
-        If the above assumptions are not met, calling this function will
-        result in hang.
-        Arguments:
-            - names: list of timer names
-            - reset: reset the timer after recording the elapsed time
-            - barrier: if set, do a global barrier before time measurments
-        """
-
-        # First make sure all the callers are in sync.
-        if barrier:
-            torch.distributed.barrier()
-
-        world_size = torch.distributed.get_world_size()
-        rank = torch.distributed.get_rank()
-
-        # Here we can use gather on the rank we want to print the
-        # timing, however, there is no gather_base support in
-        # pytorch yet. It is simpler to deal with a single tensor
-        # and since we are only gathering a small amount of data,
-        # it should be ok to use all-gather instead of gather.
-        rank_name_to_time = torch.zeros((world_size, len(names)),
-                                        dtype=torch.float,
-                                        device=torch.cuda.current_device())
-        for i, name in enumerate(names):
-            if name in self._timers:
-                # Here we don't need to pass the barrier flag as all
-                # the processes are already in sync. This avoids the
-                # issue of different timers having different barrier
-                # groups inside their class.
-                rank_name_to_time[rank, i] = self._timers[name].elapsed(
-                    reset=reset)
-
-        # See the note above for why we are not using gather.
-        if version.parse(torch.__version__) >= version.parse('1.13'):
-            torch.distributed.all_gather_into_tensor(rank_name_to_time.view(-1),
-                                           rank_name_to_time[rank, :].view(-1))
-        else:
-            torch.distributed._all_gather_base(rank_name_to_time.view(-1),
-                                         rank_name_to_time[rank, :].view(-1))
-
-        return rank_name_to_time
-
-
-    def _get_global_min_max_time(self, names, reset, barrier, normalizer):
-        """Report only min and max times across all ranks."""
-
-        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
-                                                             barrier)
-        name_to_min_max_time = {}
-        for i, name in enumerate(names):
-            rank_to_time = rank_name_to_time[:, i]
-            # filter out the ones we did not have any timings for
-            rank_to_time = rank_to_time[rank_to_time > 0.0]
-            # If the timer exists:
-            if rank_to_time.numel() > 0:
-                name_to_min_max_time[name] = (
-                    rank_to_time.min().item() / normalizer,
-                    rank_to_time.max().item() / normalizer)
-        return name_to_min_max_time
-
-
-    def _get_global_min_max_time_string(self, names, reset, barrier,
-                                        normalizer, max_only):
-        name_to_min_max_time = self._get_global_min_max_time(
-            names, reset, barrier, normalizer)
-        if not name_to_min_max_time:
-            return None
-        output_string = '(min, max) time across ranks (ms):'
-        for name in name_to_min_max_time:
-            min_time, max_time = name_to_min_max_time[name]
-            if max_only:
-                output_string += '\n    {}: {:.2f}'.format(
-                    (name+' ').ljust(48, '.'), max_time)
-            else:
-                output_string += '\n    {}: ({:.2f}, {:.2f})'.format(
-                    (name+' ').ljust(48, '.'), min_time, max_time)
-        return output_string
-
-
-    def _get_all_ranks_time_string(self, names, reset, barrier, normalizer):
-        """Report times across all ranks."""
-        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
-                                                             barrier)
-
-        output_string = 'times across ranks (ms):'
-        no_reported_timing = True
-        for i, name in enumerate(names):
-            not_yet_found = True
-            for rank in range(torch.distributed.get_world_size()):
-                if rank_name_to_time[rank, i] > 0:
-                    no_reported_timing = False
-                    if not_yet_found:
-                        not_yet_found = False
-                        output_string += '\n  {}:'.format(name)
-                    output_string += '\n     rank {:2d}: {:.2f}'.format(
-                        rank, rank_name_to_time[rank, i] / normalizer)
-        if no_reported_timing:
-            return None
-        return output_string
-
-
-    def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False):
-        """Log a group of timers."""
-
-        # Print.
-        assert normalizer > 0.0
-        if self._log_option in ['max', 'minmax']:
-            max_only = False
-            if self._log_option == 'max':
-                max_only = True
-            output_string = self._get_global_min_max_time_string(
-                names, reset, barrier, normalizer/1000.0, max_only)
-        elif self._log_option == 'all':
-            output_string = self._get_all_ranks_time_string(names,
-                                                            reset, barrier,
-                                                            normalizer/1000.0)
-        else:
-            raise Exception('unknown timing log option {}'.format(
-                self._log_option))
-
-        # If no input rank is provided, log on last rank.
-        if rank is None:
-            rank = torch.distributed.get_world_size() - 1
-        if rank == torch.distributed.get_rank() and output_string is not None:
-            print(output_string, flush=True)
-
-
-    def write(self, names, writer, iteration, normalizer=1.0,
-              reset=False, barrier=False):
-        """Write timers to a tensorboard writer
-        Note that we only report maximum time across ranks to tensorboard.
-        """
-        # currently when using add_scalars,
-        # torch.utils.add_scalars makes each timer its own run, which
-        # polutes the runs list, so we just add each as a scalar
-        assert normalizer > 0.0
-        name_to_min_max_time = self._get_global_min_max_time(
-            names, reset, barrier, normalizer)
-        if writer is not None:
-            for name in name_to_min_max_time:
-                _, max_time = name_to_min_max_time[name]
-                writer.add_scalar(name + '-time', max_time, iteration)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/datasets/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/README.md b/toolbox/Megatron-DeepSpeed/megatronspeed/core/README.md
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/README.md
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/README.md
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/fusions/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/optimizer/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c064bba7744bf0daf6dee8a752d25ddc92c00b2
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/optimizer/__init__.py
@@ -0,0 +1,264 @@
+import logging
+from functools import wraps
+from typing import Callable, Dict, List, Optional
+
+import torch
+
+try:
+    from apex.optimizers import FusedAdam as Adam
+    from apex.optimizers import FusedSGD as SGD
+except ImportError:
+    import warnings
+
+    warnings.warn(
+        f'Transformer Engine and Apex are not installed. Falling back to Torch optimizers.'
+    )
+
+    ## apex's FusedAdam is a drop-in replacement for torch's AdamW
+    ## see https://github.com/NVIDIA/apex/blob/7b73b12361068a10b0f44844534613f252a5ea75/apex/optimizers/fused_adam.py#L16
+    from torch.optim import AdamW as Adam, SGD
+
+from megatron.core import mpu
+
+from megatron.training.global_vars import get_args
+from megatron.core.distributed import ParamAndGradBuffer
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.utils import log_single_rank
+from megatron.core.optimizer.distrib_optimizer import DistributedOptimizer
+from megatron.core.optimizer.grad_scaler import ConstantGradScaler, DynamicGradScaler
+from megatron.core.optimizer.optimizer import (
+    ChainedOptimizer,
+    Float16OptimizerWithFloat16Params,
+    FP32Optimizer,
+    MegatronOptimizer,
+)
+from megatron.core.optimizer.optimizer_config import OptimizerConfig
+
+from megatron.core.optimizer import (
+    logger,
+    _get_param_groups,
+    _update_min_and_max_lr_in_param_groups
+)
+
+
+def get_param_groups(modules,
+                     no_weight_decay_cond,
+                     scale_lr_cond,
+                     lr_mult):
+    """creates param groups based on weight decay condition (regularized vs non regularized)
+       and learning rate scale condition (args.lr vs lr_mult * args.lr)
+       scale_lr_cond is used during finetuning where head of the network requires a scaled
+       version of the base learning rate. 
+    """
+    wd_no_scale_lr = []
+    wd_scale_lr = []
+    no_wd_no_scale_lr = []
+    no_wd_scale_lr = []
+    for module in modules:
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue
+
+            if no_weight_decay_cond is not None:
+                no_wd = no_weight_decay_cond(name, param)
+            else:
+                # do not regularize biases nor Norm parameters
+                no_wd = name.endswith(".bias") or len(param.shape) == 1
+
+            if scale_lr_cond is not None:
+                scale_lr = scale_lr_cond(name, param)
+            else:
+                scale_lr = False
+
+            if not no_wd and not scale_lr:
+                wd_no_scale_lr.append(param)
+            elif not no_wd and scale_lr:
+                wd_scale_lr.append(param)
+            elif no_wd and not scale_lr:
+                no_wd_no_scale_lr.append(param)
+            else:
+                no_wd_scale_lr.append(param)
+
+    param_groups = []
+    if len(wd_no_scale_lr):
+        param_groups.append({'name': 'wd_no_scale_lr', 'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0})
+    if len(wd_scale_lr):
+        param_groups.append({'name': 'wd_scale_lr', 'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult})
+    if len(no_wd_no_scale_lr):
+        param_groups.append({'name': 'no_wd_no_scale_lr', 'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0})
+    if len(no_wd_scale_lr):
+        param_groups.append({'name': 'no_wd_scale_lr', 'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult})
+
+    return param_groups
+
+def _get_param_groups_mod(
+    model_chunks: List[MegatronModule],
+    no_weight_decay_cond: Callable,
+    scale_lr_cond: Callable,
+    lr_mult: float,
+    use_decoupled_learning_rate: bool,
+) -> List[Dict]:
+    """Create parameter groups for optimizer.
+
+    Creates parameter groups based on weight decay condition (regularized vs
+    non regularized), learning rate scale condition (lr vs lr_mult * lr),
+    and whether it is expert parameters. scale_lr_cond is used during finetuning
+    where head of the network requires a scaled version of the base learning rate.
+
+    Args:
+        model_chunks (List[MegatronModule]): model chunks to create parameter
+            groups for.
+        no_weight_decay_cond (func): function to determine whether a parameter
+            should not perform weight decay.
+        scale_lr_cond (func): function to determine whether a parameter
+            should have a scaled learning rate.
+        lr_mult (float): learning rate multiplier for parameters that
+            satisfy scale_lr_cond.
+        use_decoupled_learning_rate (bool): true if using decoupled learning rate.
+
+    Returns:
+        List of parameter groups.
+    """
+
+    # Map (wd_mult, lr_mult, is_expert_parallel, is_decoupled_lr) to params.
+    params_map = {}
+    for model_chunk in model_chunks:
+        for name, param in model_chunk.named_parameters():
+            if not param.requires_grad:
+                continue
+
+            is_expert_parallel = not getattr(param, 'allreduce', True)
+
+            if no_weight_decay_cond is not None:
+                no_wd = no_weight_decay_cond(name, param)
+            else:
+                # Do not regularize biases and norm parameters.
+                no_wd = name.endswith(".bias") or len(param.shape) == 1
+
+            if scale_lr_cond is not None:
+                scale_lr = scale_lr_cond(name, param)
+            else:
+                scale_lr = False
+
+            if not no_wd and not scale_lr:
+                wd_mult, _lr_mult = 1.0, 1.0
+            elif not no_wd and scale_lr:
+                wd_mult, _lr_mult = 1.0, lr_mult
+            elif no_wd and not scale_lr:
+                wd_mult, _lr_mult = 0.0, 1.0
+            else:
+                wd_mult, _lr_mult = 0.0, lr_mult
+
+            is_decoupled_lr = False
+            # For input/embedding and output layer: embedding.word_embeddings.weight / output_layer.weight.
+            if use_decoupled_learning_rate and getattr(
+                param, 'is_embedding_or_output_parameter', False
+            ):
+                is_decoupled_lr = True
+
+            key = (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr)
+            if key not in params_map:
+                params_map[key] = []
+            params_map[key].append(param)
+
+    param_groups = []
+    for (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr), params in params_map.items():
+        assert len(params) > 0
+        if wd_mult == 1.0 and _lr_mult == 1.0:
+            name = 'wd_no_scale_lr'
+        elif wd_mult == 1.0 and _lr_mult == lr_mult:
+            name = 'wd_scale_lr'
+        elif wd_mult == 0.0 and _lr_mult == 1.0:
+            name = 'no_wd_no_scale_lr'
+        else:
+            name = 'no_wd_scale_lr'
+        param_groups.append(
+            {
+                'name': name,
+                'params': params,
+                'wd_mult': wd_mult,
+                'lr_mult': _lr_mult,
+                'is_expert_parallel': is_expert_parallel,
+                'is_decoupled_lr': is_decoupled_lr,
+            }
+        )
+
+    return param_groups
+
+def get_megatron_optimizer_wrapper(get_megatron_optimizer):
+    @wraps(get_megatron_optimizer)
+    def wrapper(
+        config: OptimizerConfig,
+        model_chunks: List[MegatronModule],
+        no_weight_decay_cond: Optional[Callable] = None,
+        scale_lr_cond: Optional[Callable] = None,
+        lr_mult: float = 1.0,
+        ):
+        args = get_args()
+        
+        if not args.deepspeed:
+            return get_megatron_optimizer(
+                    config,
+                    model_chunks,
+                    no_weight_decay_cond,
+                    scale_lr_cond,
+                    lr_mult)
+
+        log_single_rank(logger, logging.INFO, f'Setting up optimizer with config {config}')
+
+        args = get_args()
+
+        # Base optimizer.
+        param_groups = _get_param_groups(
+            model_chunks,
+            no_weight_decay_cond,
+            scale_lr_cond,
+            lr_mult,
+            use_decoupled_learning_rate=config.decoupled_lr is not None,
+        )
+        param_groups = _update_min_and_max_lr_in_param_groups(
+            param_groups,
+            lr=config.lr,
+            min_lr=config.min_lr,
+            decoupled_lr=config.decoupled_lr,
+            decoupled_min_lr=config.decoupled_min_lr,
+        )
+        if args.create_moe_param_group:
+            from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
+            param_groups = split_params_into_different_moe_groups_for_optimizer(param_groups)
+
+        if args.cpu_optimizer:
+            assert args.optimizer == 'adam', 'CPU offloading is for Adam'
+            if args.cpu_torch_adam:
+                cpu_adam_optimizer = torch.optim.AdamW
+            else:
+                from deepspeed.ops.adam import DeepSpeedCPUAdam
+                cpu_adam_optimizer = DeepSpeedCPUAdam
+            optimizer = cpu_adam_optimizer(param_groups,
+                                        lr=args.lr,
+                                        weight_decay=args.weight_decay,
+                                        betas=(args.adam_beta1, args.adam_beta2),
+                                        eps=args.adam_eps)
+        else:
+            if args.optimizer == 'adam':
+                if args.ds_fused_adam:
+                    global Adam
+                    from deepspeed.ops.adam import FusedAdam
+                    Adam = FusedAdam
+                optimizer = Adam(param_groups,
+                                lr=args.lr,
+                                weight_decay=args.weight_decay,
+                                betas=(args.adam_beta1, args.adam_beta2),
+                                eps=args.adam_eps)
+            elif args.optimizer == 'sgd':
+                optimizer = SGD(param_groups,
+                                lr=args.lr,
+                                weight_decay=args.weight_decay,
+                                momentum=args.sgd_momentum)
+            else:
+                raise Exception('{} optimizer is not supported.'.format(
+                args.optimizer))
+
+        if args.deepspeed:
+            return optimizer
+    return wrapper
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/package_info.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/package_info.py
similarity index 98%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/package_info.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/package_info.py
index 9cea5cbc1ce23c48a9af0f1beff8f923b8b074c7..5be292b42215054481fa822af7bab36b6073b604 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/package_info.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/package_info.py
@@ -2,8 +2,8 @@
 
 
 MAJOR = 2
-MINOR = 4
-PATCH = 1
+MINOR = 5
+PATCH = 0
 PRE_RELEASE = 'rc0'
 
 # Use the following formatting: (major, minor, patch, pre-release)
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/parallel_state.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/parallel_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30984e7cae896ba3ee50084d4be61cb7589e442
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/parallel_state.py
@@ -0,0 +1,502 @@
+"""Model and data parallel groups."""
+
+import os
+from functools import wraps
+import warnings
+from datetime import timedelta
+from typing import List, Optional
+
+import torch
+
+from megatron.training.global_vars import get_args
+
+import megatron.core.parallel_state as ps
+# from megatron.core.parallel_state import (
+#     _TENSOR_MODEL_PARALLEL_GROUP,
+#     _PIPELINE_MODEL_PARALLEL_GROUP,
+#     _MODEL_PARALLEL_GROUP,
+#     _MODEL_AND_EXPERT_PARALLEL_GROUP,
+#     _EMBEDDING_GROUP,
+#     _EMBEDDING_AR_GROUP,
+#     _POSITION_EMBEDDING_GROUP,
+#     _DATA_PARALLEL_GROUP,
+#     _DATA_PARALLEL_GROUP_GLOO,
+#     _TENSOR_AND_DATA_PARALLEL_GROUP,
+#     _EXPERT_MODEL_PARALLEL_GROUP,
+#     _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK,
+#     _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE,
+#     _PIPELINE_MODEL_PARALLEL_SPLIT_RANK,
+#     _EMBEDDING_GLOBAL_RANKS,
+#     _POSITION_EMBEDDING_GLOBAL_RANKS,
+#     _PIPELINE_GLOBAL_RANKS,
+#     _TENSOR_GLOBAL_RANKS,
+#     _DATA_PARALLEL_GLOBAL_RANKS,
+#     _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS,
+#     _DATA_PARALLEL_DEVICE_GROUP,
+# )
+from megatron.core.parallel_state import (
+    get_nccl_options,
+    RankGenerator,
+    get_data_parallel_group,
+    get_context_parallel_world_size,
+    _set_global_memory_buffer,
+    get_pipeline_model_parallel_world_size,
+    get_tensor_model_parallel_world_size,
+    get_tensor_model_parallel_rank,
+)
+
+from megatronspeed.megatron_adaptor import get_megatronspeed_args
+
+# For DeepSpeed's sequence parallel
+_SEQUENCE_PARALLEL_GROUP = None
+_SEQUENCE_PARALLEL_WORLD_SIZE = None
+_SEQUENCE_PARALLEL_RANK = None
+
+# This group includes processes for both data and sequence parallelisms.
+# We use this group to reduce gradients and shard parameters and optimizer stages for ZeRO.
+_SEQUENCE_DATA_PARALLEL_GROUP = None
+_SEQUENCE_DATA_PARALLEL_WORLD_SIZE = None
+_SEQUENCE_DATA_PARALLEL_RANK = None
+
+def initialize_model_parallel_wrapper(initialize_model_parallel):
+    @wraps(initialize_model_parallel)
+    def wrapper(
+        tensor_model_parallel_size: int = 1,
+        pipeline_model_parallel_size: int = 1,
+        virtual_pipeline_model_parallel_size: Optional[int] = None,
+        pipeline_model_parallel_split_rank: Optional[int] = None,
+        sequence_parallel_size: int = 1,
+        use_sharp: bool = False,
+        context_parallel_size: int = 1,
+        expert_model_parallel_size: int = 1,
+        nccl_communicator_config_path: Optional[str] = None,
+        distributed_timeout_minutes: int = 30,
+        order: str = "tp-cp-ep-dp-pp",
+    ):
+        print("megatrospeed initialize_model_parallel_wrapper.")
+        try:
+            args = get_args()
+        except AssertionError:
+            args = get_megatronspeed_args()
+
+        # import pdb
+        # if torch.distributed.get_rank() == 0:
+        #     pdb.set_trace()
+
+        # if not args.deepspeed:
+        #     initialize_model_parallel(
+        #         tensor_model_parallel_size,
+        #         pipeline_model_parallel_size,
+        #         virtual_pipeline_model_parallel_size,
+        #         pipeline_model_parallel_split_rank,
+        #         use_sharp,
+        #         context_parallel_size,
+        #         expert_model_parallel_size,
+        #         nccl_communicator_config_path,
+        #         distributed_timeout_minutes,
+        #         order,
+        #     )
+        #     return
+        
+        print(f"tensor_model_parallel_size: {tensor_model_parallel_size}, pipeline_model_parallel_size: {pipeline_model_parallel_size}")
+
+        if sequence_parallel_size > 1:
+            assert args.context_parallel_size <= 1, "Megatron-lm CP is not compatible with Deppspeed SP"
+
+        # Get world size and rank. Ensure some consistencies.
+        assert torch.distributed.is_initialized()
+        world_size: int = torch.distributed.get_world_size()
+
+        if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
+            raise RuntimeError(
+                f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
+                f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+            )
+
+        enable_ds_sequence_parallel = sequence_parallel_size > 1
+        if enable_ds_sequence_parallel:
+            assert tensor_model_parallel_size == 1 and pipeline_model_parallel_size == 1, \
+            'DeepSpeed\'s sequence parallel does not work with tensor parallel or pipeline parallel'
+
+            if world_size % sequence_parallel_size != 0:
+                raise RuntimeError(
+                    f"world_size ({world_size}) is not divisible by sequence_parallel_size {sequence_parallel_size})"
+                )
+
+        data_parallel_size: int = world_size // (tensor_model_parallel_size * pipeline_model_parallel_size * sequence_parallel_size)
+        sequence_data_parallel_size: int = sequence_parallel_size * data_parallel_size
+
+        num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+        num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+        num_data_parallel_groups: int = world_size // data_parallel_size
+        num_sequence_parallel_groups: int = world_size // sequence_parallel_size
+        num_sequence_data_parallel_groups: int = world_size // sequence_parallel_size // data_parallel_size
+
+        if virtual_pipeline_model_parallel_size is not None:
+            if not pipeline_model_parallel_size > 2:
+                raise RuntimeError("pipeline-model-parallel size should be greater than 2 with " "interleaved schedule")
+            ps._VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
+            ps._VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size
+
+        if pipeline_model_parallel_split_rank is not None:
+            ps._PIPELINE_MODEL_PARALLEL_SPLIT_RANK = pipeline_model_parallel_split_rank
+
+        rank = torch.distributed.get_rank()
+
+        nccl_comm_cfgs = {}
+        if nccl_communicator_config_path is not None:
+            try:
+                import yaml
+            except ImportError:
+                raise RuntimeError(
+                    "Cannot import `yaml`. Setting custom nccl communicator configs "
+                    "requires the yaml package."
+                )
+
+            with open(nccl_communicator_config_path, "r") as stream:
+                nccl_comm_cfgs = yaml.safe_load(stream)
+
+        rank_generator = RankGenerator(
+            tp=tensor_model_parallel_size,
+            ep=expert_model_parallel_size,
+            dp=data_parallel_size,
+            pp=pipeline_model_parallel_size,
+            cp=context_parallel_size,
+            order=order,
+        )
+        timeout = timedelta(minutes=distributed_timeout_minutes)
+
+        # Build the data-parallel groups.
+        assert ps._DATA_PARALLEL_GROUP is None, 'data parallel group is already initialized'
+        all_data_parallel_group_ranks = []
+        for i in range(pipeline_model_parallel_size):
+            start_rank = i * num_pipeline_model_parallel_groups
+            end_rank = (i + 1) * num_pipeline_model_parallel_groups
+
+            if sequence_parallel_size > 1:
+                tp_or_sp_size = sequence_parallel_size
+            else:
+                tp_or_sp_size = tensor_model_parallel_size
+
+            for j in range(tp_or_sp_size):
+                ranks = range(start_rank + j, end_rank, tp_or_sp_size)
+                all_data_parallel_group_ranks.append(list(ranks))
+                group = torch.distributed.new_group(ranks)
+                if getattr(args, "use_distributed_optimizer", None):
+                    group_gloo = torch.distributed.new_group(ranks, backend="gloo")
+                else:
+                    group_gloo = None
+                if rank in ranks:
+                    ps._DATA_PARALLEL_GROUP = group
+                    ps._DATA_PARALLEL_GROUP_GLOO = group_gloo
+                    ps._DATA_PARALLEL_GLOBAL_RANKS = ranks
+        for ranks_with_cp in rank_generator.get_ranks('dp-cp'):
+            group_with_cp = torch.distributed.new_group(
+                ranks_with_cp, timeout=timeout, pg_options=get_nccl_options('dp_cp', nccl_comm_cfgs)
+            )
+            group_with_cp_gloo = torch.distributed.new_group(
+                ranks_with_cp, timeout=timeout, backend="gloo"
+            )
+            if rank in ranks_with_cp:
+                ps._DATA_PARALLEL_GROUP_WITH_CP = group_with_cp
+                ps._DATA_PARALLEL_GROUP_WITH_CP_GLOO = group_with_cp_gloo
+                ps._DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks_with_cp
+
+        # Build the sequence parallel groups.
+        global _SEQUENCE_PARALLEL_GROUP
+        assert _SEQUENCE_PARALLEL_GROUP is None, \
+            'sequence parallel group is already initialized'
+        for i in range(num_sequence_parallel_groups):
+            ranks = range(i * sequence_parallel_size,
+                        (i + 1) * sequence_parallel_size)
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                _SEQUENCE_PARALLEL_GROUP = group
+
+        # Build the sequence data parallel groups.
+        global _SEQUENCE_DATA_PARALLEL_GROUP
+        assert _SEQUENCE_DATA_PARALLEL_GROUP is None, \
+            'sequence data parallel group is already initialized'
+        all_data_sequence_parallel_group_ranks = []
+        if enable_ds_sequence_parallel:
+            for i in range(num_sequence_data_parallel_groups):
+                ranks = range(i * sequence_data_parallel_size,
+                            (i + 1) * sequence_data_parallel_size)
+                group = torch.distributed.new_group(ranks)
+                all_data_sequence_parallel_group_ranks.append(list(ranks))
+                if rank in ranks:
+                    _SEQUENCE_DATA_PARALLEL_GROUP = group
+        else:
+            _SEQUENCE_DATA_PARALLEL_GROUP = ps._DATA_PARALLEL_GROUP
+
+        # Build the context-parallel groups.
+        assert ps._CONTEXT_PARALLEL_GROUP is None, 'context parallel group is already initialized'
+        for ranks in rank_generator.get_ranks('cp'):
+            group = torch.distributed.new_group(
+                ranks, timeout=timeout, pg_options=get_nccl_options('cp', nccl_comm_cfgs)
+            )
+            if rank in ranks:
+                ps._CONTEXT_PARALLEL_GROUP = group
+                ps._CONTEXT_PARALLEL_GLOBAL_RANKS = ranks
+
+        # Build the model-parallel groups.
+        assert ps._MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
+        num_model_parallel_groups = sequence_data_parallel_size if enable_ds_sequence_parallel else data_parallel_size
+        model_parallel_group_ranks = all_data_sequence_parallel_group_ranks if enable_ds_sequence_parallel else all_data_parallel_group_ranks
+        for i in range(num_model_parallel_groups):
+            ranks = [parallel_group_ranks[i] for parallel_group_ranks in model_parallel_group_ranks]
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                ps._MODEL_PARALLEL_GROUP = group
+
+        # Build the model-parallel groups with expert parallel
+        assert (
+            ps._MODEL_AND_EXPERT_PARALLEL_GROUP is None
+        ), 'model and expert parallel group is already initialized'
+        for ranks in rank_generator.get_ranks('tp-ep-pp', independent_ep=True):
+            group = torch.distributed.new_group(
+                ranks, timeout=timeout, pg_options=get_nccl_options('mp_exp', nccl_comm_cfgs)
+            )
+            if rank in ranks:
+                ps._MODEL_AND_EXPERT_PARALLEL_GROUP = group
+
+        # Build the tensor model-parallel groups.
+        assert ps._TENSOR_MODEL_PARALLEL_GROUP is None, 'tensor model parallel group is already initialized'
+        for i in range(num_tensor_model_parallel_groups):
+            ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                ps._TENSOR_MODEL_PARALLEL_GROUP = group
+                ps._TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = ranks
+
+
+        # Build the pipeline model-parallel groups and embedding groups
+        # (first and last rank in each pipeline model-parallel group).
+        assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, 'pipeline model parallel group is already initialized'
+        assert ps._EMBEDDING_GROUP is None, 'embedding group is already initialized'
+        assert ps._POSITION_EMBEDDING_GROUP is None, 'position embedding group is already initialized'
+        for i in range(num_pipeline_model_parallel_groups):
+            ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                torch.distributed.barrier(group=group, device_ids=[torch.cuda.current_device(),])
+                ps._PIPELINE_MODEL_PARALLEL_GROUP = group
+                ps._PIPELINE_GLOBAL_RANKS = ranks
+            # Setup embedding group (to exchange gradients between
+            # first and last stages).
+            if len(ranks) > 1:
+                embedding_ranks = [ranks[0], ranks[-1]]
+                position_embedding_ranks = [ranks[0]]
+                if pipeline_model_parallel_split_rank is not None:
+                    if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
+                        embedding_ranks = [ranks[0], ranks[pipeline_model_parallel_split_rank], ranks[-1]]
+                    if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
+                        position_embedding_ranks = [ranks[0], ranks[pipeline_model_parallel_split_rank]]
+            else:
+                embedding_ranks = ranks
+                position_embedding_ranks = ranks
+
+            group = torch.distributed.new_group(embedding_ranks)
+            if rank in embedding_ranks:
+                ps._EMBEDDING_GROUP = group
+            if rank in ranks:
+                ps._EMBEDDING_GLOBAL_RANKS = embedding_ranks
+
+            group = torch.distributed.new_group(position_embedding_ranks)
+            if rank in position_embedding_ranks:
+                ps._POSITION_EMBEDDING_GROUP = group
+            if rank in ranks:
+                ps._POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
+
+        # Build the tensor + data parallel groups.
+        assert (
+            ps._TENSOR_AND_DATA_PARALLEL_GROUP is None
+        ), 'Tensor + data parallel group is already initialized'
+        for ranks in rank_generator.get_ranks('tp-dp-cp'):
+            group = torch.distributed.new_group(
+                ranks, timeout=timeout, pg_options=get_nccl_options('tp_dp_cp', nccl_comm_cfgs)
+            )
+            if rank in ranks:
+                ps._TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = group
+        for ranks in rank_generator.get_ranks('tp-dp'):
+            group = torch.distributed.new_group(
+                ranks, timeout=timeout, pg_options=get_nccl_options('tp_dp', nccl_comm_cfgs)
+            )
+            if rank in ranks:
+                ps._TENSOR_AND_DATA_PARALLEL_GROUP = group
+
+        assert (
+            ps._TENSOR_AND_CONTEXT_PARALLEL_GROUP is None
+        ), 'Tensor + context parallel group is already initialized'
+        for ranks in rank_generator.get_ranks('tp-cp'):
+            group = torch.distributed.new_group(
+                ranks, timeout=timeout, pg_options=get_nccl_options('tp_cp', nccl_comm_cfgs)
+            )
+            if rank in ranks:
+                ps._TENSOR_AND_CONTEXT_PARALLEL_GROUP = group
+
+        # Build the tensor + expert parallel groups
+        assert ps._EXPERT_MODEL_PARALLEL_GROUP is None, 'Expert parallel group is already initialized'
+        assert (
+            ps._TENSOR_AND_EXPERT_PARALLEL_GROUP is None
+        ), 'Tensor + expert parallel group is already initialized'
+        assert (
+            ps._DATA_MODULO_EXPERT_PARALLEL_GROUP is None
+        ), 'Data modulo expert group is already initialized'
+        assert (
+            ps._DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP is None
+        ), 'Data modulo expert group with context parallel is already initialized'
+
+        for ranks in rank_generator.get_ranks('tp-ep', independent_ep=True):
+            group = torch.distributed.new_group(
+                ranks, timeout=timeout, pg_options=get_nccl_options('tp_exp', nccl_comm_cfgs)
+            )
+            if rank in ranks:
+                ps._TENSOR_AND_EXPERT_PARALLEL_GROUP = group
+
+        for ranks in rank_generator.get_ranks('ep', independent_ep=True):
+            group = torch.distributed.new_group(
+                ranks, pg_options=get_nccl_options('exp', nccl_comm_cfgs)
+            )
+            if rank in ranks:
+                ps._EXPERT_MODEL_PARALLEL_GROUP = group
+
+        for ranks in rank_generator.get_ranks('dp', independent_ep=True):
+            group = torch.distributed.new_group(
+                ranks, timeout=timeout, pg_options=get_nccl_options('dp_modulo_exp', nccl_comm_cfgs)
+            )
+            group_gloo = torch.distributed.new_group(ranks, backend="gloo")
+            if rank in ranks:
+                ps._DATA_MODULO_EXPERT_PARALLEL_GROUP = group
+                ps._DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO = group_gloo
+
+        for ranks in rank_generator.get_ranks('dp-cp', independent_ep=True):
+            # Lazy initialization of the group
+            if ps._CONTEXT_PARALLEL_GROUP and get_context_parallel_world_size() > 1:
+                group = torch.distributed.new_group(
+                    ranks,
+                    timeout=timeout,
+                    pg_options=get_nccl_options('dp_modulo_exp_cp', nccl_comm_cfgs),
+                )
+                group_gloo = torch.distributed.new_group(ranks, backend="gloo")
+            else:
+                group = ps._DATA_MODULO_EXPERT_PARALLEL_GROUP
+                group_gloo = ps._DATA_MODULO_EXPERT_PARALLEL_GROUP_GLOO
+            if rank in ranks:
+                ps._DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP = group
+                ps._DATA_MODULO_EXPERT_PARALLEL_GROUP_WITH_CP_GLOO = group_gloo
+
+        # Initialize global memory buffer
+        # This isn't really "parallel state" but there isn't another good place to
+        # put this. If we end up with a more generic initialization of megatron-core
+        # we could stick it there
+        _set_global_memory_buffer()
+
+        print(f"_PIPELINE_MODEL_PARALLEL_GROUP: {ps._PIPELINE_MODEL_PARALLEL_GROUP}")
+
+    return wrapper
+
+def sequence_parallel_is_initialized():
+    """Check if sequence and data parallel groups are initialized."""
+    if _SEQUENCE_PARALLEL_GROUP is None or \
+        ps._DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+def sequence_data_parallel_is_initialized():
+    """Check if sequence data parallel groups are initialized."""
+    if _SEQUENCE_DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+def get_sequence_parallel_group():
+    """Get the sequence parallel group the caller rank belongs to."""
+    assert _SEQUENCE_PARALLEL_GROUP is not None, \
+        'sequence parallel group is not initialized'
+    return _SEQUENCE_PARALLEL_GROUP
+
+
+def get_sequence_data_parallel_group():
+    """Get the sequence parallel group the caller rank belongs to."""
+    assert _SEQUENCE_DATA_PARALLEL_GROUP is not None, \
+        'sequence data parallel group is not initialized'
+    return _SEQUENCE_DATA_PARALLEL_GROUP
+
+def set_sequence_parallel_world_size(world_size):
+    """Set the sequence  parallel size"""
+    global _SEQUENCE_PARALLEL_WORLD_SIZE
+    _SEQUENCE_PARALLEL_WORLD_SIZE = world_size
+
+def set_sequence_data_parallel_world_size(world_size):
+    """Set the sequence  parallel size"""
+    global _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
+    _SEQUENCE_DATA_PARALLEL_WORLD_SIZE = world_size
+
+def get_model_parallel_world_size():
+    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_world_size is only supported if PP is disabled"
+    return get_tensor_model_parallel_world_size()
+
+def get_sequence_parallel_world_size():
+    """Return world size for the sequence parallel group."""
+    global _SEQUENCE_PARALLEL_WORLD_SIZE
+    if _SEQUENCE_PARALLEL_WORLD_SIZE is not None:
+        return _SEQUENCE_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_sequence_parallel_group())
+
+def get_sequence_data_parallel_world_size():
+    """Return world size for the sequence parallel group."""
+    global _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
+    if _SEQUENCE_DATA_PARALLEL_WORLD_SIZE is not None:
+        return _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_sequence_data_parallel_group())
+
+def get_model_parallel_rank():
+    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_rank is only supported if PP is disabled"
+    return get_tensor_model_parallel_rank()
+
+
+def set_sequence_parallel_rank(rank):
+    """Set sequence parallel rank."""
+    global _SEQUENCE_PARALLEL_RANK
+    _SEQUENCE_PARALLEL_RANK = rank
+
+
+def set_sequence_data_parallel_rank(rank):
+    """Set sequence parallel rank."""
+    global _SEQUENCE_DATA_PARALLEL_RANK
+    _SEQUENCE_DATA_PARALLEL_RANK = rank
+
+
+def get_sequence_parallel_rank():
+    """Return my rank for the sequence parallel group."""
+    global _SEQUENCE_PARALLEL_RANK
+    if _SEQUENCE_PARALLEL_RANK is not None:
+        return _SEQUENCE_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_sequence_parallel_group())
+
+
+def get_sequence_data_parallel_rank():
+    """Return my rank for the sequence data parallel group."""
+    global _SEQUENCE_DATA_PARALLEL_RANK
+    if _SEQUENCE_DATA_PARALLEL_RANK is not None:
+        return _SEQUENCE_DATA_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_sequence_data_parallel_group())
+
+
+def get_sequence_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the sequence parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_sequence_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+def destroy_model_parallel_wrapper(destroy_model_parallel):
+    @wraps(destroy_model_parallel)
+    def wrapper():
+        destroy_model_parallel()
+        global _SEQUENCE_PARALLEL_GROUP
+        _SEQUENCE_PARALLEL_GROUP = None
+        global _SEQUENCE_DATA_PARALLEL_GROUP
+        _SEQUENCE_DATA_PARALLEL_GROUP = None
+
+    return wrapper
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/models/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/deepspeed_zbh1_engine.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/deepspeed_zbh1_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e6697eb7a703c017578877a45300219f2437dd
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/deepspeed_zbh1_engine.py
@@ -0,0 +1,110 @@
+from deepspeed.runtime.pipe.engine import PipelineEngine
+from deepspeed.utils.timer import BACKWARD_MICRO_TIMER, \
+    BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_INNER_GLOBAL_TIMER
+from deepspeed.runtime.utils import PartitionedTensor
+from deepspeed.accelerator import get_accelerator
+
+import torch
+from torch.cuda.amp import custom_bwd
+from packaging import version
+
+
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_world_size,
+    get_tensor_model_parallel_group,
+    get_global_memory_buffer,
+)
+
+from megatronspeed.core.tensor_parallel.weight_grad_store import WeightGradStore
+
+def _exec_backward_only_pass(self, buffer_id):
+    assert self.optimizer is not None, "must provide optimizer during " \
+                                        "init in order to use backward"
+
+    self.mem_status('BEFORE BWD ONLY', reset_max=True)
+    from megatron.core.tensor_parallel.layers import LinearWithGradAccumulationAndAsyncCommunication
+    WeightGradStore.set_combine_bw(False)
+    # The last stage just runs backward on the loss using DeepSpeed's typical
+    # mechanisms.
+    if self.is_last_stage():
+        super(PipelineEngine, self).backward(self.loss)
+        WeightGradStore.flush()
+        self.mem_status('AFTER BWD ONLY')
+
+        WeightGradStore.set_combine_bw(True)
+        return
+
+    outputs = self.pipe_buffers['outputs'][buffer_id]
+
+    if self.wall_clock_breakdown():
+        self.timers(BACKWARD_MICRO_TIMER).start()
+        self.timers(BACKWARD_GLOBAL_TIMER).start()
+        self.timers(BACKWARD_INNER_MICRO_TIMER).start()
+        self.timers(BACKWARD_INNER_GLOBAL_TIMER).start()
+
+    # Reconstruct if we previously partitioned the output. We must be
+    # careful to also restore the computational graph of the tensors we partitioned.
+    if self.is_pipe_partitioned:
+        if self.is_grad_partitioned:
+            if self.pipe_partition_output_meta_cache is None:
+                self.pipe_partition_output_meta_cache = outputs[0].to('cpu')
+            part_output = PartitionedTensor.from_meta(meta=self.pipe_partition_output_meta_cache,
+                                                        local_part=outputs[1],
+                                                        group=self.grid.get_slice_parallel_group())
+            self.pipe_buffers['output_tensors'][buffer_id].data = part_output.full()
+            outputs = (self.pipe_buffers['output_tensors'][buffer_id], *outputs[2:])
+        else:
+            # Already restored from partition
+            self.pipe_buffers['output_tensors'][buffer_id].data = outputs[0]
+            outputs = (self.pipe_buffers['output_tensors'][buffer_id], *outputs[1:])
+
+    grad_tensors = self.grad_layer
+    if self.is_grad_partitioned:
+        if self.grad_partition_grad_layer_meta_cache is None:
+            self.grad_partition_grad_layer_meta_cache = self.grad_layer[0].to('cpu')
+        part_grad = PartitionedTensor.from_meta(meta=self.grad_partition_grad_layer_meta_cache,
+                                                local_part=self.grad_layer[1],
+                                                group=self.grid.get_slice_parallel_group())
+        grad_tensors = (part_grad.full(), *grad_tensors[2:])
+        part_grad = None
+
+    if self.using_bf16_optimizer and not self.is_last_stage():
+        # manually call because we don't call optimizer.backward()
+        self.optimizer.clear_lp_grads()
+
+    # This handles either a single tensor or tuple of tensors.
+    
+    if isinstance(outputs, tuple):
+        out_tensors = [t for t in outputs if t.is_floating_point()]
+        assert len(out_tensors) == len(grad_tensors)
+        torch.autograd.backward(tensors=out_tensors, grad_tensors=grad_tensors)
+    else:
+        torch.autograd.backward(tensors=(outputs, ), grad_tensors=(grad_tensors, ))
+    
+
+    WeightGradStore.flush()
+
+    if self.using_bf16_optimizer and not self.is_last_stage():
+        # manually call because we don't call optimizer.backward()
+        self.optimizer.update_hp_grads(clear_lp_grads=False)
+
+    # Free up the memory from the output of forward()
+    self.pipe_buffers['output_tensors'][buffer_id] = None
+    self.pipe_buffers['outputs'][buffer_id] = None
+    grad_tensors = None
+    
+    WeightGradStore.set_combine_bw(True)
+
+    if self.wall_clock_breakdown():
+        self.timers(BACKWARD_INNER_MICRO_TIMER).stop()
+        self.timers(BACKWARD_INNER_GLOBAL_TIMER).stop()
+        self.timers(BACKWARD_MICRO_TIMER).stop()
+        self.timers(BACKWARD_GLOBAL_TIMER).stop()
+
+def _exec_weight_pass(self):
+    if self.using_bf16_optimizer:
+        # manually call because we don't call optimizer.backward()
+        self.optimizer.clear_lp_grads()
+    WeightGradStore.pop()
+    if self.using_bf16_optimizer:
+        self.optimizer.update_hp_grads(clear_lp_grads=False)
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/deepspeed_zbh1_schedule.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/deepspeed_zbh1_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..b44954a44a025927754695a413b2136f31a0cd20
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/deepspeed_zbh1_schedule.py
@@ -0,0 +1,148 @@
+from deepspeed.runtime.pipe.schedule import PipeSchedule, PipeInstruction, BufferOpInstruction, \
+    LoadMicroBatch, RecvActivation, SendActivation, RecvGrad, SendGrad, \
+    ForwardPass, BackwardPass, ReduceGrads, ReduceTiedGrads, OptimizerStep
+from megatron.training import get_args
+
+class ZeroBubbleH1Pipeline(PipeSchedule):
+    """A schedule for training a batch using hybrid parallelism.
+
+    Pipeline parallelism is extracted through gradient accumulation and thus
+    convergence follows that of a data parallel approach with the same batch
+    size.
+    """
+
+    def steps(self):
+        num_warmup_microbatches = self.stages - self.stage_id
+
+        forward = 0
+        backward = 0
+        weight = 0
+
+        # F section
+        for _ in range(num_warmup_microbatches - 1):
+            if forward == self.micro_batches:
+                continue
+            forward_id = self.get_buffer_id(forward)
+            forward += 1
+
+            cmds = []
+            if not self.is_first_stage:
+                cmds.append(RecvActivation(forward_id))
+            if self.is_first_stage or self.is_last_stage:
+                cmds.append(LoadMicroBatch(forward_id))
+            cmds.append(ForwardPass(forward_id))
+            if not self.is_last_stage:
+                cmds.append(SendActivation(forward_id))
+            yield cmds
+
+        # FB section
+        for _ in range(self.stage_id):
+            if forward == self.micro_batches:
+                continue
+            forward_id = self.get_buffer_id(forward)
+            backward_id = self.get_buffer_id(backward)
+            forward += 1
+            backward += 1
+
+            cmds = []
+            if not self.is_first_stage:
+                cmds.append(RecvActivation(forward_id))
+            if self.is_first_stage or self.is_last_stage:
+                cmds.append(LoadMicroBatch(forward_id))
+            cmds.append(ForwardPass(forward_id))
+            if not self.is_last_stage:
+                cmds.append(RecvGrad(backward_id))
+                cmds.append(SendActivation(forward_id))
+            cmds.append(BackwardOnlyPass(backward_id))
+            if not self.is_first_stage:
+                cmds.append(SendGrad(backward_id))
+            yield cmds
+        
+        # FBW section
+        while forward < self.micro_batches:
+            forward_id = self.get_buffer_id(forward)
+            backward_id = self.get_buffer_id(backward)
+            forward += 1
+            backward += 1
+            weight += 1
+
+            cmds = []
+            if not self.is_first_stage:
+                cmds.append(RecvActivation(forward_id))
+            if self.is_first_stage or self.is_last_stage:
+                cmds.append(LoadMicroBatch(forward_id))
+            cmds.append(ForwardPass(forward_id))
+            if not self.is_last_stage:
+                cmds.append(RecvGrad(backward_id))
+                cmds.append(SendActivation(forward_id))
+            if self.is_first_stage:
+                cmds.append(BackwardPass(backward_id))
+            elif forward == self.micro_batches:
+                cmds.append(BackwardOnlyPass(backward_id))
+                cmds.append(SendGrad(backward_id))
+                cmds.append(WeightPass())
+            else:
+                if get_args().enable_zbh1_exact_semantics:
+                    cmds.append(BackwardOnlyPass(backward_id))
+                    cmds.append(SendGrad(backward_id)) 
+                    cmds.append(WeightPass())
+                else:
+                    cmds.append(BackwardPass(backward_id))
+                    cmds.append(SendGrad(backward_id))
+            yield cmds
+
+        #BW section
+        while backward < self.micro_batches:
+            backward_id = self.get_buffer_id(backward)
+            backward += 1
+            weight += 1
+
+            cmds = []
+            if not self.is_last_stage:
+                cmds.append(RecvGrad(backward_id))
+            if self.is_first_stage:
+                cmds.append(BackwardPass(backward_id))
+            else:
+                cmds.append(BackwardOnlyPass(backward_id))
+                cmds.append(SendGrad(backward_id))
+                cmds.append(WeightPass())
+            yield cmds
+        
+        #W section
+        while weight < self.micro_batches:
+            weight += 1
+            yield [WeightPass()]
+        
+        yield [ReduceTiedGrads(), ReduceGrads(), OptimizerStep()]
+
+    def get_buffer_id(self, microbatch_id):
+        num_warmup_microbatches = self.stages - self.stage_id
+        return microbatch_id % num_warmup_microbatches
+
+
+##Additional Instruction classes
+class BackwardOnlyPass(BufferOpInstruction):
+    """Compute a backward pass and accumulate gradients.
+
+    Roughly:
+
+    .. code-block:: python
+
+        outputs = buffers['outputs'][buffer_id]
+        gradients = buffers['gradients'][buffer_id]
+        torch.autograd.backward(tensors=outputs,
+                                grad_tensors=gradients, inputs = input_tensor)
+    """
+    pass
+
+class WeightPass(PipeInstruction):
+    """Compute a weight pass and accumulate gradients.
+
+    Roughly:
+
+    .. code-block:: python
+
+        torch.autograd.backward(tensors=outputs,
+                                grad_tensors=gradients, inputs = model.parameters())
+    """
+    pass
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/schedules.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/schedules.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8bff628d4fbf0c7d874a41faf81bfa966156f3e
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/pipeline_parallel/schedules.py
@@ -0,0 +1,486 @@
+import contextlib
+from typing import Callable, Iterator, List, Optional, Union
+
+import torch
+from torch.autograd.variable import Variable
+
+from megatron.core import parallel_state
+from megatron.core.enums import ModelType
+from megatron.core.pipeline_parallel import p2p_communication
+from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import (
+    drain_embedding_wgrad_compute,
+    get_attr_wrapped_model,
+    get_model_config,
+    get_model_type,
+)
+from megatron.training.global_vars import get_args
+from megatron.core.pipeline_parallel.schedules import (
+    custom_backward,
+    forward_step,
+    check_first_val_step,
+    clear_embedding_activation_buffer,
+    get_tensor_shapes,
+    recv_forward,
+    send_forward,
+    deallocate_output_tensor,
+    send_forward_recv_backward,
+    send_backward,
+    send_backward_recv_forward,
+    recv_backward,
+    finish_embedding_wgrad_compute,
+)
+
+# Types
+Shape = Union[List[int], torch.Size]
+
+
+def backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model=None):
+    """Backward step through passed-in output tensor.
+
+    If last stage, output_tensor_grad is None, otherwise gradient of loss
+    with respect to stage's output tensor.
+
+    Returns gradient of loss with respect to input tensor (None if first
+    stage)."""
+
+    # NOTE: This code currently can handle at most one skip connection. It
+    # needs to be modified slightly to support arbitrary numbers of skip
+    # connections.
+
+    args = get_args()
+    if args.deepspeed:
+        assert model is not None
+
+    if config.timers is not None:
+        config.timers('backward-compute', log_level=2).start()
+
+    # Retain the grad on the input_tensor.
+    unwrap_input_tensor_grad = False
+    if not isinstance(input_tensor, list):
+        input_tensor = [input_tensor]
+        unwrap_input_tensor_grad = True
+    for x in input_tensor:
+        if x is not None:
+            x.retain_grad()
+
+    if not isinstance(output_tensor, list):
+        output_tensor = [output_tensor]
+    if not isinstance(output_tensor_grad, list):
+        output_tensor_grad = [output_tensor_grad]
+
+    # Backward pass.
+    if args.deepspeed:
+        model.backward(output_tensor[0])
+    else:
+        if output_tensor_grad[0] is None and config.grad_scale_func is not None:
+            output_tensor[0] = config.grad_scale_func(output_tensor[0])
+
+        if config.deallocate_pipeline_outputs:
+            custom_backward(output_tensor[0], output_tensor_grad[0])
+        else:
+            torch.autograd.backward(output_tensor[0], grad_tensors=output_tensor_grad[0])
+
+    # Collect the grad of the input_tensor.
+    input_tensor_grad = [None]
+    if input_tensor is not None:
+        input_tensor_grad = []
+        for x in input_tensor:
+            if x is None:
+                input_tensor_grad.append(None)
+            else:
+                input_tensor_grad.append(x.grad)
+
+    # Handle single skip connection if it exists (encoder_hidden_state in
+    # model with encoder and decoder).
+    if (
+        parallel_state.get_pipeline_model_parallel_world_size() > 1
+        and parallel_state.is_pipeline_stage_after_split()
+        and model_type == ModelType.encoder_and_decoder
+    ):
+        if output_tensor_grad[1] is not None:
+            input_tensor_grad[-1].add_(output_tensor_grad[1])
+    if unwrap_input_tensor_grad:
+        input_tensor_grad = input_tensor_grad[0]
+
+    if config.timers is not None:
+        config.timers('backward-compute').stop()
+
+    return input_tensor_grad
+
+def forward_backward_no_pipelining(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,  # unused
+    micro_batch_size: int,  # unused
+    decoder_seq_length: int = None,  # unused
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+    first_val_step: bool = None,
+    config: TransformerConfig = None,
+):
+    """Run forward and backward passes with no pipeline parallelism
+    (no inter-stage communication).
+
+    Returns dictionary with losses.
+
+
+    See get_forward_backward_func() for argument details
+    """
+
+    if isinstance(model, list):
+        assert len(model) == 1, "non-pipeline-parallel schedule does not support model chunking"
+        model = model[0]
+    if isinstance(data_iterator, list):
+        assert (
+            len(data_iterator) == 1
+        ), "non-pipeline-parallel schedule does not support model chunking"
+        data_iterator = data_iterator[0]
+
+    if config is None:
+        config = get_model_config(model)
+
+    if config.timers is not None:
+        config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
+
+    no_sync_func = config.no_sync_func
+    if no_sync_func is None:
+        no_sync_func = contextlib.nullcontext
+
+    args = get_args()
+    if args.deepspeed:
+        model.set_gradient_accumulation_boundary(False)
+
+    model_type = get_model_type(model)
+
+    forward_data_store = []
+    input_tensor, output_tensor_grad = None, None
+    total_num_tokens = torch.zeros([], dtype=torch.int, device="cuda")
+    with no_sync_func():
+        for i in range(num_microbatches - 1):
+            output_tensor, num_tokens = forward_step(
+                forward_step_func,
+                data_iterator,
+                model,
+                num_microbatches,
+                input_tensor,
+                forward_data_store,
+                config,
+                collect_non_loss_data,
+                is_first_microbatch=check_first_val_step(first_val_step, forward_only, i == 0),
+                current_microbatch=i,
+            )
+            total_num_tokens += num_tokens.item()
+            if not forward_only:
+                backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model)
+    if args.deepspeed:
+        model.set_gradient_accumulation_boundary(True)
+
+    # Run computation for last microbatch out of context handler (want to
+    # synchronize gradients).
+    output_tensor, num_tokens = forward_step(
+        forward_step_func,
+        data_iterator,
+        model,
+        num_microbatches,
+        input_tensor,
+        forward_data_store,
+        config,
+        collect_non_loss_data,
+        is_first_microbatch=check_first_val_step(
+            first_val_step, forward_only, num_microbatches == 1
+        ),
+        current_microbatch=num_microbatches - 1,
+    )
+    total_num_tokens += num_tokens.item()
+
+    if not forward_only:
+        backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config, model)
+
+    if config.finalize_model_grads_func is not None and not forward_only:
+        # Finalize model grads (perform full grad all-reduce / reduce-scatter for
+        # data parallelism and layernorm all-reduce for sequence parallelism).
+        config.finalize_model_grads_func(
+            [model], total_num_tokens if config.calculate_per_token_loss else None
+        )
+
+    if config.timers is not None:
+        config.timers('forward-backward').stop()
+
+    return forward_data_store
+
+def forward_backward_pipelining_without_interleaving(
+    *,
+    forward_step_func,
+    data_iterator: Union[Iterator, List[Iterator]],
+    model: Union[torch.nn.Module, List[torch.nn.Module]],
+    num_microbatches: int,
+    seq_length: int,
+    micro_batch_size: int,
+    decoder_seq_length: int = None,
+    forward_only: bool = False,
+    collect_non_loss_data: bool = False,
+    first_val_step: bool = None,
+    config: TransformerConfig = None,
+):
+    """Run non-interleaved 1F1B schedule, with communication between pipeline
+    stages.
+
+    Returns dictionary with losses if the last stage, empty dict otherwise."""
+
+    if isinstance(model, list):
+        assert (
+            len(model) == 1
+        ), "non-interleaved pipeline parallelism does not support model chunking"
+        model = model[0]
+    if isinstance(data_iterator, list):
+        assert (
+            len(data_iterator) == 1
+        ), "non-pipeline-parallel schedule does not support model chunking"
+        data_iterator = data_iterator[0]
+
+    if config is None:
+        config = get_model_config(model)
+    if config.overlap_p2p_comm:
+        raise ValueError(
+            "Non-interleaved pipeline parallelism does not support overlapping p2p communication"
+        )
+
+    # Needed only when gradients are finalized in M-Core
+    if config.finalize_model_grads_func is not None and not forward_only:
+        embedding_module = clear_embedding_activation_buffer(config, model)
+
+    if config.timers is not None:
+        config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
+
+    # Disable async grad reductions
+    no_sync_func = config.no_sync_func
+    if no_sync_func is None:
+        no_sync_func = contextlib.nullcontext
+    no_sync_context = None
+
+    def disable_grad_sync():
+        """Disable asynchronous grad reductions"""
+        nonlocal no_sync_context
+        if no_sync_context is None:
+            no_sync_context = no_sync_func()
+            no_sync_context.__enter__()
+
+    def enable_grad_sync():
+        """Enable asynchronous grad reductions"""
+        nonlocal no_sync_context
+        if no_sync_context is not None:
+            no_sync_context.__exit__(None, None, None)
+            no_sync_context = None
+
+    disable_grad_sync()
+
+    # Compute number of warmup microbatches.
+    num_warmup_microbatches = (
+        parallel_state.get_pipeline_model_parallel_world_size()
+        - parallel_state.get_pipeline_model_parallel_rank()
+        - 1
+    )
+    num_warmup_microbatches = min(num_warmup_microbatches, num_microbatches)
+    num_microbatches_remaining = num_microbatches - num_warmup_microbatches
+
+    # Checkpoint the activations of partial Transformer layers in a number of micro-batches
+    # within the maximum outstanding micro-batch backpropagations.
+    # Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
+    # checkpoint partial Transformer layers (or skip checkpointing) and
+    # the rest of micro-batches within a window of micro-batches checkpoint
+    # all Transformer layers. The window of micro-batches is set by the maximum
+    # outstanding backpropagations and becomes smaller at later pipeline stages.
+    # Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
+    max_outstanding_backprops = None
+    if config.num_microbatches_with_partial_activation_checkpoints is not None:
+        max_outstanding_backprops = num_warmup_microbatches + 1
+
+    model_type = get_model_type(model)
+
+    rank = parallel_state.get_pipeline_model_parallel_rank()
+    recv_tensor_shapes = get_tensor_shapes(
+        rank=rank - 1,
+        model_type=model_type,
+        seq_length=seq_length,
+        micro_batch_size=micro_batch_size,
+        decoder_seq_length=decoder_seq_length,
+        config=config,
+    )
+    send_tensor_shapes = get_tensor_shapes(
+        rank=rank,
+        model_type=model_type,
+        seq_length=seq_length,
+        micro_batch_size=micro_batch_size,
+        decoder_seq_length=decoder_seq_length,
+        config=config,
+    )
+
+    # Input, output tensors only need to be saved when doing backward passes
+    input_tensors = None
+    output_tensors = None
+    total_num_tokens = torch.tensor(0, dtype=torch.int).cuda()
+
+    if not forward_only:
+        input_tensors = []
+        output_tensors = []
+    forward_data_store = []
+
+    # Run warmup forward passes.
+    for i in range(num_warmup_microbatches):
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                i % max_outstanding_backprops
+                >= config.num_microbatches_with_partial_activation_checkpoints
+            )
+        else:
+            checkpoint_activations_microbatch = None
+
+        input_tensor = recv_forward(recv_tensor_shapes, config)
+        output_tensor, num_tokens = forward_step(
+            forward_step_func,
+            data_iterator,
+            model,
+            num_microbatches,
+            input_tensor,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            checkpoint_activations_microbatch,
+            check_first_val_step(first_val_step, forward_only, i == 0),
+            current_microbatch=i,
+        )
+        send_forward(output_tensor, send_tensor_shapes, config)
+        total_num_tokens += num_tokens.item()
+
+        if not forward_only:
+            input_tensors.append(input_tensor)
+            output_tensors.append(output_tensor)
+            deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs)
+
+    # Before running 1F1B, need to receive first forward tensor.
+    # If all microbatches are run in warmup / cooldown phase, then no need to
+    # receive this tensor here.
+    if num_microbatches_remaining > 0:
+        input_tensor = recv_forward(recv_tensor_shapes, config)
+
+    # Run 1F1B in steady state.
+    for i in range(num_microbatches_remaining):
+        last_iteration = i == (num_microbatches_remaining - 1)
+
+        # Decide to checkpoint all layers' activations of the current micro-batch
+        if max_outstanding_backprops is not None:
+            checkpoint_activations_microbatch = (
+                (i + num_warmup_microbatches) % max_outstanding_backprops
+            ) >= config.num_microbatches_with_partial_activation_checkpoints
+        else:
+            checkpoint_activations_microbatch = None
+
+        output_tensor, num_tokens = forward_step(
+            forward_step_func,
+            data_iterator,
+            model,
+            num_microbatches,
+            input_tensor,
+            forward_data_store,
+            config,
+            collect_non_loss_data,
+            checkpoint_activations_microbatch,
+            check_first_val_step(
+                first_val_step, forward_only, (i == 0) and (num_warmup_microbatches == 0)
+            ),
+            current_microbatch=i + num_warmup_microbatches,
+        )
+        total_num_tokens += num_tokens.item()
+
+        if forward_only:
+            send_forward(output_tensor, send_tensor_shapes, config)
+
+            if not last_iteration:
+                input_tensor = recv_forward(recv_tensor_shapes, config)
+
+        else:
+            output_tensor_grad = send_forward_recv_backward(
+                output_tensor, send_tensor_shapes, config
+            )
+
+            # Add input_tensor and output_tensor to end of list.
+            input_tensors.append(input_tensor)
+            output_tensors.append(output_tensor)
+            deallocate_output_tensor(output_tensor[0], config.deallocate_pipeline_outputs)
+
+            # Pop input_tensor and output_tensor from the start of the list for
+            # the backward pass.
+            input_tensor = input_tensors.pop(0)
+            output_tensor = output_tensors.pop(0)
+
+            # Enable grad sync for the last microbatch in the batch if the full
+            # backward pass completes in the 1F1B stage.
+            if num_warmup_microbatches == 0 and last_iteration:
+                if config.grad_sync_func is None or rank == 0:
+                    enable_grad_sync()
+
+            input_tensor_grad = backward_step(
+                input_tensor, output_tensor, output_tensor_grad, model_type, config, model
+            )
+
+            if last_iteration:
+                input_tensor = None
+                send_backward(input_tensor_grad, recv_tensor_shapes, config)
+            else:
+                input_tensor = send_backward_recv_forward(
+                    input_tensor_grad, recv_tensor_shapes, config
+                )
+
+    # Run cooldown backward passes.
+    if not forward_only:
+        for i in range(num_warmup_microbatches):
+
+            # Enable async grad reduction in the last backward pass
+            # Note: If grad sync function is provided, only enable
+            # async grad reduction in first pipeline stage. Other
+            # pipeline stages do grad reduction during pipeline
+            # bubble.
+            if i == num_warmup_microbatches - 1:
+                if config.grad_sync_func is None or rank == 0:
+                    enable_grad_sync()
+
+            input_tensor = input_tensors.pop(0)
+            output_tensor = output_tensors.pop(0)
+
+            output_tensor_grad = recv_backward(send_tensor_shapes, config)
+
+            input_tensor_grad = backward_step(
+                input_tensor, output_tensor, output_tensor_grad, model_type, config, model
+            )
+
+            send_backward(input_tensor_grad, recv_tensor_shapes, config)
+
+        # Launch any remaining grad reductions.
+        if no_sync_context is not None:
+            enable_grad_sync()
+            if config.grad_sync_func is not None:
+                config.grad_sync_func(model.parameters())
+
+    if config.finalize_model_grads_func is not None and not forward_only:
+
+        # If defer_embedding_wgrad_compute is enabled we need to do the
+        # weight gradient GEMM's here.
+        finish_embedding_wgrad_compute(config, embedding_module)
+
+        # Finalize model grads (perform full grad all-reduce / reduce-scatter for
+        # data parallelism, layernorm all-reduce for sequence parallelism, and
+        # embedding all-reduce for pipeline parallelism).
+        config.finalize_model_grads_func(
+            [model], total_num_tokens if config.calculate_per_token_loss else None
+        )
+
+    if config.timers is not None:
+        config.timers('forward-backward').stop()
+
+    return forward_data_store
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/requirements.txt b/toolbox/Megatron-DeepSpeed/megatronspeed/core/requirements.txt
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/requirements.txt
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/requirements.txt
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/sequence_parallel/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/sequence_parallel/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/sequence_parallel/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/sequence_parallel/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/sequence_parallel/cross_entropy.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/sequence_parallel/cross_entropy.py
similarity index 97%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/sequence_parallel/cross_entropy.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/sequence_parallel/cross_entropy.py
index e65ca25a98ea265ae117092ddfa5452de72fb607..edba3b7a153c3d6b0ee18e2c0dbb18747ebebf3e 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/sequence_parallel/cross_entropy.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/sequence_parallel/cross_entropy.py
@@ -1,7 +1,7 @@
 import torch
 from packaging import version
 
-from megatron_ds.core.parallel_state import (
+from megatronspeed.core.parallel_state import (
     get_sequence_parallel_group,
     get_sequence_parallel_rank,
     get_sequence_parallel_world_size
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/models/bert/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/data.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/data.py
similarity index 67%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/data.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/data.py
index 0208c22e81fd34ded450b81900c0532d8b613767..1049175d0e23d04a60e6fec8237221d4387f04f4 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/core/tensor_parallel/data.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/data.py
@@ -1,32 +1,36 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
 import torch
 
-from megatron_ds.core.parallel_state import (
+from megatron.core.parallel_state import (
     get_tensor_model_parallel_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_src_rank,
 )
+from megatron.core.tensor_parallel.data import (
+    _MAX_DATA_DIM,
+    _check_data_types
+)
+from megatronspeed.core.parallel_state import (
+    get_sequence_parallel_group,
+    get_sequence_parallel_world_size,
+    get_sequence_parallel_rank,
+    get_sequence_parallel_src_rank,
+)
 
-_MAX_DATA_DIM = 5
-
-
-def _check_data_types(keys, data, target_dtype):
-    """Check that all the keys have the same target data type."""
-    for key in keys:
-        assert data[key].dtype == target_dtype, (
-            '{} has data type {} which '
-            'is different than {}'.format(key, data[key].dtype, target_dtype)
-        )
 
+def _build_key_size_numel_dictionaries(keys, data, group=None, rank=-1, src_rank=-1):
+    if group is None:
+        group = get_tensor_model_parallel_group()
+    if src_rank < 0:
+        src_rank = get_tensor_model_parallel_src_rank()
+    if rank < 0:
+        rank = get_tensor_model_parallel_rank()
 
-def _build_key_size_numel_dictionaries(keys, data):
     """Build the size on rank 0 and broadcast."""
     max_dim = _MAX_DATA_DIM
     sizes = [0 for _ in range(max_dim) for _ in keys]
 
     # Pack the sizes on rank zero.
-    if get_tensor_model_parallel_rank() == 0:
+    if rank == 0:
         offset = 0
         for key in keys:
             assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
@@ -36,9 +40,9 @@ def _build_key_size_numel_dictionaries(keys, data):
             offset += max_dim
 
     # Move to GPU and broadcast.
-    sizes_cuda = torch.cuda.LongTensor(sizes)
+    sizes_cuda = torch.tensor(sizes, dtype=torch.long, device='cuda')
     torch.distributed.broadcast(
-        sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+        sizes_cuda, src_rank, group=group
     )
 
     # Move back to cpu and unpack.
@@ -63,12 +67,11 @@ def _build_key_size_numel_dictionaries(keys, data):
 
     return key_size, key_numel, total_numel
 
-
 def broadcast_data(keys, data, datatype):
     """Broadcast data from rank zero of each model parallel group to the
     members of the same model parallel group.
 
-    Arguments:
+    Args:
         keys: list of keys in the data disctionary to be broadcasted
         data: data dictionary of string keys and cpu tensor values.
         datatype: torch data type of all tensors in data associated
@@ -76,7 +79,16 @@ def broadcast_data(keys, data, datatype):
     """
     # Build (key, size) and (key, number of elements) dictionaries along
     # with the total number of elements on all ranks.
-    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
+    if get_sequence_parallel_world_size() > 1:
+        rank = get_sequence_parallel_rank()
+        src_rank = get_sequence_parallel_src_rank()
+        group = get_sequence_parallel_group()
+    else:
+        rank = get_tensor_model_parallel_rank()
+        src_rank = get_tensor_model_parallel_src_rank()
+        group = get_tensor_model_parallel_group()
+
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data, group=group, rank=rank, src_rank=src_rank)
 
     # Pack on rank zero.
     if get_tensor_model_parallel_rank() == 0:
@@ -89,7 +101,7 @@ def broadcast_data(keys, data, datatype):
 
     # Broadcast
     torch.distributed.broadcast(
-        flatten_data, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+        flatten_data, src_rank, group=group
     )
 
     # Unpack
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/layers.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..378c8fc28ea2e51c44cb64b0f49bc64f7d6c9990
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/layers.py
@@ -0,0 +1,1540 @@
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import io
+import math
+import os
+import warnings
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.nn.parameter import Parameter
+
+from megatron.training.global_vars import get_args
+from megatron.core.model_parallel_config import ModelParallelConfig
+from megatron.core.parallel_state import (
+    get_global_memory_buffer,
+    get_tensor_and_expert_parallel_rank,
+    get_tensor_and_expert_parallel_world_size,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from megatron.core import ixte_extensions
+
+from megatron.core.dist_checkpointing.mapping import ShardedStateDict
+from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
+from megatron.core.utils import make_tp_sharded_tensor_for_checkpoint, prepare_input_tensors_for_wgrad_compute
+from megatron.core.tensor_parallel.mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region,
+    gather_from_tensor_model_parallel_region,
+    reduce_from_tensor_model_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+    scatter_to_tensor_model_parallel_region,
+)
+from megatron.core.tensor_parallel.random import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name
+from megatron.core.tensor_parallel.utils import VocabUtility, divide, split_tensor_along_last_dim
+from megatron.core.tensor_parallel.layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    _grad_accum_fusion_available,
+    _initialize_affine_weight_cpu,
+    _initialize_affine_weight_gpu,
+    set_tensor_model_parallel_attributes,
+    linear_with_frozen_weight,
+    LinearWithGradAccumulationAndAsyncCommunication,
+)
+
+try:
+    import fused_weight_gradient_mlp_cuda
+except ImportError:
+    pass
+
+
+class SequenceParallelPositionEmbedding(torch.nn.Module):
+    """Embedding parallelized in the sequence dimension.
+
+    Arguments:
+        sequence_length: max sequence length.
+        embedding_dim: size of hidden state.
+    """
+
+    def __init__(self, sequence_length, embedding_dim):
+        super(SequenceParallelPositionEmbedding, self).__init__()
+        sequence_parallel_size = get_tensor_model_parallel_world_size()
+        assert sequence_length % sequence_parallel_size == 0
+        local_sequence_length = sequence_length // sequence_parallel_size
+        self.offset = local_sequence_length * get_tensor_model_parallel_rank()
+        self.local_embeddings = torch.nn.Embedding(
+            local_sequence_length, embedding_dim)
+
+    def forward(self, position_ids):
+        return self.local_embeddings(position_ids - self.offset)
+
+def gradientUpdateFunction(total_input, grad_output, weight):
+    if weight.grad == None:
+        weight.grad = grad_output.t().matmul(total_input)
+    else:
+        weight.grad += grad_output.t().matmul(total_input)
+
+def linear_with_grad_accumulation_and_async_allreduce_forward(
+    ctx,
+    input,
+    weight,
+    bias,
+    gradient_accumulation_fusion,
+    allreduce_dgrad,
+    sequence_parallel,
+    grad_output_buffer,
+    wgrad_deferral_limit,
+    inference_params=None,
+):
+    ctx.save_for_backward(input, weight)
+    ctx.use_bias = bias is not None
+    ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
+    ctx.allreduce_dgrad = allreduce_dgrad
+    ctx.sequence_parallel = sequence_parallel
+    ctx.wgrad_deferral_limit = wgrad_deferral_limit
+    ctx.grad_output_buffer = grad_output_buffer
+
+    if sequence_parallel and not inference_params:
+        world_size = get_tensor_model_parallel_world_size()
+        dim_size = list(input.size())
+        dim_size[0] = dim_size[0] * world_size
+
+        all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+        torch.distributed._all_gather_base(
+            all_gather_buffer, input, group=get_tensor_model_parallel_group()
+        )
+        total_input = all_gather_buffer
+    else:
+        total_input = input
+
+    output = torch.matmul(total_input, weight.t())
+    if bias is not None:
+        output = output + bias
+    return output
+
+def linear_with_grad_accumulation_and_async_allreduce_backward(ctx, grad_output):
+    input, weight = ctx.saved_tensors
+    use_bias = ctx.use_bias
+    grad_output_buffer = ctx.grad_output_buffer
+    wgrad_deferral_limit = ctx.wgrad_deferral_limit
+
+    wgrad_compute = True
+    if grad_output_buffer is not None:
+        if wgrad_deferral_limit == 0 or len(grad_output_buffer) < wgrad_deferral_limit:
+            grad_output_buffer.append(grad_output)
+            wgrad_compute = False
+
+    if wgrad_compute:
+        if ctx.sequence_parallel:
+            world_size = get_tensor_model_parallel_world_size()
+            dim_size = list(input.size())
+            dim_size[0] = dim_size[0] * world_size
+
+            all_gather_buffer = get_global_memory_buffer().get_tensor(
+                dim_size, input.dtype, "mpu"
+            )
+            handle = torch.distributed._all_gather_base(
+                all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=True
+            )
+
+            # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+            # gather is scheduled before the input gradient computation
+            total_input = all_gather_buffer
+        else:
+            total_input = input
+    grad_input = grad_output.matmul(weight)
+
+    if ctx.sequence_parallel and wgrad_compute:
+        handle.wait()
+
+    if wgrad_compute:
+        # Doing gather + slicing during the NeMo forward pass can make this tensor
+        # not be contiguous. PyTorch only checks if the tensor is contiguous, and only
+        # clones it if it's not contiguous:
+        # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
+        grad_output = grad_output.contiguous()
+        # Convert the tensor shapes to 2D for execution compatibility
+        if grad_output.dim() == 3:
+            grad_output = grad_output.view(
+                grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2]
+            )
+            total_input = total_input.view(
+                total_input.shape[0] * total_input.shape[1], total_input.shape[2]
+            )
+        else:
+            # Somehow when DeepSpeed MoE is used, grad_output could have 4 dimensions.
+            # TODO: May need further investigation
+            total_input = total_input.contiguous()
+            grad_output = grad_output.view(-1, grad_output.shape[-1])
+            total_input = total_input.view(-1, total_input.shape[-1])
+
+    if ctx.allreduce_dgrad:
+        # Asynchronous all-reduce
+        handle = torch.distributed.all_reduce(
+            grad_input, group=get_tensor_model_parallel_group(), async_op=True
+        )
+        # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+        # all-reduce is scheduled before the weight gradient computation
+
+    if ctx.sequence_parallel:
+        assert not ctx.allreduce_dgrad
+        dim_size = list(input.size())
+        sub_grad_input = torch.empty(
+            dim_size, dtype=input.dtype, device=torch.cuda.current_device(), requires_grad=False
+        )
+        # reduce_scatter
+        handle = torch.distributed._reduce_scatter_base(
+            sub_grad_input, grad_input, group=get_tensor_model_parallel_group(), async_op=True
+        )
+        # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+        # reduce scatter is scheduled before the weight gradient computation
+
+    if ctx.gradient_accumulation_fusion:
+        if wgrad_compute:
+            if weight.main_grad.dtype == torch.float32:
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
+                    total_input, grad_output, weight.main_grad
+                )
+            elif weight.main_grad.dtype in (torch.float16, torch.bfloat16):
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(
+                    total_input, grad_output, weight.main_grad
+                )
+            else:
+                raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
+
+        if hasattr(weight, 'grad_added_to_main_grad'):
+            # When overlap_grad_reduce is True, need to ensure that backward hooks
+            # are all run on the main backprop thread to prevent deadlocks. Setup
+            # dummy grad_weight tensor to prevent backward hooks from being run
+            # in a background thread.
+            if getattr(weight, 'zero_out_wgrad', False):
+                grad_weight = torch.zeros(
+                    weight.main_grad.shape,
+                    dtype=input.dtype,
+                    device=torch.cuda.current_device(),
+                    requires_grad=False,
+                )
+            else:
+                grad_weight = torch.empty(
+                    weight.main_grad.shape,
+                    dtype=input.dtype,
+                    device=torch.cuda.current_device(),
+                    requires_grad=False,
+                )
+            weight.grad_added_to_main_grad = True
+        else:
+            grad_weight = None
+    else:
+        grad_weight = grad_output.t().matmul(total_input)
+    # from megatronspeed.core.tensor_parallel.weight_grad_store import WeightGradStore
+    # WeightGradStore.put(total_input, grad_output, weight, gradientUpdateFunction)
+    # grad_weight = None
+    grad_bias = grad_output.sum(dim=0) if use_bias else None
+
+    if ctx.sequence_parallel:
+        handle.wait()
+        # Need to return None's as gradient has to flow for all the input arguments
+        # provided during forward
+        return sub_grad_input, grad_weight, grad_bias, None, None, None, None, None, None
+
+    if ctx.allreduce_dgrad:
+        handle.wait()
+
+    return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
+
+# class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
+#     """See linear_with_grad_accumulation_and_async_allreduce"""
+
+#     @staticmethod
+#     @custom_fwd
+#     def forward(
+#         ctx,
+#         input,
+#         weight,
+#         bias,
+#         gradient_accumulation_fusion,
+#         allreduce_dgrad,
+#         sequence_parallel,
+#         grad_output_buffer,
+#         wgrad_deferral_limit,
+#     ):
+#         ctx.save_for_backward(input, weight)
+#         ctx.use_bias = bias is not None
+#         ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
+#         ctx.allreduce_dgrad = allreduce_dgrad
+#         ctx.sequence_parallel = sequence_parallel
+#         ctx.wgrad_deferral_limit = wgrad_deferral_limit
+#         ctx.grad_output_buffer = grad_output_buffer
+
+#         if sequence_parallel:
+#             world_size = get_tensor_model_parallel_world_size()
+#             dim_size = list(input.size())
+#             dim_size[0] = dim_size[0] * world_size
+
+#             all_gather_buffer = get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
+#             torch.distributed._all_gather_base(
+#                 all_gather_buffer, input, group=get_tensor_model_parallel_group()
+#             )
+#             total_input = all_gather_buffer
+#         else:
+#             total_input = input
+
+#         output = torch.matmul(total_input, weight.t())
+#         if bias is not None:
+#             output = output + bias
+#         return output
+
+#     @staticmethod
+#     @custom_bwd
+#     def backward(ctx, grad_output):
+#         input, weight = ctx.saved_tensors
+#         use_bias = ctx.use_bias
+#         grad_output_buffer = ctx.grad_output_buffer
+#         wgrad_deferral_limit = ctx.wgrad_deferral_limit
+
+#         wgrad_compute = True
+#         if grad_output_buffer is not None:
+#             if wgrad_deferral_limit == 0 or len(grad_output_buffer) < wgrad_deferral_limit:
+#                 grad_output_buffer.append(grad_output)
+#                 wgrad_compute = False
+
+#         if wgrad_compute:
+#             if ctx.sequence_parallel:
+#                 world_size = get_tensor_model_parallel_world_size()
+#                 dim_size = list(input.size())
+#                 dim_size[0] = dim_size[0] * world_size
+
+#                 all_gather_buffer = get_global_memory_buffer().get_tensor(
+#                     dim_size, input.dtype, "mpu"
+#                 )
+#                 handle = torch.distributed._all_gather_base(
+#                     all_gather_buffer, input, group=get_tensor_model_parallel_group(), async_op=True
+#                 )
+
+#                 # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+#                 # gather is scheduled before the input gradient computation
+#                 total_input = all_gather_buffer
+#             else:
+#                 total_input = input
+#         grad_input = grad_output.matmul(weight)
+
+#         if ctx.sequence_parallel and wgrad_compute:
+#             handle.wait()
+
+#         if wgrad_compute:
+#             # Doing gather + slicing during the NeMo forward pass can make this tensor
+#             # not be contiguous. PyTorch only checks if the tensor is contiguous, and only
+#             # clones it if it's not contiguous:
+#             # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
+#             grad_output = grad_output.contiguous()
+#             # Convert the tensor shapes to 2D for execution compatibility
+#             if grad_output.dim() == 3:
+#                 grad_output = grad_output.view(
+#                     grad_output.shape[0] * grad_output.shape[1], grad_output.shape[2]
+#                 )
+#                 total_input = total_input.view(
+#                     total_input.shape[0] * total_input.shape[1], total_input.shape[2]
+#                 )
+#             else:
+#                 # Somehow when DeepSpeed MoE is used, grad_output could have 4 dimensions.
+#                 # TODO: May need further investigation
+#                 total_input = total_input.contiguous()
+#                 grad_output = grad_output.view(-1, grad_output.shape[-1])
+#                 total_input = total_input.view(-1, total_input.shape[-1])
+
+#         if ctx.allreduce_dgrad:
+#             # Asynchronous all-reduce
+#             handle = torch.distributed.all_reduce(
+#                 grad_input, group=get_tensor_model_parallel_group(), async_op=True
+#             )
+#             # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+#             # all-reduce is scheduled before the weight gradient computation
+
+#         if ctx.sequence_parallel:
+#             assert not ctx.allreduce_dgrad
+#             dim_size = list(input.size())
+#             sub_grad_input = torch.empty(
+#                 dim_size, dtype=input.dtype, device=torch.cuda.current_device(), requires_grad=False
+#             )
+#             # reduce_scatter
+#             handle = torch.distributed._reduce_scatter_base(
+#                 sub_grad_input, grad_input, group=get_tensor_model_parallel_group(), async_op=True
+#             )
+#             # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
+#             # reduce scatter is scheduled before the weight gradient computation
+
+#         if ctx.gradient_accumulation_fusion:
+#             if wgrad_compute:
+#                 if weight.main_grad.dtype == torch.float32:
+#                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(
+#                         total_input, grad_output, weight.main_grad
+#                     )
+#                 elif weight.main_grad.dtype in (torch.float16, torch.bfloat16):
+#                     fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(
+#                         total_input, grad_output, weight.main_grad
+#                     )
+#                 else:
+#                     raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
+
+#             if hasattr(weight, 'grad_added_to_main_grad'):
+#                 # When overlap_grad_reduce is True, need to ensure that backward hooks
+#                 # are all run on the main backprop thread to prevent deadlocks. Setup
+#                 # dummy grad_weight tensor to prevent backward hooks from being run
+#                 # in a background thread.
+#                 if getattr(weight, 'zero_out_wgrad', False):
+#                     grad_weight = torch.zeros(
+#                         weight.main_grad.shape,
+#                         dtype=input.dtype,
+#                         device=torch.cuda.current_device(),
+#                         requires_grad=False,
+#                     )
+#                 else:
+#                     grad_weight = torch.empty(
+#                         weight.main_grad.shape,
+#                         dtype=input.dtype,
+#                         device=torch.cuda.current_device(),
+#                         requires_grad=False,
+#                     )
+#                 weight.grad_added_to_main_grad = True
+#             else:
+#                 grad_weight = None
+#         else:
+#             grad_weight = grad_output.t().matmul(total_input)
+#         from megatronspeed.core.tensor_parallel.weight_grad_store import WeightGradStore
+#         WeightGradStore.put(total_input, grad_output, weight, gradientUpdateFunction)
+#         grad_bias = grad_output.sum(dim=0) if use_bias else None
+
+#         if ctx.sequence_parallel:
+#             handle.wait()
+#             # Need to return None's as gradient has to flow for all the input arguments
+#             # provided during forward
+#             return sub_grad_input, grad_weight, grad_bias, None, None, None, None, None
+
+#         if ctx.allreduce_dgrad:
+#             handle.wait()
+
+#         return grad_input, grad_weight, grad_bias, None, None, None, None, None
+
+def linear_with_grad_accumulation_and_async_allreduce(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    gradient_accumulation_fusion: bool,
+    async_grad_allreduce: bool,
+    sequence_parallel: bool,
+    grad_output_buffer: Optional[List[torch.Tensor]] = None,
+    wgrad_deferral_limit: Optional[int] = 0,
+    allreduce_dgrad: bool = None,
+    inference_params = None,
+) -> torch.Tensor:
+    """Linear layer execution with asynchronous communication and
+    gradient accumulation fusion in backprop.
+
+    This has the option to accumulate the result of backprop
+    calculation into an existing gradient buffer, preventing the need
+    to do an additional addition kernel after the gradient
+    calculation.
+
+    Additionally, the tensor parallel all reduce of the input
+    gradients can be done asynchronously with the calculation of
+    the weight gradients.
+
+    In the case of sequence parallelism, the reduce scatter of the
+    input gradients is done asynchronously with the calcluation of the
+    weight gradients.
+
+    Use of this module requires that the environment variable
+    CUDA_DEVICE_MAX_CONNECTIONS=1. There are a few collective
+    operations, noted in the code, that should be scheduled before
+    compute kernels to overlap the communication with the computation,
+    which is necessary for a speedup but not for correctness so that
+    ordering isn't imposed by the scheduler. Setting
+    CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled
+    in the order they are called.
+
+    Args:
+        input (torch.Tensor required): input like torch.nn.functional.linear
+
+        weight (torch.Tensor required): weight like torch.nn.functional.linear
+
+        bias (torch.Tensor optional): bias like torch.nn.functional.linear
+
+        gradient_accumulation_fusion (bool required): Perform the gradient
+            accumulation fusion, requires the custom CUDA extension
+            fused_weight_gradient_mlp_cuda module. To use
+            gradient_accumulation_fusion you must install APEX with
+            --cpp_ext and --cuda_ext. For example: "pip install
+            --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\"
+            " Note that the extension requires CUDA>=11. Otherwise, you
+            must turn off gradient accumulation fusion."
+
+
+        async_grad_allreduce (bool required): Do the allreduce of input
+            gradients asyncronously with the computation of weight
+            gradients. If sequence_parallel is True, this must be
+            False, as no all reduce is performed.
+
+
+        sequence_parallel (bool required): Indicates that sequence
+            parallelism is used and thus in the forward pass the input is
+            all gathered, and the backward pass the input gradients are
+            reduce scattered.
+
+        grad_output_buffer (List[torch.Tensor] optional): Buffer used to save
+            output gradients when embedding table wgrad compute is deferred.
+            Defaults to None.
+
+        wgrad_deferral_limit (int optional): Limit on the number of
+            micro-batches for which embedding weight gradient GEMM should be
+            deferred. Defaults to 0.
+
+        allreduce_dgrad (bool): Do the allreduce of input gradients.
+            The allreduce is done asynchronously with the computation of weight
+            gradients. If sequence_parallel is True, this must be
+            False, as no all reduce is performed.
+    """
+    if allreduce_dgrad is None:
+        warnings.warn(
+            "async_grad_allreduce is deprecated and will be removed in a future release. use allreduce_dgrad instead."
+        )
+        allreduce_dgrad = async_grad_allreduce
+
+    args = [
+        input,
+        weight,
+        bias,
+        gradient_accumulation_fusion,
+        allreduce_dgrad,
+        sequence_parallel,
+        grad_output_buffer,
+        wgrad_deferral_limit,
+        inference_params,
+    ]
+
+    if not linear_with_grad_accumulation_and_async_allreduce.warned:
+        if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+            if sequence_parallel:
+                warnings.warn(
+                    "When using sequence parallelism it is recommended to set the "
+                    "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
+                    "maximum speedup"
+                )
+                linear_with_grad_accumulation_and_async_allreduce.warned = True
+
+            if allreduce_dgrad:
+                warnings.warn(
+                    "When using async grad allreduce it is recommended to set the "
+                    "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
+                    "maximum speedup"
+                )
+                linear_with_grad_accumulation_and_async_allreduce.warned = True
+
+    return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
+
+linear_with_grad_accumulation_and_async_allreduce.warned = False
+
+def column_parallel_linear_init(self,
+    input_size,
+    output_size,
+    *,
+    config: ModelParallelConfig,
+    init_method: Callable,
+    bias=True,
+    gather_output=False,
+    stride=1,
+    keep_master_weight_for_test=False,
+    skip_bias_add=False,
+    skip_weight_param_allocation: bool = False,
+    embedding_activation_buffer: Optional[List[torch.Tensor]] = None,
+    grad_output_buffer: Optional[List[torch.Tensor]] = None,
+    is_expert: bool = False,
+    tp_comm_buffer_name: str = None,  # Not used
+    disable_grad_reduce: bool = False,
+    is_logits_gemm: bool = False,
+    moe=False, enable_expert_tensor_parallelism=False,
+):
+    super(ColumnParallelLinear, self).__init__()
+
+    # Keep input parameters
+    self.input_size = input_size
+    self.output_size = output_size
+    self.gather_output = gather_output
+    # Divide the weight matrix along the last dimension.
+    self.skip_bias_add = skip_bias_add
+    self.is_expert = is_expert
+    self.expert_parallel = config.expert_model_parallel_size > 1
+    self.embedding_activation_buffer = embedding_activation_buffer
+    self.grad_output_buffer = grad_output_buffer
+    self.config = config
+    self.disable_grad_reduce = disable_grad_reduce
+
+    args = get_args()
+    self.deepspeed = args.deepspeed
+    self.explicit_expert_comm = False
+    rank = get_tensor_model_parallel_rank()
+    if not args.deepspeed:
+        self.explicit_expert_comm = self.is_expert and (
+            config.tensor_model_parallel_size > 1 or self.expert_parallel
+        )
+        if self.explicit_expert_comm and config.moe_extended_tp:
+            world_size = get_tensor_and_expert_parallel_world_size()
+            rank = get_tensor_and_expert_parallel_rank()
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+    else:
+        if moe and (not enable_expert_tensor_parallelism):
+            world_size = 1
+            self.is_expert_without_slicing = True
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            self.is_expert_without_slicing = False
+
+    self.output_size_per_partition = divide(output_size, world_size)
+
+    # Parameters.
+    # Note: torch.nn.functional.linear performs XA^T + b and as a result
+    # we allocate the transpose.
+    # Initialize weight.
+    if not skip_weight_param_allocation:
+        if config.use_cpu_initialization:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size_per_partition, self.input_size, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
+                self.master_weight = _initialize_affine_weight_cpu(
+                    self.weight,
+                    self.output_size,
+                    self.input_size,
+                    self.output_size_per_partition,
+                    0,
+                    init_method,
+                    stride=stride,
+                    return_master_weight=keep_master_weight_for_test,
+                    rank=rank,
+                    world_size=world_size,
+                )
+        else:
+            self.weight = Parameter(
+                torch.empty(
+                    self.output_size_per_partition,
+                    self.input_size,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(
+                    self.weight,
+                    init_method,
+                    partition_dim=0,
+                    stride=stride,
+                    expert_parallel=(self.is_expert and self.expert_parallel),
+                )
+
+        setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel))
+    else:
+        self.weight = None
+
+    if bias:
+        if config.use_cpu_initialization:
+            self.bias = Parameter(
+                torch.empty(self.output_size_per_partition, dtype=config.params_dtype)
+            )
+        else:
+            self.bias = Parameter(
+                torch.empty(
+                    self.output_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+        set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
+        if config.perform_initialization:
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel))
+    else:
+        self.register_parameter('bias', None)
+
+    self.sequence_parallel = config.sequence_parallel
+    if self.sequence_parallel and world_size <= 1:
+        warnings.warn(
+            f"`sequence_parallel` is set to `True`, but tensor model parallel size is {world_size}. "
+            f"Disabling sequence parallel."
+        )
+        self.sequence_parallel = False
+
+    self.allreduce_dgrad = (
+        world_size > 1 and not self.sequence_parallel and not self.disable_grad_reduce
+    )
+
+    if args.deepspeed:
+        self.allreduce_dgrad = False
+
+    if config.gradient_accumulation_fusion and not _grad_accum_fusion_available:
+        raise RuntimeError(
+            "ColumnParallelLinear was called with gradient_accumulation_fusion set "
+            "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
+            "module is not found. To use gradient_accumulation_fusion you must "
+            "install APEX with --cpp_ext and --cuda_ext. For example: "
+            "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
+            "Note that the extension requires CUDA>=11. Otherwise, you must turn off "
+            "gradient accumulation fusion."
+        )
+    self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+
+    if self.allreduce_dgrad and self.sequence_parallel:
+        raise RuntimeError(
+            "`allreduce_dgrad` and `sequence_parallel` cannot be enabled at the same time."
+        )
+
+    self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+    # Hook adding a default empty _extra_state for state dict
+    self._register_load_state_dict_pre_hook(
+        lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault(
+            f'{prefix}_extra_state'
+        )
+    )
+    self.use_ixte = False
+    if is_logits_gemm and config.sequence_parallel and ixte_extensions._USE_IXTE and config.transformer_impl == "transformer_engine":
+        self.use_ixte = True
+
+def column_parallel_linear_forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None, inference_params=None):
+    """Forward of ColumnParallelLinear
+
+    Args:
+        input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+        weight (optional): weight tensor to use, compulsory when
+            skip_weight_param_allocation is True.
+
+    Returns:
+        - output
+        - bias
+
+    """
+    if weight is None:
+        if self.weight is None:
+            raise RuntimeError(
+                "weight was not supplied to ColumnParallelLinear forward pass "
+                "and skip_weight_param_allocation is True."
+            )
+        weight = self.weight
+    else:
+        # Check the weight passed in is the correct shape
+        expected_shape = (self.output_size_per_partition, self.input_size)
+        if weight.shape != expected_shape:
+            raise RuntimeError(
+                f"supplied weight's shape is {tuple(weight.shape)}, "
+                f"not {expected_shape} as expected"
+            )
+
+    if self.config._cpu_offloading_context is not None:
+        if self.config._cpu_offloading_context.inside_context == True:
+            assert (
+                self.config.cpu_offloading == False
+            ), "CPU Offloading cannot be enabled while using non-TE modules"
+
+    bias = self.bias if not self.skip_bias_add else None
+
+    if self.use_ixte:
+        output = ixte_extensions.get_logits_linear_func()(
+            input=input_,
+            weight=weight,
+            sequence_parallel=self.sequence_parallel,
+            gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+            tp_group=get_tensor_model_parallel_group(),
+        )
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+    if (
+        self.allreduce_dgrad
+        or self.sequence_parallel
+        or self.explicit_expert_comm
+        or self.disable_grad_reduce
+        or (self.deepspeed and self.is_expert_without_slicing)
+    ):
+        input_parallel = input_
+    else:
+        input_parallel = copy_to_tensor_model_parallel_region(input_)
+
+    if self.config.defer_embedding_wgrad_compute:
+        if (
+            self.config.wgrad_deferral_limit == 0
+            or len(self.embedding_activation_buffer) < self.config.wgrad_deferral_limit
+        ):
+            self.embedding_activation_buffer.append(input_parallel)
+
+    # Matrix multiply.
+    if not weight.requires_grad:
+        self._forward_impl = linear_with_frozen_weight
+    else:
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+    allreduce_dgrad = False if self.explicit_expert_comm else self.allreduce_dgrad
+
+    output_parallel = self._forward_impl(
+        input=input_parallel,
+        weight=weight,
+        bias=bias,
+        gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+        async_grad_allreduce=allreduce_dgrad,
+        sequence_parallel=False if self.explicit_expert_comm else self.sequence_parallel,
+        grad_output_buffer=(
+            self.grad_output_buffer if self.config.defer_embedding_wgrad_compute else None
+        ),
+        wgrad_deferral_limit=(
+            self.config.wgrad_deferral_limit
+            if self.config.defer_embedding_wgrad_compute
+            else None
+        ),
+        allreduce_dgrad=allreduce_dgrad,
+        inference_params=inference_params,
+    )
+    if (self.gather_output and not self.deepspeed) or \
+        (self.deepspeed and self.gather_output and not self.is_expert_without_slicing):
+        # All-gather across the partitions.
+        assert not self.sequence_parallel
+        output = gather_from_tensor_model_parallel_region(output_parallel)
+    else:
+        output = output_parallel
+    output_bias = self.bias if self.skip_bias_add else None
+    return output, output_bias
+
+# class ColumnParallelLinear(torch.nn.Module):
+#     """Linear layer with column parallelism.
+
+#     The linear layer is defined as Y = XA + b. A is parallelized along
+#     its second dimension as A = [A_1, ..., A_p].
+
+#     Args:
+#         input_size: first dimension of matrix A.
+#         output_size: second dimension of matrix A.
+#         bias: If true, add bias
+#         gather_output: If true, call all-gather on output and make Y available to all GPUs, otherwise, every GPU will have its output which is Y_i = XA_i
+#         init_method: method to initialize weights. Note that bias is always set to zero.
+#         stride: For the strided linear layers.
+#         keep_master_weight_for_test: This was added for testing and should be set to False. It returns the master weights used for initialization.
+#         skip_bias_add: If True, do not add the bias term, instead return it to be added by the caller. This enables performance optimations where bias can be fused with other elementwise operations.
+#         skip_weight_param_allocation: If True, weight parameter is not allocated and must be passed as a keyword argument `weight` during the forward pass. Note that this does not affect bias, which will be allocated if bias is True. Defaults to False.
+#         embedding_activation_buffer: This buffer holds the input activations of the final embedding linear layer on the last pipeline stage when defer_embedding_wgrad_compute is enabled.
+#         grad_output_buffer: This buffer holds the gradient outputs of the final embedding linear layer on the last pipeline stage when defer_embedding_wgrad_compute is enabled.
+#         is_expert: If True, the layer is treated as an MoE expert layer.
+#         config: ModelParallelConfig object
+#         tp_comm_buffer_name: Communication buffer name is not used in non-Transformer-Engine modules.
+#         disable_grad_reduce: If True, reduction of output gradients across tensor-parallel ranks will be disabled. Defaults to False. This feature is used by Lora Adapter in Nemo to delay and fuse reduction along with other gradients for performance optimization.
+#     """
+
+#     def __init__(
+#         self,
+#         input_size,
+#         output_size,
+#         *,
+#         config: ModelParallelConfig,
+#         init_method: Callable,
+#         bias=True,
+#         gather_output=False,
+#         stride=1,
+#         keep_master_weight_for_test=False,
+#         skip_bias_add=False,
+#         skip_weight_param_allocation: bool = False,
+#         embedding_activation_buffer: Optional[List[torch.Tensor]] = None,
+#         grad_output_buffer: Optional[List[torch.Tensor]] = None,
+#         is_expert: bool = False,
+#         tp_comm_buffer_name: str = None,  # Not used
+#         disable_grad_reduce: bool = False,
+#         is_logits_gemm: bool = False,
+#         moe=False, enable_expert_tensor_parallelism=False,
+#     ):
+#         super(ColumnParallelLinear, self).__init__()
+
+#         # Keep input parameters
+#         self.input_size = input_size
+#         self.output_size = output_size
+#         self.gather_output = gather_output
+#         # Divide the weight matrix along the last dimension.
+#         self.skip_bias_add = skip_bias_add
+#         self.is_expert = is_expert
+#         self.expert_parallel = config.expert_model_parallel_size > 1
+#         self.embedding_activation_buffer = embedding_activation_buffer
+#         self.grad_output_buffer = grad_output_buffer
+#         self.config = config
+#         self.disable_grad_reduce = disable_grad_reduce
+
+#         args = get_args()
+#         self.deepspeed = args.deepspeed
+#         self.explicit_expert_comm = False
+#         rank = get_tensor_model_parallel_rank()
+#         if not args.deepspeed:
+#             self.explicit_expert_comm = self.is_expert and (
+#                 config.tensor_model_parallel_size > 1 or self.expert_parallel
+#             )
+#             if self.explicit_expert_comm and config.moe_extended_tp:
+#                 world_size = get_tensor_and_expert_parallel_world_size()
+#                 rank = get_tensor_and_expert_parallel_rank()
+#             else:
+#                 world_size = get_tensor_model_parallel_world_size()
+#                 rank = get_tensor_model_parallel_rank()
+#         else:
+#             if moe and (not enable_expert_tensor_parallelism):
+#                 world_size = 1
+#                 self.is_expert_without_slicing = True
+#             else:
+#                 world_size = get_tensor_model_parallel_world_size()
+#                 self.is_expert_without_slicing = False
+
+#         self.output_size_per_partition = divide(output_size, world_size)
+
+#         # Parameters.
+#         # Note: torch.nn.functional.linear performs XA^T + b and as a result
+#         # we allocate the transpose.
+#         # Initialize weight.
+#         if not skip_weight_param_allocation:
+#             if config.use_cpu_initialization:
+#                 self.weight = Parameter(
+#                     torch.empty(
+#                         self.output_size_per_partition, self.input_size, dtype=config.params_dtype
+#                     )
+#                 )
+#                 if config.perform_initialization:
+#                     self.master_weight = _initialize_affine_weight_cpu(
+#                         self.weight,
+#                         self.output_size,
+#                         self.input_size,
+#                         self.output_size_per_partition,
+#                         0,
+#                         init_method,
+#                         stride=stride,
+#                         return_master_weight=keep_master_weight_for_test,
+#                         rank=rank,
+#                         world_size=world_size,
+#                     )
+#             else:
+#                 self.weight = Parameter(
+#                     torch.empty(
+#                         self.output_size_per_partition,
+#                         self.input_size,
+#                         device=torch.cuda.current_device(),
+#                         dtype=config.params_dtype,
+#                     )
+#                 )
+#                 if config.perform_initialization:
+#                     _initialize_affine_weight_gpu(
+#                         self.weight,
+#                         init_method,
+#                         partition_dim=0,
+#                         stride=stride,
+#                         expert_parallel=(self.is_expert and self.expert_parallel),
+#                     )
+
+#             setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel))
+#         else:
+#             self.weight = None
+
+#         if bias:
+#             if config.use_cpu_initialization:
+#                 self.bias = Parameter(
+#                     torch.empty(self.output_size_per_partition, dtype=config.params_dtype)
+#                 )
+#             else:
+#                 self.bias = Parameter(
+#                     torch.empty(
+#                         self.output_size_per_partition,
+#                         device=torch.cuda.current_device(),
+#                         dtype=config.params_dtype,
+#                     )
+#                 )
+#             set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
+#             if config.perform_initialization:
+#                 # Always initialize bias to zero.
+#                 with torch.no_grad():
+#                     self.bias.zero_()
+#             setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel))
+#         else:
+#             self.register_parameter('bias', None)
+
+#         self.sequence_parallel = config.sequence_parallel
+#         if self.sequence_parallel and world_size <= 1:
+#             warnings.warn(
+#                 f"`sequence_parallel` is set to `True`, but tensor model parallel size is {world_size}. "
+#                 f"Disabling sequence parallel."
+#             )
+#             self.sequence_parallel = False
+
+#         self.allreduce_dgrad = (
+#             world_size > 1 and not self.sequence_parallel and not self.disable_grad_reduce
+#         )
+
+#         if args.deepspeed:
+#             self.allreduce_dgrad = False
+
+#         if config.gradient_accumulation_fusion and not _grad_accum_fusion_available:
+#             raise RuntimeError(
+#                 "ColumnParallelLinear was called with gradient_accumulation_fusion set "
+#                 "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
+#                 "module is not found. To use gradient_accumulation_fusion you must "
+#                 "install APEX with --cpp_ext and --cuda_ext. For example: "
+#                 "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
+#                 "Note that the extension requires CUDA>=11. Otherwise, you must turn off "
+#                 "gradient accumulation fusion."
+#             )
+#         self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+
+#         if self.allreduce_dgrad and self.sequence_parallel:
+#             raise RuntimeError(
+#                 "`allreduce_dgrad` and `sequence_parallel` cannot be enabled at the same time."
+#             )
+
+#         self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+#         # Hook adding a default empty _extra_state for state dict
+#         self._register_load_state_dict_pre_hook(
+#             lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault(
+#                 f'{prefix}_extra_state'
+#             )
+#         )
+#         self.use_ixte = False
+#         if is_logits_gemm and config.sequence_parallel and ixte_extensions._USE_IXTE and config.transformer_impl == "transformer_engine":
+#             self.use_ixte = True
+
+#     def forward(self, input_: torch.Tensor, weight: Optional[torch.Tensor] = None, recompute_fwd = False):
+#         """Forward of ColumnParallelLinear
+
+#         Args:
+#             input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+#             weight (optional): weight tensor to use, compulsory when
+#                 skip_weight_param_allocation is True.
+
+#         Returns:
+#             - output
+#             - bias
+
+#         """
+#         if weight is None:
+#             if self.weight is None:
+#                 raise RuntimeError(
+#                     "weight was not supplied to ColumnParallelLinear forward pass "
+#                     "and skip_weight_param_allocation is True."
+#                 )
+#             weight = self.weight
+#         else:
+#             # Check the weight passed in is the correct shape
+#             expected_shape = (self.output_size_per_partition, self.input_size)
+#             if weight.shape != expected_shape:
+#                 raise RuntimeError(
+#                     f"supplied weight's shape is {tuple(weight.shape)}, "
+#                     f"not {expected_shape} as expected"
+#                 )
+
+#         if self.config._cpu_offloading_context is not None:
+#             if self.config._cpu_offloading_context.inside_context == True:
+#                 assert (
+#                     self.config.cpu_offloading == False
+#                 ), "CPU Offloading cannot be enabled while using non-TE modules"
+
+#         bias = self.bias if not self.skip_bias_add else None
+
+#         if self.use_ixte:
+#             output = ixte_extensions.get_logits_linear_func()(
+#                 input=input_,
+#                 weight=weight,
+#                 sequence_parallel=self.sequence_parallel,
+#                 gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+#                 tp_group=get_tensor_model_parallel_group(),
+#             )
+#             output_bias = self.bias if self.skip_bias_add else None
+#             return output, output_bias
+#         if (
+#             self.allreduce_dgrad
+#             or self.sequence_parallel
+#             or self.explicit_expert_comm
+#             or self.disable_grad_reduce
+#             or (self.deepspeed and self.is_expert_without_slicing)
+#         ):
+#             input_parallel = input_
+#         else:
+#             input_parallel = copy_to_tensor_model_parallel_region(input_)
+
+#         if self.config.defer_embedding_wgrad_compute:
+#             if (
+#                 self.config.wgrad_deferral_limit == 0
+#                 or len(self.embedding_activation_buffer) < self.config.wgrad_deferral_limit
+#             ):
+#                 self.embedding_activation_buffer.append(input_parallel)
+
+#         # Matrix multiply.
+#         if not weight.requires_grad:
+#             self._forward_impl = linear_with_frozen_weight
+#         else:
+#             self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+#         allreduce_dgrad = False if self.explicit_expert_comm else self.allreduce_dgrad
+
+#         output_parallel = self._forward_impl(
+#             input=input_parallel,
+#             weight=weight,
+#             bias=bias,
+#             gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+#             async_grad_allreduce=allreduce_dgrad,
+#             sequence_parallel=False if self.explicit_expert_comm else self.sequence_parallel,
+#             grad_output_buffer=(
+#                 self.grad_output_buffer if self.config.defer_embedding_wgrad_compute else None
+#             ),
+#             wgrad_deferral_limit=(
+#                 self.config.wgrad_deferral_limit
+#                 if self.config.defer_embedding_wgrad_compute
+#                 else None
+#             ),
+#             allreduce_dgrad=allreduce_dgrad,
+#         )
+#         if (self.gather_output and not self.deepspeed) or \
+#             (self.deepspeed and self.gather_output and not self.is_expert_without_slicing):
+#             # All-gather across the partitions.
+#             assert not self.sequence_parallel
+#             output = gather_from_tensor_model_parallel_region(output_parallel)
+#         else:
+#             output = output_parallel
+#         output_bias = self.bias if self.skip_bias_add else None
+#         return output, output_bias
+
+#     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+#         """Sharding along axis 0, bias sharded"""
+#         state_dict = self.state_dict(prefix='', keep_vars=True)
+#         return make_sharded_tensors_for_checkpoint(
+#             state_dict, prefix, {'weight': 0, 'bias': 0}, sharded_offsets
+#         )
+
+#     def set_extra_state(self, state: Any):
+#         """Extra state is ignored"""
+
+#     def get_extra_state(self) -> None:
+#         """Keep compatibility with TE state dict."""
+#         return None
+
+def row_parallel_linear_init(
+    self,
+    input_size: int,
+    output_size: int,
+    *,
+    config: ModelParallelConfig,
+    init_method: Callable,
+    bias: bool,
+    input_is_parallel: bool,
+    skip_bias_add: bool,
+    stride: int = 1,
+    keep_master_weight_for_test: bool = False,
+    is_expert: bool = False,
+    tp_comm_buffer_name: str = None,  # Not used
+    moe=False, enable_expert_tensor_parallelism=False,
+):
+    super(RowParallelLinear, self).__init__()
+
+    # Keep input parameters
+    self.input_size = input_size
+    self.output_size = output_size
+    self.input_is_parallel = input_is_parallel
+    self.skip_bias_add = skip_bias_add
+    self.config = config
+    self.is_expert = is_expert
+    self.expert_parallel = config.expert_model_parallel_size > 1
+    self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+    self.sequence_parallel = config.sequence_parallel
+    if self.sequence_parallel and not self.input_is_parallel:
+        raise RuntimeError("To enable `sequence_parallel`, `input_is_parallel` must be `True`")
+
+    args = get_args()
+    self.deepspeed = args.deepspeed
+    self.explicit_expert_comm = False
+    rank = get_tensor_model_parallel_rank()
+    if not args.deepspeed:
+        self.explicit_expert_comm = self.is_expert and (
+            config.tensor_model_parallel_size > 1 or self.expert_parallel
+        )
+
+        # Divide the weight matrix along the last dimension.
+        if self.explicit_expert_comm and config.moe_extended_tp:
+            world_size = get_tensor_and_expert_parallel_world_size()
+            rank = get_tensor_and_expert_parallel_rank()
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+            rank = get_tensor_model_parallel_rank()
+    else:
+        if moe and (not enable_expert_tensor_parallelism):
+            world_size = 1
+        else:
+            world_size = get_tensor_model_parallel_world_size()
+        self.is_expert_without_slicing = moe and world_size==1
+
+    self.input_size_per_partition = divide(input_size, world_size)
+
+    # Parameters.
+    # Note: torch.nn.functional.linear performs XA^T + b and as a result
+    # we allocate the transpose.
+    # Initialize weight.
+    if config.use_cpu_initialization:
+        self.weight = Parameter(
+            torch.empty(
+                self.output_size, self.input_size_per_partition, dtype=config.params_dtype
+            )
+        )
+        if config.perform_initialization:
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight,
+                self.output_size,
+                self.input_size,
+                self.input_size_per_partition,
+                1,
+                init_method,
+                stride=stride,
+                return_master_weight=keep_master_weight_for_test,
+                params_dtype=config.params_dtype,
+                rank=rank,
+                world_size=world_size,
+            )
+    else:
+        self.weight = Parameter(
+            torch.empty(
+                self.output_size,
+                self.input_size_per_partition,
+                device=torch.cuda.current_device(),
+                dtype=config.params_dtype,
+            )
+        )
+        if config.perform_initialization:
+            _initialize_affine_weight_gpu(
+                self.weight,
+                init_method,
+                partition_dim=1,
+                stride=stride,
+                expert_parallel=(self.is_expert and self.expert_parallel),
+            )
+    setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel))
+
+    if bias:
+        if config.use_cpu_initialization:
+            self.bias = Parameter(torch.empty(self.output_size, dtype=config.params_dtype))
+        else:
+            self.bias = Parameter(
+                torch.empty(
+                    self.output_size,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+
+        if config.perform_initialization:
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel))
+        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+    else:
+        self.register_parameter('bias', None)
+
+    self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+    # Hook adding a default empty _extra_state for state dict
+    self._register_load_state_dict_pre_hook(
+        lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault(
+            f'{prefix}_extra_state'
+        )
+    )
+
+def row_parallel_linear_forward(self, input_, inference_params=None):
+    """Forward of RowParallelLinear
+
+    Args:
+        input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+    Returns:
+        - output
+        - bias
+    """
+
+    if self.config._cpu_offloading_context is not None:
+        if self.config._cpu_offloading_context.inside_context == True:
+            assert (
+                self.config.cpu_offloading == False
+            ), "CPU Offloading cannot be enabled while using non-TE modules"
+
+    # Set up backprop all-reduce.
+    if self.input_is_parallel or (self.deepspeed and self.is_expert_without_slicing):
+        input_parallel = input_
+    else:
+        assert not self.sequence_parallel
+        input_parallel = scatter_to_tensor_model_parallel_region(input_)
+    # Matrix multiply.
+    if not self.weight.requires_grad:
+        self._forward_impl = linear_with_frozen_weight
+    else:
+        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+    allreduce_dgrad = False
+
+    output_parallel = self._forward_impl(
+        input=input_parallel,
+        weight=self.weight,
+        bias=None,
+        gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+        async_grad_allreduce=allreduce_dgrad,
+        sequence_parallel=False,
+        grad_output_buffer=None,
+        allreduce_dgrad=allreduce_dgrad,
+        inference_params=inference_params,
+    )
+
+    # All-reduce across all the partitions.
+    if self.explicit_expert_comm:
+        assert self.skip_bias_add
+        output_ = output_parallel
+    elif self.deepspeed and self.is_expert_without_slicing: # non-expert only tensor-parallelism
+        output_ = output_parallel
+    elif self.sequence_parallel and not inference_params:
+        output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+    else:
+        output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+    if not self.skip_bias_add:
+        output = (output_ + self.bias) if self.bias is not None else output_
+        output_bias = None
+    else:
+        output = output_
+        output_bias = self.bias
+    return output, output_bias
+
+# class RowParallelLinear(torch.nn.Module):
+#     """Linear layer with row parallelism.
+
+#     The linear layer is defined as Y = XA + b. A is parallelized along its first dimension and X along its second dimension. A = transpose([A_1 .. A_p]) X = [X_1, ..., X_p]
+
+#     Args:
+#         input_size: first dimension of matrix A.
+#         output_size: second dimension of matrix A.
+#         bias: If true, add bias. Note that bias is not parallelized.
+#         input_is_parallel: If true, we assume that the input is already split across the GPUs and we do not split again.
+#         init_method: method to initialize weights. Note that bias is always set to zero.
+#         stride: For the strided linear layers.
+#         keep_master_weight_for_test: This was added for testing and should be set to False. It returns the master weights used for initialization.
+#         skip_bias_add: If True, do not add the bias term, instead return it to be added by the caller. This enables performance optimations where bias can be fused with other elementwise operations.
+#         is_expert: If True, the layer is treated as an MoE expert layer
+#         tp_comm_buffer_name: Communication buffer name. Not used in
+#                              non-Transformer-Engine modules.
+#         config: ModelParallelConfig object
+
+#     """
+
+#     def __init__(
+#         self,
+#         input_size: int,
+#         output_size: int,
+#         *,
+#         config: ModelParallelConfig,
+#         init_method: Callable,
+#         bias: bool,
+#         input_is_parallel: bool,
+#         skip_bias_add: bool,
+#         stride: int = 1,
+#         keep_master_weight_for_test: bool = False,
+#         is_expert: bool = False,
+#         tp_comm_buffer_name: str = None,  # Not used
+#         moe=False, enable_expert_tensor_parallelism=False,
+#     ):
+#         super(RowParallelLinear, self).__init__()
+
+#         # Keep input parameters
+#         self.input_size = input_size
+#         self.output_size = output_size
+#         self.input_is_parallel = input_is_parallel
+#         self.skip_bias_add = skip_bias_add
+#         self.config = config
+#         self.is_expert = is_expert
+#         self.expert_parallel = config.expert_model_parallel_size > 1
+#         self.gradient_accumulation_fusion = config.gradient_accumulation_fusion
+#         self.sequence_parallel = config.sequence_parallel
+#         if self.sequence_parallel and not self.input_is_parallel:
+#             raise RuntimeError("To enable `sequence_parallel`, `input_is_parallel` must be `True`")
+
+#         args = get_args()
+#         self.deepspeed = args.deepspeed
+#         self.explicit_expert_comm = False
+#         rank = get_tensor_model_parallel_rank()
+#         if not args.deepspeed:
+#             self.explicit_expert_comm = self.is_expert and (
+#                 config.tensor_model_parallel_size > 1 or self.expert_parallel
+#             )
+
+#             # Divide the weight matrix along the last dimension.
+#             if self.explicit_expert_comm and config.moe_extended_tp:
+#                 world_size = get_tensor_and_expert_parallel_world_size()
+#                 rank = get_tensor_and_expert_parallel_rank()
+#             else:
+#                 world_size = get_tensor_model_parallel_world_size()
+#                 rank = get_tensor_model_parallel_rank()
+#         else:
+#             if moe and (not enable_expert_tensor_parallelism):
+#                 world_size = 1
+#             else:
+#                 world_size = get_tensor_model_parallel_world_size()
+#             self.is_expert_without_slicing = moe and world_size==1
+
+#         self.input_size_per_partition = divide(input_size, world_size)
+
+#         # Parameters.
+#         # Note: torch.nn.functional.linear performs XA^T + b and as a result
+#         # we allocate the transpose.
+#         # Initialize weight.
+#         if config.use_cpu_initialization:
+#             self.weight = Parameter(
+#                 torch.empty(
+#                     self.output_size, self.input_size_per_partition, dtype=config.params_dtype
+#                 )
+#             )
+#             if config.perform_initialization:
+#                 self.master_weight = _initialize_affine_weight_cpu(
+#                     self.weight,
+#                     self.output_size,
+#                     self.input_size,
+#                     self.input_size_per_partition,
+#                     1,
+#                     init_method,
+#                     stride=stride,
+#                     return_master_weight=keep_master_weight_for_test,
+#                     params_dtype=config.params_dtype,
+#                     rank=rank,
+#                     world_size=world_size,
+#                 )
+#         else:
+#             self.weight = Parameter(
+#                 torch.empty(
+#                     self.output_size,
+#                     self.input_size_per_partition,
+#                     device=torch.cuda.current_device(),
+#                     dtype=config.params_dtype,
+#                 )
+#             )
+#             if config.perform_initialization:
+#                 _initialize_affine_weight_gpu(
+#                     self.weight,
+#                     init_method,
+#                     partition_dim=1,
+#                     stride=stride,
+#                     expert_parallel=(self.is_expert and self.expert_parallel),
+#                 )
+#         setattr(self.weight, 'allreduce', not (self.is_expert and self.expert_parallel))
+
+#         if bias:
+#             if config.use_cpu_initialization:
+#                 self.bias = Parameter(torch.empty(self.output_size, dtype=config.params_dtype))
+#             else:
+#                 self.bias = Parameter(
+#                     torch.empty(
+#                         self.output_size,
+#                         device=torch.cuda.current_device(),
+#                         dtype=config.params_dtype,
+#                     )
+#                 )
+
+#             if config.perform_initialization:
+#                 # Always initialize bias to zero.
+#                 with torch.no_grad():
+#                     self.bias.zero_()
+#             setattr(self.bias, 'allreduce', not (self.is_expert and self.expert_parallel))
+#             setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+#         else:
+#             self.register_parameter('bias', None)
+
+#         self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+#         # Hook adding a default empty _extra_state for state dict
+#         self._register_load_state_dict_pre_hook(
+#             lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault(
+#                 f'{prefix}_extra_state'
+#             )
+#         )
+
+#     def forward(self, input_, ignore_forward=False, recompute_fwd=False):
+#         """Forward of RowParallelLinear
+
+#         Args:
+#             input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+#         Returns:
+#             - output
+#             - bias
+#         """
+
+#         if self.config._cpu_offloading_context is not None:
+#             if self.config._cpu_offloading_context.inside_context == True:
+#                 assert (
+#                     self.config.cpu_offloading == False
+#                 ), "CPU Offloading cannot be enabled while using non-TE modules"
+
+#         # Set up backprop all-reduce.
+#         if self.input_is_parallel or (self.deepspeed and self.is_expert_without_slicing):
+#             input_parallel = input_
+#         else:
+#             assert not self.sequence_parallel
+#             input_parallel = scatter_to_tensor_model_parallel_region(input_)
+#         # Matrix multiply.
+#         if not self.weight.requires_grad:
+#             self._forward_impl = linear_with_frozen_weight
+#         else:
+#             self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
+
+#         allreduce_dgrad = False
+
+#         output_parallel = self._forward_impl(
+#             input=input_parallel,
+#             weight=self.weight,
+#             bias=None,
+#             gradient_accumulation_fusion=self.gradient_accumulation_fusion,
+#             async_grad_allreduce=allreduce_dgrad,
+#             sequence_parallel=False,
+#             grad_output_buffer=None,
+#             allreduce_dgrad=allreduce_dgrad,
+#         )
+
+#         # All-reduce across all the partitions.
+#         if self.explicit_expert_comm:
+#             assert self.skip_bias_add
+#             output_ = output_parallel
+#         elif self.deepspeed and self.is_expert_without_slicing: # non-expert only tensor-parallelism
+#             output_ = output_parallel
+#         elif self.sequence_parallel:
+#             output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+#         else:
+#             output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+#         if not self.skip_bias_add:
+#             output = (output_ + self.bias) if self.bias is not None else output_
+#             output_bias = None
+#         else:
+#             output = output_
+#             output_bias = self.bias
+#         return output, output_bias
+
+#     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+#         """Sharding along axis 1, bias not sharded"""
+#         state_dict = self.state_dict(prefix='', keep_vars=True)
+#         return make_sharded_tensors_for_checkpoint(
+#             state_dict, prefix, {'weight': 1}, sharded_offsets
+#         )
+
+#     def set_extra_state(self, state: Any):
+#         """Extra state is ignored"""
+
+#     def get_extra_state(self) -> None:
+#         """Keep compatibility with TE state dict."""
+#         return None
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/random.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a3edcf29d5fbcb2686f48caa4d0f97c93e01f48
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/random.py
@@ -0,0 +1,315 @@
+import contextlib
+from importlib.metadata import version
+
+import torch
+from pkg_resources import packaging
+from torch import _C
+from torch.cuda import _lazy_call
+from torch.cuda import device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from megatron.training.global_vars import get_args
+from megatron.core.parallel_state import (
+    get_data_parallel_rank,
+    get_expert_model_parallel_rank,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from megatron.core.utils import safely_set_viewless_tensor_data
+import megatron.core.tensor_parallel.random as tp_random
+from megatron.core.tensor_parallel.random import (
+    _set_cuda_rng_state,
+    initialize_rng_tracker,
+    CheckpointFunction
+)
+from megatron.core.tensor_parallel.utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
+
+from megatronspeed.training.memory import allocate_mem_buff
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+# Whether apply model parallelsim to checkpointed hidden states.
+_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
+
+
+def init_checkpointed_activations_memory_buffer():
+    """Initializ the memory buffer for the checkpointed activations."""
+    args = get_args()
+
+    per_layer = args.micro_batch_size * args.max_position_embeddings * \
+                args.hidden_size // args.tensor_model_parallel_size
+    assert args.num_layers % args.checkpoint_num_layers == 0, \
+        'number of layers is not divisible by checkpoint-num-layers'
+    num_checkpointer_layers = args.num_layers // args.checkpoint_num_layers
+    numel = per_layer * num_checkpointer_layers
+    dtype = torch.half
+    if not args.fp16:
+        dtype = torch.float
+
+    global _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
+    assert _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is None, \
+        'checkpointed activations memory buffer is already allocated.'
+    _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = allocate_mem_buff(
+        'checkpointed activations', numel, dtype, track_usage=False)
+
+
+def reset_checkpointed_activations_memory_buffer():
+    """Reset the memory used for checkpointing."""
+    if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+        _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.reset()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    if deepspeed.checkpointing.is_configured():
+        return deepspeed.checkpointing.get_cuda_rng_tracker()
+    
+    initialize_rng_tracker()
+    return tp_random._CUDA_RNG_STATE_TRACKER
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    if deepspeed.checkpointing.is_configured():
+        return deepspeed.checkpointing.model_parallel_cuda_manual_seed(seed)
+    
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel cuda seeds on global rank {}, '
+              'model parallel rank {}, and data parallel rank {} with '
+              'model parallel seed: {} and data parallel seed: {}'.format(
+                  torch.distributed.get_rank(), get_tensor_model_parallel_rank(),
+                  get_data_parallel_rank(), tensor_model_parallel_seed,
+                  data_parallel_seed), flush=True)
+
+    initialize_rng_tracker()
+    tp_random._CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    tp_random._CUDA_RNG_STATE_TRACKER.add(tp_random._DATA_PARALLEL_RNG_TRACKER_NAME, data_parallel_seed)
+
+    # and model parallel state.
+    tp_random._CUDA_RNG_STATE_TRACKER.add(tp_random._MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                          tensor_model_parallel_seed)
+
+    expert_parallel_seed = (
+        seed + 1024 + 100 * get_expert_model_parallel_rank() + get_tensor_model_parallel_rank()
+    )
+    tp_random._CUDA_RNG_STATE_TRACKER.add(tp_random._EXPERT_PARALLEL_RNG_TRACKER_NAME, expert_parallel_seed)
+
+
+def model_parallel_reconfigure_tp_seed(seed):
+    if deepspeed.checkpointing.is_configured():
+        return deepspeed.checkpointing.model_parallel_reconfigure_tp_seed(seed)
+
+    model_parallel_seed = seed + 2718 + get_tensor_model_parallel_rank()
+    with tp_random._CUDA_RNG_STATE_TRACKER.fork():
+        get_accelerator().manual_seed(model_parallel_seed)
+
+def checkpoint_function_forward(ctx, run_function, distribute_saved_activations, *args):
+    ctx.run_function = run_function
+    ctx.distribute_saved_activations \
+        = distribute_saved_activations
+
+    # Copy the rng states.
+    ctx.fwd_cpu_rng_state = torch.get_rng_state()
+    ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+    ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+    with torch.no_grad():
+        outputs = run_function(*args, is_recompute_forward=False)
+
+    # Divide hidden states across model parallel group and only keep
+    # the chunk corresponding to the current rank.
+    if distribute_saved_activations:
+        ctx.input_0_shape = args[0].data.shape
+        safely_set_viewless_tensor_data(
+            args[0],
+            split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True))
+
+    # HACK: currently when DeepSpeed is used, we always set
+    # distribute_saved_activations to false, and use the following older
+    # activation checkpointing mechanisms
+    if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+        ctx.input_0_shape = args[0].data.shape
+        args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
+        args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
+            args[0].data)
+
+    # Store everything.
+    ctx.save_for_backward(*args)
+
+    return outputs
+
+def checkpoint_function_backward(ctx, *args):
+    if not torch.autograd._is_checkpoint_valid():
+        raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                            "please use .backward() if possible")
+    inputs = ctx.saved_tensors
+    if ctx.distribute_saved_activations:
+        safely_set_viewless_tensor_data(
+            inputs[0],
+            gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
+    # HACK: currently when DeepSpeed is used, we always set
+    # distribute_saved_activations to false, and use the following older
+    # activation checkpointing mechanisms
+    if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+        inputs[0].data = gather_split_1d_tensor(inputs[0].data)
+        inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
+
+    # Store the current states.
+    bwd_cpu_rng_state = torch.get_rng_state()
+    bwd_cuda_rng_state = torch.cuda.get_rng_state()
+    bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+    # Set the states to what it used to be before the forward pass.
+    torch.set_rng_state(ctx.fwd_cpu_rng_state)
+    _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+    get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+    # Compute the forward pass.
+    detached_inputs = detach_variable(inputs)
+    with torch.enable_grad():
+        outputs = ctx.run_function(*detached_inputs, is_recompute_forward=True)
+
+    # Set the states back to what it was at the start of this function.
+    torch.set_rng_state(bwd_cpu_rng_state)
+    _set_cuda_rng_state(bwd_cuda_rng_state)
+    get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+    if isinstance(outputs, torch.Tensor):
+        outputs = (outputs,)
+    elif len(outputs) == 2 and isinstance(outputs[1], torch.Tensor) and \
+            torch.equal(outputs[1], torch.tensor(0).to(torch.cuda.current_device())):
+        # a hacky solution to overcome issue when running old script examples/pretrain_gpt_distributed.sh
+        outputs = (outputs[0],)
+    # filter out non tensor outputs for backward pass
+    outputs, args = zip(*filter(lambda x: torch.is_tensor(x[0]), zip(outputs, args)))
+    torch.autograd.backward(outputs, args)
+    grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
+                    for inp in detached_inputs)
+    return (None, None) + grads
+
+# class CheckpointFunction(torch.autograd.Function):
+#     """Checkpoint Function 
+
+#     This function is adapted from torch.utils.checkpoint with two main changes:
+#     1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+#     2) the states in the model parallel tracker are also properly tracked/set/reset.
+#     """
+
+#     @staticmethod
+#     def forward(ctx, run_function, distribute_saved_activations, *args):
+#         ctx.run_function = run_function
+#         ctx.distribute_saved_activations \
+#             = distribute_saved_activations
+
+#         # Copy the rng states.
+#         ctx.fwd_cpu_rng_state = torch.get_rng_state()
+#         ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+#         ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+#         with torch.no_grad():
+#             outputs = run_function(*args, is_recompute_forward=False)
+
+#         # Divide hidden states across model parallel group and only keep
+#         # the chunk corresponding to the current rank.
+#         if distribute_saved_activations:
+#             ctx.input_0_shape = args[0].data.shape
+#             safely_set_viewless_tensor_data(
+#                 args[0],
+#                 split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True))
+
+#         # HACK: currently when DeepSpeed is used, we always set
+#         # distribute_saved_activations to false, and use the following older
+#         # activation checkpointing mechanisms
+#         if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+#             ctx.input_0_shape = args[0].data.shape
+#             args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
+#             args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
+#                 args[0].data)
+
+#         # Store everything.
+#         ctx.save_for_backward(*args)
+
+#         return outputs
+
+#     @staticmethod
+#     def backward(ctx, *args):
+#         if not torch.autograd._is_checkpoint_valid():
+#             raise RuntimeError("Checkpointing is not compatible with .grad(), "
+#                                "please use .backward() if possible")
+#         inputs = ctx.saved_tensors
+#         if ctx.distribute_saved_activations:
+#             safely_set_viewless_tensor_data(
+#                 inputs[0],
+#                 gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
+#         # HACK: currently when DeepSpeed is used, we always set
+#         # distribute_saved_activations to false, and use the following older
+#         # activation checkpointing mechanisms
+#         if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+#             inputs[0].data = gather_split_1d_tensor(inputs[0].data)
+#             inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
+
+#         # Store the current states.
+#         bwd_cpu_rng_state = torch.get_rng_state()
+#         bwd_cuda_rng_state = torch.cuda.get_rng_state()
+#         bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+#         # Set the states to what it used to be before the forward pass.
+#         torch.set_rng_state(ctx.fwd_cpu_rng_state)
+#         _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+#         get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+#         # Compute the forward pass.
+#         detached_inputs = detach_variable(inputs)
+#         with torch.enable_grad():
+#             outputs = ctx.run_function(*detached_inputs, is_recompute_forward=True)
+
+#         # Set the states back to what it was at the start of this function.
+#         torch.set_rng_state(bwd_cpu_rng_state)
+#         _set_cuda_rng_state(bwd_cuda_rng_state)
+#         get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+#         if isinstance(outputs, torch.Tensor):
+#             outputs = (outputs,)
+#         elif len(outputs) == 2 and isinstance(outputs[1], torch.Tensor) and \
+#                 torch.equal(outputs[1], torch.tensor(0).to(torch.cuda.current_device())):
+#             # a hacky solution to overcome issue when running old script examples/pretrain_gpt_distributed.sh
+#             outputs = (outputs[0],)
+#         # filter out non tensor outputs for backward pass
+#         outputs, args = zip(*filter(lambda x: torch.is_tensor(x[0]), zip(outputs, args)))
+#         torch.autograd.backward(outputs, args)
+#         grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
+#                       for inp in detached_inputs)
+#         return (None, None) + grads
+
+
+def checkpoint(function, distribute_saved_activations, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    if deepspeed.checkpointing.is_configured():
+        return deepspeed.checkpointing.checkpoint(function, *args)
+    
+    return CheckpointFunction.apply(function,
+                                    distribute_saved_activations, *args)
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/weight_grad_store.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/weight_grad_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..63edc3d85acde8145634b19358eb8a1f8772eab1
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/tensor_parallel/weight_grad_store.py
@@ -0,0 +1,34 @@
+import queue
+
+class WeightGradStore:
+
+    cache = []
+    weight_grad_queue = queue.Queue()
+    combine_bw = True
+
+    @classmethod
+    def set_combine_bw(cls, combine_bw):
+        # For the following backward pass, combine W with B and skip next W.
+        cls.combine_bw = combine_bw
+
+    @classmethod
+    def put(cls, total_input, grad_output, weight, func):
+        if cls.combine_bw == True:
+            func(total_input, grad_output, weight)
+            return
+        # Store the weight gradient computation of linear layers.
+        cls.cache.append((total_input, grad_output, weight, func))
+
+    @classmethod
+    def flush(cls):
+        # Collect all stored computations during backward as a W.
+        cls.weight_grad_queue.put(cls.cache)
+        cls.cache = []
+
+    @classmethod
+    def pop(cls):
+        # Execute a single W.
+        assert cls.weight_grad_queue.qsize() > 0
+        stored_grads = cls.weight_grad_queue.get()
+        for total_input, grad_output, weight, func in stored_grads:
+            func(total_input, grad_output, weight)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/transformer/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/core/transformer/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/transformer/utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/transformer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..be6c52ba42d2b285e591ea8c303a3cc6d52ca48f
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/transformer/utils.py
@@ -0,0 +1,16 @@
+import torch
+from megatron.training.global_vars import get_args
+
+from deepspeed.runtime.zero import GatheredParameters
+
+
+def get_linear_layer(rows, columns, init_method, perform_initialization=True, gather_params_on_init=False):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    if perform_initialization:  # Take from modelparallel config
+        with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
+            init_method(layer.weight)
+    with torch.no_grad():
+        with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
+            layer.bias.zero_()
+    return layer
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/core/utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/core/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dec4cf2b3af58797e85611eedd203f9a0cb0709
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/core/utils.py
@@ -0,0 +1,8 @@
+from megatron.training.global_vars import get_args
+from megatron.core.utils import get_attr_wrapped_model
+
+def get_model_config(model):
+    args = get_args()
+    if args.deepspeed and hasattr(model, 'module'):
+        return get_attr_wrapped_model(model.module, 'config', allow_none=False)
+    return get_attr_wrapped_model(model, 'config', allow_none=False)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/embeddings/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/embeddings/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/Makefile b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/Makefile
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/Makefile
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/Makefile
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/language_module/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/models/common/language_module/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/autoaugment.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/autoaugment.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/autoaugment.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/autoaugment.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/bert_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/bert_dataset.py
similarity index 98%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/bert_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/bert_dataset.py
index c5a4c28fc1580e3aa43bea00f499c97524293bd3..d70bb0a65a2cd2ee6c081fd9da19553129ce637c 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/bert_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/bert_dataset.py
@@ -5,13 +5,13 @@
 import numpy as np
 import torch
 
-from megatron_ds import (
+from megatron import (
     get_args,
     get_tokenizer,
     mpu,
     print_rank_0
 )
-from megatron_ds.data.dataset_utils import (
+from megatronspeed.legacy.data.dataset_utils import (
     get_samples_mapping,
     get_a_and_b_segments,
     truncate_segments,
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/biencoder_dataset_utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/biencoder_dataset_utils.py
similarity index 95%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/biencoder_dataset_utils.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/biencoder_dataset_utils.py
index 8451a3ada79afb646d83b23a2e24cf6bef870115..32ddab069bbf13ce079fb6689ecf3de22b2529df 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/biencoder_dataset_utils.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/biencoder_dataset_utils.py
@@ -4,11 +4,11 @@ import time
 import numpy as np
 import torch
 
-from megatron_ds import get_args, get_tokenizer, print_rank_0
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.data.dataset_utils import create_masked_lm_predictions, \
+from megatron.training import get_args, get_tokenizer, print_rank_0
+from megatron.core import mpu, tensor_parallel
+from megatronspeed.legacy.data.dataset_utils import create_masked_lm_predictions, \
                                             pad_and_convert_to_numpy
-from megatron_ds.data.data_samplers import MegatronPretrainingSampler
+from megatronspeed.legacy.data.data_samplers import MegatronPretrainingSampler
 
 def make_attention_mask(source_block, target_block):
     """
@@ -163,7 +163,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
         print_rank_0(' > building samples index mapping for {} ...'.format(
             name))
 
-        from megatron_ds.core.datasets import helpers
+        from megatron.core.datasets import helpers
         mapping_array = helpers.build_blocks_mapping(
             block_dataset.document_indices,
             block_dataset.sequence_lengths,
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/blendable_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/blendable_dataset.py
similarity index 94%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/blendable_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/blendable_dataset.py
index 55a61c3f9a62cc7342972f82f08907d4751353cd..33391c5da3ceed969ddd2a5a5e48f4c17fa5b7b8 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/blendable_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/blendable_dataset.py
@@ -9,8 +9,9 @@ import time
 import numpy as np
 import torch
 from deepspeed.accelerator import get_accelerator
-from megatron_ds import print_rank_0
-from megatron_ds.core import mpu
+from megatron.training import print_rank_0
+from megatron.core import mpu
+from megatronspeed.core import parallel_state
 
 class BlendableDataset(torch.utils.data.Dataset):
 
@@ -37,7 +38,7 @@ class BlendableDataset(torch.utils.data.Dataset):
             dataset_index = np.zeros(self.size, dtype=np.uint8)
             dataset_sample_index = np.zeros(self.size, dtype=np.int64)
 
-            from megatron_ds.data import helpers
+            from megatronspeed.legacy.data import helpers
             helpers.build_blending_indices(dataset_index, dataset_sample_index,
                                            weights, num_datasets, self.size,
                                            torch.distributed.get_rank() == 0)
@@ -85,7 +86,7 @@ class BlendableDataset(torch.utils.data.Dataset):
             if counts[0].item() != (
                 torch.distributed.get_world_size() //
                 torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) //
-                torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())):
+                torch.distributed.get_world_size(group=parallel_state.get_sequence_parallel_group())):
                 print_rank_0("Data index creation unsuccessful, exiting.")
                 exit()
 
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/data_samplers.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/data_samplers.py
similarity index 94%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/data_samplers.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/data_samplers.py
index 043f726243afaddf4ecd312d1fe53365af13e561..ec05f14885029b227a843c3d3a814faef51441fc 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/data_samplers.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/data_samplers.py
@@ -7,8 +7,8 @@ import random
 import torch
 import numpy as np
 from torch.utils.data import Dataset
-from megatron_ds import get_args
-from megatron_ds.core import mpu
+from megatron.training import get_args
+from megatron.core import mpu
 from deepspeed.runtime.dataloader import RepeatingLoader
 
 def build_pretraining_data_loader(dataset, consumed_samples):
@@ -39,11 +39,20 @@ def build_pretraining_data_loader(dataset, consumed_samples):
         raise Exception('{} dataloader type is not supported.'.format(
                 args.dataloader_type))
 
+    def worker_init_fn(worker_id):
+        random.seed(args.seed + worker_id)
+        np.random.seed(args.seed + worker_id)
+
+    g = torch.Generator()
+    g.manual_seed(args.seed)
+
     # Torch dataloader.
     loader = torch.utils.data.DataLoader(dataset,
                                        batch_sampler=batch_sampler,
                                        num_workers=args.num_workers,
-                                       pin_memory=True)
+                                       pin_memory=True,
+                                       worker_init_fn=worker_init_fn,
+                                       generator=g)
     if args.repeated_dataloader:
         loader=RepeatingLoader(loader)
     return loader
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/dataset_utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/dataset_utils.py
similarity index 98%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/dataset_utils.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/dataset_utils.py
index 4bee3faac6a67a330f88aaf1834c997e89aced2b..64793d6aac4dd7bf1de45d5c1496b8854d586c7b 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/dataset_utils.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/dataset_utils.py
@@ -26,12 +26,12 @@ import collections
 import numpy as np
 import torch
 
-from megatron_ds import (
+from megatron.training import (
     get_args,
     print_rank_0
 )
-from megatron_ds.core import mpu
-from megatron_ds.core.datasets.indexed_dataset import MMapIndexedDataset
+from megatron.core import mpu
+from megatronspeed.legacy.data.indexed_dataset import MMapIndexedDataset
 
 
 DSET_TYPE_BERT = 'standard_bert'
@@ -548,10 +548,10 @@ def build_dataset(name, data_prefix, max_num_samples,
                   max_seq_length_dec, dataset_type='standard_bert',
                   indexed_dataset=None):
 
-    from megatron_ds.data.bert_dataset import BertDataset
-    from megatron_ds.data.ict_dataset import ICTDataset
-    from megatron_ds.data.t5_dataset import T5Dataset
-    from megatron_ds.data.multimodal_dataset import MultiModalDataset
+    from megatronspeed.legacy.data.bert_dataset import BertDataset
+    from megatronspeed.legacy.data.ict_dataset import ICTDataset
+    from megatronspeed.legacy.data.t5_dataset import T5Dataset
+    from megatronspeed.legacy.data.multimodal_dataset import MultiModalDataset
 
     if dataset_type not in DSET_TYPES:
         raise ValueError("Invalid dataset_type: ", dataset_type)
@@ -714,7 +714,7 @@ def get_samples_mapping(indexed_dataset,
         print_rank_0(' > building samples index mapping for {} ...'.format(
             name))
         # First compile and then import.
-        from megatron_ds.core.datasets import helpers
+        from megatron.core.datasets import helpers
         samples_mapping = helpers.build_mapping(
             indexed_dataset.document_indices,
             indexed_dataset.sequence_lengths,
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/gpt_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/gpt_dataset.py
similarity index 97%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/gpt_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/gpt_dataset.py
index 457c2a660cd18a374ef3cc1bc82b0dd1dc3e5dd4..481c46b2c6ff45d562062865b0d6c18afc316ddd 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/gpt_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/gpt_dataset.py
@@ -9,12 +9,13 @@ import time
 import numpy as np
 import torch
 from deepspeed.accelerator import get_accelerator
-from megatron_ds import print_rank_0, is_rank_0, get_args
-from megatron_ds.core import mpu
-from megatron_ds.data.blendable_dataset import BlendableDataset
-from megatron_ds.data.dataset_utils import get_datasets_weights_and_num_samples
-from megatron_ds.data.dataset_utils import get_train_valid_test_split_
-from megatron_ds.data.indexed_dataset import make_dataset as make_indexed_dataset
+from megatron.training import print_rank_0, get_args
+from megatron.core import mpu
+from megatron.legacy.data.blendable_dataset import BlendableDataset
+from megatron.legacy.data.dataset_utils import get_datasets_weights_and_num_samples
+from megatron.legacy.data.dataset_utils import get_train_valid_test_split_
+from megatron.legacy.data.indexed_dataset import make_dataset as make_indexed_dataset
+from megatronspeed.core import parallel_state
 
 
 def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
@@ -389,7 +390,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     data_cache_success = True
 
     # Build the indexed mapping if not exist.
-    if build_indices and is_rank_0():
+    if build_indices and torch.distributed.get_rank() == 0:
         print_rank_0(' > WARNING: could not find index map files, building '
                      'the indices on rank 0 ...')
 
@@ -450,7 +451,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
             start_time = time.time()
             # Use C++ implementation for speed.
             # First compile and then import.
-            from megatron_ds.data import helpers
+            from megatronspeed.legacy.data import helpers
             assert doc_idx.dtype == np.int32
             assert sizes.dtype == np.int32
             sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
@@ -488,7 +489,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     if counts[0].item() != (
         torch.distributed.get_world_size() //
         torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()) //
-        torch.distributed.get_world_size(group=mpu.get_sequence_parallel_group())):
+        torch.distributed.get_world_size(group=parallel_state.get_sequence_parallel_group())):
         print_rank_0("Data index creation unsuccessful, exiting.")
         exit()
 
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/helpers.cpp b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/helpers.cpp
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/helpers.cpp
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/helpers.cpp
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/ict_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/ict_dataset.py
similarity index 96%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/ict_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/ict_dataset.py
index ee6c5e931ffdea5f9c1b6ceacece6c31a9ab051e..e7d6f64c2e6b6cf9c94edd0166a816e52d85f079 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/ict_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/ict_dataset.py
@@ -4,10 +4,10 @@ import random
 import numpy as np
 from torch.utils.data import Dataset
 
-from megatron_ds import get_tokenizer
-from megatron_ds import get_args
-from megatron_ds.data.dataset_utils import get_indexed_dataset_
-from megatron_ds.data.realm_dataset_utils import get_block_samples_mapping
+from megatron.training import get_tokenizer
+from megatron.training import get_args
+from megatronspeed.legacy.data.dataset_utils import get_indexed_dataset_
+from megatronspeed.legacy.data.realm_dataset_utils import get_block_samples_mapping
 
 def make_attention_mask(source_block, target_block):
     """
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/image_folder.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/image_folder.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/image_folder.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/image_folder.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/indexed_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/indexed_dataset.py
similarity index 99%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/indexed_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/indexed_dataset.py
index 08844e77592160cc1bfafcd48386dc74eedcdf80..fdbc3dbf0617d2ae5f67815218d7c29a20b04b19 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/indexed_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/indexed_dataset.py
@@ -21,7 +21,7 @@ from itertools import accumulate
 
 import numpy as np
 import torch
-from megatron_ds import print_rank_0
+from megatron.training import print_rank_0
 
 
 def __best_fitting_dtype(vocab_size=None):
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/multimodal_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/multimodal_dataset.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/multimodal_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/multimodal_dataset.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/orqa_wiki_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/orqa_wiki_dataset.py
similarity index 97%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/orqa_wiki_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/orqa_wiki_dataset.py
index 7bf0ae8e7b9d9fcab8e1b041085018a0f1193454..663f260b43ffac07a0c1e084f87c1e0521c15dc3 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/orqa_wiki_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/orqa_wiki_dataset.py
@@ -9,9 +9,9 @@ import random
 import torch
 from torch.utils.data import Dataset
 
-from megatron_ds import print_rank_0, get_args, get_tokenizer
-from megatron_ds.core import tensor_parallel
-from megatron_ds.data.biencoder_dataset_utils import make_attention_mask
+from megatron.training import print_rank_0, get_args, get_tokenizer
+from megatron.core import tensor_parallel
+from megatronspeed.legacy.data.biencoder_dataset_utils import make_attention_mask
 
 def get_open_retrieval_wiki_dataset():
     args = get_args()
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/realm_dataset_utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/realm_dataset_utils.py
similarity index 94%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/realm_dataset_utils.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/realm_dataset_utils.py
index c89de76dbba78fdde1e755c6a731b36ee88ff4fa..fee26697b7e2eed863e364d2d963b4c9f8c3455b 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/realm_dataset_utils.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/realm_dataset_utils.py
@@ -4,10 +4,10 @@ import time
 import numpy as np
 import torch
 
-from megatron_ds import print_rank_0
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
-from megatron_ds import get_args, get_tokenizer, print_rank_0
+from megatron.training import print_rank_0
+from megatron.core import mpu, tensor_parallel
+from megatronspeed.legacy.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
+from megatron.training import get_args, get_tokenizer, print_rank_0
 
 
 def get_one_epoch_dataloader(dataset, micro_batch_size=None):
@@ -24,7 +24,7 @@ def get_one_epoch_dataloader(dataset, micro_batch_size=None):
     sampler = torch.utils.data.SequentialSampler(dataset)
     # importantly, drop_last must be False to get all the data.
     assert False, 'DistributedBatchSampler deprecated, change the implementation'
-    from megatron_ds.data.samplers import DistributedBatchSampler
+    from megatron.legacy.data.samplers import DistributedBatchSampler
     batch_sampler = DistributedBatchSampler(sampler,
                                             batch_size=global_batch_size,
                                             drop_last=False,
@@ -153,7 +153,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
         print_rank_0(' > building samples index mapping for {} ...'.format(
             name))
 
-        from megatron_ds.core.datasets import helpers
+        from megatron.core.datasets import helpers
         mapping_array = helpers.build_blocks_mapping(
             block_dataset.document_indices,
             block_dataset.sequence_lengths,
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/realm_index.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/realm_index.py
similarity index 99%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/realm_index.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/realm_index.py
index 2a14d74637914f84fc437460f282d10ab67f1471..2575af7ff0f1b8ba73cb5cc789c538a334f81e22 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/realm_index.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/realm_index.py
@@ -6,8 +6,8 @@ import shutil
 import numpy as np
 import torch
 
-from megatron_ds import get_args
-from megatron_ds.core import mpu
+from megatron.training import get_args
+from megatron.core import mpu
 
 
 def detach(tensor):
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/t5_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/t5_dataset.py
similarity index 99%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/t5_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/t5_dataset.py
index 1490b2141a7a4fa46badc8fea65c5768d59d0811..7f8bd77309ab1b8ed5a62f83e2e4d67b1de3ed61 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/t5_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/t5_dataset.py
@@ -7,8 +7,8 @@ import collections
 import numpy as np
 import torch
 
-from megatron_ds import get_tokenizer
-from megatron_ds.data.dataset_utils import (
+from megatron.training import get_tokenizer
+from megatronspeed.legacy.data.dataset_utils import (
     create_masked_lm_predictions,
     get_samples_mapping
 )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/test/test_indexed_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/test/test_indexed_dataset.py
similarity index 97%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/test/test_indexed_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/test/test_indexed_dataset.py
index 43a9a2c56752632372d91fbe0b554c22e9c49152..0cb5ce14f23f7c52c3a5cde72032166e71657c8a 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/test/test_indexed_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/test/test_indexed_dataset.py
@@ -2,8 +2,8 @@
 # put some code used during development and manual testing of
 # indexed_dataset.
 
-from megatron_ds.data import indexed_dataset
-from megatron_ds.tokenizer import build_tokenizer
+from megatronspeed.legacy.data import indexed_dataset
+from megatron.training.tokenizer import build_tokenizer
 import argparse
 import os
 import sys
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/test/test_preprocess_data.sh b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/test/test_preprocess_data.sh
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/test/test_preprocess_data.sh
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/test/test_preprocess_data.sh
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/vit_dataset.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/vit_dataset.py
similarity index 97%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/vit_dataset.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/vit_dataset.py
index 8da5b38e477dec16cd6251514a2787be736d19b2..b6688ea029b508cba50a7b3e66c724b630d4b0f2 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/data/vit_dataset.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/data/vit_dataset.py
@@ -5,10 +5,10 @@ import numpy as np
 import torch
 import torchvision.transforms as T
 from torchvision import datasets
-from megatron_ds import get_args
-from megatron_ds.data.image_folder import ImageFolder
-from megatron_ds.data.autoaugment import ImageNetPolicy
-from megatron_ds.data.data_samplers import RandomSeedDataset
+from megatron.training import get_args
+from megatronspeed.legacy.data.image_folder import ImageFolder
+from megatronspeed.legacy.data.autoaugment import ImageNetPolicy
+from megatronspeed.legacy.data.data_samplers import RandomSeedDataset
 from PIL import Image, ImageFilter, ImageOps
 
 
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/custom_layers/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/core/transformer/custom_layers/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/bert_model.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/bert_model.py
similarity index 68%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/model/bert_model.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/bert_model.py
index ee14a433c07180b505d9e99e2071ec7925b913a7..42b098449f0d4c8d013b13c66cd6f633c3b7db48 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/bert_model.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/bert_model.py
@@ -4,49 +4,28 @@
 
 import torch
 
-from megatron_ds import get_args
-from megatron_ds.core import tensor_parallel
-from megatron_ds.model.enums import AttnMaskType
-from megatron_ds.model.language_model import parallel_lm_logits
-from megatron_ds.model.language_model import get_language_model
-from megatron_ds.model.utils import get_norm
-from megatron_ds.model.utils import openai_gelu, erf_gelu
-from megatron_ds.model.utils import get_linear_layer
-from megatron_ds.model.utils import init_method_normal
-from megatron_ds.model.utils import scaled_init_method_normal
-from .module import MegatronModule
-
-
-def bert_extended_attention_mask(attention_mask):
-    # We create a 3D attention mask from a 2D tensor mask.
-    # [b, 1, s]
-    attention_mask_b1s = attention_mask.unsqueeze(1)
-    # [b, s, 1]
-    attention_mask_bs1 = attention_mask.unsqueeze(2)
-    # [b, s, s]
-    attention_mask_bss = attention_mask_b1s * attention_mask_bs1
-    # [b, 1, s, s]
-    extended_attention_mask = attention_mask_bss.unsqueeze(1)
-
-    # Convert attention mask to binary:
-    extended_attention_mask = (extended_attention_mask < 0.5)
-
-    return extended_attention_mask
-
-def bert_position_ids(token_ids):
-    # Create position ids
-    seq_length = token_ids.size(1)
-    position_ids = torch.arange(seq_length, dtype=torch.long,
-                                device=token_ids.device)
-    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
-
-    return position_ids
+from megatron.training import get_args
+from megatron.core import tensor_parallel
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.language_model import parallel_lm_logits
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import get_norm
+from megatron.legacy.model.utils import openai_gelu, erf_gelu
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
+from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.bert_model import (
+    bert_extended_attention_mask,
+    bert_position_ids,
+    post_language_model_processing
+)
 
 
 class BertLMHead(MegatronModule):
     """Masked LM head for Bert
 
-    Arguments:
+    Args:
         config: TransformerConfig object
         mpu_vocab_size: model parallel size of vocabulary.
         parallel_output: whether output logits being distributed or not.
@@ -60,7 +39,7 @@ class BertLMHead(MegatronModule):
         tensor_parallel.set_tensor_model_parallel_attributes(self.bias, True, 0, 1)
         self.parallel_output = parallel_output
 
-        self.dense = get_linear_layer(config.hidden_size, config.hidden_size, config.init_method)
+        self.dense = get_linear_layer(config.hidden_size, config.hidden_size, config.init_method, gather_params_on_init=args.zero_stage == 3)
         setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel)
         setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel)
 
@@ -93,37 +72,6 @@ class BertLMHead(MegatronModule):
         super().load_state_dict(state_dict_, strict)
 
 
-def post_language_model_processing(lm_output, pooled_output,
-                                   lm_head, binary_head,
-                                   lm_labels,
-                                   logit_weights,
-                                   fp16_lm_cross_entropy):
-    # Output.
-    lm_logits = lm_head(
-        lm_output, logit_weights)
-
-    binary_logits = None
-    if binary_head is not None:
-        binary_logits = binary_head(pooled_output)
-
-    if lm_labels is None:
-        # [s b h] => [b s h]
-        return lm_logits.transpose(0,1).contiguous(), binary_logits
-    else:
-        # [b s] => [s b]
-        lm_labels = lm_labels.transpose(0,1).contiguous()
-        # lm_logits : [s, b, h] and lm_labels: [s, b]
-        if fp16_lm_cross_entropy:
-            assert lm_logits.dtype == torch.half
-            lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits, lm_labels)
-        else:
-            lm_loss = tensor_parallel.vocab_parallel_cross_entropy(lm_logits.float(),
-                                                        lm_labels)
-        # [s, b] => [b s]
-        lm_loss = lm_loss.transpose(0,1).contiguous()
-        return lm_loss, binary_logits
-
-
 class BertModel(MegatronModule):
     """Bert Language model."""
 
@@ -133,7 +81,8 @@ class BertModel(MegatronModule):
                  add_binary_head=True,
                  parallel_output=True,
                  pre_process=True,
-                 post_process=True):
+                 post_process=True,
+                 return_moe_loss=False):
         super().__init__(config=config)
         args = get_args()
 
@@ -145,6 +94,7 @@ class BertModel(MegatronModule):
         self.parallel_output = parallel_output
         self.pre_process = pre_process
         self.post_process = post_process
+        self.return_moe_loss = return_moe_loss
 
         self.return_embeddings = args.output_bert_embeddings
         if self.return_embeddings:
@@ -165,11 +115,12 @@ class BertModel(MegatronModule):
             self.binary_head = None
             if self.add_binary_head:
                 self.binary_head = get_linear_layer(config.hidden_size, 2,
-                                                    config.init_method)
+                                                    config.init_method,
+                                                    args.zero_stage == 3)
                 self._binary_head_key = 'binary_head'
 
     def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, bert_model_input, attention_mask,
@@ -187,7 +138,7 @@ class BertModel(MegatronModule):
         )
 
         if self.post_process and self.add_binary_head:
-            lm_output, pooled_output = lm_output
+            lm_output, pooled_output, moe_losses = lm_output
 
             # Return pooled output (e.g., when computing Bert embeddings).
             if self.return_embeddings:
@@ -210,11 +161,14 @@ class BertModel(MegatronModule):
             pooled_output = None
 
         if self.post_process:
-            return post_language_model_processing(lm_output, pooled_output,
-                                                  self.lm_head, self.binary_head,
-                                                  lm_labels,
-                                                  self.shared_embedding_or_output_weight(),
-                                                  self.fp16_lm_cross_entropy)
+            if not self.add_binary_head:
+                lm_output, moe_losses = lm_output
+            lm_output = post_language_model_processing(lm_output, pooled_output,
+                                                       self.lm_head, self.binary_head,
+                                                       lm_labels,
+                                                       self.shared_embedding_or_output_weight(),
+                                                       self.fp16_lm_cross_entropy)
+            return *lm_output, moe_losses if self.return_moe_loss else lm_output
         else:
             return lm_output
 
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/biencoder_model.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/biencoder_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..775d88b14166830f087416405866b22f4130c70c
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/biencoder_model.py
@@ -0,0 +1,103 @@
+import os
+import torch
+import sys
+
+from megatron.training import get_args, print_rank_0, get_tokenizer
+from megatron.core import mpu
+from megatron.training.checkpointing import fix_query_key_value_ordering
+from megatron.training.checkpointing import get_checkpoint_tracker_filename
+from megatron.training.checkpointing import get_checkpoint_name
+from megatron.legacy.model.bert_model import bert_position_ids
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
+from megatron.legacy.model.module import MegatronModule
+
+
+class PretrainedBertModel(MegatronModule):
+    """BERT-based encoder for queries or contexts used for
+    learned information retrieval."""
+
+    def __init__(self, num_tokentypes=2,
+            parallel_output=True, pre_process=True, post_process=True):
+        super(PretrainedBertModel, self).__init__()
+
+        args = get_args()
+        tokenizer = get_tokenizer()
+        self.pad_id = tokenizer.pad
+        self.biencoder_projection_dim = args.biencoder_projection_dim
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(
+            args.init_method_std, args.num_layers)
+
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method,
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+
+        if args.biencoder_projection_dim > 0:
+            self.projection_enc = get_linear_layer(args.hidden_size,
+                                                   args.biencoder_projection_dim,
+                                                   init_method,
+                                                   gather_params_on_init=args.zero_stage == 3)
+            self._projection_enc_key = 'projection_enc'
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = attention_mask.unsqueeze(1)
+        #extended_attention_mask = bert_extended_attention_mask(attention_mask)
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output = self.language_model(input_ids,
+                                        position_ids,
+                                        extended_attention_mask,
+                                        tokentype_ids=tokentype_ids)
+        # This mask will be used in average-pooling and max-pooling
+        pool_mask = (input_ids == self.pad_id).unsqueeze(2)
+
+        # Taking the representation of the [CLS] token of BERT
+        pooled_output = lm_output[0, :, :]
+
+        # Converting to float16 dtype
+        pooled_output = pooled_output.to(lm_output.dtype)
+
+        # Output.
+        if self.biencoder_projection_dim:
+            pooled_output = self.projection_enc(pooled_output)
+
+        return pooled_output
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                prefix=prefix, keep_vars=keep_vars)
+
+        if self.biencoder_projection_dim > 0:
+            state_dict_[self._projection_enc_key] = \
+                self.projection_enc.state_dict(prefix=prefix,
+                                               keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        print_rank_0("loading pretrained weights")
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+
+        if self.biencoder_projection_dim > 0:
+            print_rank_0("loading projection head weights")
+            self.projection_enc.load_state_dict(
+                state_dict[self._projection_enc_key], strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/classification.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/classification.py
similarity index 80%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/model/classification.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/classification.py
index 2b1588679aabf69e7225c7ef15259cd4e45e9208..9a024573b092dae2ce5ebf32f4664061b1b72bb2 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/classification.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/classification.py
@@ -1,17 +1,15 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
 """Classification model."""
 
 import torch
 
-from megatron_ds import get_args, print_rank_last
-from megatron_ds.model.enums import AttnMaskType
-from megatron_ds.model.bert_model import bert_extended_attention_mask, bert_position_ids
-from megatron_ds.model.language_model import get_language_model
-from megatron_ds.model.utils import get_linear_layer
-from megatron_ds.model.utils import init_method_normal
-from megatron_ds.model.utils import scaled_init_method_normal
-from .module import MegatronModule
+from megatron.training import get_args, print_rank_last
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
+from megatron.legacy.model.module import MegatronModule
 
 
 class Classification(MegatronModule):
@@ -42,11 +40,12 @@ class Classification(MegatronModule):
             self.classification_dropout = torch.nn.Dropout(args.hidden_dropout)
             self.classification_head = get_linear_layer(args.hidden_size,
                                                         self.num_classes,
-                                                        init_method)
+                                                        config.init_method,
+                                                        gather_params_on_init=args.zero_stage == 3)
             self._classification_head_key = 'classification_head'
 
     def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, model_input, attention_mask, tokentype_ids=None):
@@ -63,7 +62,7 @@ class Classification(MegatronModule):
         )
 
         if self.post_process:
-            _, pooled_output = lm_output
+            _, pooled_output = lm_output[0], lm_output[1]
             classification_output = self.classification_dropout(pooled_output)
             classification_logits = self.classification_head(classification_output)
 
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/distributed.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/distributed.py
similarity index 98%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/model/distributed.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/distributed.py
index 67746b51e430f25c7b29501bedefbe7adc1e867e..6c67756a7c7d670ce743a131a77c4793384e028f 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/distributed.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/distributed.py
@@ -7,9 +7,9 @@ import math
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from megatron_ds import get_args
-from megatron_ds.core import mpu
-from .module import MegatronModule
+from megatron.training import get_args
+from megatron.core import mpu
+from megatron.legacy.model.module import MegatronModule
 from deepspeed.accelerator import get_accelerator
 
 class MemoryBuffer:
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/gpt_model.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/gpt_model.py
similarity index 37%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/model/gpt_model.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/gpt_model.py
index d6ba087f4d34caa0a3fc82068039ccad6ccab1aa..17ccac86bb67863035d27f1661880ef5981e02e5 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/gpt_model.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/gpt_model.py
@@ -1,25 +1,28 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
 """GPT-2 model."""
 
 import torch
-
-from megatron_ds import get_args, get_rlhf_args
-from megatron_ds.core import mpu, tensor_parallel, sequence_parallel
-from .module import MegatronModule, fp32_to_float16, float16_to_fp32
-
-from .enums import AttnMaskType
-from .language_model import parallel_lm_logits
-from .language_model import get_language_model
-from .utils import init_method_normal
-from .utils import scaled_init_method_normal
-
-from megatron_ds.model import LayerNorm,RMSNorm
-from .language_model import EmbeddingPipe
-from .transformer import ParallelTransformerLayerPipe, LMHeadPipe
+from collections import OrderedDict
+from functools import wraps
+
+from megatron.training import get_args, get_rlhf_args
+from megatron.core import tensor_parallel, mpu
+from megatron.legacy.model.module import MegatronModule, fp32_to_float16, float16_to_fp32
+from megatron.core import ixte_extensions
+
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.language_model import parallel_lm_logits
+from megatron.legacy.model.language_model import get_language_model
+
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
+
+from megatron.legacy.model import LayerNorm, RMSNorm
+from megatronspeed.core import sequence_parallel
+from megatronspeed.core import parallel_state
+from megatronspeed.legacy.model.language_model import EmbeddingPipe
+from megatronspeed.legacy.model.transformer import ParallelTransformerLayerPipe, LMHeadPipe, get_num_experts_per_layer
 from deepspeed.pipe import PipelineModule, LayerSpec, TiedLayerSpec
-import ixformer.train.functions as IXF
-from typing import List, Sequence
+
 
 try:         
     from deepspeed.checkpoint import (
@@ -34,16 +37,11 @@ except ImportError:
     DS_UNIVERSAL_CHECKPOINT_INFO = False  
 
 
-def vocab_range_from_per_partition_vocab_size(
-        per_partition_vocab_size: int, rank, world_size: int
-    ) -> Sequence[int]:
-        index_f = rank * per_partition_vocab_size
-        index_l = index_f + per_partition_vocab_size
-        return index_f, index_l
-
 def post_language_model_processing(lm_output, labels, logit_weights,
                                    parallel_output,
-                                   fp16_lm_cross_entropy, inference_params=None):
+                                   fp16_lm_cross_entropy,
+                                   inference_params=None):
+
     # Output. Format [s b h]
     output = parallel_lm_logits(
         lm_output,
@@ -57,84 +55,207 @@ def post_language_model_processing(lm_output, labels, logit_weights,
     else:
         # [b s] => [s b]
         labels = labels.transpose(0,1).contiguous()
-        cross_entropy = sequence_parallel.vocab_sequence_parallel_cross_entropy if mpu.get_sequence_parallel_world_size() > 1 \
+        cross_entropy = sequence_parallel.vocab_sequence_parallel_cross_entropy if parallel_state.get_sequence_parallel_world_size() > 1 \
             else tensor_parallel.vocab_parallel_cross_entropy
+
         if fp16_lm_cross_entropy:
             assert output.dtype == torch.half
             loss = cross_entropy(output, labels)
         else:
-            # loss = cross_entropy(output.float(), labels)
-            get_vocab_range = vocab_range_from_per_partition_vocab_size
-            partition_vocab_size = output.size()[-1]
-            rank = mpu.get_tensor_model_parallel_rank()
-            world_size = mpu.get_tensor_model_parallel_world_size()
-            group = mpu.get_tensor_model_parallel_group()
-            vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
-            loss = IXF.vocab_parallel_cross_entropy(
-                output,
-                labels,
-                0.0,
-                world_size,
-                vocab_start_index,
-                vocab_end_index,
-                group
+            if ixte_extensions._USE_IXTE:
+                from megatron.core.tensor_parallel.utils import VocabUtility
+                from megatron.core.parallel_state import (
+                    get_tensor_model_parallel_group,
+                    get_tensor_model_parallel_rank,
+                    get_tensor_model_parallel_world_size,
             )
+                get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+                partition_vocab_size = output.size()[-1]
+                rank = get_tensor_model_parallel_rank()
+                world_size = get_tensor_model_parallel_world_size()
+                group = get_tensor_model_parallel_group()
+                vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+                loss = ixte_extensions.vocab_parallel_cross_entropy(
+                    output,
+                    labels,
+                    0.0,
+                    world_size,
+                    vocab_start_index,
+                    vocab_end_index,
+                    group
+                )
+            else:
+                loss = cross_entropy(output.float(), labels)
+        
         # [s b] => [b, s]
         loss = loss.transpose(0,1).contiguous()
         return loss
 
 
-class GPTModel(MegatronModule):
-    """GPT-2 Language model."""
+class UniversalCheckpointInfo:
+    def __init__(self, using_model_pipe: bool):
+        self.using_model_pipe = using_model_pipe
+        self.args = get_args()
+        self.info = self._build_universal_checkpoint_info()
 
-    def __init__(self,
-                 config,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 pre_process=True,
-                 post_process=True,
-                 return_moe_loss=True,
-                 rlhf_training=False):
-        self.rlhf_training = rlhf_training
-        if rlhf_training:
-            args = get_rlhf_args()
+    def get(self):
+        return self.info
+
+    def _build_universal_checkpoint_info(self):
+        info = dict()
+        if DS_UNIVERSAL_CHECKPOINT_INFO:
+            # Vocabulary parameters (embeddings) that require special handling due to padding.
+            info[VOCABULARY_PARAMETER_PATTERNS] = self._get_vocab_param_patterns()
+
+            if self.using_model_pipe:
+                # Replicated (shared) parameters on the pipeline dimension
+                info[PIPELINE_REPLICATED_PARAMETER_PATTERNS] = self._get_pp_replicated_param_patterns()
+
+            if self.args.tensor_model_parallel_size > 1:
+                # Parameter slices that should be averaged not concatenated.
+                info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns()
+
+                # Parameter that are sliced on the row dimension
+                info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = self._get_row_parallel_param_patterns()
+
+            # SWIGLU parameters are first sliced on dim=0 to tp slices
+            # Then, each tp slice is chunked into 2 to create the linear layers L1, L2 used for silu(L1(x)) * L2(x))
+            info[PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0] = self._get_swiglu_col_parallel_param_patterns()
+        return info
+
+    def _get_vocab_param_patterns(self):
+        if self.using_model_pipe:
+            if self.args.untie_embeddings_and_output_weights:
+                patterns = [
+                    r"\d+.word_embeddings.weight",
+                    r"\d+.lm_head.weight"
+                ]
+            else:
+                patterns = [
+                    r"tied_modules.embed.word_embeddings.weight"
+                ]
         else:
-            args = get_args()
+            patterns = [
+                "language_model.embedding.word_embeddings.weight"
+            ]
+            if self.args.untie_embeddings_and_output_weights:
+                patterns.append("language_model.output_layer.weight")
+        return patterns
 
-        super().__init__(config=config, share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
+    def _get_pp_replicated_param_patterns(self):
+        if self.args.untie_embeddings_and_output_weights:
+            return []
+        patterns = self._get_vocab_param_patterns()
+        if self.args.add_position_embedding:
+            patterns.append(r"tied_modules.embed.position_embeddings.weight")
+        return patterns
 
-        self.parallel_output = parallel_output
-        self.pre_process = pre_process
-        self.post_process = post_process
-        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
-        self.return_moe_loss = return_moe_loss
-        self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
-
-        self.language_model, self._language_model_key = get_language_model(
-            config=config,
-            num_tokentypes=num_tokentypes,
-            add_pooler=False,
-            encoder_attn_mask_type=AttnMaskType.causal,
-            pre_process=self.pre_process,
-            post_process=self.post_process,
-            num_experts=args.num_experts,
-            rlhf_training=rlhf_training)
-
-        if not args.untie_embeddings_and_output_weights:
-            self.initialize_word_embeddings()
-
-    def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
-        self.language_model.set_input_tensor(input_tensor)
-
-    def forward(self, input_ids, position_ids, attention_mask,
+    def _layers_prefix(self):
+        return "" if self.using_model_pipe else "language_model.encoder.layers."
+
+    def _get_tp_replicated_param_patterns(self):
+        layers_prefix = self._layers_prefix()
+        patterns = [
+            layers_prefix + r"\d+.input_layernorm.weight",
+            layers_prefix + r"\d+.post_attention_layernorm.weight",
+        ]
+        # Add final normalization layer
+        final_norm_w_pattern = r"\d+.weight" if self.using_model_pipe \
+            else "language_model.encoder.final_layernorm.weight"
+        patterns.append(final_norm_w_pattern)
+        if self.args.normalization == 'layernorm':
+            final_norm_b_pattern = r"\d+.bias" if self.using_model_pipe \
+                else "language_model.encoder.final_layernorm.bias"
+            patterns.append(final_norm_b_pattern)
+        # add Positional Embedding
+        if self.args.add_position_embedding:
+            pos_emb_pattern = "tied_modules.embed.position_embeddings.weight" if self.using_model_pipe \
+                else "language_model.embedding.position_embeddings.weight"
+            patterns.append(pos_emb_pattern)
+        # add Linear bias
+        if self.args.add_bias_linear:
+            patterns.extend([
+                layers_prefix + r"\d+.self_attention.dense.bias",
+                layers_prefix + r"\d+.mlp.dense_4h_to_h.bias",
+            ])
+        # add LN bias
+        if self.args.normalization == 'layernorm':
+            patterns.extend([
+                layers_prefix + r"\d+.input_layernorm.bias",
+                layers_prefix + r"\d+.post_attention_layernorm.bias",
+            ])
+        return patterns
+
+    def _get_row_parallel_param_patterns(self):
+        layers_prefix = self._layers_prefix()
+        return [
+            layers_prefix + r"\d+.mlp.dense_4h_to_h.weight",
+            layers_prefix + r"\d+.self_attention.dense.weight",
+        ]
+
+    def _get_swiglu_col_parallel_param_patterns(self):
+        if not self.args.swiglu:
+            return []
+        layers_prefix = self._layers_prefix()
+        patterns = [
+            layers_prefix + r"\d+.mlp.dense_h_to_4h.weight",
+        ]
+        if self.args.add_bias_linear:
+            patterns.append(layers_prefix + r"\d+.mlp.dense_h_to_4h.bias")
+        return patterns
+
+def gpt_model_init(self,
+                   config,
+                   num_tokentypes=0,
+                   parallel_output=True,
+                   pre_process=True,
+                   post_process=True,
+                   return_moe_loss=True,
+                   rlhf_training=False):
+    self.rlhf_training = rlhf_training
+    if rlhf_training:
+        args = get_rlhf_args()
+    else:
+        args = get_args()
+    super(self.__class__, self).__init__(config=config, share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
+
+    self.parallel_output = parallel_output
+    self.pre_process = pre_process
+    self.post_process = post_process
+    self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+    self.return_moe_loss = return_moe_loss
+    self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
+
+    self.language_model, self._language_model_key = get_language_model(
+        config=config,
+        num_tokentypes=num_tokentypes,
+        add_pooler=False,
+        encoder_attn_mask_type=AttnMaskType.causal,
+        pre_process=self.pre_process,
+        post_process=self.post_process,
+        rlhf_training=rlhf_training)
+    
+    if not args.untie_embeddings_and_output_weights:
+        self.initialize_word_embeddings()
+
+def gpt_model_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, input_ids, position_ids, attention_mask,
                 retriever_input_ids=None,
                 retriever_position_ids=None,
                 retriever_attn_mask=None,
-                labels=None, tokentype_ids=None, inference_params=None,
+                labels=None, tokentype_ids=None, inference_params=None, config=None,
                 curriculum_seqlen=None, parallel_output=None):
         args = get_args()
 
+        # if not args.deepspeed:
+        #     return fn(self, input_ids, position_ids, attention_mask,
+        #               retriever_input_ids=retriever_input_ids,
+        #               retriever_position_ids=retriever_position_ids,
+        #               retriever_attn_mask=retriever_attn_mask,
+        #               labels=labels, tokentype_ids=tokentype_ids,
+        #               inference_params=inference_params, config=config)
+
         if curriculum_seqlen is not None:
             args.curriculum_seqlen = curriculum_seqlen
             if curriculum_seqlen < input_ids.size()[1]:
@@ -152,14 +273,20 @@ class GPTModel(MegatronModule):
                 # If got a None input, need to reset curriculum_seqlen on user side
                 args.curriculum_seqlen = args.seq_length
 
-        lm_output = self.language_model(
+        output = self.language_model(
             input_ids,
             position_ids,
             attention_mask,
             retriever_input_ids=retriever_input_ids,
             retriever_position_ids=retriever_position_ids,
             retriever_attn_mask=retriever_attn_mask,
-            inference_params=inference_params) # [s, b, h]
+            inference_params=inference_params,
+            config=config)
+        
+        if args.deepspeed and self.return_moe_loss:
+            lm_output, moe_losses = output
+        else:
+            lm_output = output
 
         if self.post_process:
             if self.rlhf_training and self.untie_embeddings_and_output_weights:
@@ -189,75 +316,166 @@ class GPTModel(MegatronModule):
                         self.fp16_lm_cross_entropy,
                         inference_params=inference_params)
 
-        return lm_output
-
-    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-
-        state_dict_ = {}
-        language_model_state_dict = self.language_model.state_dict_for_save_checkpoint(
-                prefix=prefix, keep_vars=keep_vars)
-        # MoE states need to be handled separately by DeepSpeed engine, thus
-        # moving them to the top level dictionary
-        if "moe_state_dict" in language_model_state_dict:
-            for key in list(language_model_state_dict["moe_state_dict"].keys()):
-                state_dict_[key] = language_model_state_dict["moe_state_dict"].pop(key)
-            del language_model_state_dict["moe_state_dict"]
-        state_dict_[self._language_model_key] = language_model_state_dict
-        # Save word_embeddings.
-        if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights:
-            state_dict_[self._word_embeddings_for_head_key] \
-                = self.word_embeddings.state_dict(prefix=prefix,
-                                                  keep_vars=keep_vars)
-        return state_dict_
-
-    def load_state_dict(self, state_dict, strict=True):
-        """Customized load."""
-
-        # Load word_embeddings.
-        if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights:
-            self.word_embeddings.load_state_dict(
-                state_dict[self._word_embeddings_for_head_key], strict=strict)
-        # Gather MoE states and move under language model
-        moe_state_dict = {}
-        for key in list(state_dict.keys()):
-            if 'expert' in key and 'moe.gate.wg.weight' not in key:
-                moe_state_dict[key] = state_dict.pop(key)
-        if self._language_model_key in state_dict:
-            state_dict = state_dict[self._language_model_key]
-        if len(moe_state_dict) > 0:
-            state_dict["moe_state_dict"] = moe_state_dict
-        self.language_model.load_state_dict(state_dict, strict=strict)
+        return (lm_output, moe_losses) if args.deepspeed and self.return_moe_loss else lm_output
+    return wrapper
+
+def gpt_model_state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+    state_dict_ = {}
+    language_model_state_dict = self.language_model.state_dict_for_save_checkpoint(
+            prefix=prefix, keep_vars=keep_vars)
+    # MoE states need to be handled separately by DeepSpeed engine, thus
+    # moving them to the top level dictionary
+    if "moe_state_dict" in language_model_state_dict:
+        for key in list(language_model_state_dict["moe_state_dict"].keys()):
+            state_dict_[key] = language_model_state_dict["moe_state_dict"].pop(key)
+        del language_model_state_dict["moe_state_dict"]
+    state_dict_[self._language_model_key] = language_model_state_dict
+    # Save word_embeddings.
+    if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights:
+        state_dict_[self._word_embeddings_for_head_key] \
+            = self.word_embeddings.state_dict(prefix=prefix,
+                                                keep_vars=keep_vars)
+    return state_dict_
+
+def gpt_model_load_state_dict(self, state_dict, strict=True):
+    """Customized load."""
+
+    # Load word_embeddings.
+    if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights:
+        self.word_embeddings.load_state_dict(
+            state_dict[self._word_embeddings_for_head_key], strict=strict)
+    # Gather MoE states and move under language model
+    moe_state_dict = {}
+    for key in list(state_dict.keys()):
+        if 'expert' in key and 'moe.gate.wg.weight' not in key:
+            moe_state_dict[key] = state_dict.pop(key)
+    if self._language_model_key in state_dict:
+        state_dict = state_dict[self._language_model_key]
+    if len(moe_state_dict) > 0:
+        state_dict["moe_state_dict"] = moe_state_dict
+    self.language_model.load_state_dict(state_dict, strict=strict)
+
+def gpt_model_universal_checkpoint_info(self):
+    return UniversalCheckpointInfo(using_model_pipe=False).get()
+
+
+# class GPTModel(MegatronModule):
+#     """GPT-2 Language model."""
+
+#     def __init__(self,
+#                  config,
+#                  num_tokentypes=0,
+#                  parallel_output=True,
+#                  pre_process=True,
+#                  post_process=True,
+#                  return_moe_loss=True):
+#         args = get_args()
+#         super().__init__(config=config, share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
+
+#         self.parallel_output = parallel_output
+#         self.pre_process = pre_process
+#         self.post_process = post_process
+#         self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+#         self.return_moe_loss = return_moe_loss
+#         self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
+
+#         self.language_model, self._language_model_key = get_language_model(
+#             config=config,
+#             num_tokentypes=num_tokentypes,
+#             add_pooler=False,
+#             encoder_attn_mask_type=AttnMaskType.causal,
+#             pre_process=self.pre_process,
+#             post_process=self.post_process)
+        
+#         if not args.untie_embeddings_and_output_weights:
+#             self.initialize_word_embeddings()
+
+#     def set_input_tensor(self, input_tensor):
+#         """See megatron.legacy.model.transformer.set_input_tensor()"""
+#         self.language_model.set_input_tensor(input_tensor)
+
+#     def forward(self, input_ids, position_ids, attention_mask,
+#                 retriever_input_ids=None,
+#                 retriever_position_ids=None,
+#                 retriever_attn_mask=None,
+#                 labels=None, tokentype_ids=None, inference_params=None,
+#                 curriculum_seqlen=None):
+#         args = get_args()
+#         if curriculum_seqlen is not None:
+#             args.curriculum_seqlen = curriculum_seqlen
+#             if curriculum_seqlen < input_ids.size()[1]:
+#                 # seqlen-based curriculum learning
+#                 # input_ids, position_ids, labels have size [batch size, seqlen]
+#                 input_ids = input_ids[:, :curriculum_seqlen].contiguous()
+#                 position_ids = position_ids[:, :curriculum_seqlen].contiguous()
+#                 if labels is not None:
+#                     labels = labels[:, :curriculum_seqlen].contiguous()
+
+#                 # attention_mask has size [1, 1, seqlen, seqlen]
+#                 attention_mask = attention_mask[:, :, :curriculum_seqlen, :curriculum_seqlen].contiguous()
+#         else:
+#             if args.curriculum_learning_legacy:
+#                 # If got a None input, need to reset curriculum_seqlen on user side
+#                 args.curriculum_seqlen = args.seq_length
+
+#         lm_output, moe_losses = self.language_model(
+#             input_ids,
+#             position_ids,
+#             attention_mask,
+#             retriever_input_ids=retriever_input_ids,
+#             retriever_position_ids=retriever_position_ids,
+#             retriever_attn_mask=retriever_attn_mask,
+#             inference_params=inference_params)
+
+#         if self.post_process:
+#             lm_output = post_language_model_processing(
+#                 lm_output, labels,
+#                 self.language_model.output_layer.weight if self.untie_embeddings_and_output_weights else self.shared_embedding_or_output_weight(),
+#                 self.parallel_output,
+#                 self.fp16_lm_cross_entropy)
+
+#         return lm_output, moe_losses if self.return_moe_loss else lm_output
+
+#     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+
+#         state_dict_ = {}
+#         language_model_state_dict = self.language_model.state_dict_for_save_checkpoint(
+#                 prefix=prefix, keep_vars=keep_vars)
+#         # MoE states need to be handled separately by DeepSpeed engine, thus
+#         # moving them to the top level dictionary
+#         if "moe_state_dict" in language_model_state_dict:
+#             for key in list(language_model_state_dict["moe_state_dict"].keys()):
+#                 state_dict_[key] = language_model_state_dict["moe_state_dict"].pop(key)
+#             del language_model_state_dict["moe_state_dict"]
+#         state_dict_[self._language_model_key] = language_model_state_dict
+#         # Save word_embeddings.
+#         if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights:
+#             state_dict_[self._word_embeddings_for_head_key] \
+#                 = self.word_embeddings.state_dict(prefix=prefix,
+#                                                   keep_vars=keep_vars)
+#         return state_dict_
+
+#     def load_state_dict(self, state_dict, strict=True):
+#         """Customized load."""
+
+#         # Load word_embeddings.
+#         if self.post_process and not self.pre_process and not self.untie_embeddings_and_output_weights:
+#             self.word_embeddings.load_state_dict(
+#                 state_dict[self._word_embeddings_for_head_key], strict=strict)
+#         # Gather MoE states and move under language model
+#         moe_state_dict = {}
+#         for key in list(state_dict.keys()):
+#             if 'expert' in key and 'moe.gate.wg.weight' not in key:
+#                 moe_state_dict[key] = state_dict.pop(key)
+#         if self._language_model_key in state_dict:
+#             state_dict = state_dict[self._language_model_key]
+#         if len(moe_state_dict) > 0:
+#             state_dict["moe_state_dict"] = moe_state_dict
+#         self.language_model.load_state_dict(state_dict, strict=strict)
+
+#     def universal_checkpoint_info(self):
+#         return UniversalCheckpointInfo(using_model_pipe=False).get()
 
-    def _get_vocab_param_patterns(self):
-        args = get_args()
-        if args.untie_embeddings_and_output_weights:
-            patterns = [
-                r"\d+.word_embeddings.weight",
-                r"\d+.lm_head.weight"
-            ]
-        else:
-            patterns = [
-                r"tied_modules.embed.word_embeddings.weight"
-            ]
-        return patterns
-
-    def universal_checkpoint_info(self):
-        info = dict()
-        args = get_args()
-
-        if DS_UNIVERSAL_CHECKPOINT_INFO:
-            # Vocabulary parameters (embeddings) that require special handling due to padding.
-            info[VOCABULARY_PARAMETER_PATTERNS] = self._get_vocab_param_patterns()
-            
-            if args.tensor_model_parallel_size > 1:
-                # Parameter slices that should be averaged not concatenated.
-                info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns()
-
-                # Parameter that are sliced on the row dimension
-                info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = self._get_row_parallel_param_patterns()
-
-        return info
-    
 def CrossEntropy(output, labels):
     labels, loss_mask = labels[0], labels[1]
 
@@ -283,6 +501,7 @@ class GPTModelPipe(PipelineModule,MegatronModule):
                  partition_method='uniform',
                  custom_partition=None):
         args = get_args()
+
         self.parallel_output = parallel_output
 
         if config.init_method is None:
@@ -326,12 +545,33 @@ class GPTModelPipe(PipelineModule,MegatronModule):
                                             embedding_weights_in_fp32=args.embedding_weights_in_fp32,
                                             tied_weight_attr='word_embeddings_weight'))
 
+        experts_per_layer = get_num_experts_per_layer(args.ds_num_experts, args.num_layers, args.expert_interval)
+        self.is_moe_model = any(n_experts > 1 for n_experts in experts_per_layer)
+
+        # Currently PipelineEngine does not support more than 1 pipe and/or grad partitioned tensors that
+        # require grads.
+        # When using MoE, we have 2 tensors that are passed along pipeline stages and both require grads.
+        # Therefore, verify that both pipe_partitioned / grad_partitioned are not enabled
+        if self.is_moe_model and args.pipeline_model_parallel_size > 1 and args.tensor_model_parallel_size > 1:
+            pipe_partitioned_enabled = args.deepspeed_config_dict.get('pipeline', {}).get('pipe_partitioned', False)
+            grad_partitioned_enabled = args.deepspeed_config_dict.get('pipeline', {}).get('grad_partitioned', False)
+            assert not pipe_partitioned_enabled and not grad_partitioned_enabled, \
+                'Pipe and/or Grad partitioning are not supported for MoE model'
+
         for layer_idx in range(args.num_layers):
             self.specs.append(
                 LayerSpec(ParallelTransformerLayerPipe,
-                    config,
-                    layer_number=layer_idx,
-                    self_attn_mask_type=AttnMaskType.causal))
+                          config,
+                          layer_number=layer_idx,
+                          self_attn_mask_type=AttnMaskType.causal,
+                          num_experts=experts_per_layer[layer_idx],
+                          input_aggregated_moe_loss=(self.is_moe_model and layer_idx > 0),
+                          return_aggregated_moe_loss=self.is_moe_model))
+
+        # if model has experts, add a layer to get and cache the aggregated moe loss from the
+        # last transformer layer
+        if self.is_moe_model:
+            self.specs.append(self._calculate_moe_loss)
 
         # Final layernorm after transformer layers
         if args.normalization == 'layernorm':
@@ -370,6 +610,11 @@ class GPTModelPipe(PipelineModule,MegatronModule):
         if args.fp16 or args.bf16:
             self.specs.append(float16_to_fp32)
 
+        # Cache losses
+        self.moe_loss = None
+        self.last_lm_loss = None    # detached, for display only
+        self.last_moe_loss = None   # detached, for display only
+
         if args.checkpoint_activations:
             interval = args.checkpoint_num_layers
         elif args.recompute_granularity == "full" and args.recompute_method == 'uniform':
@@ -383,98 +628,40 @@ class GPTModelPipe(PipelineModule,MegatronModule):
                                              num_mp=mpu.get_tensor_model_parallel_world_size(),
                                              num_dp=mpu.get_data_parallel_world_size())
 
-        super().__init__(layers=self.specs,
-                         loss_fn=CrossEntropy,
-                         topology=topo,
-                         activation_checkpoint_interval=interval,
-                         partition_method=partition_method,
-                         custom_partition=custom_partition,
-                         custom_recompute_layers_per_stage=args.custom_recompute_layers_per_stage)
-
-    @staticmethod
-    def _get_vocab_param_patterns():
-        args = get_args()
-        if args.untie_embeddings_and_output_weights:
-            patterns = [
-                r"\d+.word_embeddings.weight",
-                r"\d+.lm_head.weight"
-            ]
-        else:
-            patterns = [
-                r"tied_modules.embed.word_embeddings.weight"
-            ]
-        return patterns
-
-    def _get_pp_replicated_param_patterns(self):
+        PipelineModule.__init__(self, layers=self.specs,
+                                loss_fn=self.loss_func,
+                                topology=topo,
+                                activation_checkpoint_interval=interval,
+                                partition_method='type:transformer',
+                                custom_partition=custom_partition,
+                                custom_recompute_layers_per_stage=args.custom_recompute_layers_per_stage)
+        # MegatronModule.__init__(self, config=config, share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
+        self.config = config
+
+    def _calculate_moe_loss(self, inputs):
+        """ Calculate MoE auxiliary loss """
+        assert isinstance(inputs, tuple) and len(inputs) == 2
+        hidden, aggregated_moe_loss = inputs[0], inputs[1]
         args = get_args()
-        if args.untie_embeddings_and_output_weights:
-            return []
-        patterns = self._get_vocab_param_patterns()
-        if args.add_position_embedding:
-            patterns.append(r"tied_modules.embed.position_embeddings.weight")
-        return patterns
-
-    @staticmethod
-    def _get_tp_replicated_param_patterns():
-        args = get_args()
-        patterns = [
-            r"\d+.input_layernorm.weight",
-            r"\d+.post_attention_layernorm.weight",
-            r"\d+.weight",
-        ]
-        if args.add_position_embedding:
-            patterns.append(r"tied_modules.embed.position_embeddings.weight")
-        if args.add_bias_linear:
-            patterns.extend([
-                r"\d+.self_attention.dense.bias",
-                r"\d+.mlp.dense_4h_to_h.bias",
-            ])
-        if args.normalization == 'layernorm':
-            patterns.extend([
-                r"\d+.input_layernorm.bias",
-                r"\d+.post_attention_layernorm.bias",
-                r"\d+.bias",
-            ])
-        return patterns
-
-    @staticmethod
-    def _get_row_parallel_param_patterns():
-        return [
-            r"\d+.mlp.dense_4h_to_h.weight",
-            r"\d+.self_attention.dense.weight",
-        ]
-
-    @staticmethod
-    def _get_swiglu_col_parallel_param_patterns():
-        args = get_args()
-        if not args.swiglu:
-            return []
-        patterns = [
-            r"\d+.mlp.dense_h_to_4h.weight",
-        ]
-        if args.add_bias_linear:
-            patterns.append(r"\d+.mlp.dense_h_to_4h.bias")
-        return patterns
-
+        self.moe_loss = aggregated_moe_loss * args.moe_loss_coeff
+        return hidden
+
+    def loss_func(self, output, labels):
+        loss = CrossEntropy(output, labels)
+        self.last_lm_loss = loss.clone().detach()
+        if self.moe_loss is not None:
+            loss += self.moe_loss
+            self.last_moe_loss = self.moe_loss.clone().detach()
+        return loss
 
     def universal_checkpoint_info(self):
-        info = dict()
-        if DS_UNIVERSAL_CHECKPOINT_INFO:
-            # Vocabulary parameters (embeddings) that require special handling due to padding.
-            info[VOCABULARY_PARAMETER_PATTERNS] = self._get_vocab_param_patterns()
-
-            # Replicated (shared) parameters on the pipeline dimension
-            info[PIPELINE_REPLICATED_PARAMETER_PATTERNS] = self._get_pp_replicated_param_patterns()
-
-            # Parameter slices that should be averaged not concatenated.
-            info[TP_REPLICATED_PARAMETER_PATTERNS] = self._get_tp_replicated_param_patterns()
+        return UniversalCheckpointInfo(using_model_pipe=True).get()
 
-            # Parameter that are sliced on the row dimension
-            info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = self._get_row_parallel_param_patterns()
+    def get_additional_losses(self):
+        if not self.is_moe_model:
+            return None
+        return OrderedDict({
+            'lm loss': self.last_lm_loss,
+            'moe loss': self.last_moe_loss
+        })
 
-            # SWIGLU parameters are first sliced on dim=0 to tp slices
-            # Then, each tp slice is chunked into 2 to create the linear layers L1, L2 used for silu(L1(x)) * L2(x))
-            info[PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0] = self._get_swiglu_col_parallel_param_patterns()
-        return info
-                         
-                        
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/language_model.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/language_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1db4b66a87c00b70ee26a775243c7342cb0a55e
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/language_model.py
@@ -0,0 +1,1205 @@
+"""Transformer based language model."""
+
+import torch
+from functools import wraps
+import torch.nn.functional as F
+
+from megatron.training import get_args, get_rlhf_args
+from megatron.core import mpu, tensor_parallel
+from megatron.core.enums import ModelType
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+
+from megatron.legacy.model.enums import AttnMaskType, LayerType
+from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.transformer import ParallelTransformer
+from megatron.legacy.model.language_model import Pooler, Embedding, TransformerLanguageModel
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal, scaled_init_method_normal
+from megatron.core import ixte_extensions
+from megatronspeed.legacy.model.utils import gather_and_init
+
+def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
+                       bias=None, inference_params=None):
+    """LM logits using word embedding weights."""
+    args = get_args()
+    if ixte_extensions._USE_IXTE and args.transformer_impl == 'transformer_engine':
+        assert bias is None
+        logits_parallel = ixte_extensions.get_logits_linear_func()(
+        input=input_,
+        weight=word_embeddings_weight,
+        sequence_parallel=args.sequence_parallel,
+        gradient_accumulation_fusion=args.gradient_accumulation_fusion,
+        tp_group=mpu.get_tensor_model_parallel_group()
+        )
+        if parallel_output:
+            return logits_parallel
+        return tensor_parallel.gather_from_tensor_model_parallel_region(logits_parallel)
+    # Parallel logits.
+    model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+    if model_parallel or args.sequence_parallel:
+        input_parallel = input_
+        allreduce_dgrad = model_parallel and not args.sequence_parallel
+    else:
+        input_parallel = tensor_parallel.copy_to_tensor_model_parallel_region(input_)
+        allreduce_dgrad = False
+
+    # Matrix multiply.
+    logits_parallel = tensor_parallel.linear_with_grad_accumulation_and_async_allreduce(
+        input=input_parallel,
+        weight=word_embeddings_weight,
+        bias=bias,
+        gradient_accumulation_fusion=args.gradient_accumulation_fusion,
+        async_grad_allreduce=allreduce_dgrad,
+        sequence_parallel=args.sequence_parallel,
+        grad_output_buffer=None,
+        allreduce_dgrad=allreduce_dgrad,
+        inference_params=inference_params
+    )
+    # Gather if needed.
+
+    if parallel_output:
+        return logits_parallel
+
+    if not args.RLHF:
+        return tensor_parallel.gather_from_tensor_model_parallel_region(logits_parallel)
+    else:
+        return logits_parallel
+
+def get_language_model(config, num_tokentypes, add_pooler,
+                       encoder_attn_mask_type,
+                       add_encoder=True,
+                       add_decoder=False,
+                       decoder_attn_mask_type=AttnMaskType.causal,
+                       pre_process=True, post_process=True,
+                       rlhf_training=False):
+    """Build language model and return along with the key to save."""
+    args = get_args()
+    if config.init_method is None:
+        config.init_method = init_method_normal(config.init_method_std)
+
+    if config.output_layer_init_method is None:
+        config.output_layer_init_method = scaled_init_method_normal(config.init_method_std,
+                                                                    config.num_layers)
+
+    # Language model.
+    language_model = TransformerLanguageModel(
+        config,
+        encoder_attn_mask_type,
+        num_tokentypes=num_tokentypes,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        decoder_attn_mask_type=decoder_attn_mask_type,
+        add_pooler=add_pooler,
+        pre_process=pre_process,
+        post_process=post_process,
+        rlhf_training=rlhf_training
+    )
+    # key used for checkpoints.
+    language_model_key = 'language_model'
+
+    return language_model, language_model_key
+
+def pooler_init(self, hidden_size, init_method):
+    super(Pooler, self).__init__()
+    args = get_args()
+    self.dense = get_linear_layer(hidden_size, hidden_size, init_method, gather_params_on_init=args.zero_stage == 3)
+    self.sequence_parallel = args.sequence_parallel
+
+# class Pooler(MegatronModule):
+#     """Pooler layer.
+
+#     Pool hidden states of a specific token (for example start of the
+#     sequence) and add a linear transformation followed by a tanh.
+
+#     Args:
+#         hidden_size: hidden size
+#         init_method: weight initialization method for the linear layer.
+#             bias is set to zero.
+#     """
+
+#     def __init__(self, hidden_size, init_method):
+#         super(Pooler, self).__init__()
+#         args = get_args()
+#         self.dense = get_linear_layer(hidden_size, hidden_size, init_method, gather_params_on_init=args.zero_stage == 3)
+#         self.sequence_parallel = args.sequence_parallel
+
+
+#     def forward(self, hidden_states, sequence_index=0):
+#         # hidden_states: [s, b, h]
+#         # sequence_index: index of the token to pool.
+
+#         # gather data along sequence dimensions
+#         # same pooler is run on all tensor parallel nodes
+#         if self.sequence_parallel:
+#             hidden_states = tensor_parallel.gather_from_sequence_parallel_region(
+#                 hidden_states,
+#                 tensor_parallel_output_grad=False)
+
+#         pooled = hidden_states[sequence_index, :, :]
+#         pooled = self.dense(pooled)
+#         pooled = torch.tanh(pooled)
+#         return pooled
+
+def embedding_init(self,
+                   hidden_size,
+                   vocab_size,
+                   max_sequence_length,
+                   embedding_dropout_prob,
+                   config,
+                   num_tokentypes=0,
+                   embedding_weights_in_fp32=False,
+                   rlhf_training=False):
+    super(Embedding, self).__init__()
+
+    self.hidden_size = hidden_size
+    self.init_method = config.init_method
+    self.num_tokentypes = num_tokentypes
+
+    if rlhf_training:
+        args = get_rlhf_args()
+    else:
+        args = get_args()
+
+    # Word embeddings (parallel).
+    self.embedding_weights_in_fp32 = embedding_weights_in_fp32
+    self.params_dtype = args.params_dtype
+    self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
+        vocab_size, self.hidden_size, config=config, init_method=config.init_method)
+    self._word_embeddings_key = 'word_embeddings'
+
+    # Position embedding (serial).
+    self.add_position_embedding = args.position_embedding_type == 'learned_absolute'
+    if self.add_position_embedding:
+        self._position_embeddings_key = 'position_embeddings'
+        if args.sequence_parallel:
+            self.position_embeddings = tensor_parallel.layers.SequenceParallelPositionEmbedding(
+                max_sequence_length, self.hidden_size)
+            # Initialize the position embeddings.
+            self.init_method(self.position_embeddings.local_embeddings.weight)
+        else:
+            self.position_embeddings = torch.nn.Embedding(
+                max_sequence_length, self.hidden_size)
+            # Initialize the position embeddings.
+            if args.perform_initialization:
+                if args.zero_stage == 3:
+                    gather_and_init(self.position_embeddings.weight, self.init_method)
+                else:
+                    self.init_method(self.position_embeddings.weight)
+
+    # Token type embedding.
+    # Add this as an optional field that can be added through
+    # method call so we can load a pretrain model without
+    # token types and add them as needed.
+    self._tokentype_embeddings_key = 'tokentype_embeddings'
+    if self.num_tokentypes > 0:
+        self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
+                                                        self.hidden_size)
+        # Initialize the token-type embeddings.
+        if args.perform_initialization:
+            if args.zero_stage == 3:
+                gather_and_init(self.tokentype_embeddings.weight, self.init_method)
+            else:
+                self.init_method(self.tokentype_embeddings.weight)
+    else:
+        self.tokentype_embeddings = None
+
+    self.fp32_residual_connection = args.fp32_residual_connection
+    self.sequence_parallel = args.sequence_parallel
+    self.clone_scatter_output_in_embedding = args.clone_scatter_output_in_embedding
+    # Embeddings dropout
+    self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+    self.use_ixte = ixte_extensions._USE_IXTE and args.transformer_impl == "transformer_engine"
+
+def embedding_forward(self, input_ids, position_ids, tokentype_ids=None, inference_params=None):
+    # Embeddings.
+    words_embeddings = self.word_embeddings(input_ids)
+    if self.add_position_embedding:
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+    else:
+        embeddings = words_embeddings
+
+    if tokentype_ids is not None:
+        assert self.tokentype_embeddings is not None
+        embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+    else:
+        assert self.tokentype_embeddings is None
+
+    # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+    embeddings = embeddings.transpose(0, 1).contiguous()
+
+    # If the input flag for fp32 residual connection is set, convert for float.
+    if self.fp32_residual_connection:
+        embeddings = embeddings.float()
+
+    # Dropout.
+    if self.sequence_parallel and not inference_params:
+        if self.use_ixte:
+            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings, ixte_extensions.get_embedding_tp_overlap_size())
+        else:
+            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+        # `scatter_to_sequence_parallel_region` returns a view, which prevents
+        # the original tensor from being garbage collected. Clone to facilitate GC.
+        # Has a small runtime cost (~0.5%).
+        if self.clone_scatter_output_in_embedding:
+            embeddings = embeddings.clone()
+        with tensor_parallel.get_cuda_rng_tracker().fork():
+            embeddings = self.embedding_dropout(embeddings)
+    else:
+        embeddings = self.embedding_dropout(embeddings)
+
+    return embeddings
+
+# class Embedding(MegatronModule):
+#     """Language model embeddings.
+
+#     Args:
+#         hidden_size: hidden size
+#         vocab_size: vocabulary size
+#         max_sequence_length: maximum size of sequence. This
+#                              is used for positional embedding
+#         embedding_dropout_prob: dropout probability for embeddings
+#         init_method: weight initialization method
+#         num_tokentypes: size of the token-type embeddings. 0 value
+#                         will ignore this embedding
+#     """
+
+#     def __init__(self,
+#                  hidden_size,
+#                  vocab_size,
+#                  max_sequence_length,
+#                  embedding_dropout_prob,
+#                  config,
+#                  num_tokentypes=0,
+#                  embedding_weights_in_fp32=False):
+#         super(Embedding, self).__init__()
+
+#         self.hidden_size = hidden_size
+#         self.init_method = config.init_method
+#         self.num_tokentypes = num_tokentypes
+
+#         args = get_args()
+#         self.deepspeed = args.deepspeed
+
+#         # Word embeddings (parallel).
+#         self.embedding_weights_in_fp32 = embedding_weights_in_fp32
+#         self.params_dtype = args.params_dtype
+#         self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
+#             vocab_size, self.hidden_size, config=config, init_method=config.init_method)
+#         self._word_embeddings_key = 'word_embeddings'
+
+#         # Position embedding (serial).
+#         self.add_position_embedding = args.position_embedding_type == 'learned_absolute'
+#         if self.add_position_embedding:
+#             self._position_embeddings_key = 'position_embeddings'
+#             if self.deepspeed and args.sequence_parallel:
+#                 self.position_embeddings = tensor_parallel.layers.SequenceParallelPositionEmbedding(
+#                     max_sequence_length, self.hidden_size)
+#                 # Initialize the position embeddings.
+#                 self.init_method(self.position_embeddings.local_embeddings.weight)
+#             else:
+#                 self.position_embeddings = torch.nn.Embedding(
+#                     max_sequence_length, self.hidden_size)
+#                 # Initialize the position embeddings.
+#                 if args.perform_initialization:
+#                     if args.zero_stage == 3:
+#                         gather_and_init(self.position_embeddings.weight, self.init_method)
+#                     else:
+#                         self.init_method(self.position_embeddings.weight)
+
+#         # Token type embedding.
+#         # Add this as an optional field that can be added through
+#         # method call so we can load a pretrain model without
+#         # token types and add them as needed.
+#         self._tokentype_embeddings_key = 'tokentype_embeddings'
+#         if self.num_tokentypes > 0:
+#             self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
+#                                                            self.hidden_size)
+#             # Initialize the token-type embeddings.
+#             if args.perform_initialization:
+#                 if args.zero_stage == 3:
+#                     gather_and_init(self.tokentype_embeddings.weight, self.init_method)
+#                 else:
+#                     self.init_method(self.tokentype_embeddings.weight)
+#         else:
+#             self.tokentype_embeddings = None
+
+#         self.fp32_residual_connection = args.fp32_residual_connection
+#         self.sequence_parallel = args.sequence_parallel
+#         self.clone_scatter_output_in_embedding = args.clone_scatter_output_in_embedding
+#         # Embeddings dropout
+#         self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+#         self.use_ixte = ixte_extensions._USE_IXTE and args.transformer_impl == "transformer_engine"
+
+#     def zero_parameters(self):
+#         """Zero out all parameters in embedding."""
+#         self.word_embeddings.weight.data.fill_(0)
+#         self.word_embeddings.weight.shared = True
+#         if self.add_position_embedding:
+#             self.position_embeddings.weight.data.fill_(0)
+#             self.position_embeddings.weight.shared = True
+#         if self.num_tokentypes > 0:
+#             self.tokentype_embeddings.weight.data.fill_(0)
+#             self.tokentype_embeddings.weight.shared = True
+
+#     def add_tokentype_embeddings(self, num_tokentypes):
+#         """Add token-type embedding. This function is provided so we can add
+#         token-type embeddings in case the pretrained model does not have it.
+#         This allows us to load the model normally and then add this embedding.
+#         """
+#         if self.tokentype_embeddings is not None:
+#             raise Exception('tokentype embeddings is already initialized')
+#         if torch.distributed.get_rank() == 0:
+#             print('adding embedding for {} tokentypes'.format(num_tokentypes),
+#                   flush=True)
+#         self.num_tokentypes = num_tokentypes
+#         self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
+#                                                        self.hidden_size)
+#         # Initialize the token-type embeddings.
+#         args = get_args()
+#         self.init_method(self.tokentype_embeddings.weight)
+
+#     def forward(self, input_ids, position_ids, tokentype_ids=None):
+#         # Embeddings.
+#         words_embeddings = self.word_embeddings(input_ids)
+#         if self.add_position_embedding:
+#             position_embeddings = self.position_embeddings(position_ids)
+#             embeddings = words_embeddings + position_embeddings
+#         else:
+#             embeddings = words_embeddings
+
+#         if tokentype_ids is not None:
+#             assert self.tokentype_embeddings is not None
+#             embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+#         else:
+#             assert self.tokentype_embeddings is None
+
+#         # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+#         embeddings = embeddings.transpose(0, 1).contiguous()
+
+#         # If the input flag for fp32 residual connection is set, convert for float.
+#         if self.fp32_residual_connection:
+#             embeddings = embeddings.float()
+
+#         # Dropout.
+#         if self.sequence_parallel:
+#             if self.use_ixte:
+#                 embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings, ixte_extensions.get_embedding_tp_overlap_size())
+#             else:
+#                 embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+#             # `scatter_to_sequence_parallel_region` returns a view, which prevents
+#             # the original tensor from being garbage collected. Clone to facilitate GC.
+#             # Has a small runtime cost (~0.5%).
+#             if self.clone_scatter_output_in_embedding:
+#                 embeddings = embeddings.clone()
+#             with tensor_parallel.get_cuda_rng_tracker().fork():
+#                 embeddings = self.embedding_dropout(embeddings)
+#         else:
+#             embeddings = self.embedding_dropout(embeddings)
+
+#         return embeddings
+
+#     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+#         """For easy load."""
+
+#         state_dict_ = {}
+#         state_dict_[self._word_embeddings_key] \
+#             = self.word_embeddings.state_dict(prefix=prefix,
+#                                               keep_vars=keep_vars)
+#         if self.add_position_embedding:
+#             state_dict_[self._position_embeddings_key] \
+#                 = self.position_embeddings.state_dict(prefix=prefix,
+#                                                   keep_vars=keep_vars)
+#         if self.num_tokentypes > 0:
+#             state_dict_[self._tokentype_embeddings_key] \
+#                 = self.tokentype_embeddings.state_dict(prefix=prefix,
+#                                                        keep_vars=keep_vars)
+
+#         return state_dict_
+
+#     def load_state_dict(self, state_dict, strict=True):
+#         """Customized load."""
+
+#         # Word embedding.
+#         if self._word_embeddings_key in state_dict:
+#             state_dict_ = state_dict[self._word_embeddings_key]
+#         else:
+#             # for backward compatibility.
+#             state_dict_ = {}
+#             for key in state_dict.keys():
+#                 if 'word_embeddings' in key:
+#                     state_dict_[key.split('word_embeddings.')[1]] \
+#                         = state_dict[key]
+#         self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+#         # Position embedding.
+#         if self.add_position_embedding:
+#             if self._position_embeddings_key in state_dict:
+#                 state_dict_ = state_dict[self._position_embeddings_key]
+#             else:
+#                 # for backward compatibility.
+#                 state_dict_ = {}
+#                 for key in state_dict.keys():
+#                     if 'position_embeddings' in key:
+#                         state_dict_[key.split('position_embeddings.')[1]] \
+#                             = state_dict[key]
+#             self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+#         # Tokentype embedding.
+#         if self.num_tokentypes > 0:
+#             state_dict_ = {}
+#             if self._tokentype_embeddings_key in state_dict:
+#                 state_dict_ = state_dict[self._tokentype_embeddings_key]
+#             else:
+#                 # for backward compatibility.
+#                 for key in state_dict.keys():
+#                     if 'tokentype_embeddings' in key:
+#                         state_dict_[key.split('tokentype_embeddings.')[1]] \
+#                             = state_dict[key]
+#             if len(state_dict_.keys()) > 0:
+#                 self.tokentype_embeddings.load_state_dict(state_dict_,
+#                                                           strict=strict)
+#             else:
+#                 print('***WARNING*** expected tokentype embeddings in the '
+#                       'checkpoint but could not find it', flush=True)
+
+class EmbeddingPipe(Embedding):
+
+    def forward(self, inputs, **kwargs):
+        if not hasattr(self, '_args'):
+            self._args = get_args()
+
+        input_ids = inputs[0]
+        position_ids = inputs[1]
+        if hasattr(self._args, 'attn_mask'):
+            attention_mask = None
+        else:
+            attention_mask = inputs[2]
+
+        if len(inputs) == 4:
+            tokentype_ids = inputs[3]
+        else:
+            tokentype_ids = None
+        
+        embeddings = super().forward(input_ids, position_ids, tokentype_ids=tokentype_ids)
+
+        # If cmd args has attn_mask, we don't forward it as an activation.
+        if hasattr(self._args, 'attn_mask'):
+            return embeddings
+        else:
+            assert False
+            return embeddings, attention_mask
+
+
+    @property
+    def word_embeddings_weight(self):
+        """Easy accessory for the DeepSpeed pipeline engine to tie embeddings across stages."""
+        return self.word_embeddings.weight
+
+def transformer_language_model_init(self,
+                                    config,
+                                    encoder_attn_mask_type,
+                                    num_tokentypes=0,
+                                    add_encoder=True,
+                                    add_decoder=False,
+                                    decoder_attn_mask_type=AttnMaskType.causal,
+                                    add_pooler=False,
+                                    pre_process=True,
+                                    post_process=True,
+                                    rlhf_training=False):
+    if rlhf_training:
+        args = get_rlhf_args()
+    else:
+        args = get_args()
+    # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.
+    if args.untie_embeddings_and_output_weights: assert not add_decoder
+    super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
+
+    self.pre_process = pre_process
+    self.post_process = post_process
+    self.hidden_size = config.hidden_size
+    self.num_tokentypes = num_tokentypes
+    self.init_method = config.init_method
+    self.add_encoder = add_encoder
+    self.encoder_attn_mask_type = encoder_attn_mask_type
+    self.add_decoder = add_decoder
+    self.decoder_attn_mask_type = decoder_attn_mask_type
+    self.add_pooler = add_pooler
+    self.encoder_hidden_state = None
+    self.add_retriever = args.retro_add_retriever
+    self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
+
+    # Embeddings.
+    if self.pre_process:
+        self.embedding = Embedding(self.hidden_size,
+                                    args.padded_vocab_size,
+                                    args.max_position_embeddings,
+                                    args.hidden_dropout,
+                                    config,
+                                    self.num_tokentypes,
+                                    args.embedding_weights_in_fp32,
+                                    rlhf_training=rlhf_training)
+        self._embedding_key = 'embedding'
+
+    # Rotary positional embeddings
+    self.use_rotary_position_embeddings = \
+        args.position_embedding_type == 'rope'
+    if self.use_rotary_position_embeddings:
+        self.seq_length = args.seq_length
+        rotary_dim = args.hidden_size // args.num_attention_heads \
+            if args.kv_channels is None else args.kv_channels
+
+        # partial rotary embeddings, which is better than full rotary
+        # Wang and Komatsuzaki et al
+        # https://github.com/kingoflolz/mesh-transformer-jax/
+        self.rotary_pos_emb = RotaryEmbedding(
+            kv_channels=rotary_dim,
+            rotary_percent=args.rotary_percent,
+            seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor,
+            rotary_base=args.rope_theta
+        )
+        self.use_const_rope = (config.context_parallel_size == 1 and not args.variable_seq_lengths)
+        if self.use_const_rope:
+            rotary_pos_emb_out = self.rotary_pos_emb(self.seq_length)
+            self.rotary_pos_emb_v = rotary_pos_emb_out
+
+    # Encoder (usually set to True, False if part of an encoder-decoder
+    # architecture and in encoder-only stage).
+    if self.add_encoder:
+        self.encoder = ParallelTransformer(
+            config,
+            model_type=args.model_type if not args.retro_add_retriever \
+                else ModelType.retro_decoder,
+            self_attn_mask_type=self.encoder_attn_mask_type,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            rlhf_training=rlhf_training
+        )
+        self._encoder_key = 'encoder'
+    else:
+        self.encoder = None
+
+    # Decoder (usually set to False, True if part of an encoder-decoder
+    # architecture and in decoder-only stage).
+    if self.add_decoder:
+        self.decoder = ParallelTransformer(
+            config,
+            model_type=args.model_type,
+            layer_type=LayerType.decoder,
+            self_attn_mask_type=self.decoder_attn_mask_type,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+            rlhf_training=rlhf_training)
+        self._decoder_key = 'decoder'
+    else:
+        self.decoder = None
+
+    if self.post_process:
+        # Pooler.
+        if self.add_pooler:
+            self.pooler = Pooler(self.hidden_size, self.init_method)
+            self._pooler_key = 'pooler'
+
+        if self.untie_embeddings_and_output_weights:
+            if rlhf_training:
+                self.output_layer = torch.nn.Linear(args.hidden_size, 1, bias=False, dtype=config.params_dtype)
+            else:
+                self.output_layer = tensor_parallel.ColumnParallelLinear(
+                    args.hidden_size,
+                    args.padded_vocab_size,
+                    config=config,
+                    init_method=self.init_method,
+                    bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias.
+            self._output_layer_key = 'output_layer'
+
+def transformer_language_model_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+                dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
+                retriever_input_ids=None,
+                retriever_position_ids=None,
+                retriever_attn_mask=None,
+                enc_dec_attn_mask=None, tokentype_ids=None,
+                inference_params=None,
+                pooling_sequence_index=0,
+                enc_hidden_states=None, output_enc_hidden=False,
+                config=None):
+        args = get_args()
+
+        # if not args.deepspeed:
+        #     return fn(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+        #               dec_input_ids=dec_input_ids, dec_position_ids=dec_position_ids,
+        #               dec_attn_mask=dec_attn_mask,
+        #               retriever_input_ids=retriever_input_ids,
+        #               retriever_position_ids=retriever_position_ids,
+        #               retriever_attn_mask=retriever_attn_mask,
+        #               enc_dec_attn_mask=enc_dec_attn_mask,
+        #               tokentype_ids=tokentype_ids,
+        #               inference_params=inference_params,
+        #               pooling_sequence_index=pooling_sequence_index,
+        #               enc_hidden_states=enc_hidden_states,
+        #               output_enc_hidden=output_enc_hidden,
+        #               config=None)
+
+        # Encoder embedding.
+        if self.pre_process:
+            encoder_input = self.embedding(enc_input_ids, enc_position_ids,
+                                            tokentype_ids=tokentype_ids,
+                                            inference_params=inference_params)
+        else:
+            encoder_input = None
+
+        # Retriever embedding.
+        if self.add_retriever and self.pre_process:
+            retriever_input = self.embedding(retriever_input_ids,
+                                                retriever_position_ids,
+                                                tokentype_ids=tokentype_ids,
+                                                inference_params=inference_params)
+        else:
+            retriever_input = None
+
+        # Rotary positional embeddings
+        rotary_pos_emb = None
+        if self.use_rotary_position_embeddings:
+            if self.use_const_rope and inference_params is None and self.training:
+                rotary_pos_emb = self.rotary_pos_emb_v
+            else:
+                if inference_params is not None:
+                    rotary_pos_emb = \
+                        self.rotary_pos_emb(inference_params.max_sequence_length)
+                else:
+                    if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
+                        rotary_pos_emb = self.rotary_pos_emb(args.curriculum_seqlen)
+                    else:
+                        if config is not None:
+                            rotary_pos_emb = self.rotary_pos_emb(config.seq_length)
+                        else:
+                            rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+
+        # Run encoder.
+        if enc_hidden_states is None:
+            if self.encoder is not None:
+                output = self.encoder(
+                    encoder_input,
+                    enc_attn_mask,
+                    retriever_input=retriever_input,
+                    retriever_attn_mask=retriever_attn_mask,
+                    inference_params=inference_params,
+                    rotary_pos_emb=rotary_pos_emb,
+                    position_ids=enc_position_ids)
+
+                if args.deepspeed:
+                    encoder_output, *encoder_moe_losses = output
+                else:
+                    encoder_output = output
+            else:
+                encoder_output, *encoder_moe_losses = self.encoder_hidden_state, []
+        else:
+            encoder_output, *encoder_moe_losses = enc_hidden_states.to(encoder_input.dtype), []
+
+        if self.post_process:
+            if self.add_pooler:
+                pooled_output = self.pooler(encoder_output,
+                                            pooling_sequence_index)
+
+        # output_enc_hidden refers to when we just need the encoder's
+        # output. For example, it is helpful to compute
+        # similarity between two sequences by average pooling
+        if not self.add_decoder or output_enc_hidden:
+            if args.deepspeed:
+                if self.add_pooler and self.post_process:
+                    return encoder_output, pooled_output, encoder_moe_losses
+                else:
+                    return encoder_output, encoder_moe_losses
+            else:
+                if self.add_pooler and self.post_process:
+                    return encoder_output, pooled_output
+                else:
+                    return encoder_output
+
+        # Decoder embedding.
+        if self.pre_process:
+            decoder_input = self.embedding(dec_input_ids,
+                                            dec_position_ids)
+        else:
+            decoder_input = None
+
+        # Run decoder.
+        output = self.decoder(
+            decoder_input,
+            dec_attn_mask,
+            encoder_output=encoder_output,
+            enc_dec_attn_mask=enc_dec_attn_mask,
+            inference_params=inference_params,
+            rotary_pos_emb=rotary_pos_emb)
+
+        if args.deepspeed:
+            decoder_output, *decoder_moe_losses = output
+            if self.add_pooler and self.post_process:
+                return decoder_output, encoder_output, pooled_output, decoder_moe_losses, encoder_moe_losses
+            else:
+                return decoder_output, encoder_output, decoder_moe_losses, encoder_moe_losses
+        else:
+            decoder_output = output
+            if self.add_pooler and self.post_process:
+                return decoder_output, encoder_output, pooled_output
+            else:
+                return decoder_output, encoder_output
+    return wrapper
+
+def transformer_language_model_state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+    """For easy load."""
+
+    args = get_args()
+    state_dict_ = {}
+    moe_state_dict = {}
+    if self.pre_process:
+        state_dict_[self._embedding_key] \
+            = self.embedding.state_dict_for_save_checkpoint(prefix=prefix,
+                                                            keep_vars=keep_vars)
+    if self.add_encoder:
+        encoder_state_dict = self.encoder.state_dict_for_save_checkpoint(
+            prefix=prefix, keep_vars=keep_vars)
+        for key in list(encoder_state_dict.keys()):
+            if 'expert' in key and 'moe.gate.wg.weight' not in key:
+                moe_state_dict[self._encoder_key+key] = encoder_state_dict.pop(key)
+        state_dict_[self._encoder_key] = encoder_state_dict
+    if self.post_process:
+        if self.add_pooler:
+            state_dict_[self._pooler_key] \
+                = self.pooler.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                keep_vars=keep_vars)
+        if self.untie_embeddings_and_output_weights:
+            state_dict_[self._output_layer_key] \
+                = self.output_layer.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+    if self.add_decoder:
+        state_dict_[self._decoder_key] \
+            = self.decoder.state_dict_for_save_checkpoint(prefix=prefix,
+                                                            keep_vars=keep_vars)
+
+    state_dict_["moe_state_dict"] = moe_state_dict
+    return state_dict_
+
+def transformer_language_model_load_state_dict(self, state_dict, strict=True):
+    """Customized load."""
+
+    # Embedding.
+    if self.pre_process:
+        if self._embedding_key in state_dict:
+            state_dict_ = state_dict[self._embedding_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if '_embeddings' in key:
+                    state_dict_[key] = state_dict[key]
+        self.embedding.load_state_dict(state_dict_, strict=strict)
+
+    # Encoder.
+    if self.add_encoder:
+        if self._encoder_key in state_dict:
+            state_dict_ = state_dict[self._encoder_key]
+        # For backward compatibility.
+        elif 'transformer' in state_dict:
+            state_dict_ = state_dict['transformer']
+        else:
+            # For backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'transformer.' in key:
+                    state_dict_[key.split('transformer.')[1]] = state_dict[key]
+
+        # For backward compatibility.
+        # Somehow this backward compatibility could be wrong: sometimes
+        # '.attention.' is the actual key used so should not be replaced. Thus
+        # added another logic to only replace if the key does not match
+        state_dict_self_attention = {}
+        encoder_state_dict_keys = list(self.encoder.state_dict().keys())
+        for key in state_dict_.keys():
+            if '.attention.' in key and key not in encoder_state_dict_keys:
+                state_dict_self_attention[key.replace(".attention.",
+                    ".self_attention.")] = state_dict_[key]
+            else:
+                state_dict_self_attention[key] = state_dict_[key]
+        state_dict_ = state_dict_self_attention
+
+        # Gather encoder MoE states
+        if "moe_state_dict" in state_dict:
+            for key in list(state_dict["moe_state_dict"].keys()):
+                if self._encoder_key in key:
+                    key_list = key.split('.')
+                    while key_list[0] != 'encoder':
+                        key_list.pop(0)
+                    key_list.pop(0)
+                    actual_key = '.'.join(key_list)
+                    state_dict_[actual_key] = state_dict["moe_state_dict"].pop(key)
+            if len(state_dict["moe_state_dict"]) == 0:
+                del state_dict["moe_state_dict"]
+        self.encoder.load_state_dict(state_dict_, strict=strict)
+
+    # Pooler.
+    if self.post_process:
+        if self.add_pooler:
+            assert 'pooler' in state_dict, \
+                'could not find data for pooler in the checkpoint'
+            self.pooler.load_state_dict(state_dict[self._pooler_key],
+                                        strict=strict)
+        if self.untie_embeddings_and_output_weights:
+            assert 'output_layer' in state_dict, \
+                'could not find data for output_layer in the checkpoint'
+            self.output_layer.load_state_dict(state_dict[self._output_layer_key],
+                                                strict=strict)
+    # Decoder.
+    if self.add_decoder:
+        assert 'decoder' in state_dict, \
+            'could not find data for pooler in the checkpoint'
+        self.decoder.load_state_dict(state_dict[self._decoder_key],
+                                        strict=strict)
+
+# class TransformerLanguageModel(MegatronModule):
+#     """Transformer language model.
+
+#     Args:
+#         transformer_hparams: transformer hyperparameters
+#         vocab_size: vocabulary size
+#         max_sequence_length: maximum size of sequence. This
+#                              is used for positional embedding
+#         embedding_dropout_prob: dropout probability for embeddings
+#         num_tokentypes: size of the token-type embeddings. 0 value
+#                         will ignore this embedding
+#     """
+
+#     def __init__(self,
+#                  config,
+#                  encoder_attn_mask_type,
+#                  num_tokentypes=0,
+#                  add_encoder=True,
+#                  add_decoder=False,
+#                  decoder_attn_mask_type=AttnMaskType.causal,
+#                  add_pooler=False,
+#                  pre_process=True,
+#                  post_process=True):
+#         args = get_args()
+#         # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.
+#         if args.untie_embeddings_and_output_weights: assert not add_decoder
+#         super(TransformerLanguageModel, self).__init__(share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights)
+
+#         self.pre_process = pre_process
+#         self.post_process = post_process
+#         self.hidden_size = config.hidden_size
+#         self.num_tokentypes = num_tokentypes
+#         self.init_method = config.init_method
+#         self.add_encoder = add_encoder
+#         self.encoder_attn_mask_type = encoder_attn_mask_type
+#         self.add_decoder = add_decoder
+#         self.decoder_attn_mask_type = decoder_attn_mask_type
+#         self.add_pooler = add_pooler
+#         self.encoder_hidden_state = None
+#         self.add_retriever = args.retro_add_retriever
+#         self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights
+
+#         # Embeddings.
+#         if self.pre_process:
+#             self.embedding = Embedding(self.hidden_size,
+#                                        args.padded_vocab_size,
+#                                        args.max_position_embeddings,
+#                                        args.hidden_dropout,
+#                                        config,
+#                                        self.num_tokentypes)
+#             self._embedding_key = 'embedding'
+
+#         # Rotary positional embeddings
+#         self.use_rotary_position_embeddings = \
+#             args.position_embedding_type == 'rope'
+#         if self.use_rotary_position_embeddings:
+#             self.seq_length = args.seq_length
+#             rotary_dim = args.hidden_size // args.num_attention_heads \
+#                 if args.kv_channels is None else args.kv_channels
+
+#             # partial rotary embeddings, which is better than full rotary
+#             # Wang and Komatsuzaki et al
+#             # https://github.com/kingoflolz/mesh-transformer-jax/
+#             self.rotary_pos_emb = RotaryEmbedding(
+#                 kv_channels=rotary_dim,
+#                 rotary_percent=args.rotary_percent,
+#                 seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor,
+#                 rotary_base=args.rope_theta
+#             )
+#             self.use_const_rope = config.context_parallel_size == 1
+#             if self.use_const_rope:
+#                 rotary_pos_emb_out = self.rotary_pos_emb(self.seq_length)
+#                 self.rotary_pos_emb_v = rotary_pos_emb_out
+
+#         # Encoder (usually set to True, False if part of an encoder-decoder
+#         # architecture and in encoder-only stage).
+#         if self.add_encoder:
+#             self.encoder = ParallelTransformer(
+#                 config,
+#                 model_type=args.model_type if not args.retro_add_retriever \
+#                     else ModelType.retro_decoder,
+#                 self_attn_mask_type=self.encoder_attn_mask_type,
+#                 pre_process=self.pre_process,
+#                 post_process=self.post_process,
+#             )
+#             self._encoder_key = 'encoder'
+#         else:
+#             self.encoder = None
+
+#         # Decoder (usually set to False, True if part of an encoder-decoder
+#         # architecture and in decoder-only stage).
+#         if self.add_decoder:
+#             self.decoder = ParallelTransformer(
+#                 config,
+#                 model_type=args.model_type,
+#                 layer_type=LayerType.decoder,
+#                 self_attn_mask_type=self.decoder_attn_mask_type,
+#                 pre_process=self.pre_process,
+#                 post_process=self.post_process)
+#             self._decoder_key = 'decoder'
+#         else:
+#             self.decoder = None
+
+#         if self.post_process:
+#             # Pooler.
+#             if self.add_pooler:
+#                 self.pooler = Pooler(self.hidden_size, self.init_method)
+#                 self._pooler_key = 'pooler'
+
+#             if self.untie_embeddings_and_output_weights:
+#                 self.output_layer = tensor_parallel.ColumnParallelLinear(
+#                     args.hidden_size,
+#                     args.padded_vocab_size,
+#                     config=config,
+#                     init_method=self.init_method,
+#                     bias=False) # Setting bias to False always to keep it consistent with embedding tying that also does not have a bias.
+#                 self._output_layer_key = 'output_layer'
+
+#     def set_input_tensor(self, input_tensor):
+#         """ See megatron.legacy.model.transformer.set_input_tensor()"""
+
+#         # This is usually handled in schedules.py but some inference code still
+#         # gives us non-lists or None
+#         if not isinstance(input_tensor, list):
+#             input_tensor = [input_tensor]
+
+#         if self.add_encoder and self.add_decoder:
+#             assert len(input_tensor) == 1, \
+#                 'input_tensor should only be length 1 for stage with both encoder and decoder'
+#             self.encoder.set_input_tensor(input_tensor[0])
+#         elif self.add_encoder:
+#             assert len(input_tensor) == 1, \
+#                 'input_tensor should only be length 1 for stage with only encoder'
+#             self.encoder.set_input_tensor(input_tensor[0])
+#         elif self.add_decoder:
+#             if len(input_tensor) == 2:
+#                 self.decoder.set_input_tensor(input_tensor[0])
+#                 self.encoder_hidden_state = input_tensor[1]
+#             elif len(input_tensor) == 1:
+#                 self.decoder.set_input_tensor(None)
+#                 self.encoder_hidden_state = input_tensor[0]
+#             else:
+#                 raise Exception('input_tensor must have either length 1 or 2')
+#         else:
+#             raise Exception('Stage must have at least either encoder or decoder')
+
+#     def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+#                 dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
+#                 retriever_input_ids=None,
+#                 retriever_position_ids=None,
+#                 retriever_attn_mask=None,
+#                 enc_dec_attn_mask=None, tokentype_ids=None,
+#                 inference_params=None,
+#                 pooling_sequence_index=0,
+#                 enc_hidden_states=None, output_enc_hidden=False):
+
+#         args = get_args()
+#         # Encoder embedding.
+#         if self.pre_process:
+#             encoder_input = self.embedding(enc_input_ids, enc_position_ids,
+#                                            tokentype_ids=tokentype_ids)
+#         else:
+#             encoder_input = None
+
+#         # Retriever embedding.
+#         if self.add_retriever and self.pre_process:
+#             retriever_input = self.embedding(retriever_input_ids,
+#                                              retriever_position_ids,
+#                                              tokentype_ids=tokentype_ids)
+#         else:
+#             retriever_input = None
+
+#         # Rotary positional embeddings
+#         rotary_pos_emb = None
+#         if self.use_rotary_position_embeddings:
+#             if self.use_const_rope and inference_params is None and self.training:
+#                 rotary_pos_emb = self.rotary_pos_emb_v
+#             else:
+#                 if inference_params is not None:
+#                     rotary_pos_emb = \
+#                         self.rotary_pos_emb(inference_params.max_sequence_length)
+#                 else:
+#                     if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
+#                         rotary_pos_emb = self.rotary_pos_emb(args.curriculum_seqlen)
+#                     else:
+#                         rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+
+#         # Run encoder.
+#         if enc_hidden_states is None:
+#             if self.encoder is not None:
+#                 encoder_output, *encoder_moe_losses = self.encoder(
+#                     encoder_input,
+#                     enc_attn_mask,
+#                     retriever_input=retriever_input,
+#                     retriever_attn_mask=retriever_attn_mask,
+#                     inference_params=inference_params,
+#                     rotary_pos_emb=rotary_pos_emb)
+#             else:
+#                 encoder_output = self.encoder_hidden_state
+#         else:
+#             encoder_output, *encoder_moe_losses = enc_hidden_states.to(encoder_input.dtype), []
+
+#         if self.post_process:
+#             if self.add_pooler:
+#                 pooled_output = self.pooler(encoder_output,
+#                                             pooling_sequence_index)
+
+#         # output_enc_hidden refers to when we just need the encoder's
+#         # output. For example, it is helpful to compute
+#         # similarity between two sequences by average pooling
+#         if not self.add_decoder or output_enc_hidden:
+#             if self.add_pooler and self.post_process:
+#                 return encoder_output, pooled_output, encoder_moe_losses
+#             else:
+#                 return encoder_output, encoder_moe_losses
+
+#         # Decoder embedding.
+#         if self.pre_process:
+#             decoder_input = self.embedding(dec_input_ids,
+#                                            dec_position_ids)
+#         else:
+#             decoder_input = None
+
+#         # Run decoder.
+#         decoder_output, *decoder_moe_losses = self.decoder(
+#             decoder_input,
+#             dec_attn_mask,
+#             encoder_output=encoder_output,
+#             enc_dec_attn_mask=enc_dec_attn_mask,
+#             inference_params=inference_params,
+#             rotary_pos_emb=rotary_pos_emb)
+
+#         if self.add_pooler and self.post_process:
+#             return decoder_output, encoder_output, pooled_output, decoder_moe_losses, encoder_moe_losses
+#         else:
+#             return decoder_output, encoder_output, decoder_moe_losses, encoder_moe_losses
+
+#     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+#         """For easy load."""
+
+#         args = get_args()
+#         state_dict_ = {}
+#         moe_state_dict = {}
+#         if self.pre_process:
+#             state_dict_[self._embedding_key] \
+#                 = self.embedding.state_dict_for_save_checkpoint(prefix=prefix,
+#                                                                 keep_vars=keep_vars)
+#         if self.add_encoder:
+#             encoder_state_dict = self.encoder.state_dict_for_save_checkpoint(
+#                 prefix=prefix, keep_vars=keep_vars)
+#             for key in list(encoder_state_dict.keys()):
+#                 if 'expert' in key and 'moe.gate.wg.weight' not in key:
+#                     moe_state_dict[self._encoder_key+key] = encoder_state_dict.pop(key)
+#             state_dict_[self._encoder_key] = encoder_state_dict
+#         if self.post_process:
+#             if self.add_pooler:
+#                 state_dict_[self._pooler_key] \
+#                     = self.pooler.state_dict_for_save_checkpoint(prefix=prefix,
+#                                                                  keep_vars=keep_vars)
+#             if self.untie_embeddings_and_output_weights:
+#                 state_dict_[self._output_layer_key] \
+#                     = self.output_layer.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+#         if self.add_decoder:
+#             state_dict_[self._decoder_key] \
+#                 = self.decoder.state_dict_for_save_checkpoint(prefix=prefix,
+#                                                               keep_vars=keep_vars)
+
+#         state_dict_["moe_state_dict"] = moe_state_dict
+#         return state_dict_
+
+#     def load_state_dict(self, state_dict, strict=True):
+#         """Customized load."""
+
+#         # Embedding.
+#         if self.pre_process:
+#             if self._embedding_key in state_dict:
+#                 state_dict_ = state_dict[self._embedding_key]
+#             else:
+#                 # for backward compatibility.
+#                 state_dict_ = {}
+#                 for key in state_dict.keys():
+#                     if '_embeddings' in key:
+#                         state_dict_[key] = state_dict[key]
+#             self.embedding.load_state_dict(state_dict_, strict=strict)
+
+#         # Encoder.
+#         if self.add_encoder:
+#             if self._encoder_key in state_dict:
+#                 state_dict_ = state_dict[self._encoder_key]
+#             # For backward compatibility.
+#             elif 'transformer' in state_dict:
+#                 state_dict_ = state_dict['transformer']
+#             else:
+#                 # For backward compatibility.
+#                 state_dict_ = {}
+#                 for key in state_dict.keys():
+#                     if 'transformer.' in key:
+#                         state_dict_[key.split('transformer.')[1]] = state_dict[key]
+
+#             # For backward compatibility.
+#             # Somehow this backward compatibility could be wrong: sometimes
+#             # '.attention.' is the actual key used so should not be replaced. Thus
+#             # added another logic to only replace if the key does not match
+#             state_dict_self_attention = {}
+#             encoder_state_dict_keys = list(self.encoder.state_dict().keys())
+#             for key in state_dict_.keys():
+#                 if '.attention.' in key and key not in encoder_state_dict_keys:
+#                     state_dict_self_attention[key.replace(".attention.",
+#                         ".self_attention.")] = state_dict_[key]
+#                 else:
+#                     state_dict_self_attention[key] = state_dict_[key]
+#             state_dict_ = state_dict_self_attention
+
+#             # Gather encoder MoE states
+#             if "moe_state_dict" in state_dict:
+#                 for key in list(state_dict["moe_state_dict"].keys()):
+#                     if self._encoder_key in key:
+#                         key_list = key.split('.')
+#                         while key_list[0] != 'encoder':
+#                             key_list.pop(0)
+#                         key_list.pop(0)
+#                         actual_key = '.'.join(key_list)
+#                         state_dict_[actual_key] = state_dict["moe_state_dict"].pop(key)
+#                 if len(state_dict["moe_state_dict"]) == 0:
+#                     del state_dict["moe_state_dict"]
+#             self.encoder.load_state_dict(state_dict_, strict=strict)
+
+#         # Pooler.
+#         if self.post_process:
+#             if self.add_pooler:
+#                 assert 'pooler' in state_dict, \
+#                     'could not find data for pooler in the checkpoint'
+#                 self.pooler.load_state_dict(state_dict[self._pooler_key],
+#                                             strict=strict)
+#             if self.untie_embeddings_and_output_weights:
+#                 assert 'output_layer' in state_dict, \
+#                     'could not find data for output_layer in the checkpoint'
+#                 self.output_layer.load_state_dict(state_dict[self._output_layer_key],
+#                                                   strict=strict)
+#         # Decoder.
+#         if self.add_decoder:
+#             assert 'decoder' in state_dict, \
+#                 'could not find data for pooler in the checkpoint'
+#             self.decoder.load_state_dict(state_dict[self._decoder_key],
+#                                          strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/module.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..4afe6e65c04edda5fa62b8b217915c817a814a5a
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/module.py
@@ -0,0 +1,11 @@
+"""Megatron Module"""
+
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from megatron.training import get_args
+from megatron.core import mpu, tensor_parallel
+
+def megatron_module_universal_checkpoint_info(self):
+    return {}
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/multiple_choice.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/multiple_choice.py
similarity index 84%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/model/multiple_choice.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/multiple_choice.py
index 242946fc9ee6a2f825202d653626cdeaba060ab5..9002e8bbff1c4330fac09d8af1d072f3cd87f5b3 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/multiple_choice.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/multiple_choice.py
@@ -1,17 +1,15 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
 """Multiple choice model."""
 
 import torch
 
-from megatron_ds import get_args, print_rank_last
-from megatron_ds.model.enums import AttnMaskType
-from megatron_ds.model.bert_model import bert_extended_attention_mask, bert_position_ids
-from megatron_ds.model.language_model import get_language_model
-from megatron_ds.model.utils import get_linear_layer
-from megatron_ds.model.utils import init_method_normal
-from megatron_ds.model.utils import scaled_init_method_normal
-from .module import MegatronModule
+from megatron.training import get_args, print_rank_last
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.utils import scaled_init_method_normal
+from megatron.legacy.model.module import MegatronModule
 
 
 class MultipleChoice(MegatronModule):
@@ -39,11 +37,12 @@ class MultipleChoice(MegatronModule):
         if self.post_process:
             self.multichoice_dropout = torch.nn.Dropout(args.hidden_dropout)
             self.multichoice_head = get_linear_layer(args.hidden_size, 1,
-                                                     init_method)
+                                                     init_method,
+                                                     gather_params_on_init=args.zero_stage == 3)
             self._multichoice_head_key = 'multichoice_head'
 
     def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, model_input, attention_mask, tokentype_ids=None):
@@ -74,7 +73,7 @@ class MultipleChoice(MegatronModule):
             tokentype_ids=tokentype_ids
         )
         if self.post_process:
-            _, pooled_output = lm_output
+            _, pooled_output = lm_output[0], lm_output[1]
             multichoice_output = self.multichoice_dropout(pooled_output)
             multichoice_logits = self.multichoice_head(multichoice_output)
 
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/realm_model.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/realm_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..046fda64f5917710674cd5d030c25eb160dd67d8
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/realm_model.py
@@ -0,0 +1,72 @@
+from functools import wraps
+
+import torch
+
+from megatron.training import get_args, print_rank_0
+from megatron.training.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
+from megatron.legacy.model import BertModel
+from megatron.legacy.model.module import MegatronModule
+from megatron.core import mpu
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.utils import get_linear_layer
+from megatron.legacy.model.utils import init_method_normal
+from megatron.legacy.model.language_model import get_language_model
+from megatron.legacy.model.utils import scaled_init_method_normal
+from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids
+
+class IREncoderBertModel(MegatronModule):
+    """BERT-based encoder for queries or blocks used for learned information retrieval."""
+    def __init__(self, ict_head_size, num_tokentypes=2, parallel_output=True):
+        super(IREncoderBertModel, self).__init__()
+        args = get_args()
+
+        self.ict_head_size = ict_head_size
+        self.parallel_output = parallel_output
+        init_method = init_method_normal(args.init_method_std)
+        scaled_init_method = scaled_init_method_normal(args.init_method_std,
+                                                       args.num_layers)
+
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=True,
+            encoder_attn_mask_type=AttnMaskType.padding,
+            init_method=init_method,
+            scaled_init_method=scaled_init_method)
+
+        self.ict_head = get_linear_layer(args.hidden_size, ict_head_size, init_method, gather_params_on_init=args.zero_stage == 3)
+        self._ict_head_key = 'ict_head'
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        extended_attention_mask = bert_extended_attention_mask(
+            attention_mask, next(self.language_model.parameters()).dtype)
+        position_ids = bert_position_ids(input_ids)
+
+        lm_output, pooled_output = self.language_model(
+            input_ids,
+            position_ids,
+            extended_attention_mask,
+            tokentype_ids=tokentype_ids)
+
+        # Output.
+        ict_logits = self.ict_head(pooled_output)
+        return ict_logits, None
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load when model is combined with other heads,
+        add an extra key."""
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+        state_dict_[self._ict_head_key] \
+            = self.ict_head.state_dict(prefix=prefix,
+                                       keep_vars=keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        self.language_model.load_state_dict(
+            state_dict[self._language_model_key], strict=strict)
+        self.ict_head.load_state_dict(
+            state_dict[self._ict_head_key], strict=strict)
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/rms_norm.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cbb91718f1220958327974c6dc19fc35ebe0535
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/rms_norm.py
@@ -0,0 +1,16 @@
+import torch
+from torch.nn.parameter import Parameter
+
+# Taken from facebookresearch/llama
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/rotary_pos_embedding.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/rotary_pos_embedding.py
similarity index 43%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/model/rotary_pos_embedding.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/rotary_pos_embedding.py
index 4d4497e0cd931293549c8d647dc9384bc8bb69e2..b283e8b586c7458b5666087115e70f096e560f1f 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/rotary_pos_embedding.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/rotary_pos_embedding.py
@@ -1,15 +1,9 @@
-# coding=utf-8
-
-# The following code has been taken from https://github.com/NVIDIA/NeMo/blob/ \
-# 782b4e1652aaa43c8be390d9db0dc89544afa080/nemo/collections/nlp/modules/ \
-# common/megatron/rotary_pos_embedding.py
-
 import importlib.util
 import torch
 
 from torch import einsum, nn
 
-__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
+__all__ = ['RotaryEmbedding']
 
 class RotaryEmbedding(nn.Module):
     def __init__(self, dim, theta=10000):
@@ -28,29 +22,3 @@ class RotaryEmbedding(nn.Module):
         # emb [seq_length, .., dim]
         from einops import rearrange
         return rearrange(emb, 'n d -> n 1 1 d')
-
-
-def _rotate_half(x):
-    """
-    change sign so the last dimension becomes [-odd, +even]
-    """
-    from einops import rearrange
-    x = rearrange(x, '... (j d) -> ... j d', j=2)
-    x1, x2 = x.unbind(dim=-2)
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(t, freqs):
-    """
-    input tensor t is of shape [seq_length, ..., dim]
-    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
-    check https://kexue.fm/archives/8265 for detailed formulas
-    """
-    rot_dim = freqs.shape[-1]
-    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
-    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
-
-    # first part is cosine component
-    # second part is sine component, need to change signs with _rotate_half method
-    t = (t * freqs.cos().to(t.dtype)) + (_rotate_half(t) * freqs.sin().to(t.dtype))
-    return t if t_pass.shape[-1] == 0 else torch.cat((t, t_pass), dim=-1)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/t5_model.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/t5_model.py
similarity index 83%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/model/t5_model.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/t5_model.py
index 8be9a43fe993911a76b24ed081c418aa45c2b7ba..9b66a01fc907911d7d2b38080f465514385a227f 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/t5_model.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/t5_model.py
@@ -1,45 +1,27 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
 """T5 model."""
 
 import torch
 
-from megatron_ds import get_args
-from megatron_ds.core import tensor_parallel
-from megatron_ds.model.enums import AttnMaskType
-from megatron_ds.model.language_model import parallel_lm_logits, get_language_model
-from megatron_ds.model import LayerNorm
-from megatron_ds.model.utils import (
+from megatron.training import get_args
+from megatron.core import tensor_parallel
+from megatron.legacy.model.enums import AttnMaskType
+from megatron.legacy.model.language_model import parallel_lm_logits, get_language_model
+from megatron.legacy.model import LayerNorm
+from megatron.legacy.model.utils import (
     openai_gelu,
     get_linear_layer
 )
-from .module import MegatronModule
-
-
-def t5_extended_attention_mask(attention_mask_list):
-
-    def attn_mask_postprocess(attn_mask):
-        # [b, 1, s, s]
-        extended_attention_mask = attn_mask.unsqueeze(1)
-        return extended_attention_mask
-
-    return [attn_mask_postprocess(attn_mask) for attn_mask in attention_mask_list]
-
-
-def t5_position_ids(token_ids):
-    # Create position ids
-    seq_length = token_ids.size(1)
-    position_ids = torch.arange(seq_length, dtype=torch.long,
-                                device=token_ids.device)
-    position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
-
-    return position_ids
+from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.t5_model import (
+    t5_extended_attention_mask,
+    t5_position_ids
+)
 
 
 class T5LMHead(MegatronModule):
     """Masked LM head for T5
 
-    Arguments:
+    Args:
         mpu_vocab_size: model parallel size of vocabulary.
         parallel_output: wether output logits being distributed or not.
     """
@@ -71,7 +53,8 @@ class T5Model(MegatronModule):
                  pre_process=True,
                  post_process=True,
                  add_encoder=True,
-                 add_decoder=True):
+                 add_decoder=True,
+                 return_moe_loss=False):
         super().__init__(config=config)
         args = get_args()
 
@@ -81,6 +64,7 @@ class T5Model(MegatronModule):
         self.post_process = post_process
         self.add_encoder = add_encoder
         self.add_decoder = add_decoder
+        self.return_moe_loss = return_moe_loss
 
         self.language_model, self._language_model_key = get_language_model(
             config=config,
@@ -101,7 +85,7 @@ class T5Model(MegatronModule):
             self._lm_head_key = 'lm_head'
 
     def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
+        """See megatron.legacy.model.transformer.set_input_tensor()"""
         self.language_model.set_input_tensor(input_tensor)
 
     def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask,
@@ -126,7 +110,7 @@ class T5Model(MegatronModule):
                                         enc_hidden_states=enc_hidden_states)
 
         if self.post_process and self.add_decoder:
-            decoder_output, encoder_output = lm_output
+            decoder_output, encoder_output, dec_moe_losses, enc_moe_losses = lm_output
             # Output. [s, b, h]
             lm_logits = self.lm_head(decoder_output,
                                      self.shared_embedding_or_output_weight())
@@ -148,10 +132,12 @@ class T5Model(MegatronModule):
             return lm_loss
         elif self.add_decoder and not self.add_encoder:
             decoder_output, encoder_output = lm_output
-            return decoder_output
+            return decoder_output, dec_moe_losses, enc_moe_losses if self.return_moe_loss else lm_loss
         else:
-            encoder_output = lm_output
-            return encoder_output
+            decoder_output, _, decoder_moe_losses, _= lm_output
+            return decoder_output, decoder_moe_losses
+
+        return lm_output
 
     def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
         """For easy load when model is combined with other heads,
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/transformer.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c98df461516431ff7293386191b01b368cb53c6c
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/transformer.py
@@ -0,0 +1,3205 @@
+"""Transformer."""
+from contextlib import nullcontext
+import os
+import math
+import numpy as np
+import torch
+from functools import wraps
+import torch.nn.functional as F
+from typing import Optional
+
+from megatron import core
+from apex.corex.activations import swiglu
+from apex.transformer.functional import fused_apply_rotary_pos_emb
+from megatron.training import get_timers, get_args, get_rlhf_args
+from megatron.legacy.model.module import MegatronModule
+from megatron.legacy.model.transformer import (
+    ParallelMLP,
+    CoreAttention,
+    ParallelAttention,
+    ParallelTransformerLayer,
+    ParallelTransformer
+)
+from megatron.core import mpu, tensor_parallel
+from megatron.core.enums import ModelType
+from megatron.legacy.model.enums import AttnMaskType, LayerType, AttnType
+from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding, apply_rotary_pos_emb
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.legacy.model.utils import attention_mask_func, openai_gelu, erf_gelu, get_norm, get_rmsnorm_residual
+from megatron.core.tensor_parallel import (
+    gather_from_sequence_parallel_region_to_moe,
+    reduce_scatter_to_sequence_parallel_region_from_moe,
+    get_cuda_rng_tracker,
+    get_data_parallel_rng_tracker_name
+)
+from megatron.core.parallel_state import get_tensor_model_parallel_group, get_tensor_and_expert_parallel_group
+from megatron.core.jit import jit_fuser
+from megatron.core import ixte_extensions
+import megatron.legacy.model.transformer
+from megatron.legacy.model.transformer import (
+    DropPath,
+    SwitchMLP,
+    bias_dropout_add_fused_train,
+    bias_dropout_add_fused_inference,
+    get_bias_dropout_add,
+    _get_num_layers,
+    _get_layer_type,
+    NoopTransformerLayer,
+    FlashSelfAttention
+)
+from megatronspeed.core import parallel_state
+
+import deepspeed
+from deepspeed.moe.layer import MoE
+from deepspeed.accelerator import get_accelerator
+
+try:
+    from deepspeed.sequence.layer import DistributedAttention
+    dist_attn_supported = True
+except ImportError:
+    dist_attn_supported = False
+
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
+except ImportError:
+    try:
+        from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
+    except ImportError:
+        flash_attn_unpadded_func = None
+
+
+def parallel_mlp_init(self, config, is_expert=False, moe=False, enable_expert_tensor_parallelism=False, rlhf_training=False):
+    super(ParallelMLP, self).__init__()
+    args = get_args()
+
+    self.add_bias = config.add_bias_linear
+
+    ffn_hidden_size = config.ffn_hidden_size
+    if config.gated_linear_unit:
+        ffn_hidden_size *= 2
+
+    # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+    self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear(
+        config.hidden_size,
+        ffn_hidden_size,
+        config=config,
+        init_method=config.init_method,
+        bias=self.add_bias,
+        gather_output=False,
+        skip_bias_add=True,
+        is_expert=is_expert,
+        moe=moe,
+        enable_expert_tensor_parallelism=enable_expert_tensor_parallelism
+    )
+
+    self.bias_gelu_fusion = False
+    self.activation_func = None
+    self.swiglu = args.swiglu
+
+    if args.openai_gelu:
+        self.activation_func = openai_gelu
+    elif args.onnx_safe:
+        self.activation_func = erf_gelu
+    elif args.swiglu:
+        # def swiglu(x):
+        #     x = torch.chunk(x, 2, dim=-1)
+        #     return F.silu(x[0]) * x[1]
+        self.activation_func = swiglu
+    elif args.squared_relu:
+        def squared_relu(x):
+            return torch.pow(F.relu(x), 2)
+        self.activation_func = squared_relu
+    else:
+        self.bias_gelu_fusion = args.bias_gelu_fusion
+        self.activation_func = F.gelu
+
+    # Project back to h.
+    self.dense_4h_to_h = tensor_parallel.RowParallelLinear(
+        config.ffn_hidden_size,
+        config.hidden_size,
+        config=config,
+        init_method=config.output_layer_init_method,
+        bias=self.add_bias,
+        skip_bias_add=True,
+        input_is_parallel=True,
+        is_expert=is_expert,
+        moe=moe,
+        enable_expert_tensor_parallelism=enable_expert_tensor_parallelism
+    )
+
+def parallel_mlp_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, hidden_states, inference_params=None):
+        args = get_args()
+
+        # if not args.deepspeed:
+        #     return fn(self, hidden_states)
+
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states, inference_params=inference_params)
+
+        if self.bias_gelu_fusion:
+            assert self.add_bias is True
+            # DeepSpeed FLOPS profiler temporarily substitues functions like F.gelu to calculate the throughput
+            assert hasattr(self, "__flops__") or self.activation_func == F.gelu
+            intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            if bias_parallel is not None:
+                intermediate_parallel = intermediate_parallel + bias_parallel
+            intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel, inference_params=inference_params)
+        return output, output_bias
+    return wrapper
+
+# class ParallelMLP(MegatronModule):
+#     """MLP.
+
+#     MLP will take the input with h hidden state, project it to 4*h
+#     hidden dimension, perform nonlinear transformation, and project the
+#     state back into h hidden dimension.
+#     """
+
+#     def __init__(self, config, is_expert=False, moe=False, enable_expert_tensor_parallelism=False):
+#         super(ParallelMLP, self).__init__()
+#         args = get_args()
+
+#         self.add_bias = config.add_bias_linear
+
+#         ffn_hidden_size = config.ffn_hidden_size
+#         if config.gated_linear_unit:
+#             ffn_hidden_size *= 2
+
+#         # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+#         self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear(
+#             config.hidden_size,
+#             ffn_hidden_size,
+#             config=config,
+#             init_method=config.init_method,
+#             bias=self.add_bias,
+#             gather_output=False,
+#             skip_bias_add=True,
+#             is_expert=is_expert,
+#             moe=moe,
+#             enable_expert_tensor_parallelism=enable_expert_tensor_parallelism
+#         )
+
+#         self.bias_gelu_fusion = False
+#         self.activation_func = None
+#         self.swiglu = args.swiglu
+
+#         if args.openai_gelu:
+#             self.activation_func = openai_gelu
+#         elif args.onnx_safe:
+#             self.activation_func = erf_gelu
+#         elif args.swiglu:
+#             def swiglu(x):
+#                 x = torch.chunk(x, 2, dim=-1)
+#                 return F.silu(x[0]) * x[1]
+#             self.activation_func = swiglu
+#         elif args.squared_relu:
+#             def squared_relu(x):
+#                 return torch.pow(F.relu(x), 2)
+#             self.activation_func = squared_relu
+#         else:
+#             self.bias_gelu_fusion = args.bias_gelu_fusion
+#             self.activation_func = F.gelu
+
+#         # Project back to h.
+#         self.dense_4h_to_h = tensor_parallel.RowParallelLinear(
+#             config.ffn_hidden_size,
+#             config.hidden_size,
+#             config=config,
+#             init_method=config.output_layer_init_method,
+#             bias=self.add_bias,
+#             skip_bias_add=True,
+#             input_is_parallel=True,
+#             is_expert=is_expert,
+#             moe=moe,
+#             enable_expert_tensor_parallelism=enable_expert_tensor_parallelism
+#         )
+
+#     def forward(self, hidden_states):
+
+#         # [s, b, 4hp]
+#         intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
+
+#         if self.bias_gelu_fusion:
+#             assert self.add_bias is True
+#             # DeepSpeed FLOPS profiler temporarily substitues functions like F.gelu to calculate the throughput
+#             assert hasattr(self, "__flops__") or self.activation_func == F.gelu
+#             intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+#         else:
+#             if bias_parallel is not None:
+#                 intermediate_parallel = intermediate_parallel + bias_parallel
+#             intermediate_parallel = self.activation_func(intermediate_parallel)
+
+#         # [s, b, h]
+#         output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+#         return output, output_bias
+
+def core_attention_init(self, layer_number, config,
+                        attn_mask_type=AttnMaskType.padding):
+    super(CoreAttention, self).__init__()
+    self.fp16 = config.fp16
+    self.bf16 = config.bf16
+
+    self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+    self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+    if self.apply_query_key_layer_scaling:
+        self.attention_softmax_in_fp32 = True
+    self.layer_number = max(1, layer_number)
+    self.attn_mask_type = attn_mask_type
+    self.sequence_parallel = config.sequence_parallel
+
+    projection_size = config.kv_channels * config.num_attention_heads
+
+    # Per attention head and per partition values.
+    seq_parallel_world_size = 1
+    if parallel_state.sequence_parallel_is_initialized():
+        seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size()
+    world_size = seq_parallel_world_size if seq_parallel_world_size > 1 else mpu.get_tensor_model_parallel_world_size()
+    self.hidden_size_per_partition = core.utils.divide(projection_size,
+                                                        world_size)
+    self.hidden_size_per_attention_head = core.utils.divide(
+        projection_size, config.num_attention_heads)
+    self.num_attention_heads_per_partition = core.utils.divide(
+        config.num_attention_heads, world_size)
+
+    coeff = None
+    self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+    if self.apply_query_key_layer_scaling:
+        coeff = self.layer_number
+        self.norm_factor *= coeff
+
+    self.scale_mask_softmax = FusedScaleMaskSoftmax(
+        self.fp16, self.bf16,
+        self.attn_mask_type,
+        config.masked_softmax_fusion,
+        attention_mask_func,
+        self.attention_softmax_in_fp32,
+        coeff)
+
+    # Dropout. Note that for a single iteration, this layer will generate
+    # different outputs on different number of parallel partitions but
+    # on average it should not be partition dependent.
+    self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+# class CoreAttention(MegatronModule):
+
+#     def __init__(self, layer_number, config,
+#                  attn_mask_type=AttnMaskType.padding):
+#         super(CoreAttention, self).__init__()
+#         self.fp16 = config.fp16
+#         self.bf16 = config.bf16
+
+#         self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+#         self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+#         if self.apply_query_key_layer_scaling:
+#             self.attention_softmax_in_fp32 = True
+#         self.layer_number = max(1, layer_number)
+#         self.attn_mask_type = attn_mask_type
+#         self.sequence_parallel = config.sequence_parallel
+
+#         projection_size = config.kv_channels * config.num_attention_heads
+
+#         # Per attention head and per partition values.
+#         seq_parallel_world_size = 1
+#         if parallel_state.sequence_parallel_is_initialized():
+#             seq_parallel_world_size = mpu.get_tensor_model_parallel_world_size()
+#         world_size = seq_parallel_world_size if seq_parallel_world_size > 1 else mpu.get_tensor_model_parallel_world_size()
+#         self.hidden_size_per_partition = core.utils.divide(projection_size,
+#                                                            world_size)
+#         self.hidden_size_per_attention_head = core.utils.divide(
+#             projection_size, config.num_attention_heads)
+#         self.num_attention_heads_per_partition = core.utils.divide(
+#             config.num_attention_heads, world_size)
+
+#         coeff = None
+#         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+#         if self.apply_query_key_layer_scaling:
+#             coeff = self.layer_number
+#             self.norm_factor *= coeff
+
+#         self.scale_mask_softmax = FusedScaleMaskSoftmax(
+#             self.fp16, self.bf16,
+#             self.attn_mask_type,
+#             config.masked_softmax_fusion,
+#             attention_mask_func,
+#             self.attention_softmax_in_fp32,
+#             coeff)
+
+#         # Dropout. Note that for a single iteration, this layer will generate
+#         # different outputs on different number of parallel partitions but
+#         # on average it should not be partition dependent.
+#         self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+#     def forward(self, query_layer, key_layer,
+#                 value_layer, attention_mask):
+
+#         # ===================================
+#         # Raw attention scores. [b, np, s, s]
+#         # ===================================
+
+#         # [b, np, sq, sk]
+#         output_size = (query_layer.size(1),
+#                        query_layer.size(2),
+#                        query_layer.size(0),
+#                        key_layer.size(0))
+
+#         # [sq, b, np, hn] -> [sq, b * np, hn]
+#         query_layer = query_layer.reshape(output_size[2],
+#                                           output_size[0] * output_size[1], -1)
+#         # [sk, b, np, hn] -> [sk, b * np, hn]
+#         key_layer = key_layer.view(output_size[3],
+#                                    output_size[0] * output_size[1], -1)
+
+#         # preallocting input tensor: [b * np, sq, sk]
+#         matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor(
+#             (output_size[0]*output_size[1], output_size[2], output_size[3]),
+#             query_layer.dtype, "mpu")
+
+#         # Raw attention scores. [b * np, sq, sk]
+#         matmul_result = torch.baddbmm(
+#             matmul_input_buffer,
+#             query_layer.transpose(0, 1),   # [b * np, sq, hn]
+#             key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+#             beta=0.0, alpha=(1.0/self.norm_factor))
+
+#         # change view to [b, np, sq, sk]
+#         attention_scores = matmul_result.view(*output_size)
+
+#         # ===========================
+#         # Attention probs and dropout
+#         # ===========================
+
+#         # attention scores and attention mask [b, np, sq, sk]
+#         attention_probs = self.scale_mask_softmax(attention_scores,
+#                                                   attention_mask)
+
+#         # This is actually dropping out entire tokens to attend to, which might
+#         # seem a bit unusual, but is taken from the original Transformer paper.
+#         if not self.sequence_parallel:
+#             with tensor_parallel.get_cuda_rng_tracker().fork():
+#                 attention_probs = self.attention_dropout(attention_probs)
+#         else:
+#             attention_probs = self.attention_dropout(attention_probs)
+
+#         # =========================
+#         # Context layer. [sq, b, hp]
+#         # =========================
+
+#         # value_layer -> context layer.
+#         # [sk, b, np, hn] --> [b, np, sq, hn]
+
+#         # context layer shape: [b, np, sq, hn]
+#         output_size = (value_layer.size(1),
+#                        value_layer.size(2),
+#                        query_layer.size(0),
+#                        value_layer.size(3))
+
+#         # change view [sk, b * np, hn]
+#         value_layer = value_layer.view(value_layer.size(0),
+#                                        output_size[0] * output_size[1], -1)
+
+#         # change view [b * np, sq, sk]
+#         attention_probs = attention_probs.view(output_size[0] * output_size[1],
+#                                                output_size[2], -1)
+
+#         # matmul: [b * np, sq, hn]
+#         context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+#         # change view [b, np, sq, hn]
+#         context_layer = context_layer.view(*output_size)
+
+#         # [b, np, sq, hn] --> [sq, b, np, hn]
+#         context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+#         # [sq, b, np, hn] --> [sq, b, hp]
+#         new_context_layer_shape = context_layer.size()[:-2] + \
+#             (self.hidden_size_per_partition,)
+#         context_layer = context_layer.view(*new_context_layer_shape)
+
+#         return context_layer
+
+def flash_selfattention_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, q, k, v):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+
+        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v)))
+        assert all((i.is_cuda for i in (q,k,v)))
+
+        batch_size, seqlen_q, num_heads = q.shape[0], q.shape[1], q.shape[2]
+        seqlen_k = k.shape[1]
+
+        q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]]
+        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
+                                    device=q.device)
+
+        if self.training:
+            # during training q,k,v always have same seqlen
+            assert seqlen_k == seqlen_q
+
+            is_causal = self.causal
+            cu_seqlens_k = cu_seqlens_q
+            dropout_p = self.dropout_p
+        else:
+            # turn off FA causal mask after first inference autoregressive iteration
+            # only on first autoregressive step q,k,v have same seqlen
+            is_causal = seqlen_q == seqlen_k
+            cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
+                        device=q.device)
+            dropout_p = 0
+
+        output = flash_attn_unpadded_func(
+            q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k,
+            dropout_p,
+            softmax_scale=self.softmax_scale, causal=is_causal,
+            imp_mode=1 if batch_size * num_heads >= 32 else 0
+        )
+
+        output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+        return output
+    return wrapper
+
+def parallel_attention_init(self, config, layer_number,
+                            attention_type=AttnType.self_attn,
+                            attn_mask_type=AttnMaskType.padding,
+                            rlhf_training=False):
+    super(ParallelAttention, self).__init__()
+    if rlhf_training:
+        args = get_rlhf_args()
+    else:
+        args = get_args()
+    self.layer_number = max(1, layer_number)
+    self.attention_type = attention_type
+    self.attn_mask_type = attn_mask_type
+    self.params_dtype = config.params_dtype
+    self.sequence_parallel = config.sequence_parallel
+    self.config = config
+    self.group_query_attention = args.group_query_attention
+    self.num_query_groups = args.num_query_groups
+    self.num_attention_heads = config.num_attention_heads
+    self.num_key_value_heads = args.num_query_groups
+    self.use_gqa = args.group_query_attention
+
+    query_projection_size = config.kv_channels * config.num_attention_heads
+    if self.group_query_attention:
+        kv_projection_size = args.kv_channels * args.num_query_groups
+    else:
+        kv_projection_size = args.kv_channels * args.num_attention_heads
+
+    self.use_flash_attn = args.use_flash_attn \
+        and attention_type == AttnType.self_attn \
+        and self.attn_mask_type == AttnMaskType.causal
+    if self.use_flash_attn:
+        if flash_attn_unpadded_func is None:
+            raise ImportError('FlashAttention is not installed, please install with '
+                                'pip install flash-attn')
+        assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
+                                                        'self-attention for now')
+        assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
+                                                            'supports causal mask for now')
+        if rearrange is None:
+            raise ImportError('einops is not installed, please install with pip install einops')
+
+    # Per attention head and per partition values.
+    world_size = mpu.get_tensor_model_parallel_world_size()
+    self.hidden_size_per_attention_head = core.utils.divide(
+        query_projection_size, config.num_attention_heads)
+    self.num_attention_heads_per_partition = core.utils.divide(
+        config.num_attention_heads, world_size)
+
+    if self.group_query_attention:
+        if args.num_query_groups % world_size != 0:
+            raise NotImplementedError('Currently the num_query_groups should be '
+                                        'a multiple of the tensor parallel size')
+        self.num_query_groups_per_partition = core.utils.divide(
+                    args.num_query_groups, world_size)
+    else:
+        self.num_query_groups_per_partition = self.num_attention_heads_per_partition
+
+    # Strided linear layer.
+    if attention_type == AttnType.self_attn:
+        self.query_key_value = tensor_parallel.ColumnParallelLinear(
+            config.hidden_size,
+            query_projection_size + 2 * kv_projection_size,
+            config=config,
+            init_method=config.init_method,
+            bias=args.add_bias_linear or args.add_qkv_bias,
+            gather_output=False)
+    else:
+        assert attention_type == AttnType.cross_attn
+
+        if self.group_query_attention:
+            raise NotImplementedError("Grouped query attention not implemented for cross-attention.")
+        assert query_projection_size == kv_projection_size
+
+        self.query = tensor_parallel.ColumnParallelLinear(
+            config.hidden_size,
+            query_projection_size,
+            config=config,
+            init_method=config.init_method,
+            bias=config.add_bias_linear,
+            gather_output=False)
+
+        self.key_value = tensor_parallel.ColumnParallelLinear(
+            config.hidden_size,
+            2 * kv_projection_size,
+            config=config,
+            init_method=config.init_method,
+            bias=config.add_bias_linear,
+            gather_output=False)
+
+    # Currently FlashAttention only works with causal mask
+    if self.use_flash_attn:
+        local_attn = FlashSelfAttention(causal=True, attention_dropout=config.attention_dropout)
+    else:
+        local_attn = CoreAttention(self.layer_number, config, self.attn_mask_type)
+
+    self.enable_ds_sequence_parallel = False
+
+    if args.deepspeed:
+        self.enable_ds_sequence_parallel = parallel_state.get_sequence_parallel_world_size() > 1 \
+                                            or args.force_ds_sequence_parallel
+
+    if self.enable_ds_sequence_parallel:
+        assert dist_attn_supported, 'Distributed attention is not supported in this DeepSpeed version'
+        assert args.num_attention_heads % parallel_state.get_sequence_parallel_world_size() == 0
+        self.dist_attn = DistributedAttention(
+            local_attn, 
+            parallel_state.get_sequence_parallel_group(), 
+            gather_idx=1 if args.use_flash_attn else 0) 
+        # flash_attn_cuda assumes [b, s, nh, hd] layout, we need to make sure all2all gathers into the correct sequence dimension.
+    else:
+        if self.use_flash_attn:
+            self.core_attention_flash = local_attn
+        else:
+            self.core_attention = local_attn
+            self.checkpoint_core_attention = config.recompute_granularity == 'selective'
+
+    # Output.
+    self.dense = tensor_parallel.RowParallelLinear(
+        query_projection_size,
+        config.hidden_size,
+        config=config,
+        init_method=config.output_layer_init_method,
+        bias=args.add_bias_linear,
+        input_is_parallel=True,
+        skip_bias_add=True)
+
+def parallel_attention_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, hidden_states, attention_mask,
+                encoder_output=None, inference_params=None,
+                rotary_pos_emb=None, position_ids=None):
+        # hidden_states: [sq, b, h]
+
+        # Inference or Forward 使用, 会影响 RoPE
+        if position_ids is not None:
+            # position_ids = position_ids.transpose(1, 0) #[s, b]
+            ## 适配rope fused kernel
+            position_ids = position_ids.transpose(1, 0)[:, 0].unsqueeze(-1) #[s, b] -> [s, b] -> [s, 1]  rope position ids embedding 在同一位置是一样的
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        is_first_step = False
+        if inference_params:
+            if self.layer_number not in inference_params.key_value_memory_dict:
+                inf_max_seq_len = inference_params.max_sequence_length
+                inf_max_batch_size = inference_params.max_batch_size
+                inference_key_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size,
+                    self.num_query_groups_per_partition)
+                inference_value_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size,
+                    self.num_query_groups_per_partition)
+
+                inference_params.key_value_memory_dict[self.layer_number] = (
+                    inference_key_memory, inference_value_memory)
+                is_first_step = True
+            else:
+                inference_key_memory, inference_value_memory = \
+                    inference_params.key_value_memory_dict[self.layer_number]
+
+            # 存储 inference position_ids
+            if is_first_step and position_ids is not None \
+                    and "position_ids" not in inference_params.key_value_memory_dict:
+                inference_params.key_value_memory_dict["position_ids"] = position_ids
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        if self.attention_type == AttnType.self_attn:
+            # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+            mixed_x_layer, _ = self.query_key_value(hidden_states, inference_params=inference_params)
+
+            # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_query_groups_per_partition,
+                (
+                    (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                    * self.hidden_size_per_attention_head
+                ),
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+            (query_layer,
+            key_layer,
+            value_layer) = torch.split(
+                mixed_x_layer,
+                [
+                    (
+                        self.num_attention_heads_per_partition // self.num_query_groups_per_partition
+                        * self.hidden_size_per_attention_head
+                    ),
+                    self.hidden_size_per_attention_head,
+                    self.hidden_size_per_attention_head
+                ],
+                dim=3)
+
+            # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
+            query_layer = query_layer.contiguous().view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
+        else:
+            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+            mixed_kv_layer, _ = self.key_value(encoder_output)
+
+            # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+            new_tensor_shape = mixed_kv_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                    2 * self.hidden_size_per_attention_head)
+            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
+
+            # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+            (key_layer,
+                value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2)
+
+            # Attention head [sq, b, h] --> [sq, b, hp]
+            query_layer, _ = self.query(hidden_states)
+            # [sq, b, hp] --> [sq, b, np, hn]
+            new_tensor_shape = query_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(*new_tensor_shape)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        # duplicate the pos_emb for self attention
+        if rotary_pos_emb is not None:
+            if isinstance(rotary_pos_emb, tuple):
+                rotary_pos_emb = rotary_pos_emb
+            else:
+                rotary_pos_emb = ((rotary_pos_emb,) * 2)
+
+        if inference_params:
+            batch_start = inference_params.batch_size_offset
+            batch_end = batch_start + key_layer.size(1)
+            assert batch_end <= inference_key_memory.size(1)
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + key_layer.size(0)
+            assert sequence_end <= inference_key_memory.size(0)
+            # Copy key and values.
+            inference_key_memory[sequence_start:sequence_end,
+                                    batch_start:batch_end, ...] = key_layer
+            inference_value_memory[sequence_start:sequence_end,
+                                    batch_start:batch_end, ...] = value_layer
+            key_layer = inference_key_memory[
+                :sequence_end, batch_start:batch_end, ...]
+            value_layer = inference_value_memory[
+                :sequence_end, batch_start:batch_end, ...]
+
+
+            # adjust the key rotary positional embedding
+            if rotary_pos_emb is not None:
+                q_pos_emb, k_pos_emb = rotary_pos_emb
+                # need to cross check this condition during inference
+                # if not set_inference_key_value_memory:
+                if not is_first_step:
+                    # In inference, we compute one token at a time.
+                    # Select the correct query positional embedding (only the last token in the sequence)
+                    if position_ids is not None:
+                        # 取 last position_id 对应的 q_pos_emb
+                        assert position_ids.shape[0] == 1
+                        # cur_pos_id = position_ids[-1].item()
+                        q_pos_emb = q_pos_emb[position_ids].squeeze(2) # [1, bs, 1, dim]
+
+                        # 取 position_id 对应的 k_pos_emb
+                        k_pos_emb = k_pos_emb.squeeze(1).squeeze(1) # [max_seq, dim]
+                        mem_position_ids = inference_params.key_value_memory_dict["position_ids"]
+                        if mem_position_ids.shape[0] == sequence_end:
+                            k_pos_emb = k_pos_emb[mem_position_ids].unsqueeze(2) # [sequence_end, b, 1, dim]
+                        elif mem_position_ids.shape[0] == sequence_end - 1:
+                            new_position_ids = torch.concat((mem_position_ids, position_ids), 0)
+                            k_pos_emb = k_pos_emb[new_position_ids].unsqueeze(2) # [sequence_end, b, 1, dim]
+                            inference_params.key_value_memory_dict["position_ids"] = new_position_ids # update memory position_ids
+                        else:
+                            raise Exception("input position_ids shape wrong.")
+                    else:
+                        q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end] # [1, 1, 1, dim]
+                        k_pos_emb = k_pos_emb[:sequence_end, :, :, :] # [sequence_end, 1, 1, dim]
+                else:
+                    # In the first forward pass of inference, we use the entire provided prefix.
+                    # q_pos_emb here has the rope embeddings of the entire prefix + to-be-generated output
+                    # so we slice to just the prefix.
+                    if position_ids is not None:
+                        assert position_ids.shape[0] <= q_pos_emb.shape[0] and q_pos_emb.shape[0] == k_pos_emb.shape[0]
+                        q_pos_emb = q_pos_emb.squeeze(1).squeeze(1) # [max_seq, dim]
+                        q_pos_emb = q_pos_emb[position_ids].unsqueeze(2) # [s, b, 1, dim]
+                        k_pos_emb = k_pos_emb.squeeze(1).squeeze(1) # [max_seq, dim]
+                        k_pos_emb = k_pos_emb[position_ids].unsqueeze(2) # [s, b, 1, dim]
+                    else:
+                        q_pos_emb = q_pos_emb[:sequence_end, :, :, :] # [sequence_end, 1, 1, dim]
+                        k_pos_emb = k_pos_emb[:sequence_end, :, :, :] # [sequence_end, 1, 1, dim]
+
+                rotary_pos_emb = (q_pos_emb, k_pos_emb)
+
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn]
+        # Flash attention support group attention
+        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1 and not self.use_flash_attn:
+            key_layer = key_layer.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
+                dim = 2
+            )
+            value_layer = value_layer.repeat_interleave(
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
+                dim = 2
+            )
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            # query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb,self.config)
+            # key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb,self.config)
+            query_layer = fused_apply_rotary_pos_emb(query_layer, q_pos_emb)
+            key_layer = fused_apply_rotary_pos_emb(key_layer, k_pos_emb)
+            # TODO, can apply positional embedding to value_layer so it has
+            # absolute positional embedding.
+            # otherwise, only relative positional embedding takes effect
+            # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+        if self.enable_ds_sequence_parallel:
+            batch_dim_idx = 1
+            if self.use_flash_attn:
+                query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous()
+                        for x in (query_layer, key_layer, value_layer)]
+                batch_dim_idx = 0
+
+                context_layer = self.dist_attn(query_layer, key_layer, value_layer, batch_dim_idx)
+
+                context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
+            else:
+                context_layer = self.dist_attn(query_layer, key_layer, value_layer, attention_mask)
+        else:
+            if not self.use_flash_attn:
+                if self.checkpoint_core_attention:
+                    context_layer = self._checkpointed_attention_forward(
+                        query_layer, key_layer, value_layer, attention_mask)
+                else:
+                    context_layer = self.core_attention(
+                        query_layer, key_layer, value_layer, attention_mask)
+            else:
+                q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
+                        for x in (query_layer, key_layer, value_layer)]
+                if not self.sequence_parallel:
+                    with tensor_parallel.get_cuda_rng_tracker().fork():
+                        context_layer = self.core_attention_flash(q, k, v)
+                else:
+                    context_layer = self.core_attention_flash(q, k, v)
+                context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.dense(context_layer, inference_params=inference_params)
+
+        return output, bias
+    return wrapper
+
+# class ParallelAttention(MegatronModule):
+#     """Parallel self-attention layer abstract class.
+
+#     Self-attention layer takes input with size [s, b, h]
+#     and returns output of the same size.
+#     """
+
+#     def __init__(self, config, layer_number,
+#                  attention_type=AttnType.self_attn,
+#                  attn_mask_type=AttnMaskType.padding):
+#         super(ParallelAttention, self).__init__()
+#         args = get_args()
+#         self.layer_number = max(1, layer_number)
+#         self.attention_type = attention_type
+#         self.attn_mask_type = attn_mask_type
+#         self.params_dtype = config.params_dtype
+#         self.sequence_parallel = config.sequence_parallel
+#         self.config = config
+#         self.group_query_attention = args.group_query_attention
+#         self.num_query_groups = args.num_query_groups
+#         self.num_attention_heads = config.num_attention_heads
+#         self.num_key_value_heads = args.num_query_groups
+#         self.use_gqa = args.group_query_attention
+
+#         query_projection_size = config.kv_channels * config.num_attention_heads
+#         if self.group_query_attention:
+#             kv_projection_size = args.kv_channels * args.num_query_groups
+#         else:
+#             kv_projection_size = args.kv_channels * args.num_attention_heads
+
+#         self.use_flash_attn = args.use_flash_attn \
+#             and attention_type == AttnType.self_attn \
+#             and self.attn_mask_type == AttnMaskType.causal
+#         if self.use_flash_attn:
+#             if flash_attn_unpadded_func is None:
+#                 raise ImportError('FlashAttention is not installed, please install with '
+#                                   'pip install flash-attn')
+#             assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
+#                                                           'self-attention for now')
+#             assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
+#                                                                 'supports causal mask for now')
+#             if rearrange is None:
+#                 raise ImportError('einops is not installed, please install with pip install einops')
+
+#         # Per attention head and per partition values.
+#         world_size = mpu.get_tensor_model_parallel_world_size()
+#         self.hidden_size_per_attention_head = core.utils.divide(
+#             query_projection_size, config.num_attention_heads)
+#         self.num_attention_heads_per_partition = core.utils.divide(
+#             config.num_attention_heads, world_size)
+
+#         if self.group_query_attention:
+#             if args.num_query_groups % world_size != 0:
+#                 raise NotImplementedError('Currently the num_query_groups should be '
+#                                           'a multiple of the tensor parallel size')
+#             self.num_query_groups_per_partition = core.utils.divide(
+#                         args.num_query_groups, world_size)
+#         else:
+#             self.num_query_groups_per_partition = self.num_attention_heads_per_partition
+
+#         # Strided linear layer.
+#         if attention_type == AttnType.self_attn:
+#             self.query_key_value = tensor_parallel.ColumnParallelLinear(
+#                 config.hidden_size,
+#                 query_projection_size + 2 * kv_projection_size,
+#                 config=config,
+#                 init_method=config.init_method,
+#                 bias=args.add_bias_linear or args.add_qkv_bias,
+#                 gather_output=False)
+#         else:
+#             assert attention_type == AttnType.cross_attn
+
+#             if self.group_query_attention:
+#                 raise NotImplementedError("Grouped query attention not implemented for cross-attention.")
+#             assert query_projection_size == kv_projection_size
+
+#             self.query = tensor_parallel.ColumnParallelLinear(
+#                 config.hidden_size,
+#                 query_projection_size,
+#                 config=config,
+#                 init_method=config.init_method,
+#                 bias=config.add_bias_linear,
+#                 gather_output=False)
+
+#             self.key_value = tensor_parallel.ColumnParallelLinear(
+#                 config.hidden_size,
+#                 2 * kv_projection_size,
+#                 config=config,
+#                 init_method=config.init_method,
+#                 bias=config.add_bias_linear,
+#                 gather_output=False)
+
+#         # Currently FlashAttention only works with causal mask
+#         if self.use_flash_attn:
+#             local_attn = FlashSelfAttention(causal=True, attention_dropout=config.attention_dropout)
+#         else:
+#             local_attn = CoreAttention(self.layer_number, config, self.attn_mask_type)
+
+#         self.enable_ds_sequence_parallel = parallel_state.get_sequence_parallel_world_size() > 1 \
+#                                            or args.force_ds_sequence_parallel
+#         if self.enable_ds_sequence_parallel:
+#             assert dist_attn_supported, 'Distributed attention is not supported in this DeepSpeed version'
+#             assert args.num_attention_heads % parallel_state.get_sequence_parallel_world_size() == 0
+#             self.dist_attn = DistributedAttention(
+#                 local_attn, 
+#                 parallel_state.get_sequence_parallel_group(), 
+#                 gather_idx=1 if args.use_flash_attn else 0) 
+#             # flash_attn_cuda assumes [b, s, nh, hd] layout, we need to make sure all2all gathers into the correct sequence dimension.
+#         else:
+#             if self.use_flash_attn:
+#                 self.core_attention_flash = local_attn
+#             else:
+#                 self.core_attention = local_attn
+#                 self.checkpoint_core_attention = config.recompute_granularity == 'selective'
+
+#         # Output.
+#         self.dense = tensor_parallel.RowParallelLinear(
+#             query_projection_size,
+#             config.hidden_size,
+#             config=config,
+#             init_method=config.output_layer_init_method,
+#             bias=args.add_bias_linear,
+#             input_is_parallel=True,
+#             skip_bias_add=True)
+
+
+#     def _checkpointed_attention_forward(self, query_layer, key_layer,
+#                                         value_layer, attention_mask,
+#                                         rotary_pos_emb=None):
+#         """Forward method with activation checkpointing."""
+#         def custom_forward(*inputs):
+#             query_layer = inputs[0]
+#             key_layer = inputs[1]
+#             value_layer = inputs[2]
+#             attention_mask = inputs[3]
+#             output_ = self.core_attention(query_layer, key_layer,
+#                                           value_layer, attention_mask)
+#             return output_
+
+#         q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \
+#             else rotary_pos_emb
+
+#         hidden_states = tensor_parallel.checkpoint(
+#             custom_forward,
+#             False, query_layer, key_layer, value_layer, attention_mask,
+#             q_pos_emb, k_pos_emb)
+
+#         return hidden_states
+
+#     def _allocate_memory(self, inference_max_sequence_len, batch_size, num_attention_heads):
+#         return torch.empty(
+#             inference_max_sequence_len,
+#             batch_size,
+#             num_attention_heads,
+#             self.hidden_size_per_attention_head,
+#             dtype=self.params_dtype,
+#             device=torch.cuda.current_device())
+
+#     def repeat_kv(self, hidden_states, n_rep):
+#         slen, batch, num_key_value_heads_per_partition, head_dim = hidden_states.shape
+#         if n_rep == 1:
+#             return hidden_states
+#         elif num_key_value_heads_per_partition == 1:
+#             # If no of KV heads is 1 then just perform expand operation
+#             # instead of unsqueeze, expand and reshape to match query states.
+#             return hidden_states.expand(slen, batch, n_rep, head_dim)
+#         else:
+#             hidden_states = hidden_states[:, :, :, None, :].expand(
+#                 slen, batch, num_key_value_heads_per_partition, n_rep, head_dim)
+#             return hidden_states.reshape(slen, batch,
+#                                          num_key_value_heads_per_partition * n_rep,
+#                                          head_dim)
+                                     
+#     def split_tensor(self, mixed_x_layer):
+#         query_layer, key_layer, value_layer = torch.split(mixed_x_layer, [self.num_key_value_groups, 1, 1], dim=-2)
+#         query_layer = query_layer.reshape(mixed_x_layer.shape[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head))
+#         key_layer = torch.squeeze(key_layer, -2)
+#         value_layer = torch.squeeze(value_layer, -2)
+
+#         return query_layer, key_layer, value_layer
+
+#     def forward(self, hidden_states, attention_mask,
+#                 encoder_output=None, inference_params=None,
+#                 rotary_pos_emb=None):
+#         # hidden_states: [sq, b, h]
+
+#         # =================================================
+#         # Pre-allocate memory for key-values for inference.
+#         # =================================================
+#         is_first_step = False
+#         if inference_params:
+#             if self.layer_number not in inference_params.key_value_memory_dict:
+#                 inf_max_seq_len = inference_params.max_sequence_length
+#                 inf_max_batch_size = inference_params.max_batch_size
+#                 inference_key_memory = self._allocate_memory(
+#                     inf_max_seq_len, inf_max_batch_size,
+#                     self.num_query_groups_per_partition)
+#                 inference_value_memory = self._allocate_memory(
+#                     inf_max_seq_len, inf_max_batch_size,
+#                     self.num_query_groups_per_partition)
+
+#                 inference_params.key_value_memory_dict[self.layer_number] = (
+#                     inference_key_memory, inference_value_memory)
+#                 is_first_step = True
+#             else:
+#                 inference_key_memory, inference_value_memory = \
+#                     inference_params.key_value_memory_dict[self.layer_number]
+
+#         # =====================
+#         # Query, Key, and Value
+#         # =====================
+
+#         if self.attention_type == AttnType.self_attn:
+#             # Attention heads [sq, b, h] --> [sq, b, ng * (np/ng + 2) * hn)]
+#             mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+#             # [sq, b, hp] --> [sq, b, ng, (np/ng + 2) * hn]
+#             new_tensor_shape = mixed_x_layer.size()[:-1] + (
+#                 self.num_query_groups_per_partition,
+#                 (
+#                     (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+#                     * self.hidden_size_per_attention_head
+#                 ),
+#             )
+#             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+#             # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
+#             (query_layer,
+#             key_layer,
+#             value_layer) = torch.split(
+#                 mixed_x_layer,
+#                 [
+#                     (
+#                         self.num_attention_heads_per_partition // self.num_query_groups_per_partition
+#                         * self.hidden_size_per_attention_head
+#                     ),
+#                     self.hidden_size_per_attention_head,
+#                     self.hidden_size_per_attention_head
+#                 ],
+#                 dim=3)
+
+#             # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
+#             query_layer = query_layer.reshape(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
+#         else:
+#             # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+#             mixed_kv_layer, _ = self.key_value(encoder_output)
+
+#             # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+#             new_tensor_shape = mixed_kv_layer.size()[:-1] + \
+#                 (self.num_attention_heads_per_partition,
+#                  2 * self.hidden_size_per_attention_head)
+#             mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
+
+#             # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+#             (key_layer,
+#              value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2)
+
+#             # Attention head [sq, b, h] --> [sq, b, hp]
+#             query_layer, _ = self.query(hidden_states)
+#             # [sq, b, hp] --> [sq, b, np, hn]
+#             new_tensor_shape = query_layer.size()[:-1] + \
+#                 (self.num_attention_heads_per_partition,
+#                  self.hidden_size_per_attention_head)
+#             query_layer = query_layer.view(*new_tensor_shape)
+
+#         # ==================================
+#         # Adjust key and value for inference
+#         # ==================================
+
+#         # duplicate the pos_emb for self attention
+#         if rotary_pos_emb is not None:
+#             if isinstance(rotary_pos_emb, tuple):
+#                 rotary_pos_emb = rotary_pos_emb
+#             else:
+#                 rotary_pos_emb = ((rotary_pos_emb,) * 2)
+
+#         if inference_params:
+#             batch_start = inference_params.batch_size_offset
+#             batch_end = batch_start + key_layer.size(1)
+#             assert batch_end <= inference_key_memory.size(1)
+#             sequence_start = inference_params.sequence_len_offset
+#             sequence_end = sequence_start + key_layer.size(0)
+#             assert sequence_end <= inference_key_memory.size(0)
+#             # Copy key and values.
+#             inference_key_memory[sequence_start:sequence_end,
+#                                  batch_start:batch_end, ...] = key_layer
+#             inference_value_memory[sequence_start:sequence_end,
+#                                    batch_start:batch_end, ...] = value_layer
+#             key_layer = inference_key_memory[
+#                 :sequence_end, batch_start:batch_end, ...]
+#             value_layer = inference_value_memory[
+#                 :sequence_end, batch_start:batch_end, ...]
+
+
+#             # adjust the key rotary positional embedding
+#             if rotary_pos_emb is not None:
+#                 q_pos_emb, k_pos_emb = rotary_pos_emb
+#                 # need to cross check this condition during inference
+#                 # if not set_inference_key_value_memory:
+#                 if not is_first_step:
+#                     # In inference, we compute one token at a time.
+#                     # Select the correct positional embedding
+#                     # (only the last token in the sequence)
+#                     q_pos_emb = q_pos_emb[sequence_end - 1 : sequence_end]
+#                 else:
+#                     # In the first forward pass of inference,
+#                     # we use the entire provided prefix.
+#                     # q_pos_emb here has the rope embeddings of the entire
+#                     # prefix + to-be-generated output so
+#                     # we slice to just the prefix.
+#                     q_pos_emb = q_pos_emb[:sequence_end, :, :, :]
+#                 k_pos_emb = k_pos_emb[:sequence_end, :, :, :]
+#                 rotary_pos_emb = (q_pos_emb, k_pos_emb)
+
+
+#         # ==================================
+#         # core attention computation
+#         # ==================================
+
+#         # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn]
+#         # Flash attention support group attention
+#         if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1 and not self.use_flash_attn:
+#             key_layer = key_layer.repeat_interleave(
+#                 self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
+#                 dim = 2
+#             )
+#             value_layer = value_layer.repeat_interleave(
+#                 self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
+#                 dim = 2
+#             )
+
+#         # apply relative positional encoding (rotary embedding)
+#         if rotary_pos_emb is not None:
+#             q_pos_emb, k_pos_emb = rotary_pos_emb
+#             query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb,self.config)
+#             key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb,self.config)
+#             # TODO, can apply positional embedding to value_layer so it has
+#             # absolute positional embedding.
+#             # otherwise, only relative positional embedding takes effect
+#             # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
+
+#         if self.enable_ds_sequence_parallel:
+#             batch_dim_idx = 1
+#             if self.use_flash_attn:
+#                 query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous()
+#                         for x in (query_layer, key_layer, value_layer)]
+#                 batch_dim_idx = 0
+
+#                 context_layer = self.dist_attn(query_layer, key_layer, value_layer, batch_dim_idx)
+
+#                 context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
+#             else:
+#                 context_layer = self.dist_attn(query_layer, key_layer, value_layer, attention_mask)
+#         else:
+#             if not self.use_flash_attn:
+#                 if self.checkpoint_core_attention:
+#                     context_layer = self._checkpointed_attention_forward(
+#                         query_layer, key_layer, value_layer, attention_mask)
+#                 else:
+#                     context_layer = self.core_attention(
+#                         query_layer, key_layer, value_layer, attention_mask)
+#             else:
+#                 q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
+#                         for x in (query_layer, key_layer, value_layer)]
+#                 if not self.sequence_parallel:
+#                     with tensor_parallel.get_cuda_rng_tracker().fork():
+#                         context_layer = self.core_attention_flash(q, k, v)
+#                 else:
+#                     context_layer = self.core_attention_flash(q, k, v)
+#                 context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
+
+#         # =================
+#         # Output. [sq, b, h]
+#         # =================
+
+#         output, bias = self.dense(context_layer)
+
+#         return output, bias
+
+def parallel_transformer_layer_init(self, config,
+                                    layer_number, layer_type=LayerType.encoder,
+                                    self_attn_mask_type=AttnMaskType.padding,
+                                    drop_path_rate=0., num_experts=1,
+                                    rlhf_training=False):
+    if rlhf_training:
+        args = get_rlhf_args()
+    else:
+        args = get_args()
+    self.args = args
+
+    super(ParallelTransformerLayer, self).__init__()
+    self.layer_number = layer_number
+    self.layer_type = layer_type
+
+    self.apply_residual_connection_post_norm \
+        = config.apply_residual_connection_post_layernorm
+
+    self.bf16 = config.bf16
+    self.fp32_residual_connection = config.fp32_residual_connection
+
+    # Normalize the input data.
+    self.input_norm = get_norm(config)
+
+    # Self attention.
+    self.self_attention = ParallelAttention(
+        config,
+        layer_number,
+        attention_type=AttnType.self_attn,
+        attn_mask_type=self_attn_mask_type,
+        rlhf_training=rlhf_training)
+    self.hidden_dropout = config.hidden_dropout
+    self.bias_dropout_fusion = config.bias_dropout_fusion
+    self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None
+
+    # Normalize the attention output
+    if self.args.normalization != "RMSNorm":
+        self.post_attention_norm = get_norm(config)
+    else:
+        self.post_attention_norm = get_rmsnorm_residual(config)
+
+    # Cross attention.
+    if self.layer_type in (LayerType.decoder,
+                            LayerType.retro_decoder,
+                            LayerType.retro_decoder_with_retriever,
+                            LayerType.retro_encoder):
+        self.inter_attention = ParallelAttention(
+            config,
+            layer_number,
+            attention_type=AttnType.cross_attn,
+            rlhf_training=rlhf_training)
+        # Normalize the attention output.
+        self.post_inter_attention_norm = get_norm(config)
+
+    # MLP
+    self.num_experts = num_experts
+    if not args.deepspeed:
+        if args.num_experts is not None:
+            self.mlp = SwitchMLP(config) # Megatron-LM's MoE
+        else:
+            self.mlp = ParallelMLP(config, rlhf_training=rlhf_training)
+    else:
+        if self.num_experts <= 1: # dense, not MoE
+            self.mlp = ParallelMLP(config, rlhf_training=rlhf_training)
+        else: # DeepSpeed's MoE
+            enable_expert_tensor_parallelism = args.enable_expert_tensor_parallelism
+            self.mlp = MoE(args.hidden_size,
+                            ParallelMLP(config,
+                                        moe=True,
+                                        enable_expert_tensor_parallelism=enable_expert_tensor_parallelism),
+                            num_experts=self.num_experts,
+                            ep_size=args.moe_expert_parallel_size,
+                            k=args.topk,
+                            use_residual=(args.mlp_type == 'residual'),
+                            capacity_factor=args.moe_train_capacity_factor,
+                            eval_capacity_factor=args.moe_eval_capacity_factor,
+                            min_capacity=args.moe_min_capacity,
+                            drop_tokens=args.moe_token_dropping,
+                            use_tutel=args.use_tutel,
+                            enable_expert_tensor_parallelism=enable_expert_tensor_parallelism,
+                            top2_2nd_expert_sampling=args.moe_top2_2nd_expert_sampling)
+
+    # Set bias+dropout+add fusion grad_enable execution handler.
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+    self.bias_dropout_add_exec_handler = \
+            nullcontext if use_nvfuser else torch.enable_grad
+
+    if args.retro_add_retriever:
+        self.retro_num_neighbors = args.retro_num_neighbors
+        self.retro_chunk_length = args.retro_chunk_length
+        self.retro_retrieved_length = \
+            args.retro_num_retrieved_chunks * args.retro_chunk_length
+
+    # Retriever (bi-directional transformer with cross attention)
+    if layer_type == LayerType.retro_decoder_with_retriever:
+        self.retriever = ParallelTransformer(
+            config=config,
+            model_type=ModelType.retro_encoder,
+            self_attn_mask_type=AttnMaskType.padding,
+            pre_process=True,
+            post_process=False,
+        )
+        self._retriever_key = 'retriever'
+    else:
+        self.retriever = None
+
+def parallel_transformer_layer_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, hidden_states, attention_mask=None,
+                encoder_output=None, enc_dec_attn_mask=None,
+                retriever_input=None,
+                retriever_output=None,
+                retriever_attn_mask=None,
+                inference_params=None,
+                rotary_pos_emb=None,
+                position_ids=None, **kwargs):
+        # Update the params in case the retro param changes during inference
+        # TODO: better redesign with inference param
+        args = get_args()
+
+        # if not args.deepspeed:
+        #     return fn(self, hidden_states, attention_mask=attention_mask,
+        #               encoder_output=encoder_output, enc_dec_attn_mask=enc_dec_attn_mask,
+        #               retriever_input=retriever_input,
+        #               retriever_output=retriever_output,
+        #               retriever_attn_mask=retriever_attn_mask,
+        #               inference_params=inference_params,
+        #               rotary_pos_emb=rotary_pos_emb, **kwargs)
+
+        if args.retro_add_retriever:
+            self.retro_num_neighbors = args.retro_num_neighbors
+            self.retro_chunk_length = args.retro_chunk_length
+            self.retro_retrieved_length = \
+                args.retro_num_retrieved_chunks * args.retro_chunk_length
+
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        norm_output = self.input_norm(hidden_states)
+
+        # Self attention.
+        attention_output, attention_bias = \
+            self.self_attention(
+                norm_output,
+                attention_mask,
+                inference_params=inference_params,
+                rotary_pos_emb=rotary_pos_emb,
+                position_ids=position_ids,)
+
+        # Residual connection.
+        if self.apply_residual_connection_post_norm:
+            residual = norm_output
+        else:
+            residual = hidden_states
+
+        if self.drop_path is None:
+            # jit scripting for a nn.module (with dropout) is not
+            # trigerring the fusion kernel. For now, we use two
+            # different nn.functional routines to account for varying
+            # dropout semantics during training and inference phases.
+            if self.bias_dropout_fusion:
+                if self.training:
+                    bias_dropout_add_func = bias_dropout_add_fused_train
+                else:
+                    bias_dropout_add_func = bias_dropout_add_fused_inference
+            else:
+                bias_dropout_add_func = get_bias_dropout_add(self.training)
+
+            if attention_bias is not None:
+                attention_bias = attention_bias.expand_as(residual)
+            with self.bias_dropout_add_exec_handler():
+            #     norm_input = bias_dropout_add_func(
+            #         attention_output,
+            #         attention_bias,
+            #         residual,
+            #         self.hidden_dropout)
+                if self.args.normalization != "RMSNorm":
+                    norm_input = bias_dropout_add_func(
+                        attention_output,
+                        attention_bias,
+                        residual,
+                        self.hidden_dropout)
+                else:
+                    if attention_bias is not None:
+                        attention_output = attention_output + attention_bias
+                    out = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+                    norm_output, norm_input = self.post_attention_norm(out, residual)
+        else:
+            out = torch.nn.functional.dropout(attention_output + attention_bias,
+                                                p=self.hidden_dropout,
+                                                training=self.training)
+            # norm_input = residual + self.drop_path(out)
+            if self.args.normalization != "RMSNorm":
+                norm_input = residual + self.drop_path(out)
+            else:
+                norm_output, norm_input = self.post_attention_norm(self.drop_path(out), residual)
+
+        # Layer norm post the self attention.
+        # norm_output = self.post_attention_norm(norm_input)
+        if self.args.normalization != "RMSNorm":
+            norm_output = self.post_attention_norm(norm_input)
+
+        # Cross attention.
+        if self.layer_type == LayerType.encoder:
+            pass
+        elif self.layer_type == LayerType.decoder:
+            norm_input, norm_output = \
+                self.default_decoder_cross_attention(
+                    encoder_output,
+                    enc_dec_attn_mask,
+                    norm_input,
+                    norm_output,
+                    bias_dropout_add_func)
+        elif self.layer_type == LayerType.retro_encoder:
+            norm_input, norm_output = \
+                self.retro_encoder_cross_attention(
+                    retriever_output,
+                    norm_input,
+                    norm_output,
+                    bias_dropout_add_func)
+        elif self.layer_type in (LayerType.retro_decoder,
+                                    LayerType.retro_decoder_with_retriever):
+            retriever_output, norm_input, norm_output = \
+                self.retro_decoder_cross_attention(
+                    retriever_input,
+                    retriever_output,
+                    retriever_attn_mask,
+                    norm_input,
+                    norm_output,
+                    inference_params,
+                    bias_dropout_add_func)
+        else:
+            raise Exception("Unsupported layer type, '%s'." %
+                            self.layer_type.name)
+
+        # MLP.
+        moe_loss = torch.tensor(0.0, device=norm_output.device, dtype=norm_output.dtype)
+        mlp_bias = torch.tensor(0.0, device=norm_output.device, dtype=norm_output.dtype)
+
+        if self.num_experts > 1 and args.deepspeed:
+            mlp_output, moe_loss, _ = self.mlp(norm_output)
+        else:
+            mlp_output, mlp_bias = self.mlp(norm_output, inference_params=inference_params)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_norm:
+            residual = norm_output
+        else:
+            residual = norm_input
+
+        if self.drop_path is None:
+            if mlp_bias is not None:
+                mlp_bias = mlp_bias.expand_as(residual)
+            with self.bias_dropout_add_exec_handler():
+                output = bias_dropout_add_func(
+                    mlp_output,
+                    mlp_bias,
+                    residual,
+                    self.hidden_dropout)
+
+            # Jit compiled function creates 'view' tensor. This tensor
+            # potentially gets saved in the MPU checkpoint function context,
+            # which rejects view tensors. While making a viewless tensor here
+            # won't result in memory savings (like the data loader, or
+            # p2p_communication), it serves to document the origin of this
+            # 'view' tensor.
+            output = core.utils.make_viewless_tensor(inp = output,
+                                                        requires_grad = output.requires_grad,
+                                                        keep_graph = True)
+
+        else:
+            if mlp_bias is not None:
+                mlp_output = mlp_output + mlp_bias
+            out = torch.nn.functional.dropout(mlp_output,
+                                                p=self.hidden_dropout,
+                                                training=self.training)
+            output = residual + self.drop_path(out)
+
+        if args.deepspeed:
+            if self.layer_type == LayerType.retro_decoder_with_retriever:
+                return output, retriever_output, moe_loss
+            else:
+                return output, moe_loss
+        else:
+            if self.layer_type == LayerType.retro_decoder_with_retriever:
+                return output, retriever_output
+            else:
+                return output
+    return wrapper
+
+# class ParallelTransformerLayer(MegatronModule):
+#     """A single transformer layer.
+
+#     Transformer layer takes input with size [s, b, h] and returns an
+#     output of the same size.
+#     """
+
+#     def __init__(self, config,
+#                  layer_number, layer_type=LayerType.encoder,
+#                  self_attn_mask_type=AttnMaskType.padding,
+#                  drop_path_rate=0., num_experts=1):
+#         args = get_args()
+
+#         super(ParallelTransformerLayer, self).__init__()
+#         self.layer_number = layer_number
+#         self.layer_type = layer_type
+
+#         self.apply_residual_connection_post_norm \
+#             = config.apply_residual_connection_post_layernorm
+
+#         self.bf16 = config.bf16
+#         self.fp32_residual_connection = config.fp32_residual_connection
+
+#         # Normalize the input data.
+#         self.input_norm = get_norm(config)
+
+#         # Self attention.
+#         self.self_attention = ParallelAttention(
+#             config,
+#             layer_number,
+#             attention_type=AttnType.self_attn,
+#             attn_mask_type=self_attn_mask_type)
+#         self.hidden_dropout = config.hidden_dropout
+#         self.bias_dropout_fusion = config.bias_dropout_fusion
+#         self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None
+
+#         # Normalize the attention output
+#         self.post_attention_norm = get_norm(config)
+
+#         # Cross attention.
+#         if self.layer_type in (LayerType.decoder,
+#                                LayerType.retro_decoder,
+#                                LayerType.retro_decoder_with_retriever,
+#                                LayerType.retro_encoder):
+#             self.inter_attention = ParallelAttention(
+#                 config,
+#                 layer_number,
+#                 attention_type=AttnType.cross_attn)
+#             # Normalize the attention output.
+#             self.post_inter_attention_norm = get_norm(config)
+
+#         # MLP
+#         self.num_experts = num_experts
+#         if not args.deepspeed:
+#             if args.num_experts is not None:
+#                 self.mlp = SwitchMLP(config) # Megatron-LM's MoE
+#             else:
+#                 self.mlp = ParallelMLP(config)
+#         else:
+#             if self.num_experts <= 1: # dense, not MoE
+#                 self.mlp = ParallelMLP(config)
+#             else: # DeepSpeed's MoE
+#                 enable_expert_tensor_parallelism = args.enable_expert_tensor_parallelism
+#                 self.mlp = MoE(args.hidden_size,
+#                                ParallelMLP(config,
+#                                            moe=True,
+#                                            enable_expert_tensor_parallelism=enable_expert_tensor_parallelism),
+#                                num_experts=self.num_experts,
+#                                ep_size=args.moe_expert_parallel_size,
+#                                k=args.topk,
+#                                use_residual=(args.mlp_type == 'residual'),
+#                                capacity_factor=args.moe_train_capacity_factor,
+#                                eval_capacity_factor=args.moe_eval_capacity_factor,
+#                                min_capacity=args.moe_min_capacity,
+#                                drop_tokens=args.moe_token_dropping,
+#                                use_tutel=args.use_tutel,
+#                                enable_expert_tensor_parallelism=enable_expert_tensor_parallelism,
+#                                top2_2nd_expert_sampling=args.moe_top2_2nd_expert_sampling)
+
+#         # Set bias+dropout+add fusion grad_enable execution handler.
+#         TORCH_MAJOR = int(torch.__version__.split('.')[0])
+#         TORCH_MINOR = int(torch.__version__.split('.')[1])
+#         use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+#         self.bias_dropout_add_exec_handler = \
+#                 nullcontext if use_nvfuser else torch.enable_grad
+
+#         if args.retro_add_retriever:
+#             self.retro_num_neighbors = args.retro_num_neighbors
+#             self.retro_chunk_length = args.retro_chunk_length
+#             self.retro_retrieved_length = \
+#                 args.retro_num_retrieved_chunks * args.retro_chunk_length
+
+#         # Retriever (bi-directional transformer with cross attention)
+#         if layer_type == LayerType.retro_decoder_with_retriever:
+#             self.retriever = ParallelTransformer(
+#                 config=config,
+#                 model_type=ModelType.retro_encoder,
+#                 self_attn_mask_type=AttnMaskType.padding,
+#                 pre_process=True,
+#                 post_process=False,
+#             )
+#             self._retriever_key = 'retriever'
+#         else:
+#             self.retriever = None
+
+#     def default_decoder_cross_attention(self,
+#                                         encoder_output,
+#                                         enc_dec_attn_mask,
+#                                         norm_input,
+#                                         norm_output,
+#                                         bias_dropout_add_func):
+#         '''Cross attention for a standard encoder-decoder model.'''
+
+#         # Attention.
+#         attention_output, attention_bias = \
+#             self.inter_attention(norm_output,
+#                                  enc_dec_attn_mask,
+#                                  encoder_output=encoder_output)
+
+#         # Residual connection.
+#         if self.apply_residual_connection_post_norm:
+#             residual = norm_output
+#         else:
+#             residual = norm_input
+
+#         if attention_bias is not None:
+#             attention_bias = attention_bias.expand_as(residual)
+
+#         # Bias-dropout-add.
+#         with self.bias_dropout_add_exec_handler():
+#             norm_input = bias_dropout_add_func(
+#                 attention_output,
+#                 attention_bias,
+#                 residual,
+#                 self.hidden_dropout)
+
+#         # Normalize.
+#         norm_output = self.post_inter_attention_norm(norm_input)
+
+#         return norm_input, norm_output
+
+#     def retro_encoder_cross_attention(self,
+#                                       retriever_output,
+#                                       norm_input,
+#                                       norm_output,
+#                                       bias_dropout_add_func):
+#         """Cross attention for Retro encoder.
+
+#         Notation:
+#             ns : Sequence length.
+#             bs : Batch size.
+#             d  : Hidden size.
+#             l  : Number of chunks per sample (i.e., seq_length/chunk_length).
+#             k  : Number of neighbors.
+#             r  : Number of retrieved tokens (neighbors + continuation).
+#         """
+
+#         ns, bs, d = norm_output.shape # [r, bs * l * k, d]
+
+#         # Divide sequence dimension into chunks.
+#         chunked_outputs = norm_output.reshape(self.retro_retrieved_length,
+#                                               -1,
+#                                               self.retro_num_neighbors,
+#                                               d)
+#         chunked_outputs_before_norm = \
+#             norm_input.reshape(self.retro_retrieved_length, -1,
+#                                self.retro_num_neighbors, d) # [r, bs*l, k, d]
+
+#         # Per-chunk attention.
+#         norm_inputs = []
+#         norm_outputs = []
+#         for k in range(self.retro_num_neighbors):
+
+#             # Attention.
+#             chunked_output = chunked_outputs[:,:,k].contiguous()
+#             attention_output, attention_bias = \
+#                 self.inter_attention(
+#                     chunked_output, # Q (neighbor embedding)
+#                     None,
+#                     encoder_output=retriever_output) # K, V (hidden act)
+
+#             # Residual connection.
+#             if self.apply_residual_connection_post_norm:
+#                 residual = chunked_output
+#             else:
+#                 residual = chunked_outputs_before_norm[:,:,k]
+
+#             # Re-enable torch grad to enable fused optimization.
+#             with torch.enable_grad():
+#                 norm_input = bias_dropout_add_func(
+#                     attention_output,
+#                     None if attention_bias is None else attention_bias.expand_as(residual),
+#                     residual,
+#                     self.hidden_dropout)
+#                 norm_inputs.append(norm_input)
+
+#             # Layer norm.
+#             norm_output = self.post_inter_attention_norm(norm_input)
+#             norm_outputs.append(norm_output)
+
+#         # Concatenate layer norms.
+#         # norm_input : [r, k * bs * l, d]
+#         # norm_output : [r, k * bs * l, d]
+#         norm_input = torch.stack(norm_inputs, dim=1).reshape(ns, bs, d)
+#         norm_output = torch.stack(norm_outputs, dim=1).reshape(ns, bs, d)
+
+#         return norm_input, norm_output
+
+#     def retro_decoder_cross_attention(self,
+#                                       retriever_input,
+#                                       retriever_output,
+#                                       retriever_attn_mask,
+#                                       norm_input,
+#                                       norm_output,
+#                                       inference_params,
+#                                       bias_dropout_add_func):
+#         """Cross attention for Retro decoder.
+
+#         Notation:
+#             ns : Sequence length.
+#             bs : Batch size.
+#             d  : Hidden size.
+#             l  : Number of chunks per sample (i.e., seq_length/chunk_length).
+#             m  : Number of tokens per chunk.
+#             k  : Number of neighbors.
+#             r  : Number of retrieved tokens (neighbors + continuation).
+#         """
+
+#         ns, bs, d = norm_output.shape
+#         l = int(np.ceil(ns / self.retro_chunk_length))
+
+#         # Retrieve neighbors.
+#         if self.layer_type == LayerType.retro_decoder_with_retriever:
+#             first_ns = ns % self.retro_chunk_length
+#             if first_ns > 0:
+#                 first_chunk, rest_chunk = \
+#                     norm_output[:first_ns], norm_output[first_ns:]
+#                 first_chunk = torch.nn.functional.pad(
+#                     first_chunk,
+#                     (0, 0, 0, 0, 0, self.retro_chunk_length - first_ns),
+#                     'constant',
+#                     0)
+#                 chunked_output = \
+#                     torch.cat((first_chunk, rest_chunk), dim=0) # [l * m, bs, d]
+#             else:
+#                 chunked_output = norm_output # [l * m, bs, d]
+#             chunked_output = chunked_output \
+#                 .reshape(l, self.retro_chunk_length, bs, d) \
+#                 .permute(1, 2, 0, 3) \
+#                 .reshape(self.retro_chunk_length, bs * l, d) \
+#                 .contiguous()
+
+#             # Get Encoder Output
+#             retriever_output = self.retriever(
+#                 hidden_states=retriever_input,
+#                 attention_mask=retriever_attn_mask,
+#                 retriever_output=chunked_output,
+#                 retriever_attn_mask=retriever_attn_mask,
+#                 inference_params=inference_params) # [r, k * bs * l , d]
+#             retriever_output = retriever_output.reshape(
+#                 self.retro_retrieved_length * self.retro_num_neighbors, bs * l, d) # [r * k, bs * l, d]
+
+#         # Chunks.
+#         pad = (ns - 1) % self.retro_chunk_length
+#         attending_chunks = norm_output[pad:]
+#         padded_chunks = torch.nn.functional.pad(
+#             attending_chunks,
+#             (0, 0, 0, 0, 0, self.retro_chunk_length - 1),
+#             'constant', 0)
+#         padded_chunked_output = padded_chunks \
+#             .reshape(l, self.retro_chunk_length, bs, d) \
+#             .permute(1, 2, 0, 3)
+#         padded_chunked_output = padded_chunked_output.reshape(
+#             self.retro_chunk_length, bs * l, d).contiguous()
+
+#         # Encoder output.
+#         attention_output, attention_bias = \
+#             self.inter_attention(padded_chunked_output,
+#                                  None,
+#                                  encoder_output=retriever_output)
+
+#         # Residual connection.
+#         if self.apply_residual_connection_post_norm:
+#             residual = norm_output
+#         else:
+#             residual = norm_input
+
+#         # Re-enable torch grad to enable fused optimization.
+#         with torch.enable_grad():
+#             norm_input = bias_dropout_add_func(
+#                 attention_output,
+#                 None if attention_bias is None else attention_bias.expand_as(attention_output),
+#                 torch.zeros_like(attention_output),
+#                 self.hidden_dropout)
+#             norm_input = norm_input \
+#                 .reshape(self.retro_chunk_length, bs, l, d) \
+#                 .permute(2, 0, 1, 3) # [l, m, bs, d]
+#             norm_input = norm_input.reshape(self.retro_chunk_length * l, bs, d)
+#             norm_input = torch.nn.functional.pad(
+#                 norm_input,
+#                 (0, 0, 0, 0, pad, 0),
+#                 'constant', 0)[:ns] # [ns, b, d]
+#             # TODO: better redesign with inference param
+#             args = get_args()
+#             norm_input = args.retro_attention_gate * norm_input + residual
+
+#         # Layer norm post the decoder attention
+#         norm_output = self.post_inter_attention_norm(norm_input)
+
+#         return retriever_output, norm_input, norm_output
+
+#     def forward(self, hidden_states, attention_mask=None,
+#                 encoder_output=None, enc_dec_attn_mask=None,
+#                 retriever_input=None,
+#                 retriever_output=None,
+#                 retriever_attn_mask=None,
+#                 inference_params=None,
+#                 rotary_pos_emb=None, **kwargs):
+
+#         # Update the params in case the retro param changes during inference
+#         # TODO: better redesign with inference param
+#         args = get_args()
+#         if args.retro_add_retriever:
+#             self.retro_num_neighbors = args.retro_num_neighbors
+#             self.retro_chunk_length = args.retro_chunk_length
+#             self.retro_retrieved_length = \
+#                 args.retro_num_retrieved_chunks * args.retro_chunk_length
+
+#         # hidden_states: [s, b, h]
+
+#         # Layer norm at the beginning of the transformer layer.
+#         norm_output = self.input_norm(hidden_states)
+
+#         # Self attention.
+#         attention_output, attention_bias = \
+#             self.self_attention(
+#                 norm_output,
+#                 attention_mask,
+#                 inference_params=inference_params,
+#                 rotary_pos_emb=rotary_pos_emb)
+
+#         # Residual connection.
+#         if self.apply_residual_connection_post_norm:
+#             residual = norm_output
+#         else:
+#             residual = hidden_states
+
+#         if self.drop_path is None:
+#             # jit scripting for a nn.module (with dropout) is not
+#             # trigerring the fusion kernel. For now, we use two
+#             # different nn.functional routines to account for varying
+#             # dropout semantics during training and inference phases.
+#             if self.bias_dropout_fusion:
+#                 if self.training:
+#                     bias_dropout_add_func = bias_dropout_add_fused_train
+#                 else:
+#                     bias_dropout_add_func = bias_dropout_add_fused_inference
+#             else:
+#                 bias_dropout_add_func = get_bias_dropout_add(self.training)
+
+#             if attention_bias is not None:
+#                 attention_bias = attention_bias.expand_as(residual)
+#             with self.bias_dropout_add_exec_handler():
+#                 norm_input = bias_dropout_add_func(
+#                     attention_output,
+#                     attention_bias,
+#                     residual,
+#                     self.hidden_dropout)
+#         else:
+#             out = torch.nn.functional.dropout(attention_output + attention_bias,
+#                                               p=self.hidden_dropout,
+#                                               training=self.training)
+#             norm_input = residual + self.drop_path(out)
+
+#         # Layer norm post the self attention.
+#         norm_output = self.post_attention_norm(norm_input)
+
+#         # Cross attention.
+#         if self.layer_type == LayerType.encoder:
+#             pass
+#         elif self.layer_type == LayerType.decoder:
+#             norm_input, norm_output = \
+#                 self.default_decoder_cross_attention(
+#                     encoder_output,
+#                     enc_dec_attn_mask,
+#                     norm_input,
+#                     norm_output,
+#                     bias_dropout_add_func)
+#         elif self.layer_type == LayerType.retro_encoder:
+#             norm_input, norm_output = \
+#                 self.retro_encoder_cross_attention(
+#                     retriever_output,
+#                     norm_input,
+#                     norm_output,
+#                     bias_dropout_add_func)
+#         elif self.layer_type in (LayerType.retro_decoder,
+#                                  LayerType.retro_decoder_with_retriever):
+#             retriever_output, norm_input, norm_output = \
+#                 self.retro_decoder_cross_attention(
+#                     retriever_input,
+#                     retriever_output,
+#                     retriever_attn_mask,
+#                     norm_input,
+#                     norm_output,
+#                     inference_params,
+#                     bias_dropout_add_func)
+#         else:
+#             raise Exception("Unsupported layer type, '%s'." %
+#                             self.layer_type.name)
+
+#         # MLP.
+#         moe_loss = torch.tensor(0.0, device=norm_output.device, dtype=norm_output.dtype)
+#         mlp_bias = torch.tensor(0.0, device=norm_output.device, dtype=norm_output.dtype)
+
+#         if self.num_experts > 1 and args.deepspeed:
+#             mlp_output, moe_loss, _ = self.mlp(norm_output)
+#         else:
+#             mlp_output, mlp_bias = self.mlp(norm_output)
+
+#         # Second residual connection.
+#         if self.apply_residual_connection_post_norm:
+#             residual = norm_output
+#         else:
+#             residual = norm_input
+
+#         if self.drop_path is None:
+#             if mlp_bias is not None:
+#                 mlp_bias = mlp_bias.expand_as(residual)
+#             with self.bias_dropout_add_exec_handler():
+#                 output = bias_dropout_add_func(
+#                     mlp_output,
+#                     mlp_bias,
+#                     residual,
+#                     self.hidden_dropout)
+
+#             # Jit compiled function creates 'view' tensor. This tensor
+#             # potentially gets saved in the MPU checkpoint function context,
+#             # which rejects view tensors. While making a viewless tensor here
+#             # won't result in memory savings (like the data loader, or
+#             # p2p_communication), it serves to document the origin of this
+#             # 'view' tensor.
+#             output = core.utils.make_viewless_tensor(inp = output,
+#                                                      requires_grad = output.requires_grad,
+#                                                      keep_graph = True)
+
+#         else:
+#             if mlp_bias is not None:
+#                 mlp_output = mlp_output + mlp_bias
+#             out = torch.nn.functional.dropout(mlp_output,
+#                                               p=self.hidden_dropout,
+#                                               training=self.training)
+#             output = residual + self.drop_path(out)
+
+#         if self.layer_type == LayerType.retro_decoder_with_retriever:
+#             return output, retriever_output, moe_loss
+#         else:
+#             return output, moe_loss
+
+
+class ParallelTransformerLayerPipe(ParallelTransformerLayer):
+    """Extends ParallelTransformerLayer to forward attention_mask through the pipeline.
+
+    Forward has two usages that affect attention mask communication:
+
+    1) forward((input, attn_mask) , **kwargs) -> (output, mask)
+       When the attention mask is provided as the second positional
+       argument, typical pipeline behavior is used and both the output
+       *and* mask are returned in a tuple. This tuple is then forwarded
+       to the next stage in the pipeline.
+
+       This version is useful if masks are dynamic.
+
+    2) forward(input, **kwargs) -> output
+       When the mask is static over all samples, it is advantageous to
+       cache the mask and avoid communicating it.
+
+       If no mask is provided, the module will query `self._args.attn_mask`
+       for the mask and only return `super().forward(...)`
+    """
+    def __init__(self, config,
+                 layer_number, layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding,
+                 drop_path_rate=0., num_experts=1,
+                 input_aggregated_moe_loss=False, return_aggregated_moe_loss=False):
+        self.input_aggregated_moe_loss = input_aggregated_moe_loss
+        self.return_aggregated_moe_loss = return_aggregated_moe_loss
+        super().__init__(config, layer_number, layer_type, self_attn_mask_type, drop_path_rate, num_experts)
+
+    def forward(self, inputs, **kwargs):
+        assert torch.is_tensor(inputs) or isinstance(inputs, tuple)
+        if not hasattr(self, '_args'):
+            self._args = get_args()
+        rotary_pos_emb = self._args.rotary_pos_emb if self._args.use_rotary_position_embeddings else None
+        if torch.is_tensor(inputs) or len(inputs) == 1:
+            assert not self.input_aggregated_moe_loss, f'Expecting an input tuple of size >= 2'
+            # No attention mask forwarded, search for args.attn_mask
+            hidden_states, attention_mask = inputs, self._args.attn_mask
+            output, moe_loss = super().forward(hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb)
+            return (output, moe_loss) if self.return_aggregated_moe_loss else output
+        elif len(inputs) in (2, 3):
+            # Attention mask and aggregated_moe can both be activations.
+            return_attention_mask = False
+            if len(inputs) == 2:
+                if self.input_aggregated_moe_loss:
+                    hidden_states, aggregated_moe_loss = inputs[0], inputs[1]
+                    attention_mask = self._args.attn_mask
+                else:
+                    hidden_states, attention_mask = inputs[0], inputs[1]
+                    return_attention_mask = True
+            else:
+                hidden_states, attention_mask, aggregated_moe_loss = inputs[0], inputs[1], inputs[2]
+
+            # Forward aggregated_moe_loss to ParallelTransformerLayer for further accumulation
+            if self.input_aggregated_moe_loss:
+                kwargs.update({'aggregated_moe_loss': aggregated_moe_loss})
+
+            output, moe_loss = super().forward(hidden_states, attention_mask, **kwargs, rotary_pos_emb=rotary_pos_emb)
+
+            ret = (output, )
+            if return_attention_mask:
+                ret += (attention_mask, )
+            if self.return_aggregated_moe_loss:
+                ret += (moe_loss, )
+            return ret
+        else:
+            raise RuntimeError('Received more inputs than understood.')
+
+def get_num_experts_per_layer(num_experts: list, num_layers: int, expert_interval: int, offset: int = 0) -> list:
+    assert len(num_experts) == 1 or len(num_experts) == num_layers // expert_interval, \
+        'num_experts must be either a single value or a list of the same length as the number of MoE layers'
+    if len(num_experts) == 1:
+        num_experts = num_experts * (num_layers // expert_interval)
+    experts_per_layer = []
+    for i in range(num_layers):
+        layer_num = i + 1 + offset
+        n_e = num_experts[(layer_num-1) // expert_interval] if layer_num % expert_interval == 0 else 1
+        experts_per_layer.append(n_e)
+    return experts_per_layer
+
+def parallel_transformer_init(self, config,
+                              model_type, layer_type=LayerType.encoder,
+                              self_attn_mask_type=AttnMaskType.padding,
+                              post_norm=True,
+                              pre_process=True,
+                              post_process=True,
+                              drop_path_rate=0.0,
+                              rlhf_training=False):
+    super(ParallelTransformer, self).__init__()
+    if rlhf_training:
+        args = get_rlhf_args()
+    else:
+        args = get_args()
+
+    self.layer_type = layer_type
+    self.model_type = model_type
+    self.bf16 = config.bf16
+    self.fp32_residual_connection = config.fp32_residual_connection
+    self.post_norm = post_norm
+    self.pre_process = pre_process
+    self.post_process = post_process
+    self.input_tensor = None
+    self.drop_path_rate = drop_path_rate
+    self.transformer_impl = args.transformer_impl
+    self.retro_add_retriever = args.retro_add_retriever
+    self.ds_inference = args.ds_inference
+    self.deepspeed = args.deepspeed
+
+    # Store activation checkpoiting flag.
+    self.checkpoint_activations = args.checkpoint_activations
+    self.checkpoint_num_layers = args.checkpoint_num_layers
+    self.recompute_granularity = config.recompute_granularity
+    if args.recompute_method_per_stage != None:
+        if args.virtual_pipeline_model_parallel_size != None:
+            if args.recompute_method_per_stage[mpu.get_virtual_pipeline_model_parallel_rank() * args.pipeline_model_parallel_size + mpu.get_pipeline_model_parallel_rank()] == 0:
+                self.recompute_method = 'uniform'
+            elif args.recompute_method_per_stage[mpu.get_virtual_pipeline_model_parallel_rank() * args.pipeline_model_parallel_size + mpu.get_pipeline_model_parallel_rank()] == 1:
+                self.recompute_method = 'block'
+        else:
+            if args.recompute_method_per_stage[mpu.get_pipeline_model_parallel_rank()] == 0:
+                self.recompute_method = 'uniform'
+            elif args.recompute_method_per_stage[mpu.get_pipeline_model_parallel_rank()] == 1:
+                self.recompute_method = 'block'
+    else:
+        self.recompute_method = config.recompute_method
+    if args.recompute_num_layers_per_stage != None:
+        if args.virtual_pipeline_model_parallel_size != None:
+            self.recompute_num_layers = args.recompute_num_layers_per_stage[mpu.get_virtual_pipeline_model_parallel_rank() * args.pipeline_model_parallel_size + mpu.get_pipeline_model_parallel_rank()]
+        else:
+            self.recompute_num_layers = args.recompute_num_layers_per_stage[mpu.get_pipeline_model_parallel_rank()]
+    else:
+        self.recompute_num_layers = config.recompute_num_layers
+    self.distribute_saved_activations = \
+        config.distribute_saved_activations and not config.sequence_parallel
+
+    self.sequence_parallel = config.sequence_parallel
+
+    # Transformer Engine Init.
+    self.transformer_engine_rope_available = False
+    self.transformer_engine_v_0_10 = False
+    self.transformer_engine_v_0_11 = False
+    self.transformer_engine_v_0_8 = False
+    self.ixte_v_0_2_3 = False
+    self.use_ixte = False
+    if self.transformer_impl == 'transformer_engine':
+        global transformer_engine
+        import transformer_engine
+        megatron.legacy.model.transformer.transformer_engine = transformer_engine
+        from importlib.metadata import version
+        from pkg_resources import packaging
+
+        if ixte_extensions._USE_IXTE:
+            te_version = packaging.version.Version(ixte_extensions.te_version())
+            self.use_ixte = True
+            ixte_version = packaging.version.Version(ixte_extensions.ixte_version())
+            if ixte_version >= packaging.version.Version("0.2.3"):
+                self.ixte_v_0_2_3 = True
+        else:
+            te_version = packaging.version.Version(version("transformer-engine"))
+        if te_version >= packaging.version.Version("0.8.0"):
+            self.transformer_engine_v_0_8 = True
+        if te_version >= packaging.version.Version("0.10.0"):
+            self.transformer_engine_v_0_10 = True
+        if te_version >= packaging.version.Version("0.11.0"):
+            self.transformer_engine_v_0_11 = True
+        if te_version >= packaging.version.Version("0.10.0"):
+            self.transformer_engine_rope_available = True
+
+        del version, packaging
+
+        assert not args.squared_relu, "TransformerEngine does not support squared relu activation."
+
+    self.use_fp8 = args.fp8 is not None
+    self.fp8_recipe = None
+    self.fp8_group = None
+    if self.use_fp8:
+        assert args.transformer_impl == 'transformer_engine', \
+            'transformer-engine required for fp8 training and inference'
+        self.fp8_group = mpu.get_amax_reduction_group()
+        if args.fp8 == "e4m3":
+            fp8_format = transformer_engine.common.recipe.Format.E4M3
+        elif args.fp8 == "hybrid":
+            fp8_format = transformer_engine.common.recipe.Format.HYBRID
+        else:
+            raise ValueError("The DelayedScaling recipe only supports E4M3 and HYBRID formats.")
+        self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+            margin=args.fp8_margin,
+            interval=args.fp8_interval,
+            fp8_format=fp8_format,
+            amax_history_len=args.fp8_amax_history_len,
+            amax_compute_algo=args.fp8_amax_compute_algo,
+            override_linear_precision=(False, False, not args.fp8_wgrad),
+        )
+
+    self.num_microbatches_in_previous_step = -1
+    self.microbatch_count = 0
+    self.checkpoint_core_attention = config.recompute_granularity == 'selective'
+
+    ## check custom parition pp stage
+    if args.num_layers_per_stage is not None:
+        assert sum(args.num_layers_per_stage) == args.num_layers, \
+            f"total custom partition pp stage transformer layers should equal to model layers" \
+            f"get total custom partition layers ({sum(args.num_layers_per_stage)})  !=   model layers ({args.num_layers})"
+
+    # Number of layers.
+    self.num_layers = _get_num_layers(args, model_type,
+                                        layer_type==LayerType.decoder)
+
+    self.drop_path_rates = [
+        rate.item() for rate in
+        torch.linspace(0, self.drop_path_rate, config.num_layers)]
+
+    self.retro_layer_numbers = None
+    if model_type == ModelType.retro_decoder:
+        retro_layer_start = 6 if config.num_layers <= 15 else 9
+        self.retro_layer_numbers = \
+            np.arange(retro_layer_start, args.num_layers + 1, 3).tolist()
+    if model_type == ModelType.retro_encoder:
+        self.retro_layer_numbers = [1]
+
+    # Transformer layers.
+    if args.retro_add_retriever:
+        assert self.recompute_granularity != 'full', \
+            "Full recompute not supported for Retro."
+        assert args.transformer_impl == 'local', \
+            "Transformer engine does not support Retro layers."
+    def build_layer(layer_number, n_e=1):
+        if args.transformer_impl == 'local':
+            current_layer_type = _get_layer_type(
+                model_type, layer_type, self.retro_layer_numbers,
+                layer_number)
+            return ParallelTransformerLayer(
+                config,
+                layer_number,
+                layer_type=current_layer_type,
+                self_attn_mask_type=self_attn_mask_type,
+                drop_path_rate=self.drop_path_rates[layer_number - 1],
+                num_experts=n_e,
+                rlhf_training=rlhf_training)
+        else:
+            # This argument is only available from TE v0.10 onwards.
+            extra_transformer_engine_kwargs = {}
+            if self.transformer_engine_v_0_8:
+                extra_transformer_engine_kwargs["bias"] = args.add_bias_linear
+            if self.transformer_engine_v_0_10:
+                extra_transformer_engine_kwargs["activation"] = "swiglu" if args.swiglu else "gelu"
+            if self.transformer_engine_v_0_11:
+                extra_transformer_engine_kwargs["normalization"] = args.normalization
+            if not ixte_extensions._USE_IXTE:
+                assert config.attention_softmax_in_fp32, "TransformerEngine only supports softmax compute in FP32."
+            if self.use_ixte:
+                extra_transformer_engine_kwargs["use_alibi"] = args.position_embedding_type == "alibi"
+                if self.ixte_v_0_2_3:
+                    extra_transformer_engine_kwargs["qkv_bias"] = args.add_qkv_bias
+                elif args.add_qkv_bias and not args.add_bias_linear:
+                    raise NotImplementedError("Please update ixTE version to 0.2.3 to support individual qkv_bias!")
+            assert (
+                (bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and args.fp16) == config.apply_query_key_layer_scaling
+            ), ("Unsupported config for apply_query_key_layer_scaling in TransformerEngine. If --apply-query-key-layer-scaling is "
+                "provided, set env-var NVTE_APPLY_QK_LAYER_SCALING=1 and you must be using fp16.")
+            layer_tmp = transformer_engine.pytorch.TransformerLayer(
+                config.hidden_size,
+                config.ffn_hidden_size,
+                config.num_attention_heads,
+                num_gqa_groups=config.num_query_groups,
+                layernorm_epsilon=config.layernorm_epsilon,
+                hidden_dropout=config.hidden_dropout,
+                attention_dropout=config.attention_dropout,
+                init_method=config.init_method,
+                output_layer_init_method=config.output_layer_init_method,
+                layer_number=layer_number,
+                kv_channels=config.kv_channels,
+                self_attn_mask_type=self_attn_mask_type.name,
+                tp_group=mpu.get_tensor_model_parallel_group() if mpu.is_initialized() else None,
+                tp_size=mpu.get_tensor_model_parallel_world_size(),
+                get_rng_state_tracker=get_cuda_rng_tracker
+                if hasattr(get_cuda_rng_tracker(), 'is_initialized') and get_cuda_rng_tracker().is_initialized()
+                else None,
+                fuse_wgrad_accumulation=config.gradient_accumulation_fusion,
+                seq_length=args.seq_length,
+                micro_batch_size=args.micro_batch_size,
+                sequence_parallel=config.sequence_parallel,
+                params_dtype=config.params_dtype,
+                apply_residual_connection_post_layernorm=config.apply_residual_connection_post_layernorm,
+                output_layernorm=False,
+                layer_type="encoder",
+                drop_path_rate=self.drop_path_rates[layer_number - 1],
+                set_parallel_mode=True,
+                fuse_qkv_params=True,
+                **extra_transformer_engine_kwargs)
+            return layer_tmp
+
+    if config.virtual_pipeline_model_parallel_size is not None:
+        if args.num_layers_per_stage is None:
+            assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \
+                'num_layers_per_stage must be divisible by ' \
+                'virtual_pipeline_model_parallel_size'
+            assert args.model_type != ModelType.encoder_and_decoder
+            # Number of layers in each model chunk is the number of layers in the stage,
+            # divided by the number of model chunks in a stage.
+            self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size
+            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0]  [2]  [4]  [6]
+            # Stage 1: [1]  [3]  [5]  [7]
+            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0, 1]  [4, 5]
+            # Stage 1: [2, 3]  [6, 7]
+            offset = mpu.get_virtual_pipeline_model_parallel_rank() * (
+                config.num_layers // config.virtual_pipeline_model_parallel_size) + \
+                (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
+        else:
+            offset_list = [0] * len(args.num_layers_per_stage)
+            for i in range(len(args.num_layers_per_stage)):
+                for j in range(i):
+                    offset_list[i] += args.num_layers_per_stage[j]
+            offset = offset_list[mpu.get_virtual_pipeline_model_parallel_rank() * mpu.get_pipeline_model_parallel_world_size() + mpu.get_pipeline_model_parallel_rank()]
+    else:
+        # Each stage gets a contiguous set of layers.
+        if args.model_type == ModelType.encoder_and_decoder and \
+                mpu.get_pipeline_model_parallel_world_size() > 1:
+            pipeline_rank = mpu.get_pipeline_model_parallel_rank()
+            if layer_type == LayerType.encoder:
+                offset = pipeline_rank * self.num_layers
+            else:
+                num_ranks_in_enc = args.pipeline_model_parallel_split_rank
+                offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers
+        else:
+            if args.num_layers_per_stage is not None:
+                offset_list = [0] * len(args.num_layers_per_stage)
+                for i in range(len(args.num_layers_per_stage)):
+                    for j in range(i):
+                        offset_list[i] += args.num_layers_per_stage[j]
+                offset = offset_list[mpu.get_pipeline_model_parallel_rank()]
+            else:
+                offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
+
+    if self.num_layers == 0:
+        # When a standalone embedding stage is used (e.g.,
+        # args.standalone_embedding_stage == True), virtual pipeline ranks
+        # on pipeline rank 0 will have zero transformer layers assigned to
+        # them. This results in the model's input and output tensors to be
+        # the same, which will cause failure for certain output tensor
+        # optimizations (e.g., pipeline output deallocation). To remedy
+        # this, we assign a 'no-op' layer on these ranks, which will
+        # disconnect the input tensor from the output tensor.
+        self.num_layers = 1
+        self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ])
+    else:
+        # Build the layers
+        if not args.deepspeed:
+            self.layers = torch.nn.ModuleList(
+                [build_layer(i + 1 + offset) for i in range(self.num_layers)])
+        else:
+            self.layers = []
+            num_experts = args.ds_num_experts
+            experts_per_layer = get_num_experts_per_layer(num_experts, self.num_layers, args.expert_interval, offset)
+            for i in range(self.num_layers):
+                layer_num = i + 1 + offset
+                n_e = experts_per_layer[i]
+                self.layers.append(build_layer(layer_num, n_e))
+            self.layers = torch.nn.ModuleList(self.layers)
+
+        # Update dropout rate for Retro encoder.
+        if model_type == ModelType.retro_encoder:
+            for layer in self.layers:
+                if layer.self_attention.use_flash_attn:
+                    layer.self_attention.core_attention_flash.dropout_p = \
+                        torch.nn.Dropout(args.retro_encoder_attention_dropout)
+                else:
+                    layer.self_attention.core_attention.attention_dropout.p =\
+                        args.retro_encoder_attention_dropout
+                layer.hidden_dropout = args.retro_encoder_hidden_dropout
+
+    if self.post_process and self.post_norm:
+        # Final layer norm before output.
+        self.final_norm = get_norm(config)
+
+def parallel_transformer__checkpointed_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(
+        self, hidden_states, attention_mask,
+        encoder_output, enc_dec_attn_mask,
+        rotary_pos_emb, is_first_microbatch):
+        args = get_args()
+
+        if not args.deepspeed:
+            return fn(self, hidden_states, attention_mask,
+                      encoder_output, enc_dec_attn_mask,
+                      rotary_pos_emb, is_first_microbatch)
+
+        """Forward method with activation checkpointing."""
+        def custom(start, end):
+            def custom_forward(*args, **kwargs):
+                x_, *args = args
+                moe_losses = []
+                for index in range(start, end):
+                    # Is recompute last layer
+                    # Network last layer also can be optimized, because vocab gemm always save forward tenor for backward!
+                    layer = self._get_layer(index)
+                    is_recompute_lastlayer = kwargs.pop('is_recompute_forward', False) and index == end - 1
+                    can_opt_last_gemm = hasattr(layer, 'mlp') and hasattr(layer.mlp, 'fc2') and hasattr(layer.mlp.fc2, 'is_recompute_lastlayer')
+                    if can_opt_last_gemm:
+                        layer.mlp.fc2.is_recompute_lastlayer = is_recompute_lastlayer
+                    output = layer(x_, *args, **kwargs)
+                    if can_opt_last_gemm:
+                        layer.mlp.fc2.is_recompute_lastlayer = False
+                    if isinstance(output, tuple):
+                        x_, moe_loss = output
+                    else:
+                        x_ = output
+                        moe_loss = torch.tensor(0.0, device=x_.device, dtype=x_.dtype, requires_grad=True)
+                    moe_losses.append(moe_loss)
+                return (x_, *moe_losses)
+            return custom_forward
+        
+        if args.deepspeed and args.deepspeed_activation_checkpointing:
+            moe_losses = []
+            # Make sure memory is freed.
+            tensor_parallel.reset_checkpointed_activations_memory_buffer()
+            l = 0
+            while l < self.num_layers:
+                hidden_states, *local_moe_losses = tensor_parallel.checkpoint(
+                    custom(l, l + self.checkpoint_num_layers), False,
+                    hidden_states, attention_mask, encoder_output, enc_dec_attn_mask,
+                    None, None, None, None, rotary_pos_emb)
+                moe_losses.extend(local_moe_losses)
+                l += self.checkpoint_num_layers
+
+            return hidden_states, moe_losses
+        else:
+            moe_losses = []
+            te_forward_kwargs = {}
+            if self.transformer_impl == 'transformer_engine':
+                te_forward_kwargs['is_first_microbatch'] = is_first_microbatch
+                if self.transformer_engine_v_0_10:
+                    te_forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+
+            if self.recompute_method == 'uniform':
+                # Uniformly divide the total number of Transformer layers and
+                # checkpoint the input activation of each divided chunk.
+                # A method to further reduce memory usage reducing checkpoints.
+                l = 0
+                while l < self.num_layers:
+                    if self.transformer_impl == 'transformer_engine':
+                        hidden_states, *local_moe_losses = transformer_engine.pytorch.checkpoint(
+                            custom(l, l + self.recompute_num_layers),
+                            self.distribute_saved_activations,
+                            tensor_parallel.get_cuda_rng_tracker,
+                            mpu.get_tensor_model_parallel_group(),
+                            hidden_states, attention_mask, encoder_output,
+                            enc_dec_attn_mask, **te_forward_kwargs)
+                    else:
+                        hidden_states, *local_moe_losses = tensor_parallel.checkpoint(
+                            custom(l, l + self.recompute_num_layers),
+                            self.distribute_saved_activations,
+                            hidden_states, attention_mask,
+                            encoder_output, enc_dec_attn_mask,
+                            None, None, None, None, rotary_pos_emb)
+                    moe_losses.extend(local_moe_losses)
+                    l += self.recompute_num_layers
+            elif self.recompute_method == 'block':
+                # Checkpoint the input activation of only a set number of individual
+                # Transformer layers and skip the rest.
+                # A method fully use the device memory removing redundant re-computation.
+                for l in range(self.num_layers):
+                    if l < self.recompute_num_layers:
+                        if self.transformer_impl == 'transformer_engine':
+                            hidden_states, *local_moe_losses = transformer_engine.pytorch.checkpoint(
+                                custom(l, l + 1),
+                                self.distribute_saved_activations,
+                                tensor_parallel.get_cuda_rng_tracker,
+                                mpu.get_tensor_model_parallel_group(),
+                                hidden_states, attention_mask, encoder_output,
+                                enc_dec_attn_mask, **te_forward_kwargs)
+                        else:
+                            hidden_states, *local_moe_losses = tensor_parallel.checkpoint(
+                                custom(l, l + 1),
+                                self.distribute_saved_activations,
+                                hidden_states, attention_mask,
+                                encoder_output, enc_dec_attn_mask,
+                                None, None, None, None, rotary_pos_emb)
+                    else:
+                        if self.transformer_impl == 'transformer_engine':
+                            hidden_states, *local_moe_losses = custom(l, l + 1)(
+                                hidden_states, attention_mask, encoder_output,
+                                enc_dec_attn_mask, **te_forward_kwargs)
+                        else:
+                            hidden_states, *local_moe_losses = custom(l, l + 1)(
+                                hidden_states, attention_mask,
+                                encoder_output, enc_dec_attn_mask,
+                                None, None, None, None, rotary_pos_emb)
+                            
+                    moe_losses.extend(local_moe_losses)
+            else:
+                raise ValueError("Invalid activation recompute method.")
+            return hidden_states, moe_losses
+    return wrapper
+
+def parallel_transformer_forward_wrapper(fn):
+    @wraps(fn)
+    def wrapper(
+        self, hidden_states, attention_mask,
+        encoder_output=None, enc_dec_attn_mask=None,
+        retriever_input=None,
+        retriever_output=None,
+        retriever_attn_mask=None,
+        inference_params=None,
+        rotary_pos_emb=None,
+        position_ids=None):
+        # hidden_states: [s, b, h]
+
+        args = get_args()
+        # if not args.deepspeed:
+        #     return fn(self, hidden_states, attention_mask,
+        #             encoder_output=encoder_output, enc_dec_attn_mask=enc_dec_attn_mask,
+        #             retriever_input=retriever_input,
+        #             retriever_output=retriever_output,
+        #             retriever_attn_mask=retriever_attn_mask,
+        #             inference_params=inference_params,
+        #             rotary_pos_emb=rotary_pos_emb)
+
+        # Checks.
+        if inference_params:
+            assert self.recompute_granularity is None, \
+                'inference does not work with activation checkpointing'
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = core.utils.make_viewless_tensor(
+            hidden_states,
+            requires_grad=True,
+            keep_graph=True,
+        )
+
+        # RNG context.
+        if self.sequence_parallel and not inference_params:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        # Forward layers.
+        with rng_context:
+            # The fp8_autocast context manager is a no-op when enabled=True
+            # The if...else serves to short circuit name resolution for fp8_autocast
+            with transformer_engine.pytorch.fp8_autocast(
+                enabled=self.use_fp8,
+                fp8_recipe=self.fp8_recipe,
+                fp8_group=self.fp8_group
+            ) if self.use_fp8 else nullcontext():
+                # Determine if the current iteration is first microbatch
+                if self.num_microbatches_in_previous_step != get_num_microbatches():
+                    self.microbatch_count = 0 # Reset count on new batch size rampup interval
+                self.num_microbatches_in_previous_step = get_num_microbatches()
+                is_first_microbatch = self.microbatch_count % get_num_microbatches() == 0
+
+                # Forward pass.
+                moe_losses = []
+                if self.deepspeed and self.checkpoint_activations:
+                    # deepspeed use the args.checkpoint_activations
+                    hidden_states, moe_losses = self._checkpointed_forward(hidden_states,
+                                                                attention_mask,
+                                                                encoder_output,
+                                                                enc_dec_attn_mask,
+                                                                rotary_pos_emb,
+                                                                is_first_microbatch)
+                elif self.recompute_granularity == 'full':
+                    # megatron-lm use args.recompute_granularity and args.recompute_method
+                    hidden_states = self._checkpointed_forward(hidden_states,
+                                                                attention_mask,
+                                                                encoder_output,
+                                                                enc_dec_attn_mask,
+                                                                rotary_pos_emb,
+                                                                is_first_microbatch)
+                else:
+                    forward_kwargs = {
+                        'encoder_output': encoder_output,
+                        'enc_dec_attn_mask': enc_dec_attn_mask,
+                        'inference_params': inference_params,
+                    }
+
+                    if self.transformer_impl == 'transformer_engine':
+                        forward_kwargs['is_first_microbatch'] = is_first_microbatch
+                        forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention
+                        if self.transformer_engine_v_0_10:
+                            forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+                    else:
+                        forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+                        forward_kwargs['retriever_input'] = retriever_input
+                        forward_kwargs['retriever_output'] = retriever_output
+                        forward_kwargs['retriever_attn_mask'] = retriever_attn_mask
+                        forward_kwargs['position_ids'] = position_ids
+
+                    for index in range(self.num_layers):
+                        layer = self._get_layer(index)
+
+                        hidden_states = layer(
+                            hidden_states,
+                            attention_mask,
+                            **forward_kwargs)
+
+                        # First Retro decoder layer returns both hidden_states
+                        # and retriever_output. Make retriever_output available
+                        # to subsequence Retro layers.
+                        if isinstance(hidden_states, tuple):
+                            assert (len(hidden_states) == 2 or len(hidden_states) == 3)
+                            if len(hidden_states) == 2:
+                                if not self.ds_inference:
+                                    hidden_states, moe_loss = hidden_states
+                                    moe_losses.append(moe_loss)
+                            else:
+                                forward_kwargs["retriever_output"] = hidden_states[1]
+                                if not self.ds_inference:
+                                    hidden_states, _, moe_loss = hidden_states
+                                    moe_losses.append(moe_loss)
+
+                # Skip counter update for eval and activation checkpointing
+                if torch.is_grad_enabled() and self.training:
+                    self.microbatch_count += 1
+
+        # Final layer norm.
+        if self.post_process and self.post_norm:
+            hidden_states = self.final_norm(hidden_states)
+
+        if args.deepspeed:
+            return (hidden_states, *moe_losses)
+        else:
+            return hidden_states
+    return wrapper
+
+# class ParallelTransformer(MegatronModule):
+#     """Transformer class."""
+
+#     def __init__(self, config,
+#                  model_type, layer_type=LayerType.encoder,
+#                  self_attn_mask_type=AttnMaskType.padding,
+#                  post_norm=True,
+#                  pre_process=True,
+#                  post_process=True,
+#                  drop_path_rate=0.0):
+#         super(ParallelTransformer, self).__init__()
+#         args = get_args()
+
+#         self.layer_type = layer_type
+#         self.model_type = model_type
+#         self.bf16 = config.bf16
+#         self.fp32_residual_connection = config.fp32_residual_connection
+#         self.post_norm = post_norm
+#         self.pre_process = pre_process
+#         self.post_process = post_process
+#         self.input_tensor = None
+#         self.drop_path_rate = drop_path_rate
+#         self.transformer_impl = args.transformer_impl
+#         self.retro_add_retriever = args.retro_add_retriever
+#         self.ds_inference = args.ds_inference
+#         self.deepspeed = args.deepspeed
+
+#         # Store activation checkpoiting flag.
+#         self.checkpoint_activations = args.checkpoint_activations
+#         self.checkpoint_num_layers = args.checkpoint_num_layers
+#         self.recompute_granularity = config.recompute_granularity
+#         if args.recompute_method_per_stage != None:
+#             if args.virtual_pipeline_model_parallel_size != None:
+#                 if args.recompute_method_per_stage[mpu.get_virtual_pipeline_model_parallel_rank() * args.pipeline_model_parallel_size + mpu.get_pipeline_model_parallel_rank()] == 0:
+#                     self.recompute_method = 'uniform'
+#                 elif args.recompute_method_per_stage[mpu.get_virtual_pipeline_model_parallel_rank() * args.pipeline_model_parallel_size + mpu.get_pipeline_model_parallel_rank()] == 1:
+#                     self.recompute_method = 'block'
+#             else:
+#                 if args.recompute_method_per_stage[mpu.get_pipeline_model_parallel_rank()] == 0:
+#                     self.recompute_method = 'uniform'
+#                 elif args.recompute_method_per_stage[mpu.get_pipeline_model_parallel_rank()] == 1:
+#                     self.recompute_method = 'block'
+#         else:
+#             self.recompute_method = config.recompute_method
+#         if args.recompute_num_layers_per_stage != None:
+#             if args.virtual_pipeline_model_parallel_size != None:
+#                 self.recompute_num_layers = args.recompute_num_layers_per_stage[mpu.get_virtual_pipeline_model_parallel_rank() * args.pipeline_model_parallel_size + mpu.get_pipeline_model_parallel_rank()]
+#             else:
+#                 self.recompute_num_layers = args.recompute_num_layers_per_stage[mpu.get_pipeline_model_parallel_rank()]
+#         else:
+#             self.recompute_num_layers = config.recompute_num_layers
+#         self.distribute_saved_activations = \
+#             config.distribute_saved_activations and not config.sequence_parallel
+
+#         self.sequence_parallel = config.sequence_parallel
+
+#         # Transformer Engine Init.
+#         self.transformer_engine_rope_available = False
+#         self.transformer_engine_v_0_10 = False
+#         self.transformer_engine_v_0_11 = False
+#         self.transformer_engine_v_0_8 = False
+#         self.use_ixte = False
+#         if self.transformer_impl == 'transformer_engine':
+#             global transformer_engine
+#             import transformer_engine
+#             from importlib.metadata import version
+#             from pkg_resources import packaging
+
+#             if ixte_extensions._USE_IXTE:
+#                 te_version = packaging.version.Version(ixte_extensions.te_version())
+#                 self.use_ixte = True
+#             else:
+#                 te_version = packaging.version.Version(version("transformer-engine"))
+#             if te_version >= packaging.version.Version("0.8.0"):
+#                 self.transformer_engine_v_0_8 = True
+#             if te_version >= packaging.version.Version("0.10.0"):
+#                 self.transformer_engine_v_0_10 = True
+#             if te_version >= packaging.version.Version("0.11.0"):
+#                 self.transformer_engine_v_0_11 = True
+#             if te_version >= packaging.version.Version("0.10.0"):
+#                 self.transformer_engine_rope_available = True
+
+#             del version, packaging
+
+#             assert not args.squared_relu, "TransformerEngine does not support squared relu activation."
+
+#         self.use_fp8 = args.fp8 is not None
+#         self.fp8_recipe = None
+#         self.fp8_group = None
+#         if self.use_fp8:
+#             assert args.transformer_impl == 'transformer_engine', \
+#                 'transformer-engine required for fp8 training and inference'
+#             self.fp8_group = mpu.get_amax_reduction_group()
+#             if args.fp8 == "e4m3":
+#                 fp8_format = transformer_engine.common.recipe.Format.E4M3
+#             elif args.fp8 == "hybrid":
+#                 fp8_format = transformer_engine.common.recipe.Format.HYBRID
+#             else:
+#                 raise ValueError("The DelayedScaling recipe only supports E4M3 and HYBRID formats.")
+#             self.fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+#                 margin=args.fp8_margin,
+#                 interval=args.fp8_interval,
+#                 fp8_format=fp8_format,
+#                 amax_history_len=args.fp8_amax_history_len,
+#                 amax_compute_algo=args.fp8_amax_compute_algo,
+#                 override_linear_precision=(False, False, not args.fp8_wgrad),
+#             )
+
+#         self.num_microbatches_in_previous_step = -1
+#         self.microbatch_count = 0
+#         self.checkpoint_core_attention = config.recompute_granularity == 'selective'
+
+#         # Number of layers.
+#         self.num_layers = _get_num_layers(args, model_type,
+#                                           layer_type==LayerType.decoder)
+
+#         self.drop_path_rates = [
+#             rate.item() for rate in
+#             torch.linspace(0, self.drop_path_rate, config.num_layers)]
+
+#         self.retro_layer_numbers = None
+#         if model_type == ModelType.retro_decoder:
+#             retro_layer_start = 6 if config.num_layers <= 15 else 9
+#             self.retro_layer_numbers = \
+#                 np.arange(retro_layer_start, args.num_layers + 1, 3).tolist()
+#         if model_type == ModelType.retro_encoder:
+#             self.retro_layer_numbers = [1]
+
+#         # Transformer layers.
+#         if args.retro_add_retriever:
+#             assert self.recompute_granularity != 'full', \
+#                 "Full recompute not supported for Retro."
+#             assert args.transformer_impl == 'local', \
+#                 "Transformer engine does not support Retro layers."
+#         def build_layer(layer_number, n_e=1):
+#             if args.transformer_impl == 'local':
+#                 current_layer_type = _get_layer_type(
+#                     model_type, layer_type, self.retro_layer_numbers,
+#                     layer_number)
+#                 return ParallelTransformerLayer(
+#                     config,
+#                     layer_number,
+#                     layer_type=current_layer_type,
+#                     self_attn_mask_type=self_attn_mask_type,
+#                     drop_path_rate=self.drop_path_rates[layer_number - 1],
+#                     num_experts=n_e)
+#             else:
+#                 # This argument is only available from TE v0.10 onwards.
+#                 extra_transformer_engine_kwargs = {}
+#                 if self.transformer_engine_v_0_8:
+#                     extra_transformer_engine_kwargs["bias"] = args.add_bias_linear
+#                 if self.transformer_engine_v_0_10:
+#                     extra_transformer_engine_kwargs["activation"] = "swiglu" if args.swiglu else "gelu"
+#                 if self.transformer_engine_v_0_11:
+#                     extra_transformer_engine_kwargs["normalization"] = args.normalization
+#                 if not ixte_extensions._USE_IXTE:
+#                     assert config.attention_softmax_in_fp32, "TransformerEngine only supports softmax compute in FP32."
+#                 if self.use_ixte:
+#                     extra_transformer_engine_kwargs["use_alibi"] = args.position_embedding_type == "alibi"
+#                 assert (
+#                     (bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and args.fp16) == config.apply_query_key_layer_scaling
+#                 ), ("Unsupported config for apply_query_key_layer_scaling in TransformerEngine. If --apply-query-key-layer-scaling is "
+#                     "provided, set env-var NVTE_APPLY_QK_LAYER_SCALING=1 and you must be using fp16.")
+#                 layer_tmp = transformer_engine.pytorch.TransformerLayer(
+#                     config.hidden_size,
+#                     config.ffn_hidden_size,
+#                     config.num_attention_heads,
+#                     num_gqa_groups=config.num_query_groups,
+#                     layernorm_epsilon=config.layernorm_epsilon,
+#                     hidden_dropout=config.hidden_dropout,
+#                     attention_dropout=config.attention_dropout,
+#                     init_method=config.init_method,
+#                     output_layer_init_method=config.output_layer_init_method,
+#                     layer_number=layer_number,
+#                     kv_channels=config.kv_channels,
+#                     self_attn_mask_type=self_attn_mask_type.name,
+#                     tp_group=mpu.get_tensor_model_parallel_group() if mpu.is_initialized() else None,
+#                     tp_size=mpu.get_tensor_model_parallel_world_size(),
+#                     get_rng_state_tracker=get_cuda_rng_tracker
+#                     if get_cuda_rng_tracker().is_initialized()
+#                     else None,
+#                     fuse_wgrad_accumulation=config.gradient_accumulation_fusion,
+#                     seq_length=args.seq_length,
+#                     micro_batch_size=args.micro_batch_size,
+#                     sequence_parallel=config.sequence_parallel,
+#                     params_dtype=config.params_dtype,
+#                     apply_residual_connection_post_layernorm=config.apply_residual_connection_post_layernorm,
+#                     output_layernorm=False,
+#                     layer_type="encoder",
+#                     drop_path_rate=self.drop_path_rates[layer_number - 1],
+#                     set_parallel_mode=True,
+#                     fuse_qkv_params=True,
+#                     **extra_transformer_engine_kwargs)
+#                 return layer_tmp
+
+#         if config.virtual_pipeline_model_parallel_size is not None:
+#             if args.num_layers_per_stage is None:
+#                 assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \
+#                     'num_layers_per_stage must be divisible by ' \
+#                     'virtual_pipeline_model_parallel_size'
+#                 assert args.model_type != ModelType.encoder_and_decoder
+#                 # Number of layers in each model chunk is the number of layers in the stage,
+#                 # divided by the number of model chunks in a stage.
+#                 self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size
+#                 # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+#                 # layers to stages like (each list is a model chunk):
+#                 # Stage 0: [0]  [2]  [4]  [6]
+#                 # Stage 1: [1]  [3]  [5]  [7]
+#                 # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+#                 # layers to stages like (each list is a model chunk):
+#                 # Stage 0: [0, 1]  [4, 5]
+#                 # Stage 1: [2, 3]  [6, 7]
+#                 offset = mpu.get_virtual_pipeline_model_parallel_rank() * (
+#                     config.num_layers // config.virtual_pipeline_model_parallel_size) + \
+#                     (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
+#             else:
+#                 offset_list = [0] * len(args.num_layers_per_stage)
+#                 for i in range(len(args.num_layers_per_stage)):
+#                     for j in range(i):
+#                         offset_list[i] += args.num_layers_per_stage[j]
+#                 offset = offset_list[mpu.get_virtual_pipeline_model_parallel_rank() * mpu.get_pipeline_model_parallel_world_size() + mpu.get_pipeline_model_parallel_rank()]
+#         else:
+#             # Each stage gets a contiguous set of layers.
+#             if args.model_type == ModelType.encoder_and_decoder and \
+#                     mpu.get_pipeline_model_parallel_world_size() > 1:
+#                 pipeline_rank = mpu.get_pipeline_model_parallel_rank()
+#                 if layer_type == LayerType.encoder:
+#                     offset = pipeline_rank * self.num_layers
+#                 else:
+#                     num_ranks_in_enc = args.pipeline_model_parallel_split_rank
+#                     offset = (pipeline_rank - num_ranks_in_enc) * self.num_layers
+#             else:
+#                 if args.num_layers_per_stage is not None:
+#                     offset_list = [0] * len(args.num_layers_per_stage)
+#                     for i in range(len(args.num_layers_per_stage)):
+#                         for j in range(i):
+#                             offset_list[i] += args.num_layers_per_stage[j]
+#                     offset = offset_list[mpu.get_pipeline_model_parallel_rank()]
+#                 else:
+#                     offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
+
+#         if self.num_layers == 0:
+#             # When a standalone embedding stage is used (e.g.,
+#             # args.standalone_embedding_stage == True), virtual pipeline ranks
+#             # on pipeline rank 0 will have zero transformer layers assigned to
+#             # them. This results in the model's input and output tensors to be
+#             # the same, which will cause failure for certain output tensor
+#             # optimizations (e.g., pipeline output deallocation). To remedy
+#             # this, we assign a 'no-op' layer on these ranks, which will
+#             # disconnect the input tensor from the output tensor.
+#             self.num_layers = 1
+#             self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ])
+#         else:
+#             # Build the layers
+#             if not args.deepspeed:
+#                 self.layers = torch.nn.ModuleList(
+#                     [build_layer(i + 1 + offset) for i in range(self.num_layers)])
+#             else:
+#                 self.layers = []
+#                 num_experts = args.ds_num_experts
+#                 experts_per_layer = get_num_experts_per_layer(num_experts, self.num_layers, args.expert_interval, offset)
+#                 for i in range(self.num_layers):
+#                     layer_num = i + 1 + offset
+#                     n_e = experts_per_layer[i]
+#                     self.layers.append(build_layer(layer_num, n_e))
+#                 self.layers = torch.nn.ModuleList(self.layers)
+
+#             # Update dropout rate for Retro encoder.
+#             if model_type == ModelType.retro_encoder:
+#                 for layer in self.layers:
+#                     if layer.self_attention.use_flash_attn:
+#                         layer.self_attention.core_attention_flash.dropout_p = \
+#                             torch.nn.Dropout(args.retro_encoder_attention_dropout)
+#                     else:
+#                         layer.self_attention.core_attention.attention_dropout.p =\
+#                             args.retro_encoder_attention_dropout
+#                     layer.hidden_dropout = args.retro_encoder_hidden_dropout
+
+#         if self.post_process and self.post_norm:
+#             # Final layer norm before output.
+#             self.final_norm = get_norm(config)
+
+#     def _get_layer(self, layer_number):
+#         return self.layers[layer_number]
+
+#     def _checkpointed_forward(self, hidden_states, attention_mask,
+#                               encoder_output, enc_dec_attn_mask,
+#                               rotary_pos_emb, is_first_microbatch):
+#         args = get_args()
+
+#         """Forward method with activation checkpointing."""
+#         def custom(start, end):
+#             def custom_forward(*args, **kwargs):
+#                 x_, *args = args
+#                 moe_losses = []
+#                 for index in range(start, end):
+#                     # Is recompute last layer
+#                     # Network last layer also can be optimized, because vocab gemm always save forward tenor for backward!
+#                     if self.transformer_impl == 'transformer_engine' and ixte_extensions._USE_IXTE:
+#                         kwargs["is_recompute_lastlayer"] = index == end - 1
+#                     layer = self._get_layer(index)
+#                     output = layer(x_, *args, **kwargs)
+#                     if isinstance(output, tuple):
+#                         x_, moe_loss = output
+#                     else:
+#                         x_ = output
+#                         moe_loss = torch.tensor(0.0, device=x_.device, dtype=x_.dtype, requires_grad=True)
+#                     moe_losses.append(moe_loss)
+#                 return (x_, *moe_losses)
+#             return custom_forward
+        
+#         if args.deepspeed and args.deepspeed_activation_checkpointing:
+#             moe_losses = []
+#             # Make sure memory is freed.
+#             tensor_parallel.reset_checkpointed_activations_memory_buffer()
+#             l = 0
+#             while l < self.num_layers:
+#                 hidden_states, *local_moe_losses = tensor_parallel.checkpoint(
+#                     custom(l, l + self.checkpoint_num_layers), False,
+#                     hidden_states, attention_mask, encoder_output, enc_dec_attn_mask,
+#                     None, None, None, None, rotary_pos_emb)
+#                 moe_losses.extend(local_moe_losses)
+#                 l += self.checkpoint_num_layers
+
+#             return hidden_states, moe_losses
+#         else:
+#             moe_losses = []
+#             te_forward_kwargs = {}
+#             if self.transformer_impl == 'transformer_engine':
+#                 te_forward_kwargs['is_first_microbatch'] = is_first_microbatch
+#                 if self.transformer_engine_v_0_10:
+#                     te_forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+
+#             if self.recompute_method == 'uniform':
+#                 # Uniformly divide the total number of Transformer layers and
+#                 # checkpoint the input activation of each divided chunk.
+#                 # A method to further reduce memory usage reducing checkpoints.
+#                 l = 0
+#                 while l < self.num_layers:
+#                     if self.transformer_impl == 'transformer_engine':
+#                         hidden_states, *local_moe_losses = transformer_engine.pytorch.checkpoint(
+#                             custom(l, l + self.recompute_num_layers),
+#                             self.distribute_saved_activations,
+#                             tensor_parallel.get_cuda_rng_tracker,
+#                             mpu.get_tensor_model_parallel_group(),
+#                             hidden_states, attention_mask, encoder_output,
+#                             enc_dec_attn_mask, **te_forward_kwargs)
+#                     else:
+#                         hidden_states, *local_moe_losses = tensor_parallel.checkpoint(
+#                             custom(l, l + self.recompute_num_layers),
+#                             self.distribute_saved_activations,
+#                             hidden_states, attention_mask,
+#                             encoder_output, enc_dec_attn_mask,
+#                             None, None, None, None, rotary_pos_emb)
+#                     moe_losses.extend(local_moe_losses)
+#                     l += self.recompute_num_layers
+#             elif self.recompute_method == 'block':
+#                 # Checkpoint the input activation of only a set number of individual
+#                 # Transformer layers and skip the rest.
+#                 # A method fully use the device memory removing redundant re-computation.
+#                 for l in range(self.num_layers):
+#                     if l < self.recompute_num_layers:
+#                         if self.transformer_impl == 'transformer_engine':
+#                             hidden_states, *local_moe_losses = transformer_engine.pytorch.checkpoint(
+#                                 custom(l, l + 1),
+#                                 self.distribute_saved_activations,
+#                                 tensor_parallel.get_cuda_rng_tracker,
+#                                 mpu.get_tensor_model_parallel_group(),
+#                                 hidden_states, attention_mask, encoder_output,
+#                                 enc_dec_attn_mask, **te_forward_kwargs)
+#                         else:
+#                             hidden_states, *local_moe_losses = tensor_parallel.checkpoint(
+#                                 custom(l, l + 1),
+#                                 self.distribute_saved_activations,
+#                                 hidden_states, attention_mask,
+#                                 encoder_output, enc_dec_attn_mask,
+#                                 None, None, None, None, rotary_pos_emb)
+#                     else:
+#                         if self.transformer_impl == 'transformer_engine':
+#                             hidden_states, *local_moe_losses = custom(l, l + 1)(
+#                                 hidden_states, attention_mask, encoder_output,
+#                                 enc_dec_attn_mask, **te_forward_kwargs)
+#                         else:
+#                             hidden_states, *local_moe_losses = custom(l, l + 1)(
+#                                 hidden_states, attention_mask,
+#                                 encoder_output, enc_dec_attn_mask,
+#                                 None, None, None, None, rotary_pos_emb)
+                            
+#                     moe_losses.extend(local_moe_losses)
+#             else:
+#                 raise ValueError("Invalid activation recompute method.")
+#             return hidden_states, moe_losses
+
+#     def set_input_tensor(self, input_tensor):
+#         """Set input tensor to be used instead of forward()'s input.
+
+#         When doing pipeline parallelism the input from the previous
+#         stage comes from communication, not from the input, so the
+#         model's forward_step_func won't have it. This function is thus
+#         used by internal code to bypass the input provided by the
+#         forward_step_func"""
+#         self.input_tensor = input_tensor
+
+#     def forward(self, hidden_states, attention_mask,
+#                 encoder_output=None, enc_dec_attn_mask=None,
+#                 retriever_input=None,
+#                 retriever_output=None,
+#                 retriever_attn_mask=None,
+#                 inference_params=None,
+#                 rotary_pos_emb=None):
+#         # hidden_states: [s, b, h]
+
+#         # Checks.
+#         if inference_params:
+#             assert self.recompute_granularity is None, \
+#                 'inference does not work with activation checkpointing'
+
+#         if not self.pre_process:
+#             # See set_input_tensor()
+#             hidden_states = self.input_tensor
+
+#         # Viewless tensor.
+#         # - We only need to create a viewless tensor in the case of micro batch
+#         #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+#         #   above creates a view tensor, and '.contiguous()' is a pass-through.
+#         #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+#         #   the need to make it viewless.
+#         #
+#         #   However, we don't explicitly check mbs == 1 here because
+#         #   make_viewless_tensor() has negligible overhead when its input
+#         #   is already viewless.
+#         #
+#         # - For the 'else' case above, calling make_viewless_tensor() here is
+#         #   likely redundant, since p2p_communication.py (likely originator)
+#         #   already creates viewless tensors. That said, make_viewless_tensor()
+#         #   is called here to be future-proof and corner-case-proof.
+#         hidden_states = core.utils.make_viewless_tensor(
+#             hidden_states,
+#             requires_grad=True,
+#             keep_graph=True,
+#         )
+
+#         # RNG context.
+#         if self.sequence_parallel:
+#             rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+#         else:
+#             rng_context = nullcontext()
+
+#         # Forward layers.
+#         with rng_context:
+#             # The fp8_autocast context manager is a no-op when enabled=True
+#             # The if...else serves to short circuit name resolution for fp8_autocast
+#             with transformer_engine.pytorch.fp8_autocast(
+#                 enabled=self.use_fp8,
+#                 fp8_recipe=self.fp8_recipe,
+#                 fp8_group=self.fp8_group
+#             ) if self.use_fp8 else nullcontext():
+#                 # Determine if the current iteration is first microbatch
+#                 if self.num_microbatches_in_previous_step != get_num_microbatches():
+#                     self.microbatch_count = 0 # Reset count on new batch size rampup interval
+#                 self.num_microbatches_in_previous_step = get_num_microbatches()
+#                 is_first_microbatch = self.microbatch_count % get_num_microbatches() == 0
+
+#                 # Forward pass.
+#                 moe_losses = []
+#                 if self.deepspeed and self.checkpoint_activations:
+#                     hidden_states, moe_losses = self._checkpointed_forward(hidden_states,
+#                                                                attention_mask,
+#                                                                encoder_output,
+#                                                                enc_dec_attn_mask,
+#                                                                rotary_pos_emb,
+#                                                                is_first_microbatch)
+#                 elif self.recompute_granularity == 'full':
+#                     hidden_states, moe_losses = self._checkpointed_forward(hidden_states,
+#                                                                attention_mask,
+#                                                                encoder_output,
+#                                                                enc_dec_attn_mask,
+#                                                                rotary_pos_emb,
+#                                                                is_first_microbatch)
+#                 else:
+#                     forward_kwargs = {
+#                         'encoder_output': encoder_output,
+#                         'enc_dec_attn_mask': enc_dec_attn_mask,
+#                         'inference_params': inference_params,
+#                     }
+
+#                     if self.transformer_impl == 'transformer_engine':
+#                         forward_kwargs['is_first_microbatch'] = is_first_microbatch
+#                         forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention
+#                         if self.transformer_engine_v_0_10:
+#                             forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+#                     else:
+#                         forward_kwargs['rotary_pos_emb'] = rotary_pos_emb
+#                         forward_kwargs['retriever_input'] = retriever_input
+#                         forward_kwargs['retriever_output'] = retriever_output
+#                         forward_kwargs['retriever_attn_mask'] = retriever_attn_mask
+
+#                     for index in range(self.num_layers):
+#                         layer = self._get_layer(index)
+
+#                         hidden_states = layer(
+#                             hidden_states,
+#                             attention_mask,
+#                             **forward_kwargs)
+
+#                         # First Retro decoder layer returns both hidden_states
+#                         # and retriever_output. Make retriever_output available
+#                         # to subsequence Retro layers.
+#                         if isinstance(hidden_states, tuple):
+#                             assert (len(hidden_states) == 2 or len(hidden_states) == 3)
+#                             if len(hidden_states) == 2:
+#                                 if not self.ds_inference:
+#                                     hidden_states, moe_loss = hidden_states
+#                                     moe_losses.append(moe_loss)
+#                             else:
+#                                 forward_kwargs["retriever_output"] = hidden_states[1]
+#                                 if not self.ds_inference:
+#                                     hidden_states, _, moe_loss = hidden_states
+#                                     moe_losses.append(moe_loss)
+
+#                 # Skip counter update for eval and activation checkpointing
+#                 if torch.is_grad_enabled() and self.training:
+#                     self.microbatch_count += 1
+
+#         # Final layer norm.
+#         if self.post_process and self.post_norm:
+#             hidden_states = self.final_norm(hidden_states)
+
+#         return (hidden_states, *moe_losses)
+
+#     def load_state_dict(self, state_dict, strict=True):
+#         """Customize load."""
+
+#         # Handle renaming layernorm -> norm in component names
+#         state_dict_ = {}
+#         for key in state_dict.keys():
+#             # Bypass TransformerEngine module parameters.
+#             if "layernorm_qkv" in key or "layernorm_mlp" in key:
+#                 state_dict_[key] = state_dict[key]
+#                 continue
+#             newkey = key.replace("layernorm", "norm")
+#             state_dict_[newkey] = state_dict[key]
+
+#         super().load_state_dict(state_dict_, strict)
+
+class LMHeadPipe(MegatronModule):
+    """
+    Arguments:
+        vocab_size: size of vocabulary.
+        hidden_size: hidden size
+        gather_output: wether output logits being gathered or not.
+        init_method: init method for weight initialization
+        config:
+    """
+
+    def __init__(self, hidden_size, vocab_size, config):
+        args = get_args()
+        super(LMHeadPipe, self).__init__()
+        self.lm_head = tensor_parallel.ColumnParallelLinear(input_size=hidden_size,
+                                                            output_size=vocab_size,
+                                                            bias=False,
+                                                            config=config,
+                                                            init_method=config.init_method,)
+
+    def forward(self, inputs, **kwargs):
+        assert torch.is_tensor(inputs) or isinstance(inputs, tuple)
+        if isinstance(inputs, tuple):
+            hidden_states = inputs[0]
+        else:
+            hidden_states = inputs
+
+        if not hasattr(self, '_args'):
+            self._args = get_args()
+
+        if hasattr(self._args, 'attn_mask'):
+            attention_mask = None
+        else:
+            attention_mask = inputs[1]
+
+        logits, _ = self.lm_head(hidden_states)
+
+        # If cmd args has attn_mask, we don't forward it as an activation.
+        if hasattr(self._args, 'attn_mask'):
+            return logits
+        else:
+            return logits, attention_mask
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/model/utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/utils.py
similarity index 35%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/model/utils.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/utils.py
index 09eca92b6afe7afbcd0bc6930f039ab0a77940f4..134736be455c06c75ff871e3d709a1a95f340a29 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/model/utils.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/legacy/model/utils.py
@@ -1,38 +1,19 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
 """Utilities for models."""
 
 import math
 
 import torch
 
-from megatron_ds import get_args
-from megatron_ds.model import LayerNorm, RMSNorm, RMSNormResidual
+from megatron.training import get_args
+from megatron.legacy.model import LayerNorm, RMSNorm, RMSNormResidual
+from megatron.core.jit import jit_fuser
 
 from deepspeed.runtime.zero import GatheredParameters
 
-def init_method_normal(sigma):
-    """Init method based on N(0, sigma)."""
-    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
-
-    return init_
-
-
-def scaled_init_method_normal(sigma, num_layers):
-    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
-    std = sigma / math.sqrt(2.0 * num_layers)
-
-    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
-
-    return init_
-
 
 def gather_and_init(param, init_method):
     with GatheredParameters(param, modifier_rank=0):
         init_method(param)
-        
 
 def attention_mask_func(attention_scores, attention_mask):
     args = get_args()
@@ -47,7 +28,6 @@ def attention_mask_func(attention_scores, attention_mask):
         attention_scores.masked_fill_(attention_mask, -10000.0)
     return attention_scores
 
-
 def get_linear_layer(rows, columns, init_method, gather_params_on_init=False):
     """Simple linear layer with weight initialization."""
     layer = torch.nn.Linear(rows, columns)
@@ -55,48 +35,6 @@ def get_linear_layer(rows, columns, init_method, gather_params_on_init=False):
         with GatheredParameters(layer.weight, modifier_rank=0, enabled=gather_params_on_init):
             init_method(layer.weight)
     with torch.no_grad():
-        with GatheredParameters(layer.bias, modifier_rank=0, enabled=gather_params_on_init):
+        with GatheredParameters(layer.weight, modifier_rank=0, enabled=gather_params_on_init):
             layer.bias.zero_()
     return layer
-
-@torch.jit.script
-def gelu_impl(x):
-    """OpenAI's gelu implementation."""
-    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
-                                       (1.0 + 0.044715 * x * x)))
-def openai_gelu(x):
-    return gelu_impl(x)
-
-#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
-@torch.jit.script
-def erf_gelu(x):
-    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
-
-
-def get_norm(config):
-    args = get_args()
-    if args.normalization == "LayerNorm":
-        return LayerNorm(
-            config.hidden_size,
-            eps=config.layernorm_epsilon,
-            no_persist_layer_norm=not config.persist_layer_norm,
-            sequence_parallel=config.sequence_parallel,
-            apply_layernorm_1p=args.apply_layernorm_1p)
-    elif args.normalization == "RMSNorm":
-        if args.apply_layernorm_1p:
-            raise NotImplementedError('RMSNorm does not currently support the layernorm_1p formulation.')
-
-        return RMSNorm(dim=config.hidden_size,
-                       eps=config.layernorm_epsilon,
-                       sequence_parallel=config.sequence_parallel)
-    else:
-        raise Exception(f"unsupported norm type '{args.normalization}'.")
-def get_rmsnorm_residual(config):
-    args = get_args()
-    return RMSNormResidual(
-        config.hidden_size,
-        eps=config.layernorm_epsilon,
-        no_persist_layer_norm=not config.persist_layer_norm,
-        sequence_parallel=config.sequence_parallel,
-        apply_layernorm_1p=args.apply_layernorm_1p,
-        apply_layernorm_rms = True)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/log_handler.py b/toolbox/Megatron-DeepSpeed/megatronspeed/log_handler.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/log_handler.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/log_handler.py
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/megatron_adaptor.py b/toolbox/Megatron-DeepSpeed/megatronspeed/megatron_adaptor.py
new file mode 100644
index 0000000000000000000000000000000000000000..de1207bae5bf2d9a461549ca5683d27658ed978f
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/megatron_adaptor.py
@@ -0,0 +1,319 @@
+import sys
+import argparse
+from functools import wraps
+import torch
+from torch.distributed import all_gather_into_tensor, reduce_scatter_tensor
+from megatronspeed.training.arguments import process_args
+
+_ARGS = None
+
+IS_ADAPTED = False
+
+def add_args(args, key, value):
+    if key is not None:
+        key = key[2:].replace('-', '_')
+        if value is None:
+            value = True
+        elif len(value) == 1:
+            value = value[0]
+        setattr(args, key, value)
+
+
+def parser_unknown_args(args, unknown):
+    i = 0
+    key = value = None
+    while i < len(unknown):
+        if unknown[i].startswith("--"):
+            add_args(args, key, value)
+            key = unknown[i]
+            value = None
+        else:
+            if value is None:
+                value = [unknown[i]]
+            else:
+                value.append(unknown[i])
+        i += 1
+    add_args(args, key, value)
+
+
+def version_wrapper(fn):
+    @wraps(fn)
+    def wrapper(name, *args, **kwargs):
+        if name == 'transformer-engine':
+            return '0.0'
+        res = fn(name, *args, **kwargs)
+        return res
+
+    return wrapper
+
+
+def te_adaptation(aspm):
+    # Need replace modules before import megatron
+    # aspm.register_patch('importlib.metadata.version', version_wrapper)
+    pass
+
+
+def apex_adaptation(aspm):
+    pass
+
+
+def torch_adaptation(aspm):
+    aspm.register_patch('torch.distributed._all_gather_base', all_gather_into_tensor)
+    aspm.register_patch('torch.distributed._reduce_scatter_base', reduce_scatter_tensor)
+
+
+def mcore_models_adaptation(aspm):
+    from .core.utils import get_model_config
+
+    aspm.register_patch('megatron.core.utils.get_model_config', get_model_config)
+
+
+def preparation_adaption(aspm):
+    from .training.global_vars import get_rlhf_args, set_rlhf_args
+
+    aspm.register_patch('megatron.training.global_vars.get_rlhf_args', get_rlhf_args)
+    aspm.register_patch('megatron.training.global_vars.set_rlhf_args', set_rlhf_args)
+    aspm.register_patch('megatron.training.get_rlhf_args', get_rlhf_args)
+    aspm.register_patch('megatron.training.set_rlhf_args', set_rlhf_args)
+
+
+def mcore_tensor_parallel_adaptation(aspm):
+    from .core.tensor_parallel.random import init_checkpointed_activations_memory_buffer, reset_checkpointed_activations_memory_buffer, \
+        get_cuda_rng_tracker, model_parallel_cuda_manual_seed, model_parallel_reconfigure_tp_seed, checkpoint, \
+        checkpoint_function_forward, checkpoint_function_backward
+    from .core.tensor_parallel.layers import linear_with_grad_accumulation_and_async_allreduce_forward, linear_with_grad_accumulation_and_async_allreduce_backward, \
+        linear_with_grad_accumulation_and_async_allreduce, SequenceParallelPositionEmbedding, column_parallel_linear_init, column_parallel_linear_forward, \
+        row_parallel_linear_init, row_parallel_linear_forward
+    from .core.tensor_parallel.data import _build_key_size_numel_dictionaries, broadcast_data
+
+    aspm.register_patch('megatron.core.tensor_parallel.random.init_checkpointed_activations_memory_buffer', init_checkpointed_activations_memory_buffer)
+    aspm.register_patch('megatron.core.tensor_parallel.random.reset_checkpointed_activations_memory_buffer', reset_checkpointed_activations_memory_buffer)
+    aspm.register_patch('megatron.core.tensor_parallel.random.get_cuda_rng_tracker', get_cuda_rng_tracker)
+    aspm.register_patch('megatron.core.tensor_parallel.random.model_parallel_cuda_manual_seed', model_parallel_cuda_manual_seed)
+    aspm.register_patch('megatron.core.tensor_parallel.random.model_parallel_reconfigure_tp_seed', model_parallel_reconfigure_tp_seed)
+    aspm.register_patch('megatron.core.tensor_parallel.random.CheckpointFunction.forward', checkpoint_function_forward)
+    aspm.register_patch('megatron.core.tensor_parallel.random.CheckpointFunction.backward', checkpoint_function_backward)
+    aspm.register_patch('megatron.core.tensor_parallel.random.checkpoint', checkpoint)
+    aspm.register_patch('megatron.core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication.forward',
+                        linear_with_grad_accumulation_and_async_allreduce_forward)
+    aspm.register_patch('megatron.core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication.backward',
+                        linear_with_grad_accumulation_and_async_allreduce_backward)
+    aspm.register_patch('megatron.core.tensor_parallel.layers.linear_with_grad_accumulation_and_async_allreduce', linear_with_grad_accumulation_and_async_allreduce)
+    aspm.register_patch('megatron.core.tensor_parallel.layers.SequenceParallelPositionEmbedding', SequenceParallelPositionEmbedding)
+    aspm.register_patch('megatron.core.tensor_parallel.layers.ColumnParallelLinear.__init__', column_parallel_linear_init)
+    aspm.register_patch('megatron.core.tensor_parallel.layers.ColumnParallelLinear.forward', column_parallel_linear_forward)
+    aspm.register_patch('megatron.core.tensor_parallel.layers.RowParallelLinear.__init__', row_parallel_linear_init)
+    aspm.register_patch('megatron.core.tensor_parallel.layers.RowParallelLinear.forward', row_parallel_linear_forward)
+    aspm.register_patch('megatron.core.tensor_parallel.data._build_key_size_numel_dictionaries', _build_key_size_numel_dictionaries)
+    aspm.register_patch('megatron.core.tensor_parallel.data.broadcast_data', broadcast_data)
+
+
+def mcore_pipeline_parallel_adaptation(aspm):
+    from .core.pipeline_parallel.schedules import backward_step, forward_backward_no_pipelining, forward_backward_pipelining_without_interleaving
+
+    aspm.register_patch('megatron.core.pipeline_parallel.schedules.backward_step', backward_step)
+    aspm.register_patch('megatron.core.pipeline_parallel.schedules.forward_backward_no_pipelining', forward_backward_no_pipelining)
+    aspm.register_patch('megatron.core.pipeline_parallel.schedules.forward_backward_pipelining_without_interleaving',
+                        forward_backward_pipelining_without_interleaving)
+
+
+def mcore_transformer_adaptation(aspm):
+    from .core.transformer.utils import get_linear_layer
+
+    aspm.register_patch('megatron.core.transformer.utils.get_linear_layer', get_linear_layer)
+
+def legacy_model_transformer(aspm):
+    from .legacy.model.module import megatron_module_universal_checkpoint_info
+    from .legacy.model.utils import gather_and_init, attention_mask_func, get_linear_layer
+    from .legacy.model.transformer import parallel_mlp_init, parallel_mlp_forward_wrapper, core_attention_init, flash_selfattention_forward_wrapper, parallel_attention_init, \
+        parallel_attention_forward_wrapper, parallel_transformer_layer_init, parallel_transformer_layer_forward_wrapper, parallel_transformer_init, \
+        parallel_transformer__checkpointed_forward_wrapper, parallel_transformer_forward_wrapper
+    from .legacy.model.realm_model import IREncoderBertModel
+    from .legacy.model.multiple_choice import MultipleChoice
+    from .legacy.model.language_model import parallel_lm_logits, get_language_model, pooler_init, embedding_init, embedding_forward, transformer_language_model_init, \
+        transformer_language_model_forward_wrapper, transformer_language_model_state_dict_for_save_checkpoint, transformer_language_model_load_state_dict
+    from .legacy.model.gpt_model import post_language_model_processing, gpt_model_init, gpt_model_forward_wrapper, gpt_model_state_dict_for_save_checkpoint, \
+        gpt_model_load_state_dict, gpt_model_universal_checkpoint_info, GPTModelPipe
+    from .legacy.model.classification import Classification
+    from .legacy.model.bert_model import BertModel
+    from .legacy.model.t5_model import T5Model
+    from .legacy.model.biencoder_model import PretrainedBertModel
+
+    aspm.register_patch('megatron.legacy.model.module.MegatronModule.universal_checkpoint_info', megatron_module_universal_checkpoint_info, create_dummy=True)
+    aspm.register_patch('megatron.legacy.model.utils.gather_and_init', gather_and_init)
+    aspm.register_patch('megatron.legacy.model.utils.attention_mask_func', attention_mask_func)
+    aspm.register_patch('megatron.legacy.model.utils.get_linear_layer', get_linear_layer)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelMLP.__init__', parallel_mlp_init)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelMLP.forward', parallel_mlp_forward_wrapper)
+    aspm.register_patch('megatron.legacy.model.transformer.CoreAttention.__init__', core_attention_init)
+    aspm.register_patch('megatron.legacy.model.transformer.FlashSelfAttention.forward', flash_selfattention_forward_wrapper)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelAttention.__init__', parallel_attention_init)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelAttention.forward', parallel_attention_forward_wrapper)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelTransformerLayer.__init__', parallel_transformer_layer_init)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelTransformerLayer.forward', parallel_transformer_layer_forward_wrapper)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelTransformer.__init__', parallel_transformer_init)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelTransformer._checkpointed_forward', parallel_transformer__checkpointed_forward_wrapper)
+    aspm.register_patch('megatron.legacy.model.transformer.ParallelTransformer.forward', parallel_transformer_forward_wrapper)
+    aspm.register_patch('megatron.legacy.model.language_model.parallel_lm_logits', parallel_lm_logits)
+    aspm.register_patch('megatron.legacy.model.language_model.get_language_model', get_language_model)
+    aspm.register_patch('megatron.legacy.model.language_model.Pooler.__init__', pooler_init)
+    aspm.register_patch('megatron.legacy.model.language_model.Embedding.__init__', embedding_init)
+    aspm.register_patch('megatron.legacy.model.language_model.Embedding.forward', embedding_forward)
+    aspm.register_patch('megatron.legacy.model.language_model.TransformerLanguageModel.__init__',
+                        transformer_language_model_init)
+    aspm.register_patch('megatron.legacy.model.language_model.TransformerLanguageModel.forward',
+                        transformer_language_model_forward_wrapper)
+    aspm.register_patch('megatron.legacy.model.language_model.TransformerLanguageModel.state_dict_for_save_checkpoint',
+                        transformer_language_model_state_dict_for_save_checkpoint)
+    aspm.register_patch('megatron.legacy.model.language_model.TransformerLanguageModel.load_state_dict',
+                        transformer_language_model_load_state_dict)
+    aspm.register_patch('megatron.legacy.model.gpt_model.post_language_model_processing', post_language_model_processing)
+    aspm.register_patch('megatron.legacy.model.gpt_model.GPTModel.__init__', gpt_model_init)
+    aspm.register_patch('megatron.legacy.model.gpt_model.GPTModel.forward', gpt_model_forward_wrapper)
+    aspm.register_patch('megatron.legacy.model.gpt_model.GPTModel.state_dict_for_save_checkpoint', gpt_model_state_dict_for_save_checkpoint)
+    aspm.register_patch('megatron.legacy.model.gpt_model.GPTModel.load_state_dict', gpt_model_load_state_dict)
+    aspm.register_patch('megatron.legacy.model.gpt_model.GPTModel.universal_checkpoint_info', gpt_model_universal_checkpoint_info)
+    aspm.register_patch('megatron.legacy.model.GPTModelPipe', GPTModelPipe)
+    aspm.register_patch('megatron.legacy.model.realm_model.IREncoderBertModel', IREncoderBertModel)
+    aspm.register_patch('megatron.legacy.model.multiple_choice.MultipleChoice', MultipleChoice)
+    aspm.register_patch('megatron.legacy.model.classification.Classification', Classification)
+    aspm.register_patch('megatron.legacy.model.bert_model.BertModel', BertModel)
+    aspm.register_patch('megatron.legacy.model.t5_model.T5Model', T5Model)
+    aspm.register_patch('megatron.legacy.model.biencoder_model.PretrainedBertModel', PretrainedBertModel)
+
+
+def legacy_data_base(aspm):
+    from .legacy.data.blendable_dataset import BlendableDataset
+    from .legacy.data.indexed_dataset import make_dataset
+
+    aspm.register_patch('megatron.legacy.data.blendable_dataset.BlendableDataset', BlendableDataset, create_dummy=True)
+    aspm.register_patch('megatron.legacy.data.indexed_dataset.make_dataset', make_dataset, create_dummy=True)
+
+
+def legacy_data_adaption(aspm):
+    from .legacy.data.gpt_dataset import GPTDataset, build_train_valid_test_datasets
+
+    aspm.register_patch('megatron.legacy.data.gpt_dataset.GPTDataset', GPTDataset, create_dummy=True)
+    aspm.register_patch('megatron.legacy.data.gpt_dataset.build_train_valid_test_datasets', build_train_valid_test_datasets, create_dummy=True)
+
+
+
+
+def mcore_optimizer_adapation(aspm):
+    from .core.optimizer import get_param_groups, _get_param_groups_mod, get_megatron_optimizer_wrapper
+
+    aspm.register_patch('megatron.core.optimizer.get_param_groups', get_param_groups)
+    aspm.register_patch('megatron.core.optimizer._get_param_groups', _get_param_groups_mod)
+    aspm.register_patch('megatron.core.optimizer.get_megatron_optimizer', get_megatron_optimizer_wrapper)
+
+
+def megatron_training_adaptation(aspm):
+    from .training.arguments import parse_args_wrapper, validate_args
+    from .training.training import pretrain, get_model, setup_model_and_optimizer, train_step, training_log, train, evaluate, \
+        evaluate_and_print_results, build_train_valid_test_data_loaders
+    from .training.initialize import initialize_megatron, _compile_dependencies, _initialize_distributed, _warmup_jit_function
+    from .training.checkpointing import check_checkpoint_args, save_checkpoint, generate_state_dict, load_checkpoint
+    from .training.utils import get_ltor_masks_and_position_ids, update_rotary_pos_emb
+    from .training.tokenizer import build_tokenizer
+
+    aspm.register_patch('megatron.training.utils.get_ltor_masks_and_position_ids', get_ltor_masks_and_position_ids)
+    aspm.register_patch('megatron.training.utils.update_rotary_pos_emb', update_rotary_pos_emb)
+    aspm.register_patch('megatron.training.arguments.parse_args', parse_args_wrapper)
+    aspm.register_patch('megatron.training.arguments.validate_args', validate_args)
+    aspm.register_patch('megatron.training.yaml_arguments.validate_yaml', validate_args)
+    aspm.register_patch('megatron.training.training.pretrain', pretrain)
+    aspm.register_patch('megatron.training.training.train', train)
+    aspm.register_patch('megatron.training.training.get_model', get_model)
+    aspm.register_patch('megatron.training.training.setup_model_and_optimizer', setup_model_and_optimizer)
+    aspm.register_patch('megatron.training.training.train_step', train_step)
+    aspm.register_patch('megatron.training.training.training_log', training_log)
+    aspm.register_patch('megatron.training.training.evaluate', evaluate)
+    aspm.register_patch('megatron.training.training.evaluate_and_print_results', evaluate_and_print_results)
+    aspm.register_patch('megatron.training.training.build_train_valid_test_data_loaders', build_train_valid_test_data_loaders)
+    aspm.register_patch('megatron.training.initialize.initialize_megatron', initialize_megatron)
+    aspm.register_patch('megatron.training.initialize._compile_dependencies', _compile_dependencies)
+    aspm.register_patch('megatron.training.initialize._initialize_distributed', _initialize_distributed)
+    aspm.register_patch('megatron.training.initialize._warmup_jit_function', _warmup_jit_function)
+    aspm.register_patch('megatron.training.checkpointing.check_checkpoint_args', check_checkpoint_args)
+    aspm.register_patch('megatron.training.checkpointing.save_checkpoint', save_checkpoint)
+    aspm.register_patch('megatron.training.checkpointing.generate_state_dict', generate_state_dict)
+    aspm.register_patch('megatron.training.checkpointing.load_checkpoint', load_checkpoint)
+    aspm.register_patch('megatron.training.tokenizer.tokenizer.build_tokenizer', build_tokenizer)
+
+
+def mcore_parallel_state_adaptation(aspm):
+    from .core.parallel_state import initialize_model_parallel_wrapper, destroy_model_parallel_wrapper
+    from .core.parallel_state import sequence_parallel_is_initialized, sequence_data_parallel_is_initialized, \
+        get_sequence_parallel_group, get_sequence_data_parallel_group, set_sequence_parallel_world_size, set_sequence_data_parallel_world_size, \
+        get_model_parallel_world_size, get_sequence_parallel_world_size, get_sequence_data_parallel_world_size, get_model_parallel_rank, \
+        set_sequence_parallel_rank, set_sequence_data_parallel_rank, get_sequence_parallel_rank, get_sequence_data_parallel_rank, \
+        get_sequence_parallel_src_rank
+
+    aspm.register_patch('megatron.core.parallel_state.initialize_model_parallel', initialize_model_parallel_wrapper)
+    aspm.register_patch('megatron.core.parallel_state.destroy_model_parallel', destroy_model_parallel_wrapper)
+    aspm.register_patch('megatron.core.parallel_state.sequence_parallel_is_initialized', sequence_parallel_is_initialized)
+    aspm.register_patch('megatron.core.parallel_state.sequence_data_parallel_is_initialized', sequence_data_parallel_is_initialized)
+    aspm.register_patch('megatron.core.parallel_state.get_sequence_parallel_group', get_sequence_parallel_group)
+    aspm.register_patch('megatron.core.parallel_state.get_sequence_data_parallel_group', get_sequence_data_parallel_group)
+    aspm.register_patch('megatron.core.parallel_state.set_sequence_parallel_world_size', set_sequence_parallel_world_size)
+    aspm.register_patch('megatron.core.parallel_state.set_sequence_data_parallel_world_size', set_sequence_data_parallel_world_size)
+    aspm.register_patch('megatron.core.parallel_state.get_model_parallel_world_size', get_model_parallel_world_size)
+    aspm.register_patch('megatron.core.parallel_state.get_sequence_parallel_world_size', get_sequence_parallel_world_size)
+    aspm.register_patch('megatron.core.parallel_state.get_sequence_data_parallel_world_size', get_sequence_data_parallel_world_size)
+    aspm.register_patch('megatron.core.parallel_state.get_model_parallel_rank', get_model_parallel_rank)
+    aspm.register_patch('megatron.core.parallel_state.set_sequence_parallel_rank', set_sequence_parallel_rank)
+    aspm.register_patch('megatron.core.parallel_state.set_sequence_data_parallel_rank', set_sequence_data_parallel_rank)
+    aspm.register_patch('megatron.core.parallel_state.get_sequence_parallel_rank', get_sequence_parallel_rank)
+    aspm.register_patch('megatron.core.parallel_state.get_sequence_data_parallel_rank', get_sequence_data_parallel_rank)
+    aspm.register_patch('megatron.core.parallel_state.get_sequence_parallel_src_rank', get_sequence_parallel_src_rank)
+
+
+def adaptation_l0(aspm):
+    """
+    The minimum patch set for megatron to adapt to NPU
+    """
+    # transformer_engine
+    te_adaptation(aspm)
+    apex_adaptation(aspm)
+    torch_adaptation(aspm)
+    legacy_data_base(aspm)
+    preparation_adaption(aspm)
+    # Need replace transformer_engine modules before import megatron
+    aspm.apply_patches()
+
+    mcore_models_adaptation(aspm)
+    mcore_tensor_parallel_adaptation(aspm)
+    mcore_pipeline_parallel_adaptation(aspm)
+    mcore_transformer_adaptation(aspm)
+    legacy_model_transformer(aspm)
+    legacy_data_adaption(aspm)
+    mcore_optimizer_adapation(aspm)
+    megatron_training_adaptation(aspm)
+    mcore_parallel_state_adaptation(aspm)
+
+
+def get_megatronspeed_args():
+    global _ARGS
+    if _ARGS is None:
+        parser = argparse.ArgumentParser(description='Megatron-Deepspeed Arguments', allow_abbrev=False)
+        _ARGS, unknown = process_args(parser).parse_known_args()
+        parser_unknown_args(_ARGS, unknown)
+    return _ARGS
+
+
+def exe_adaptation():
+    global IS_ADAPTED
+    if IS_ADAPTED:
+        return
+
+    megatronspeed_args = get_megatronspeed_args()
+    from .patch_utils import MegatronPatchesManager as aspm
+
+    adaptation_l0(aspm)
+
+    aspm.apply_patches()
+
+    IS_ADAPTED = True
+
+
+exe_adaptation()
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/p2p_communication.py b/toolbox/Megatron-DeepSpeed/megatronspeed/p2p_communication.py
similarity index 98%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/p2p_communication.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/p2p_communication.py
index 15df395e0153e0fd6665eba5ac7ebbae4d44d8f1..eed513fc3019c3438aa3615d96c475f6670e4258 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/p2p_communication.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/p2p_communication.py
@@ -17,14 +17,14 @@ from functools import reduce
 import operator
 import torch
 from deepspeed.accelerator import get_accelerator
-from megatron_ds import get_args
-from megatron_ds.core import mpu
+from megatron.training import get_args
+from megatron.core import mpu
 
 
 def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
                  use_ring_exchange=False):
     """Communicate tensors between stages. Used as helper method in other
-    communication methods that are used in megatron_ds/schedules.py.
+    communication methods that are used in megatron/schedules.py.
 
     Takes the following arguments:
         tensor_send_next: tensor to send to next rank (no tensor sent if
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/patch_utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/patch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d2f6b1a21189f42e9eac63745b5d13377f692c9
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/patch_utils.py
@@ -0,0 +1,118 @@
+import importlib
+import sys
+import types
+
+
+def get_func_name(func):
+    if isinstance(func, str):
+        return func
+    return '.'.join((func.__module__, func.__qualname__))
+
+
+def dummy_function_wrapper(func_name):
+    def dummy_function(*args, **kwargs):
+        raise RuntimeError('function {} no exist'.format(func_name))
+
+    return dummy_function
+
+
+class Patch:
+    def __init__(self, orig_func_name, new_func, create_dummy):
+        split_name = orig_func_name.rsplit('.', 1)
+        if len(split_name) == 1:
+            self.orig_module_name, self.orig_func_name = orig_func_name, None
+        else:
+            self.orig_module_name, self.orig_func_name = split_name
+        self.orig_module = None
+        self.orig_func = None
+
+        self.patch_func = None
+        self.wrappers = []
+        if new_func is None:
+            new_func = dummy_function_wrapper(orig_func_name)
+        self.set_patch_func(new_func)
+        self.is_applied = False
+        self.create_dummy = create_dummy
+
+    @property
+    def orig_func_id(self):
+        return id(self.orig_func)
+
+    @property
+    def patch_func_id(self):
+        return id(self.patch_func)
+
+    def set_patch_func(self, new_func, force_patch=False):
+        if hasattr(new_func, '__name__') and new_func.__name__.endswith(('wrapper', 'decorator')):
+            self.wrappers.append(new_func)
+        else:
+            if self.patch_func and not force_patch:
+                raise RuntimeError('the patch of {} exist !'.format(self.orig_func_name))
+            self.patch_func = new_func
+        self.is_applied = False
+
+    def apply_patch(self):
+        if self.is_applied:
+            return
+
+        self.orig_module, self.orig_func = Patch.parse_path(self.orig_module_name, self.orig_func_name, self.create_dummy)
+        if self.patch_func is None:
+            self.patch_func = self.orig_func
+
+        for wrapper in self.wrappers:
+            self.patch_func = wrapper(self.patch_func)
+
+        if self.orig_func_name is not None:
+            setattr(self.orig_module, self.orig_func_name, self.patch_func)
+        for key, value in sys.modules.copy().items():
+            if self.orig_func_name is not None and hasattr(value, self.orig_func_name) \
+                    and id(getattr(value, self.orig_func_name)) == self.orig_func_id:
+                setattr(value, self.orig_func_name, self.patch_func)
+        self.is_applied = True
+
+    @staticmethod
+    def parse_path(module_path, function_name, create_dummy):
+        from importlib.machinery import ModuleSpec
+        modules = module_path.split('.')
+        for i in range(1, len(modules) + 1):
+            parent = '.'.join(modules[:i - 1])
+            path = '.'.join(modules[:i])
+            try:
+                importlib.import_module(path)
+            except ModuleNotFoundError as e:
+                if not parent or not hasattr(importlib.import_module(parent), modules[i - 1]):
+                    if not create_dummy:
+                        raise ModuleNotFoundError(e) from e
+                    sys.modules[path] = types.ModuleType(path)
+                    sys.modules[path].__file__ = 'megatronspeed.dummy_module.py'
+                    sys.modules[path].__spec__ = ModuleSpec(path, None)
+                    if parent:
+                        setattr(importlib.import_module(parent), modules[i - 1], sys.modules[path])
+                else:
+                    module = getattr(importlib.import_module(parent), modules[i - 1])
+                    if hasattr(module, function_name):
+                        return module, getattr(module, function_name)
+                    elif create_dummy:
+                        return module, dummy_function_wrapper(function_name)
+                    else:
+                        raise RuntimeError('no exist {} of {}'.format(function_name, module))
+
+        if function_name is not None and not hasattr(sys.modules[module_path], function_name):
+            setattr(sys.modules[module_path], function_name, None)
+        return sys.modules[module_path], getattr(sys.modules[module_path], function_name) if function_name is not None else None
+
+
+class MegatronPatchesManager:
+    patches_info = {}
+
+    @staticmethod
+    def register_patch(orig_func_name, new_func=None, force_patch=False, create_dummy=False):
+        if orig_func_name not in MegatronPatchesManager.patches_info:
+            MegatronPatchesManager.patches_info[orig_func_name] = Patch(orig_func_name, new_func, create_dummy)
+        else:
+            MegatronPatchesManager.patches_info.get(orig_func_name).set_patch_func(new_func, force_patch)
+
+    @staticmethod
+    def apply_patches():
+        for patch in MegatronPatchesManager.patches_info.values():
+            patch.apply_patch()
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/data/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/data/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/checkpointing_rlhf.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/checkpointing_rlhf.py
similarity index 98%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/checkpointing_rlhf.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/checkpointing_rlhf.py
index 77a6516ae2c2c2e9aab9f61e970cf124f09f0817..a05b375ec4ea25232cbab8c3f4a78e7be3decd30 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/checkpointing_rlhf.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/checkpointing_rlhf.py
@@ -8,9 +8,9 @@ import torch
 import random
 import numpy as np
 
-from megatron_ds.core import mpu, tensor_parallel
-from ..global_vars import get_args, set_args, set_rlhf_args
-from ..utils import unwrap_model, print_rank_0
+from megatron.core import mpu, tensor_parallel
+from megatron.training.global_vars import get_args, set_args, set_rlhf_args
+from megatron.training.utils import unwrap_model, print_rank_0
 
 
 
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/tests/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/fused_kernels/tests/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/communication_rlhf.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/communication_rlhf.py
similarity index 99%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/communication_rlhf.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/communication_rlhf.py
index ecfbb43858b1f100fe1c649067ff5eeb1c2c931b..dee32077f34904f7585fab0f5180a5d014f7829f 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/communication_rlhf.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/communication_rlhf.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from megatron_ds.core import mpu
+from megatron.core import mpu
 
 
 
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/forward_rlhf.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/forward_rlhf.py
similarity index 97%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/forward_rlhf.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/forward_rlhf.py
index d8552d3c269b70ff3577d7429b3e4bdda7dff189..8854d9f8567beb182c6a1d278af0fb222388c55b 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/forward_rlhf.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/forward_rlhf.py
@@ -4,9 +4,9 @@
 
 import torch
 
-from megatron_ds import get_args
-from megatron_ds.core import mpu, InferenceParams
-from megatron_ds.core.utils import get_attr_wrapped_model
+from megatron.training import get_args
+from megatron.core import mpu, InferenceParams
+from megatron.core.utils import get_attr_wrapped_model
 from .communication_rlhf import (
     send_to_next_pipeline_rank,
     recv_from_prev_pipeline_rank_)
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/generation_rlhf.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/generation_rlhf.py
similarity index 95%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/generation_rlhf.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/generation_rlhf.py
index 34004829dfd697be60f9bebf7325bb7533a92ba6..6fdc995fdd477d01ff207b017e16dee3b2273765 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/generation/generation_rlhf.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/generation/generation_rlhf.py
@@ -5,9 +5,9 @@
 import torch
 import torch.nn.functional as F
 
-from megatron_ds import get_tokenizer
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.utils import get_ltor_masks_and_position_ids
+from megatron.training import get_tokenizer
+from megatron.core import mpu, tensor_parallel
+from megatron.training.utils import get_ltor_masks_and_position_ids
 from .communication_rlhf import (
     copy_from_last_to_first_pipeline_stage,
     broadcast_float_list, broadcast_int_list,
@@ -146,7 +146,7 @@ def get_attention_mask_and_position_ids(data, pad_token_id=None):
         # 针对 left_padding 部分更新 attention_mask 和 position_ids
         for b in range(micro_batch_size):
             num_left_padding = 0
-            while data[b][num_left_padding] == pad_token_id:
+            while num_left_padding < len(data[b]) and data[b][num_left_padding] == pad_token_id:
                 num_left_padding += 1
 
             # 更新 attention_mask
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/initialize_rlhf.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/initialize_rlhf.py
similarity index 94%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/initialize_rlhf.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/initialize_rlhf.py
index 0d3059744f7ec2640688ce9090ca0df02d1317ce..3d698babd8d596ddbc5b8f916522e1cd02339bd8 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/initialize_rlhf.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/initialize_rlhf.py
@@ -1,19 +1,20 @@
 """Megatron RLHF Initialization."""
 
 import random
+import os
 import time
 import torch
 
 import numpy as np
 from datetime import timedelta
 
-from megatron_ds import fused_kernels
-from megatron_ds import get_args, get_adlr_autoresume, get_tensorboard_writer, print_rank_0
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.arguments import parse_args, validate_args
-from megatron_ds.global_vars import set_global_variables
-from megatron_ds.model.transformer import bias_dropout_add_fused_train
-from megatron_ds.model.fused_bias_gelu import bias_gelu
+from megatron.legacy import fused_kernels
+from megatron.training import get_args, get_adlr_autoresume, get_tensorboard_writer, print_rank_0
+from megatron.core import mpu, tensor_parallel
+from megatron.training.arguments import parse_args, validate_args
+from megatron.training.global_vars import set_global_variables
+from megatron.legacy.model.transformer import bias_dropout_add_fused_train
+from megatron.legacy.model.fused_bias_gelu import bias_gelu
 
 
 def initialize_megatron(
@@ -91,18 +92,18 @@ def _compile_dependencies():
     # Compile dataset C++ code.
     # =========================
     # TODO: move this to ninja
-    if torch.distributed.get_rank() == 0:
+    if not torch.distributed.is_initialized() or int(os.environ["LOCAL_RANK"]) == 0:
         if args.deepspeed:
             start_time = time.time()
             print('> compiling dataset index builder ...')
-            from megatron_ds.data.dataset_utils import compile_helper
+            from megatronspeed.legacy.data.dataset_utils import compile_helper
             compile_helper()
             print('>>> done with dataset index builder. Compilation time: {:.3f} '
                 'seconds'.format(time.time() - start_time), flush=True)
         else:
             start_time = time.time()
             print("> compiling dataset index builder ...")
-            from megatron_ds.core.datasets.utils import compile_helpers
+            from megatron.core.datasets.utils import compile_helpers
 
             compile_helpers()
             print(
@@ -211,9 +212,9 @@ def _initialize_distributed():
             mpu.initialize_model_parallel(
                 args.tensor_model_parallel_size,
                 args.pipeline_model_parallel_size,
-                args.ds_sequence_parallel_size,
                 args.virtual_pipeline_model_parallel_size,
                 args.pipeline_model_parallel_split_rank,
+                args.ds_sequence_parallel_size,
                 context_parallel_size=args.context_parallel_size,
                 expert_model_parallel_size=args.expert_model_parallel_size,
                 nccl_communicator_config_path=args.nccl_communicator_config_path,
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/schedules_rlhf.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/schedules_rlhf.py
similarity index 99%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/schedules_rlhf.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/schedules_rlhf.py
index ed7f4dfe5d729311d674619241554a1275029c45..9b6d2af9e67d75960ad174a68dce66ee9055b862 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/schedules_rlhf.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/schedules_rlhf.py
@@ -6,10 +6,10 @@ from typing import Callable, Iterator, List, Optional, Union
 import torch
 from torch.autograd.variable import Variable
 
-from megatron_ds.core import parallel_state
-from megatron_ds.core.enums import ModelType
-from megatron_ds.core.pipeline_parallel import p2p_communication
-from megatron_ds.core.utils import get_attr_wrapped_model, get_model_config, get_model_type
+from megatron.core import parallel_state
+from megatron.core.enums import ModelType
+from megatron.core.pipeline_parallel import p2p_communication
+from megatron.core.utils import get_attr_wrapped_model, get_model_config, get_model_type
 
 # Types
 Shape = Union[List[int], torch.Size]
@@ -69,7 +69,7 @@ def get_forward_backward_func():
         iterators in the case of interleaved pipeline parallelism.
 
     model (required): the actual model. Expected to be a list of modules in the case of interleaved
-        pipeline parallelism. Must be a (potentially wrapped) megatron_ds.core.models.MegatronModule.
+        pipeline parallelism. Must be a (potentially wrapped) megatron.core.models.MegatronModule.
 
     num_microbatches (int, required):
         The number of microbatches to go through
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/training_rlhf.py b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/training_rlhf.py
similarity index 82%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/training_rlhf.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/training_rlhf.py
index b4deac9482834866f4edce18a0d3ac42fcd580f5..c76288a296e192124128642734d2061997a684b7 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/rlhf/training_rlhf.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/rlhf/training_rlhf.py
@@ -1,5 +1,8 @@
-import gc
+"""Pretrain utilities."""
+
+import dataclasses
 from datetime import datetime
+import gc
 from functools import partial
 import math
 import logging
@@ -7,47 +10,68 @@ import sys
 from ..log_handler import CustomHandler
 # Make default logging level INFO, but filter out all log messages not from MCore.
 logging.basicConfig(handlers=[CustomHandler()], level=logging.INFO)
-from ..theoretical_memory_usage import report_theoretical_memory
 import time
 import os
 # The earliest we can measure the start time.
 _TRAIN_START_TIME = time.time()
 import torch
-from torch import Tensor
 import copy
 import torch.nn.functional as F
 
-from megatron_ds import get_args, get_rlhf_args, set_rlhf_args, set_args
-from megatron_ds import get_signal_handler
-from megatron_ds import get_timers
-from megatron_ds import get_tokenizer
-from megatron_ds import get_tensorboard_writer
-from megatron_ds import get_wandb_writer
-from megatron_ds import get_current_global_batch_size
-from megatron_ds import get_num_microbatches
-from megatron_ds import update_num_microbatches
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.core.utils import get_model_config
-from megatron_ds import print_rank_0, print_rank_last, is_last_rank
-from megatron_ds.model import Float16Module
-from megatron_ds.model import GPTModel
-from megatron_ds.core.distributed import DistributedDataParallel as DDP
-from megatron_ds.core.distributed import finalize_model_grads
-from megatron_ds.core.enums import ModelType
-from megatron_ds.optimizer import get_megatron_optimizer
-from megatron_ds.optimizer_param_scheduler import OptimizerParamScheduler
-from megatron_ds.utils import check_adlr_autoresume_termination
-from megatron_ds.utils import unwrap_model
-from megatron_ds.data.data_samplers import build_pretraining_data_loader
-from megatron_ds.utils import calc_params_l2_norm
-from megatron_ds.utils import report_memory, throughput_calculator
-from megatron_ds.utils import get_ltor_masks_and_position_ids, get_batch_on_this_cp_rank, average_losses_across_data_parallel_group
-
-from megatron_ds.rlhf.schedules_rlhf import get_forward_backward_func
-from megatron_ds.rlhf.initialize_rlhf import initialize_megatron, write_args_to_tensorboard
-from megatron_ds.rlhf.checkpointing_rlhf import load_state_dict_into_model, load_state_dict, save_checkpoint, set_args_from_state_dict
-from megatron_ds.rlhf.generation.generation_rlhf import generate_tokens_and_return_on_first_stage, get_attention_mask_and_position_ids
-from megatron_ds.rlhf.generation.communication_rlhf import broadcast_from_last_pipeline_stage
+from megatron.core import mpu, tensor_parallel
+from megatron.core.utils import get_model_config
+from megatron.legacy.model import Float16Module, GPTModel
+from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.distributed import DistributedDataParallel as DDP
+from megatron.core.distributed import finalize_model_grads
+from megatron.core.enums import ModelType
+from megatron.core.optimizer import get_megatron_optimizer, OptimizerConfig
+from megatron.training.optimizer_param_scheduler import OptimizerParamScheduler
+from megatron.legacy.data.data_samplers import build_pretraining_data_loader
+from megatron.core.num_microbatches_calculator import (
+    get_current_global_batch_size,
+    get_num_microbatches,
+    update_num_microbatches)
+from megatron.training.utils import (
+    calc_params_l2_norm,
+    check_adlr_autoresume_termination,
+    is_last_rank,
+    print_rank_0,
+    print_rank_last,
+    report_memory,
+    unwrap_model,
+    append_to_progress_log,
+    get_ltor_masks_and_position_ids,
+    get_batch_on_this_tp_rank,
+    get_batch_on_this_cp_rank,
+    average_losses_across_data_parallel_group
+)
+from megatron.training.global_vars import (
+    get_args,
+    get_rlhf_args,
+    set_rlhf_args,
+    set_args,
+    get_tokenizer,
+    get_signal_handler,
+    get_timers,
+    get_tensorboard_writer,
+    get_wandb_writer,
+    get_one_logger)
+from megatron.training import one_logger_utils
+from megatron.training.training import (
+    print_datetime,
+    num_floating_point_operations,
+    update_train_iters,
+    build_train_valid_test_data_iterators,
+    get_optimizer_param_scheduler,
+    build_train_valid_test_datasets,
+    save_checkpoint_and_time
+)
+from megatronspeed.rlhf.schedules_rlhf import get_forward_backward_func
+from megatronspeed.rlhf.initialize_rlhf import initialize_megatron, write_args_to_tensorboard
+from megatronspeed.rlhf.checkpointing_rlhf import load_state_dict_into_model, load_state_dict, save_checkpoint, set_args_from_state_dict
+from megatronspeed.rlhf.generation.generation_rlhf import generate_tokens_and_return_on_first_stage, get_attention_mask_and_position_ids
+from megatronspeed.rlhf.generation.communication_rlhf import broadcast_from_last_pipeline_stage
 
 
 
@@ -157,9 +181,9 @@ class RLHFPPOTrainer():
         # copy args to rlhf_args, which will be updated during loading model
         self.rlhf_args = copy.deepcopy(self.args)
         set_rlhf_args(self.rlhf_args)
-        # reset custom_partition argument
-        if self.args.custom_partition is not None and self.args.num_layers != sum(self.args.custom_partition):
-            setattr(self.args, "custom_partition", None)
+        # reset num_layers_per_stage argument
+        if self.args.num_layers_per_stage is not None and self.args.num_layers != sum(self.args.num_layers_per_stage):
+            setattr(self.args, "num_layers_per_stage", None)
             set_args(self.args)
 
         self.timers = get_timers()
@@ -216,6 +240,12 @@ class RLHFPPOTrainer():
         state_dict = load_state_dict(ckpt_dir)
         set_args_from_state_dict(args, state_dict, rlhf_training=rlhf_training)
 
+        args.encoder_num_layers = args.num_layers
+        if rlhf_training:
+            set_rlhf_args(args)
+        else:
+            set_args(args)
+
         # Model
         model = get_model(self.model_provider, self.model_type,
                           rlhf_training=rlhf_training)
@@ -223,10 +253,18 @@ class RLHFPPOTrainer():
         # Optimizer
         optimizer, opt_param_scheduler = None, None
         if model_prefix in {"actor", "critic"}:
-            lr = getattr(args, f"{model_prefix}_learning_rate")
-            weight_decay = getattr(args, f"{model_prefix}_weight_decay")
-            optimizer = get_megatron_optimizer(model, lr=lr, weight_decay=weight_decay)
-            opt_param_scheduler = get_optimizer_param_scheduler(optimizer, lr=lr)
+            kwargs = {}
+            for f in dataclasses.fields(OptimizerConfig):
+                if hasattr(args, f.name):
+                    kwargs[f.name] = getattr(args, f.name)
+            config = OptimizerConfig(**kwargs)
+            config.timers = self.timers
+            config.lr = getattr(args, f"{model_prefix}_learning_rate")
+            config.weight_decay = getattr(args, f"{model_prefix}_weight_decay")
+            args.lr = config.lr
+            args.weight_decay = config.weight_decay
+            optimizer = get_megatron_optimizer(config, model)
+            opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
 
         if ckpt_dir is not None:
             self.timers(f'load {model_prefix} model', log_level=0).start(barrier=True)
@@ -588,7 +626,7 @@ class RLHFPPOTrainer():
             # If using distributed optimizer, don't zero buffer here; zeroing of buffer is
             # handled automatically by the optimizer after all-gathers finish.
             # Otherwise, zero the buffer.
-            model_chunk.zero_grad_buffer(zero_buffer=(not self.args.use_distributed_optimizer))
+            model_chunk.zero_grad_buffer()
         self.actor_optimizer.zero_grad()
 
         actor_loss = self.forward_backward_func(
@@ -610,7 +648,7 @@ class RLHFPPOTrainer():
 
         # Update parameters.
         self.timers('optimizer', log_level=1).start(barrier=self.args.barrier_with_L1_time)
-        update_successful, grad_norm, num_zeros_in_grad = self.actor_optimizer.step(self.args, self.timers)
+        update_successful, grad_norm, num_zeros_in_grad = self.actor_optimizer.step()
         self.timers('optimizer').stop()
 
         # Update learning rate.
@@ -643,7 +681,7 @@ class RLHFPPOTrainer():
             # If using distributed optimizer, don't zero buffer here; zeroing of buffer is
             # handled automatically by the optimizer after all-gathers finish.
             # Otherwise, zero the buffer.
-            model_chunk.zero_grad_buffer(zero_buffer=(not self.args.use_distributed_optimizer))
+            model_chunk.zero_grad_buffer()
         self.critic_optimizer.zero_grad()
 
         critic_loss = self.forward_backward_func(
@@ -665,7 +703,7 @@ class RLHFPPOTrainer():
 
         # Update parameters.
         self.timers('optimizer', log_level=1).start(barrier=self.args.barrier_with_L1_time)
-        update_successful, grad_norm, num_zeros_in_grad = self.critic_optimizer.step(self.args, self.timers)
+        update_successful, grad_norm, num_zeros_in_grad = self.critic_optimizer.step()
         self.timers('optimizer').stop()
 
         # Update learning rate.
@@ -918,35 +956,6 @@ class RLHFPPOTrainer():
         print_rank_last(f"Average reward score: {average_reward}")
         print_rank_last(f"------------------------------------------------------------------------------------------------------------------------------------")
 
-def update_train_iters(args):
-
-    # For iteration-based training, we don't need to do anything
-    if args.train_iters:
-        return
-
-    # Constant batch size with sample-based training.
-    if args.rampup_batch_size is None:
-        args.train_iters = args.train_samples // args.global_batch_size
-
-    else:
-        # Sample based training with rampup batch size.
-        iterations = 0
-        consumed_samples = 0
-        # Rampup phase.
-        while consumed_samples <= int(args.rampup_batch_size[2]):
-            update_num_microbatches(consumed_samples, consistency_check=False)
-            consumed_samples += get_current_global_batch_size()
-            iterations += 1
-        # Reset
-        update_num_microbatches(0, consistency_check=False)
-        # Constant phase
-        # Note that we throw away any partial last batch.
-        iterations += (args.train_samples - consumed_samples) // \
-                      args.global_batch_size
-        args.train_iters = iterations
-
-    print_rank_0('setting training iterations to {}'.format(args.train_iters))
-
 
 def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True, rlhf_training=False):
     """Build the model."""
@@ -1030,6 +1039,7 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
             sum([sum([p.ds_numel if hasattr(p,'ds_id') else p.nelement() for p in model_module.parameters()])
                  for model_module in model])), flush=True)
 
+
     # GPU allocation.
     for model_module in model:
         model_module.cuda(torch.cuda.current_device())
@@ -1040,12 +1050,16 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
 
     if wrap_with_ddp:
         config = get_model_config(model[0])
+        ddp_config = DistributedDataParallelConfig(
+            grad_reduce_in_fp32=args.accumulate_allreduce_grads_in_fp32,
+            overlap_grad_reduce=args.overlap_grad_reduce,
+            use_distributed_optimizer=args.use_distributed_optimizer,
+            check_for_nan_in_grad=args.check_for_nan_in_loss_and_grad,
+            bucket_size=args.ddp_bucket_size,
+            average_in_collective=args.ddp_average_in_collective)
         model = [DDP(config,
+                     ddp_config,
                      model_chunk,
-                     data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True),
-                     accumulate_allreduce_grads_in_fp32=args.accumulate_allreduce_grads_in_fp32,
-                     overlap_grad_reduce=args.overlap_grad_reduce,
-                     use_distributed_optimizer=args.use_distributed_optimizer,
                      # Turn off bucketing for model_chunk 2 onwards, since communication for these
                      # model chunks is overlapped with compute anyway.
                      disable_bucketing=(model_chunk_idx > 0))
@@ -1059,60 +1073,6 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
     return model
 
 
-def get_optimizer_param_scheduler(optimizer, lr=None):
-    """Build the learning rate scheduler."""
-    args = get_args()
-
-    if lr is None:
-        lr = args.lr
-
-    # Iteration-based training.
-    if args.train_iters:
-        if args.lr_decay_iters is None:
-            args.lr_decay_iters = args.train_iters
-        lr_decay_steps = args.lr_decay_iters * args.global_batch_size
-        wd_incr_steps = args.train_iters * args.global_batch_size
-        if args.lr_warmup_fraction is not None:
-            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
-        else:
-            lr_warmup_steps = args.lr_warmup_iters * args.global_batch_size
-    # Sample-based training.
-    elif args.train_samples:
-        # We need to set training iters for later use. Technically
-        # we need to adjust the training samples too (due to last
-        # batch being incomplete) but we leave it as is for now.
-        update_train_iters(args)
-        if args.lr_decay_samples is None:
-            args.lr_decay_samples = args.train_samples
-        lr_decay_steps = args.lr_decay_samples
-        wd_incr_steps = args.train_samples
-        if args.lr_warmup_fraction is not None:
-            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
-        else:
-            lr_warmup_steps = args.lr_warmup_samples
-    else:
-        raise Exception(
-            'either train-iters or train-samples should be provided.')
-
-    opt_param_scheduler = OptimizerParamScheduler(
-        optimizer,
-        init_lr=args.lr_warmup_init,
-        max_lr=lr,
-        min_lr=args.min_lr,
-        lr_warmup_steps=lr_warmup_steps,
-        lr_decay_steps=lr_decay_steps,
-        lr_decay_style=args.lr_decay_style,
-        start_wd=args.start_weight_decay,
-        end_wd=args.end_weight_decay,
-        wd_incr_steps=wd_incr_steps,
-        wd_incr_style=args.weight_decay_incr_style,
-        use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler,
-        override_opt_param_scheduler=args.override_opt_param_scheduler)
-
-    return opt_param_scheduler
-
-
-
 
 def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler):
     timers = get_timers()
@@ -1304,7 +1264,7 @@ def get_batch(data_iterator):
     tokenizer = get_tokenizer()
 
     # Items and their type.
-    keys = ['text']
+    keys = ['tokens']
     datatype = torch.int64
 
     # Broadcast data.
@@ -1312,20 +1272,26 @@ def get_batch(data_iterator):
         data = next(data_iterator)
     else:
         data = None
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
+    data_tokens = tensor_parallel.broadcast_data(['tokens'], data, torch.int64)
+    data_labels = tensor_parallel.broadcast_data(['labels'], data, torch.int64)
+    data_loss_mask = tensor_parallel.broadcast_data(['loss_mask'], data, torch.float32)
+    data_attention_mask = tensor_parallel.broadcast_data(['attention_mask'], data, torch.bool) if args.create_attention_mask_in_dataloader else None
+    data_position_ids = tensor_parallel.broadcast_data(['position_ids'], data, torch.int64)
 
     # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
+    tokens = data_tokens['tokens'].contiguous()
+    labels = data_labels['labels'].contiguous()
+    loss_mask = data_loss_mask['loss_mask'].contiguous()
+    attention_mask = data_attention_mask['attention_mask'].contiguous() if data_attention_mask is not None else None
+    position_ids = data_position_ids['position_ids'].contiguous()
 
     # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
+    # attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+    #     tokens,
+    #     tokenizer.eod,
+    #     args.reset_position_ids,
+    #     args.reset_attention_mask,
+    #     args.eod_mask_loss)
 
     batch = {
         'tokens': tokens,
@@ -1340,119 +1306,119 @@ def get_batch(data_iterator):
     return batch.values()
 
 
-def build_train_valid_test_datasets(build_train_valid_test_datasets_provider):
-    """Build pretraining datasets."""
-
-    args = get_args()
-
-    # Number of train/valid/test samples.
-    if args.train_samples:
-        train_samples = args.train_samples
-    else:
-        train_samples = args.train_iters * args.global_batch_size
-    eval_iters = (args.train_iters // args.eval_interval + 1) * \
-                 args.eval_iters
-    test_iters = args.eval_iters
-    train_val_test_num_samples = [train_samples,
-                                  eval_iters * args.global_batch_size,
-                                  test_iters * args.global_batch_size]
-    print_rank_0(' > datasets target sizes (minimum size):')
-    print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
-    print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
-    print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
-
-    # Build the datasets.
-    return build_train_valid_test_datasets_provider(train_val_test_num_samples)
-
-
-def build_train_valid_test_data_loaders(
-        build_train_valid_test_datasets_provider):
-    """Build pretraining data loaders."""
-
-    args = get_args()
-
-    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
-
-    print_rank_0('> building train, validation, and test datasets ...')
-
-    # Backward compatibility, assume fixed batch size.
-    if args.iteration > 0 and args.consumed_train_samples == 0:
-        assert args.train_samples is None, \
-            'only backward compatiblity support for iteration-based training'
-        args.consumed_train_samples = args.iteration * args.global_batch_size
-    if args.iteration > 0 and args.consumed_valid_samples == 0:
-        if args.train_samples is None:
-            args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
-                args.eval_iters * args.global_batch_size
-
-    # Rely on distributed-aware core datasets, temporary
-    is_distributed = getattr(build_train_valid_test_datasets_provider, "is_distributed", False)
-
-    # Construct the data pipeline
-    if is_distributed or mpu.get_tensor_model_parallel_rank() == 0:
-
-        # Build datasets.
-        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-            build_train_valid_test_datasets_provider)
-        # Build dataloders.
-        train_dataloader = build_pretraining_data_loader(
-            train_ds, args.consumed_train_samples)
-        if args.skip_train:
-            valid_dataloader = build_pretraining_data_loader(valid_ds, 0)
-        else:
-            valid_dataloader = build_pretraining_data_loader(
-                valid_ds, args.consumed_valid_samples)
-        test_dataloader = build_pretraining_data_loader(test_ds, 0)
-
-        # Flags to know if we need to do training/validation/testing.
-        do_train = train_dataloader is not None and args.train_iters > 0
-        do_valid = valid_dataloader is not None and args.eval_iters > 0
-        do_test = test_dataloader is not None and args.eval_iters > 0
-        flags = torch.cuda.LongTensor(
-            [int(do_train), int(do_valid), int(do_test)])
-    else:
-        flags = torch.cuda.LongTensor([0, 0, 0])
-
-    torch.distributed.broadcast(flags, 0)
-
-    args.do_train = getattr(args, "do_train", False) or flags[0].item()
-    args.do_valid = getattr(args, "do_valid", False) or flags[1].item()
-    args.do_test = getattr(args, "do_test", False) or flags[2].item()
-
-    return train_dataloader, valid_dataloader, test_dataloader
-
-
-def build_train_valid_test_data_iterators(
-        build_train_valid_test_datasets_provider):
-    """Build pretraining data iterators."""
-
-    args = get_args()
-
-    # Build loaders.
-    train_dataloader, valid_dataloader, test_dataloader = \
-        build_train_valid_test_data_loaders(
-            build_train_valid_test_datasets_provider)
-
-    # Build iterators.
-    dl_type = args.dataloader_type
-    assert dl_type in ['single', 'cyclic']
-
-    if train_dataloader is not None:
-        train_data_iterator = iter(train_dataloader) if dl_type == 'single' \
-                              else iter(cyclic_iter(train_dataloader))
-    else:
-        train_data_iterator = None
-
-    if valid_dataloader is not None:
-        valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \
-                              else iter(cyclic_iter(valid_dataloader))
-    else:
-        valid_data_iterator = None
-
-    if test_dataloader is not None:
-        test_data_iterator = iter(test_dataloader) if dl_type == 'single' \
-                             else iter(cyclic_iter(test_dataloader))
-    else:
-        test_data_iterator = None
-
-    return train_data_iterator, valid_data_iterator, test_data_iterator
+# def build_train_valid_test_datasets(build_train_valid_test_datasets_provider):
+#     """Build pretraining datasets."""
+
+#     args = get_args()
+
+#     # Number of train/valid/test samples.
+#     if args.train_samples:
+#         train_samples = args.train_samples
+#     else:
+#         train_samples = args.train_iters * args.global_batch_size
+#     eval_iters = (args.train_iters // args.eval_interval + 1) * \
+#                  args.eval_iters
+#     test_iters = args.eval_iters
+#     train_val_test_num_samples = [train_samples,
+#                                   eval_iters * args.global_batch_size,
+#                                   test_iters * args.global_batch_size]
+#     print_rank_0(' > datasets target sizes (minimum size):')
+#     print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
+#     print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
+#     print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
+
+#     # Build the datasets.
+#     return build_train_valid_test_datasets_provider(train_val_test_num_samples)
+
+
+# def build_train_valid_test_data_loaders(
+#         build_train_valid_test_datasets_provider):
+#     """Build pretraining data loaders."""
+
+#     args = get_args()
+
+#     (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
+
+#     print_rank_0('> building train, validation, and test datasets ...')
+
+#     # Backward compatibility, assume fixed batch size.
+#     if args.iteration > 0 and args.consumed_train_samples == 0:
+#         assert args.train_samples is None, \
+#             'only backward compatiblity support for iteration-based training'
+#         args.consumed_train_samples = args.iteration * args.global_batch_size
+#     if args.iteration > 0 and args.consumed_valid_samples == 0:
+#         if args.train_samples is None:
+#             args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
+#                 args.eval_iters * args.global_batch_size
+
+#     # Rely on distributed-aware core datasets, temporary
+#     is_distributed = getattr(build_train_valid_test_datasets_provider, "is_distributed", False)
+
+#     # Construct the data pipeline
+#     if is_distributed or mpu.get_tensor_model_parallel_rank() == 0:
+
+#         # Build datasets.
+#         train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+#             build_train_valid_test_datasets_provider)
+#         # Build dataloders.
+#         train_dataloader = build_pretraining_data_loader(
+#             train_ds, args.consumed_train_samples)
+#         if args.skip_train:
+#             valid_dataloader = build_pretraining_data_loader(valid_ds, 0)
+#         else:
+#             valid_dataloader = build_pretraining_data_loader(
+#                 valid_ds, args.consumed_valid_samples)
+#         test_dataloader = build_pretraining_data_loader(test_ds, 0)
+
+#         # Flags to know if we need to do training/validation/testing.
+#         do_train = train_dataloader is not None and args.train_iters > 0
+#         do_valid = valid_dataloader is not None and args.eval_iters > 0
+#         do_test = test_dataloader is not None and args.eval_iters > 0
+#         flags = torch.cuda.LongTensor(
+#             [int(do_train), int(do_valid), int(do_test)])
+#     else:
+#         flags = torch.cuda.LongTensor([0, 0, 0])
+
+#     torch.distributed.broadcast(flags, 0)
+
+#     args.do_train = getattr(args, "do_train", False) or flags[0].item()
+#     args.do_valid = getattr(args, "do_valid", False) or flags[1].item()
+#     args.do_test = getattr(args, "do_test", False) or flags[2].item()
+
+#     return train_dataloader, valid_dataloader, test_dataloader
+
+
+# def build_train_valid_test_data_iterators(
+#         build_train_valid_test_datasets_provider):
+#     """Build pretraining data iterators."""
+
+#     args = get_args()
+
+#     # Build loaders.
+#     train_dataloader, valid_dataloader, test_dataloader = \
+#         build_train_valid_test_data_loaders(
+#             build_train_valid_test_datasets_provider)
+
+#     # Build iterators.
+#     dl_type = args.dataloader_type
+#     assert dl_type in ['single', 'cyclic']
+
+#     if train_dataloader is not None:
+#         train_data_iterator = iter(train_dataloader) if dl_type == 'single' \
+#                               else iter(cyclic_iter(train_dataloader))
+#     else:
+#         train_data_iterator = None
+
+#     if valid_dataloader is not None:
+#         valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \
+#                               else iter(cyclic_iter(valid_dataloader))
+#     else:
+#         valid_data_iterator = None
+
+#     if test_dataloader is not None:
+#         test_data_iterator = iter(test_dataloader) if dl_type == 'single' \
+#                              else iter(cyclic_iter(test_dataloader))
+#     else:
+#         test_data_iterator = None
+
+#     return train_data_iterator, valid_data_iterator, test_data_iterator
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation_utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/text_generation_utils.py
similarity index 98%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/text_generation_utils.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/text_generation_utils.py
index 88dd1d93a9a9989256c544a72cd534aad6623d88..6bb3b104866783841cb5ea0a7ccac5624e0e2372 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/text_generation_utils.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/text_generation_utils.py
@@ -22,16 +22,16 @@ import time
 
 import torch
 import torch.nn.functional as F
-from megatron_ds import get_args
-from megatron_ds import get_tokenizer
-from megatron_ds.core import mpu
-from megatron_ds.utils import get_ltor_masks_and_position_ids, unwrap_model
-from megatron_ds.p2p_communication import recv_forward, send_forward
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.core import mpu
+from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.p2p_communication import recv_forward, send_forward
 
-# These are needed to unwrap the model, would be nice to put these in megatron_ds.utils if possible?
+# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron_ds.model import DistributedDataParallel as LocalDDP
-from megatron_ds.model import Float16Module
+from megatron.core.distributed import DistributedDataParallel as LocalDDP
+from megatron.legacy.model import Float16Module
 from deepspeed.accelerator import get_accelerator
 def get_batch(context_tokens):
     """Generate batch from context tokens."""
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/theoretical_memory_usage.py b/toolbox/Megatron-DeepSpeed/megatronspeed/theoretical_memory_usage.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/theoretical_memory_usage.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/theoretical_memory_usage.py
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/training/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37752b70bef18fd63a91441d910473a64c9a95fa
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/__init__.py
@@ -0,0 +1 @@
+from .global_vars import get_rlhf_args, set_rlhf_args
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/arguments.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/arguments.py
similarity index 36%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/arguments.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/arguments.py
index fe409212f280ec066067f3aad2596121fdffbe79..974a5c847206d91239d7c814a47571c697672133 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/arguments.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/arguments.py
@@ -1,698 +1,389 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Megatron arguments."""
-
-import argparse
-import dataclasses
-import json
 import os
+from functools import wraps
+import argparse
+
 import torch
+
+from megatron.training.arguments import load_retro_args, _print_args, _check_arg_is_not_none
 import deepspeed
-import types
+from packaging import version
+
+def extra_args_provider_decorator(extra_args_provider):
+    @wraps(extra_args_provider)
+    def wrapper(parser):
+        if extra_args_provider is not None:
+            parser = extra_args_provider(parser)
+        parser = process_args(parser)
+        return parser
+
+    return wrapper
 
-import torch.nn.functional as F
-from megatron_ds.global_vars import set_retro_args, get_retro_args
-#from tools.retro.utils import get_args_path as get_retro_args_path
-def get_retro_args_path(workdir):
-    '''Argument copy stored within retro workdir.'''
-    return os.path.join(workdir, "args.json")
-# from megatron_ds.core.models.retro import RetroConfig
-from megatron_ds.core.transformer import TransformerConfig
 
+def parse_args_wrapper(parse_args):
+    @wraps(parse_args)
+    def wrapper(extra_args_provider=None, ignore_unknown_args=False):
+        decorated_provider = extra_args_provider_decorator(extra_args_provider)
+        args = parse_args(decorated_provider, ignore_unknown_args)
 
-def parse_args(extra_args_provider=None, ignore_unknown_args=False):
-    """Parse all arguments."""
-    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
-                                     allow_abbrev=False)
+        # helper argument to set deepspeed pipeline parallel or not
+        args.ds_pipeline_enabled = not args.no_pipeline_parallel
 
-    # Standard arguments.
+        return args
+
+    return wrapper
+
+
+def process_args(parser):
+    parser.conflict_handler = 'resolve'
     parser = _add_network_size_args(parser)
     parser = _add_regularization_args(parser)
     parser = _add_training_args(parser)
-    parser = _add_initialization_args(parser)
     parser = _add_learning_rate_args(parser)
     parser = _add_checkpointing_args(parser)
     parser = _add_mixed_precision_args(parser)
     parser = _add_distributed_args(parser)
-    parser = _add_validation_args(parser)
     parser = _add_data_args(parser)
-    parser = _add_autoresume_args(parser)
-    parser = _add_biencoder_args(parser)
-    parser = _add_vision_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_zero_args(parser)
     parser = _add_memoryopt_args(parser)
     parser = _add_activation_checkpoint_args(parser)
-    parser = _add_distillation_args(parser)
-    parser = _add_inference_args(parser)
     parser = _add_transformer_engine_args(parser)
     parser = _add_retro_args(parser)
-    parser = _add_experimental_args(parser)
-
-    # Custom arguments.
-    if extra_args_provider is not None:
-        parser = extra_args_provider(parser)
 
     parser = deepspeed.add_config_arguments(parser)
 
-    # Parse.
-    if ignore_unknown_args:
-        args, _ = parser.parse_known_args()
-    else:
-        args = parser.parse_args()
+    return parser
 
-    # helper argument to set deepspeed pipeline parallel or not
-    args.ds_pipeline_enabled = not args.no_pipeline_parallel
 
-    # Args from environment
-    args.rank = int(os.getenv('RANK', '0'))
-    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+def _add_transformer_engine_args(parser):
+    group = parser.add_argument_group(title='Transformer-Engine')
 
-    return args
+    # group.add_argument('--fp8-e4m3', action='store_true',
+    #                     help='E4M3 TransformerLayer', dest='fp8_e4m3')
+    # group.add_argument('--fp8-hybrid', action='store_true',
+    #                     help='Hybrid FP8 TransformerLayer', dest='fp8_hybrid')
+    group.add_argument('--transformer-impl', default='local',
+                       choices=['local', 'transformer_engine'],
+                       help='Which Transformer implementation to use.')
+    return parser
 
-def validate_args(args, defaults={}):
-    # Tensor model parallel size.
-    args.tensor_model_parallel_size = min(
-        args.tensor_model_parallel_size, args.world_size)
-    assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
-        ' ({}) is not divisible by tensor model parallel size ({})'.format(
-            args.world_size, args.tensor_model_parallel_size)
-    # Pipeline model parallel size.
-    args.pipeline_model_parallel_size = min(
-        args.pipeline_model_parallel_size,
-        (args.world_size // args.tensor_model_parallel_size))
-    args.transformer_pipeline_model_parallel_size = (
-        args.pipeline_model_parallel_size - 1
-        if args.standalone_embedding_stage else
-        args.pipeline_model_parallel_size
-    )
-    # Checks.
-    model_parallel_size = args.pipeline_model_parallel_size * \
-                          args.tensor_model_parallel_size
-    assert args.world_size % (model_parallel_size * args.context_parallel_size) == 0, \
-        'world size ({}) is not divisible by tensor parallel size ({}) times ' \
-        'pipeline parallel size ({}) times context parallel size ({})'.format(
-        args.world_size, args.tensor_model_parallel_size,
-        args.pipeline_model_parallel_size, args.context_parallel_size)
-    args.data_parallel_size = args.world_size // (model_parallel_size * args.context_parallel_size)
-    if args.rank == 0:
-        print('using world size: {}, data-parallel size: {}, '
-              'context-parallel size: {} '
-              'tensor-model-parallel size: {}, '
-              'pipeline-model-parallel size: {} '.format(
-                  args.world_size, args.data_parallel_size,
-                  args.context_parallel_size,
-                  args.tensor_model_parallel_size,
-                  args.pipeline_model_parallel_size), flush=True)
-    if args.pipeline_model_parallel_size > 1:
-        if args.pipeline_model_parallel_split_rank is not None:
-            assert args.pipeline_model_parallel_split_rank < \
-                    args.pipeline_model_parallel_size, 'split rank needs'\
-                    ' to be less than pipeline model parallel size ({})'.format(
-                            args.pipeline_model_parallel_size)
+def _add_network_size_args(parser):
+    group = parser.add_argument_group(title='network size')
 
-    if args.tp_comm_overlap:
-        assert args.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
+    group.add_argument('--ds-num-experts', type=int, nargs='+', default=[1,],
+                           help='number of experts list, MoE related.')
+    group.add_argument('--mlp-type', type=str, default='standard',
+                           help='Only applicable when num-experts > 1, accepts [standard, residual]')
+    group.add_argument('--topk', type=int, default=1,
+                           help='Sets the k in TopK gating for MoE layers')
+    group.add_argument('--expert-interval', type=int, default=1,
+                           help='Use experts in every "expert-interval" layers')
+    group.add_argument('--num-key-value-heads', type=int, default=None,
+                       help='Number of key_value heads that should be used to implement Grouped Query Attention.')
+    group.add_argument('--rotary-position-embeddings-theta', type=int, default=10000,
+                       help='Rotary positional embeddings theta value.',
+                       dest='rope_theta')
+    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
+                       help='Layer norm epsilon.')
+    group.add_argument('--disable-mem-efficient-ln', action='store_false', 
+                       help='Disable the memory-efficient fused LayerNorm optimization '
+                       'introduced in https://github.com/NVIDIA/apex/pull/1715', dest='mem_efficient_ln')
+    group.add_argument('--num-experts-switch', type=int, default=None,
+                       help='Number of Experts in Switch Transformer (None means no Switch)')
+    group.add_argument('--embedding-weights-in-fp32', action='store_true',
+                       help='Cast word embedding weights to fp32 before embedding fwd.'),
+    group.add_argument('--kill-switch-file', type=str, default=None,
+                       help='Location of kill switch file. ' 
+                            'If found will automatically exit the program at runtime.')
+    return parser
 
+def _add_logging_args(parser):
+    group = parser.add_argument_group(title='logging')
 
-    # Deprecated arguments
-    assert args.batch_size is None, '--batch-size argument is no longer ' \
-        'valid, use --micro-batch-size instead'
-    del args.batch_size
-    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
-        '--lr-warmup-fraction instead'
-    del args.warmup
-    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
-        'longer valid, use --tensor-model-parallel-size instead'
-    del args.model_parallel_size
+    group.add_argument('--log-optimizer-states-to-tensorboard',
+                       action='store_true',
+                       help='If set, write various optimizer states to '
+                       'tensorboard. This feature may consume extra GPU memory.')
+    return parser
 
-    # HACK: below is commented because DeepSpeed still relies on the old
-    # activation checkpointing mechanism.
-    # if args.checkpoint_activations:
-    #     if args.rank == 0:
-    #         print('--checkpoint-activations is no longer valid, use --recompute-activations, '
-    #               'or, for more control, --recompute-granularity and --recompute-method.')
-    #     exit()
-    # del args.checkpoint_activations
+def _add_regularization_args(parser):
+    group = parser.add_argument_group(title='regularization')
 
-    if args.recompute_activations:
-        args.recompute_granularity = 'selective'
-    del args.recompute_activations
+    group.add_argument('--actor-weight-decay', type=float, default=0.01,
+                       help='RLHF actor model weight decay coefficient for L2 regularization.')
+    group.add_argument('--critic-weight-decay', type=float, default=0.01,
+                       help='RLHF critic model weight decay coefficient for L2 regularization.')
+    return parser
 
-    # Set input defaults.
-    for key in defaults:
-        # For default to be valid, it should not be provided in the
-        # arguments that are passed to the program. We check this by
-        # ensuring the arg is set to None.
-        if getattr(args, key, None) is not None:
-            if args.rank == 0:
-                print('WARNING: overriding default arguments for {key}:{v} \
-                       with {key}:{v2}'.format(key=key, v=defaults[key],
-                                               v2=getattr(args, key)),
-                                               flush=True)
-        else:
-            setattr(args, key, defaults[key])
+def _add_training_args(parser):
+    group = parser.add_argument_group(title='training')
 
-    # Batch size.
-    assert args.micro_batch_size is not None
-    assert args.micro_batch_size > 0
-    if args.global_batch_size is None:
-        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
-        if args.rank == 0:
-            print('setting global batch size to {}'.format(
-                args.global_batch_size), flush=True)
-    assert args.global_batch_size > 0
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        assert args.pipeline_model_parallel_size > 2, \
-            'pipeline-model-parallel size should be greater than 2 with ' \
-            'interleaved schedule'
-        assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
-            'number of layers should be divisible by the pipeline parallel size'
-        num_layers_per_pipeline_stage = args.num_layers // args.transformer_pipeline_model_parallel_size
-        assert num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0, \
-            'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage'
-        args.virtual_pipeline_model_parallel_size = num_layers_per_pipeline_stage // \
-            args.num_layers_per_virtual_pipeline_stage
-    else:
-        args.virtual_pipeline_model_parallel_size = None
-        # Overlap P2P communication is disabled if not using the interleaved schedule.
-        args.overlap_p2p_comm = False
-        if args.rank == 0:
-            print('WARNING: Setting args.overlap_p2p_comm to False since non-interleaved '
-                  'schedule does not support overlapping p2p communication')
-    ## RLHF Batch size check
-    if args.RLHF:
-        assert args.global_batch_size == args.micro_batch_size * args.data_parallel_size, \
-            f"error with batch size setting. GBS should equal to MBS * DP"
+    group.add_argument('--rlhf-train-mbs', type=int, default=None,
+                       help='Micro batch size in RLHF train time')
+    group.add_argument('--custom-recompute-layers-per-stage', nargs='*', type=int, default=None,
+                       help='custom recompute num layers in each PP stage, it should be equal to PP size ')
+    group.add_argument('--enable-zbh1-pipeline', action='store_true',
+                       help='Activate zero bubble pipeline parallelism schedule method')
+    group.add_argument('--enable-zbh1-exact-semantics', action='store_true',
+                       help='Use an exact semantics for zbh1 schedule, might be slower than the default.')
 
-    if args.overlap_param_gather:
-        assert args.use_distributed_optimizer, \
-            '--overlap-param-gather only supported with distributed optimizer'
 
-    # Parameters dtype.
-    args.params_dtype = torch.float
-    if args.fp16:
-        assert not args.bf16
-        args.params_dtype = torch.half
-    if args.bf16:
-        assert not args.fp16
-        args.params_dtype = torch.bfloat16
-        # bfloat16 requires gradient accumulation and all-reduce to
-        # be done in fp32.
-        if not args.accumulate_allreduce_grads_in_fp32:
-            args.accumulate_allreduce_grads_in_fp32 = True
-            if args.rank == 0:
-                print('accumulate and all-reduce gradients in fp32 for '
-                      'bfloat16 data type.', flush=True)
+    # deprecated
+    # HACK: added back arguments because DeepSpeed still relies on the old
+    # activation checkpointing mechanism.
+    group.add_argument('--distribute-checkpointed-activations',
+                       action='store_true',
+                       help='If set, distribute checkpointed activations '
+                       'across model parallel group.')
+    group.add_argument('--checkpoint-num-layers', type=int, default=1,
+                       help='chunk size (number of layers) for checkpointing.')
+    group.add_argument('--train-tokens', type=int, default=None,
+                       help='Total number of tokens to train over all '
+                       'training runs.')
+    group.add_argument('--random-ltd',
+                       action='store_true',
+                       help='enable random layer token drop')
+    group.add_argument('--disable-moe-token-dropping', action='store_false',
+                       help='Disable MoE expert token dropping.',
+                       dest='moe_token_dropping')
+    group.add_argument('--moe-train-capacity-factor', type=float, default=1.0,
+                       help='The capacity of the MoE expert at training time')
+    group.add_argument('--moe-eval-capacity-factor', type=float, default=1.0,
+                       help='The capacity of the MoE expert at eval time.')
+    group.add_argument('--moe-min-capacity', type=int, default=4,
+                       help='The minimum capacity per MoE expert regardless of the capacity_factor.')
+    group.add_argument('--moe-loss-coeff', type=float, default=0.1,
+                       help='Scaling coefficient for adding MoE loss to model loss')
+    group.add_argument('--create-moe-param-group', action='store_true',
+                       help='Create separate groups for MoE params.'
+                       'This is necessary for techniques like ZeRO.')
+    group.add_argument('--disable-moe-top2-2nd-expert-sampling', action='store_false',
+                       help='Disable MoE top2 sampling of the 2nd expert. Instead of sampling, use argmax.',
+                       dest='moe_top2_2nd_expert_sampling')
+    group.add_argument('--use-flash-attn-v1', dest='use_flash_attn_v1', action='store_true',
+                       help='use first version FlashAttention implementation of attention. '
+                       'https://arxiv.org/abs/2205.14135')
+    group.add_argument('--use-flash-attn-v2', action='store_true',
+                       help='use second version FlashAttention implementation of attention. '
+                       'https://arxiv.org/abs/2307.08691')
+    group.add_argument('--use-flash-attn-triton', action='store_true',
+                       help='use FlashAttention implementation of attention using Triton.')
+    group.add_argument('--use-flash-attn-builder', action='store_true',
+                       help='use FlashAttention op builder.')
+    group.add_argument('--ds-inference', action='store_true',
+                       help='DeepSpeed inference engine being used')
+    group.add_argument('--cpu-optimizer', action='store_true',
+                       help='Run optimizer on CPU')
+    group.add_argument('--cpu_torch_adam', action='store_true',
+                       help='Use Torch Adam as optimizer on CPU.')
+    group.add_argument('--ds_fused_adam', action='store_true',
+                       help='Use DeepSpeed FusedAdam as optimizer.')
+    group.add_argument('--no-pipeline-parallel', action='store_true',
+                       help='Disable Deepspeed pipeline parallelism')
+    group.add_argument('--use-tutel', action='store_true',
+                       help='Use Tutel optimization for MoE')
+    group.add_argument('--inference', action='store_true',
+                       help='Very basic inference mode: not allocating optim/lr - requires ZERO_STAGE=0')
+    group.add_argument('--ds-sequence-parallel-size', type=int, default=1,
+                       help='Enable DeepSpeed\'s sequence parallel. Cannot be combined with "--sequence-parallel", which enables Megatron-LM\'s sequence parallel.')
+    group.add_argument('--force-ds-sequence-parallel', action='store_true',
+                       help='use DeepSpeed sequence parallelism regardless of sequence parallel size.')
+    group.add_argument('--use-dataset-only', type=bool, required=False, default=False,
+                       help='If set to True, only use the megatron dataset for external trainer ')
+    group.add_argument('--RLHF', action="store_true",
+                       help='RLHF mode')
+    group.add_argument('--ppo-epoches', type=int, default=1,
+                       help='RLHF model train epoches')
+    return parser
 
-    if args.rank == 0:
-        print('using {} for parameters ...'.format(args.params_dtype),
-              flush=True)
+def _add_learning_rate_args(parser):
+    group = parser.add_argument_group(title='learning rate')
 
-    # If we do accumulation and all-reduces in fp32, we need to have local DDP
-    # and we should make sure use-contiguous-buffers-in-local-ddp is not off.
-    if args.accumulate_allreduce_grads_in_fp32:
-        assert args.DDP_impl == 'local'
-        assert args.use_contiguous_buffers_in_local_ddp
+    group.add_argument('--actor-learning-rate', type=float, default=None,
+                       help='Initial RLHF actor model learning rate. Depending on decay style '
+                       'and initial warmup, the learing rate at each '
+                       'iteration would be different.')
+    group.add_argument('--critic-learning-rate', type=float, default=None,
+                       help='Initial RLHF critic model learning rate. Depending on decay style '
+                       'and initial warmup, the learing rate at each '
+                       'iteration would be different.')
+    group.add_argument('--lr-decay-tokens', type=int, default=None,
+                       help='number of tokens to decay learning rate over,'
+                       ' If not None will override iter/sample-based decay')
+    group.add_argument('--lr-warmup-tokens', type=int, default=None,
+                       help='number of tokens to linearly warmup '
+                       'learning rate over.')
+    return parser
 
-    # If we use the distributed optimizer, we need to have local DDP
-    # and we should make sure use-contiguous-buffers-in-local-ddp is on.
-    if args.use_distributed_optimizer:
-        assert args.DDP_impl == 'local'
-        assert args.use_contiguous_buffers_in_local_ddp
+def _add_checkpointing_args(parser):
+    group = parser.add_argument_group(title='checkpointing')
 
-    # For torch DDP, we do not use contiguous buffer
-    # if args.DDP_impl == 'torch':
-    if args.DDP_impl != 'local':
-        args.use_contiguous_buffers_in_local_ddp = False
+    group.add_argument('--load-tag', type=str, default=None,
+                       help='Specific checkpoint tag to load. Ignores latest.')
+    parser.add_argument("--actor_model_name_or_path", type=str, default=None,
+                        help="Directory containing a actor_model checkpoint.")
+    parser.add_argument("--critic_model_name_or_path", type=str, default=None,
+                        help="Directory containing a critic_model checkpoint.")
+    group.add_argument('--no-load-lr-state', action='store_true',
+                       help='Do not load lr state when loading checkpoint.')
+    group.add_argument('--universal-checkpoint', action='store_true',
+                        help='Loading a universal format checkpoint.')
+    return parser
 
-    if args.dataloader_type is None:
-        args.dataloader_type = 'single'
+def _add_mixed_precision_args(parser):
+    group = parser.add_argument_group(title='mixed precision')
 
-    # Consumed tokens.
-    args.consumed_train_samples = 0
-    args.consumed_valid_samples = 0
-    args.consumed_train_tokens = 0
+    group.add_argument('--no-query-key-layer-scaling', action='store_false',
+                       help='Do not scale Q * K^T by 1 / layer-number.',
+                       dest='apply_query_key_layer_scaling')
+    return parser
 
-    # Support for variable sequence lengths across batches/microbatches.
-    # set it if the dataloader supports generation of variable sequence lengths
-    # across batches/microbatches. Due to additional communication overhead
-    # during pipeline parallelism, it should not be set if sequence length
-    # is constant during training.
-    args.variable_seq_lengths = False
+def _add_distributed_args(parser):
+    group = parser.add_argument_group(title='distributed')
 
-    # Iteration-based training.
-    if args.train_iters:
-        # If we use iteration-based training, make sure the
-        # sample-based options are off.
-        assert args.train_samples is None, \
-            'expected iteration-based training'
-        assert args.lr_decay_samples is None, \
-            'expected iteration-based learning rate decay'
-        assert args.lr_warmup_samples == 0, \
-            'expected iteration-based learning rate warmup'
-        assert args.rampup_batch_size is None, \
-            'expected no batch-size rampup for iteration-based training'
-        if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_iters == 0, \
-                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
-
-    # Sample-based training.
-    if args.train_samples:
-        # If we use sample-based training, make sure the
-        # iteration-based options are off.
-        assert args.train_iters is None, \
-            'expected sample-based training'
-        assert args.lr_decay_iters is None, \
-            'expected sample-based learning rate decay'
-        assert args.lr_warmup_iters == 0, \
-            'expected sample-based learnig rate warmup'
-        if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_samples == 0, \
-                'can only specify one of lr-warmup-fraction ' \
-                'and lr-warmup-samples'
-
-    if args.num_layers is not None:
-        assert args.encoder_num_layers is None, \
-            'cannot have both num-layers and encoder-num-layers specified'
-        args.encoder_num_layers = args.num_layers
-    else:
-        if not args.use_dataset_only:
-            assert args.encoder_num_layers is not None, \
-                'either num-layers or encoder-num-layers should be specified'
-            args.num_layers = args.encoder_num_layers
-
-    # Check required arguments.
-    if not args.use_dataset_only:
-        required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
-                         'max_position_embeddings']
-        for req_arg in required_args:
-            _check_arg_is_not_none(args, req_arg)
-
-    # Checks.
-    if not args.use_dataset_only:
-        if args.ffn_hidden_size is None:
-            if args.swiglu:
-                # reduce the dimnesion for MLP since projections happens on
-                # two linear layers. this keeps the number of paramters in
-                # the same ballpark as the counterpart with 4*h size
-                # we keep it a multiple of 64, which means the actual tensor size
-                # will be a multiple of 64 / tp_size
-                args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64
-            else:
-                args.ffn_hidden_size = 4 * args.hidden_size
-
-        if args.kv_channels is None:
-            assert args.hidden_size % args.num_attention_heads == 0
-            args.kv_channels = args.hidden_size // args.num_attention_heads
-
-    if args.seq_length is not None:
-        assert args.encoder_seq_length is None
-        args.encoder_seq_length = args.seq_length
-    else:
-        assert args.encoder_seq_length is not None
-        args.seq_length = args.encoder_seq_length
-
-    if not args.use_dataset_only:
-        if args.seq_length is not None:
-            assert args.max_position_embeddings >= args.seq_length
-        if args.decoder_seq_length is not None:
-            assert args.max_position_embeddings >= args.decoder_seq_length
-    # When rotary position embeddings is used, set add_position_embedding
-    # to false to turn off absolute position embedding.
-    if args.use_rotary_position_embeddings:
-        args.add_position_embedding = False
-    if args.lr is not None:
-        assert args.min_lr <= args.lr
-    if args.save is not None:
-        assert args.save_interval is not None
-    # Mixed precision checks.
-    if args.fp16_lm_cross_entropy:
-        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
-    if args.fp32_residual_connection:
-        assert args.fp16 or args.bf16, \
-            'residual connection in fp32 only supported when using fp16 or bf16.'
-
-    if not args.use_dataset_only:
-        if args.weight_decay_incr_style == 'constant':
-            assert args.start_weight_decay is None
-            assert args.end_weight_decay is None
-            args.start_weight_decay = args.weight_decay
-            args.end_weight_decay = args.weight_decay
-        else:
-            assert args.start_weight_decay is not None
-            assert args.end_weight_decay is not None
-
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-    # Persistent fused layer norm.
-    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11):
-        args.no_persist_layer_norm = True
-        if args.rank == 0:
-            print('Persistent fused layer norm kernel is supported from '
-                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
-                  'Defaulting to no_persist_layer_norm=True')
-
-    # Activation checkpointing.
-    if args.distribute_checkpointed_activations:
-        assert args.checkpoint_activations, \
-            'for distribute-checkpointed-activations to work you '\
-            'need to enable checkpoint-activations'
-
-    # Activation recomputing.
-    if args.distribute_saved_activations:
-        assert args.tensor_model_parallel_size > 1, 'can distribute ' \
-            'recomputed activations only across tensor model ' \
-            'parallel groups'
-        assert args.recompute_granularity == 'full', \
-            'distributed recompute activations is only '\
-            'application to full recompute granularity'
-        assert args.recompute_method is not None, \
-            'for distributed recompute activations to work you '\
-            'need to use a recompute method '
-        assert (TORCH_MAJOR, TORCH_MINOR) >= (1, 10), \
-            'distributed recompute activations are supported for pytorch ' \
-            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
-            'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)
-
-    # Tranformer-Engine/FP8 related checking
-    if args.fp8_e4m3 or args.fp8_hybrid:
-        assert args.transformer_impl == 'transformer_engine', \
-            'transformer-engine required for fp8 training and inference'
-
-    assert not (args.fp8_e4m3 and args.fp8_hybrid), \
-        'cannot train with both fp8 e4m3 and hybrid formatting'
-
-    if args.recompute_granularity == 'selective':
-        assert args.recompute_method is None, \
-            'recompute method is not yet supported for ' \
-            'selective recomputing granularity'
-
-    if args.custom_recompute_layers_per_stage:
-        if args.virtual_pipeline_model_parallel_size is not None:
-            assert len(args.custom_recompute_layers_per_stage) == args.pipeline_model_parallel_size * args.virtual_pipeline_model_parallel_size, \
-                f"custom recompute_num_layers_per_stage length ({len(args.custom_recompute_layers_per_stage)}) should equal to total virtual pp stage size ({args.pipeline_model_parallel_size * args.virtual_pipeline_model_parallel_size})"
-        else:
-            assert len(args.custom_recompute_layers_per_stage) == args.pipeline_model_parallel_size, \
-                f"custom recompute_num_layers_per_stage ({len(args.custom_recompute_layers_per_stage)}) length should equal to PP size ({args.pipeline_model_parallel_size})"
-        
-        ## 若是deepseed使用自定义重计算pp stage则不考虑如下
-        if not args.deepspeed:
-            assert args.recompute_granularity == 'full', \
-            'custom recompute layers pp stage is only '\
-            'application to full recompute granularity'
-        
-            if args.virtual_pipeline_model_parallel_size is None:
-                num_layers_per_stage = args.num_layers // args.pipeline_model_parallel_size
-            else:
-                num_layers_per_stage = args.num_layers_per_virtual_pipeline_stage
-            if args.custom_partition is None:
-                assert max(args.custom_recompute_layers_per_stage) <= num_layers_per_stage, \
-                "recompute layers per PP stage should small than num layers per stage." \
-                f"get max recompute layers: {max(args.custom_recompute_layers_per_stage)}" \
-                f"average num layers per stage: {num_layers_per_stage}"
-            else:
-                for i in range(args.pipeline_model_parallel_size):
-                    assert args.custom_recompute_layers_per_stage[i] <= args.custom_partition[i], \
-                    "recompute layers per PP stage should small the num layers of PP stage" \
-                    f"stage ({i}): recompute layers ({args.custom_recompute_layers_per_stage[i]})  >  stage layers ({args.custom_partition[i]})"
-    
-    # disable sequence parallelism when tp=1
-    # to avoid change in numerics when
-    # sequence_parallelism is enabled.
-    if args.tensor_model_parallel_size == 1:
-        args.sequence_parallel = False
-
-    # disable async_tensor_model_parallel_allreduce when
-    # model parallel memory optimization is enabled
-    if args.sequence_parallel:
-        args.async_tensor_model_parallel_allreduce = False
-
-    # TODO: currently DeepSpeed seems to be incompatible with
-    # async_tensor_model_parallel_allreduce thus temporarily disabling it.
-    # Need further investigation.
-    if args.deepspeed:
-        args.async_tensor_model_parallel_allreduce = False
-
-    if not args.use_dataset_only:
-        if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
-            if args.sequence_parallel:
-                raise RuntimeError(
-                    "Using sequence parallelism requires setting the environment variable "
-                    "CUDA_DEVICE_MAX_CONNECTIONS to 1")
-            if args.async_tensor_model_parallel_allreduce:
-                raise RuntimeError(
-                    "Using async gradient all reduce requires setting the environment "
-                    "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
-
-    # Disable bias gelu fusion if we are disabling bias altogether
-    if not args.add_bias_linear:
-        args.bias_gelu_fusion = False
-
-    # Retro checks.
-    if args.retro_add_retriever:
-
-        # Sequence parallelism unsupported.
-        assert not args.sequence_parallel, \
-            "retro currently does not support sequence parallelism."
-
-        # Pipeline parallelism unsupported.
-        assert args.pipeline_model_parallel_size == 1, \
-            "retro currently does not support pipeline parallelism."
-
-    # Load retro args.
-    if args.retro_workdir:
-        retro_args_path = get_retro_args_path(args.retro_workdir)
-        assert os.path.exists(retro_args_path), "retro workdir missing args.json"
-        with open(retro_args_path) as f:
-            retro_args = types.SimpleNamespace(**json.load(f))
-            retro_args.retro_return_doc_ids = args.retro_return_doc_ids
-            retro_args.retro_gpt_retrieved_length = \
-                args.retro_num_retrieved_chunks * \
-                retro_args.retro_gpt_chunk_length
-            set_retro_args(retro_args)
+    group.add_argument('--enable-expert-tensor-parallelism', action='store_true',
+                        default=False,
+                        help="use tensor parallelism for expert layers in MoE")
+    group.add_argument('--partition-method',
+                       type=str, default='type:transformer',
+                       help='use deepspeed to patition layers. method include: uniform, parameters, type:transformer, custom')
+    group.add_argument('--custom-partition', nargs='*',
+                       type=int, default=None,
+                       help='customized model layers to PP stages, parameter of partition-method should set < custom > to take this effect. \
+                       example: divide 32 layers to 6 PP stages: 5 5 5 6 6 5. it means there are 5/5/5/6/6/5 layers in 6 pp stages')
+    group.add_argument('--moe-expert-parallel-size', type=int, default=1,
+                       help='Degree of the MoE expert parallelism.')
+    group.add_argument('--DDP-impl', default='local',
+                       choices=['local', 'torch', 'FSDP'],
+                       help='which DistributedDataParallel implementation '
+                       'to use.')
+    group.add_argument('--no-contiguous-buffers-in-local-ddp',
+                       action='store_false', help='If set, dont use '
+                       'contiguous buffer in local DDP.',
+                       dest='use_contiguous_buffers_in_local_ddp')
+    group.add_argument('--pp-delay', action='store_true', 
+                       default=False, help='')
+    group.add_argument('--pp-split-size', type=int, default=1,
+                    help='')
+    return parser
     
-    ## meg-ds start
-    args.curriculum_learning_legacy = False
-    args.compression_training = False
-
-    # FlashAttention
-    args.use_flash_attn = args.use_flash_attn_v1 or args.use_flash_attn_triton or args.use_flash_attn_v2
-
-    # AML
-    if args.aml_data_download_path is not None:
-        data_paths = []
-        for path in args.data_path:
-            data_paths.append(f"{args.aml_data_download_path}/{path}")
-        args.data_path = data_paths
-
-    # GQA
-    if not args.use_dataset_only:
-        if args.num_key_value_heads is None:
-            args.num_key_value_heads = args.num_attention_heads
-        assert args.num_attention_heads % args.num_key_value_heads == 0, \
-            f"num_attention_heads must be divisible by num_key_value_heads (got `num_attention_heads`: {args.num_attention_heads} " \
-            f"and `num_key_value_heads`: {args.num_key_value_heads})."
-        if args.num_key_value_heads != args.num_attention_heads:
-            # if GQA
-            assert not args.mos, 'GQA currently does not support args.mos'
-            assert not args.kd, 'GQA currently does not support args.kd'
-    ## meg-ds end
-
-    # Legacy RoPE arguments
-    if args.use_rotary_position_embeddings:
-        args.position_embedding_type = 'rope'
-
-    # Would just need to add 'NoPE' as a position_embedding_type to support this, but for now
-    # don't allow it to keep things simple
-    if not args.add_position_embedding and args.position_embedding_type != 'rope':
-        raise RuntimeError('--no-position-embedding is deprecated, use --position-embedding-type')
-
-    # MoE Spec check
-    if args.num_experts is not None:
-        assert args.spec is None, "Model Spec must be None when using MoEs"
-
-    # Expert parallelism check
-    if args.expert_model_parallel_size  > 1:
-        assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism"
-        assert args.num_experts % args.expert_model_parallel_size == 0, \
-            "Number of experts should be a multiple of expert model parallel_size."
-        assert not args.use_distributed_optimizer, \
-            "Expert parallelism is not suppored with distributed optimizer."
-        assert not args.fp16, \
-            "Expert parallelism is not supported with fp16 training."
-        if args.tensor_model_parallel_size > 1:
-            assert args.sequence_parallel, \
-                "When using expert parallelism and tensor parallelism, sequence parallelism must be used."
+def _add_data_args(parser):
+    group = parser.add_argument_group(title='data and dataloader')
 
-    # Print arguments.
-    _print_args("arguments", args)
-    retro_args = get_retro_args()
-    if retro_args and args != retro_args:
-        _print_args("retro arguments", types.SimpleNamespace(**{k:v for k,v in vars(retro_args).items() if k.startswith("retro")}, rank=args.rank))
+    group.add_argument('--aml-data-download-path', type=str, default=None,
+                       help='Path to mounted input dataset')
+    group.add_argument('--special-tokens-file', type=str, default=None,
+                       help='Path to the BPE special tokens file.')
+    parser.add_argument("--max-prompt-seq-len", type=int, default=256,
+                        help="The maximum prompt length during RLHF Training.")
+    group.add_argument('--mmap-warmup', action='store_true',
+                       help='Warm up mmap files.')
+    group.add_argument('--tokenizer-type', type=str,
+                       default=None,
+                       choices=['BertWordPieceLowerCase',
+                                'BertWordPieceCase',
+                                'GPT2BPETokenizer',
+                                'SentencePieceTokenizer',
+                                'GPTSentencePieceTokenizer',
+                                'HFTokenizer',
+                                'NullTokenizer',
+                                'AquilaTokenizer',
+                                'Llama2Tokenizer',
+                                'Llama3Tokenizer'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--trust-remote-code', action='store_true', default=False,
+                       help='To run HFTokenizer model from local path.')
+    group.add_argument('--data-impl', type=str, default='infer',
+                       choices=['mmap', 'infer'],
+                       help='Implementation of indexed datasets.')
+    group.add_argument('--train-data-exact-num-epochs', type=int, default=None,
+                       help='When building the train dataset, force it to be '
+                       'an exact number of epochs of the raw data')
+    group.add_argument('--return-data-index', action='store_true',
+                       help='Return the index of data sample.')
+    group.add_argument('--data-efficiency-curriculum-learning', action='store_true',
+                       help='Use DeepSpeed data efficiency library curriculum learning feature.')
+    group.add_argument('--train-idx-path', type=str, default=None,
+                       help='Force to use certain index file.')
+    group.add_argument('--train-desc-path', type=str, default=None,
+                       help='Force to use certain index file.')
+    group.add_argument('--train-doc-idx-path', type=str, default=None,
+                       help='Force to use certain index file.')
+    group.add_argument('--train-sample-idx-path', type=str, default=None,
+                       help='Force to use certain index file.')
+    group.add_argument('--train-shuffle-idx-path', type=str, default=None,
+                       help='Force to use certain index file.')
+    group.add_argument('--repeated-dataloader', action='store_true',
+                       help='Once all the data has been loaded, reuse the DataLoader.')
+    return parser
     
-    if args.pp_delay:
-        if not args.overlap_p2p_comm:
-            args.pp_delay = False
-
-    return args
-
-
-def _print_args(title, args):
-    """Print arguments."""
-    if args.rank == 0:
-        print(f'------------------------ {title} ------------------------',
-              flush=True)
-        str_list = []
-        for arg in vars(args):
-            dots = '.' * (48 - len(arg))
-            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
-        for arg in sorted(str_list, key=lambda x: x.lower()):
-            print(arg, flush=True)
-        print(f'-------------------- end of {title} ---------------------',
-              flush=True)
-
-
-def _check_arg_is_not_none(args, arg):
-    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
-
-def core_transformer_config_from_args(args):
-
-    # Translate args to core transformer configuration
-    kw_args = {}
-    for f in dataclasses.fields(TransformerConfig):
-        if hasattr(args, f.name):
-            kw_args[f.name] = getattr(args, f.name)
-    kw_args['persist_layer_norm'] = not args.no_persist_layer_norm
-    kw_args['layernorm_zero_centered_gamma'] = args.apply_layernorm_1p
-    kw_args['layernorm_epsilon'] = args.norm_epsilon
-    kw_args['deallocate_pipeline_outputs'] = True
-    kw_args['pipeline_dtype'] = args.params_dtype
-    kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm
-    kw_args['num_moe_experts'] = args.num_experts
-    if args.swiglu:
-        kw_args['activation_func'] = F.silu
-        kw_args['gated_linear_unit'] = True
-        kw_args['bias_gelu_fusion'] = False
-    if args.squared_relu:
-        assert not args.swiglu
-        def squared_relu(x):
-            return torch.pow(F.relu(x), 2)
-        kw_args['activation_func'] = squared_relu
-    if args.init_method_xavier_uniform:
-        kw_args['init_method'] = torch.nn.init.xavier_uniform_
-        kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_
-    if args.group_query_attention:
-        kw_args['num_query_groups'] = args.num_query_groups
-    else:
-        kw_args['num_query_groups'] = None
 
-    # If using Retro, return Retro config.
-    # retro_args = get_retro_args()
-    # if retro_args:
-    #     kw_args['retro_preprocess'] = retro_args
-    #     return RetroConfig(**kw_args)
-
-    # Return Transformer config.
-    return TransformerConfig(**kw_args)
+def _add_zero_args(parser):
+    """Text generate arguments."""
 
+    group = parser.add_argument_group('ZeRO configurations', 'configurations')
+    group.add_argument("--zero-stage", type=int, default=1.0)
+    group.add_argument('--zero-reduce-scatter', action='store_true',
+                       help='Use reduce scatter if specified')
+    group.add_argument('--zero-contigious-gradients', action='store_true',
+                       help='Use contigious memory optimizaiton if specified')
+    group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
+    group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
+    group.add_argument('--remote-device', type=str, default='none', choices=['none', 'cpu', 'nvme'],
+                      help='Remote device for ZeRO-3 initialized parameters.')
+    group.add_argument('--use-pin-memory', action='store_true',
+                     help='Use pinned CPU memory for ZeRO-3 initialized model parameters.')
+    return parser
 
-def _add_transformer_engine_args(parser):
-    group = parser.add_argument_group(title='Transformer-Engine')
+def _add_memoryopt_args(parser):
+    """Memory optimization arguments."""
 
-    group.add_argument('--fp8-e4m3', action='store_true',
-                        help='E4M3 TransformerLayer', dest='fp8_e4m3')
-    group.add_argument('--fp8-hybrid', action='store_true',
-                        help='Hybrid FP8 TransformerLayer', dest='fp8_hybrid')
-    group.add_argument('--fp8-format', default=None,
-                       choices=['e4m3', 'hybrid'],
-                       help='Which fp8 format scheme to use for FP8 tensors in the forward and backward pass',
-                       dest='fp8')
-    group.add_argument('--fp8-margin', type=int, default=0,
-                       help='Scaling margin for fp8',
-                       dest='fp8_margin')
-    group.add_argument('--fp8-interval', type=int, default=1,
-                       help='Scaling update interval for fp8',
-                       dest='fp8_interval')
-    group.add_argument('--fp8-amax-history-len', type=int, default=1,
-                       help='Number of steps for which amax history is recorded per tensor',
-                       dest='fp8_amax_history_len')
-    group.add_argument('--fp8-amax-compute-algo', default='most_recent',
-                       choices=['most_recent', 'max'],
-                       help='Algorithm for computing amax from history',
-                       dest='fp8_amax_compute_algo')
-    group.add_argument('--no-fp8-wgrad', action='store_false',
-                       help='Execute wgrad in higher precision even for FP8 runs',
-                       dest='fp8_wgrad')
-    group.add_argument('--transformer-impl', default='local',
-                       choices=['local', 'transformer_engine'],
-                       help='Which Transformer implementation to use.')
+    group = parser.add_argument_group('Memory optimizations', 'configurations')
+    group.add_argument("--scattered-embeddings", action='store_true',
+                       help='Save memory by scattering embedding activations. '
+                            'Introduces dropout differences across MP configurations.')
+    group.add_argument("--split-transformers", action='store_true',
+                       help='Save memory by splitting transformer layers into two parts, '
+                       'allowing for more frequent activation checkpoint savings.')
+    group.add_argument("--memory-centric-tiled-linear", action="store_true",
+                       help='Save memory by tiling with deepspeed.zero.TiledLinear.')
+    group.add_argument("--tile-factor", type=int, default=1,
+                       help='Make all linear layers the same size of [hidden/tile_factor, hidden/tile_factor]. '
+                            'Must be enabled with --memory-centric-tiled-linear. '
+                            'Example A: if tile_factor=1, the qkv layer [hidden, 3* hidden] would be converted into [1,3] tiles of size [hidden,hidden]. '
+                            'Example B: if tile_factor=2, the intermediate layer [4*hidden, hidden] will be converted into [8, 2] tiles of size [hidden/2, hidden/2]. '
+                            'Default is 1.')
 
     return parser
 
-def _add_inference_args(parser):
-    group = parser.add_argument_group(title='inference')
-
-    group.add_argument('--inference-batch-times-seqlen-threshold',
-                       type=int, default=512,
-                       help='During inference, if batch-size times '
-                       'sequence-length is smaller than this threshold '
-                       'then we will not use pipelining, otherwise we will.')
-    group.add_argument('--max-tokens-to-oom',
-                       type=int, default=12000,
-                       help='Maximum number of tokens during inference'
-                       'tokens here is # in prompt + # to generate'
-                       'Allows us to throw an error before OOM crashes server')
-    group.add_argument('--output-bert-embeddings', action='store_true',
-                       help='Output Bert embeddings (via mean pooling) from '
-                       'model, rather than its binary head output or entire '
-                       'hidden batch.')
-    group.add_argument('--bert-embedder-type', default="megatron",
-                       choices=["megatron", "huggingface"],
-                       help='Select either Megatron or Huggingface as the '
-                       'Bert embedder.')
-
+def _add_activation_checkpoint_args(parser):
+    group = parser.add_argument_group('Activation Checkpointing',
+                                      'Checkpointing Configurations')
+    group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
+                       help='uses activation checkpointing from deepspeed')
+    group.add_argument('--partition-activations', action='store_true',
+                       help='partition Activations across GPUs before checkpointing.')
+    group.add_argument('--contigious-checkpointing', action='store_true',
+                       help='Contigious memory checkpointing for activatoins.')
+    group.add_argument('--checkpoint-in-cpu', action='store_true',
+                       help='Move the activation checkpoints to CPU.')
+    group.add_argument('--synchronize-each-layer', action='store_true',
+                       help='does a synchronize at the beginning and end of each checkpointed layer.')
+    group.add_argument('--profile-backward', action='store_true',
+                       help='Enables backward pass profiling for checkpointed layers.')
     return parser
 
-
 def _add_retro_args(parser):
     group = parser.add_argument_group(title='retro')
-
     group.add_argument('--retro-workdir', default=None,
                        help='Retro working directory, which contains the '
                        'preprocessed data for for pretraining. This directory '
                        'is built during preprocessing (see '
                        'tools/retro/README.md), and contains subdirectories '
                        'for the chunk database and pretraining neighbors.')
-    group.add_argument('--retro-add-retriever',
-                       action='store_true', default=False,
-                       help='Add a retriever to the transformer, for use in '
-                       'pretraining a Retro model.')
-    group.add_argument('--retro-cyclic-train-iters', type=int, default=None,
-                       help='Set number of training iterations for cyclic '
-                       'Retro training.')
-    group.add_argument('--retro-encoder-layers', type=int, default=2,
-                       help='Number of layers to use for the retrieval '
-                       'encoder.')
-    group.add_argument('--retro-encoder-hidden-dropout',
-                       type=float, default=0.1, help='Hidden dropout for '
-                       'retrieval encoder.')
-    group.add_argument('--retro-encoder-attention-dropout',
-                       type=float, default=0.1, help='Attention dropout for '
-                       'retrieval encoder.')
-    group.add_argument("--retro-num-neighbors", type=int, default=2,
-                       help='Number of neighbors to retrieve during '
-                       'pretraining.')
-    group.add_argument("--retro-num-retrieved-chunks", type=int, default=2,
-                       help='Number of chunks to retrieve from the retrieval '
-                       'database.')
     group.add_argument("--retro-return-doc-ids", action="store_true",
                        help="Turn this on when preprocessing retro data.")
-    group.add_argument("--retro-no-verify-neighbor-count", action="store_false",
-                       dest="retro_verify_neighbor_count",
-                       help="Skip verifying that len(GPT dataset) == len(saved "
-                       "neighbors).")
-
+    
     # Enforce argument naming convention.
     for action in group._group_actions:
         prefix = action.dest.split("_")[0]
@@ -702,1086 +393,842 @@ def _add_retro_args(parser):
 
     return parser
 
+def validate_args(args, defaults={}):
 
-def _add_network_size_args(parser):
-    group = parser.add_argument_group(title='network size')
-
-    group.add_argument('--num-layers', type=int, default=None,
-                       help='Number of transformer layers.')
-    group.add_argument('--encoder-num-layers', type=int, default=None,
-                       help='Number of encoder transformer layers.')
-    group.add_argument('--decoder-num-layers', type=int, default=None,
-                       help='Number of decoder transformer layers.')
-    group.add_argument('--num-experts', type=int, nargs='+', default=[1,],
-                           help='number of experts list, MoE related.')
-    group.add_argument('--mlp-type', type=str, default='standard',
-                           help='Only applicable when num-experts > 1, accepts [standard, residual]')
-    group.add_argument('--topk', type=int, default=1,
-                           help='Sets the k in TopK gating for MoE layers')
-    group.add_argument('--expert-interval', type=int, default=1,
-                           help='Use experts in every "expert-interval" layers')
-    group.add_argument('--hidden-size', type=int, default=None,
-                       help='Tansformer hidden size.')
-    group.add_argument('--ffn-hidden-size', type=int, default=None,
-                       help='Transformer Feed-Forward Network hidden size. '
-                       'This is set to 4*hidden-size if not provided')
-    group.add_argument('--num-attention-heads', type=int, default=None,
-                       help='Number of transformer attention heads.')
-    group.add_argument('--num-key-value-heads', type=int, default=None,
-                       help='Number of key_value heads that should be used to implement Grouped Query Attention.')
-    group.add_argument('--kv-channels', type=int, default=None,
-                       help='Projection weights dimension in multi-head '
-                       'attention. This is set to '
-                       '   args.hidden_size // args.num_attention_heads '
-                       'if not provided.')
-    group.add_argument('--group-query-attention', action='store_true',
-                          help='Use group-query attention.')
-    group.add_argument('--num-query-groups', type=int, default=1)
-
-    group.add_argument('--max-position-embeddings', type=int, default=None,
-                       help='Maximum number of position embeddings to use. '
-                       'This is the size of position embedding.')
-    group.add_argument('--position-embedding-type', type=str, default='learned_absolute',
-                       choices=['learned_absolute', 'rope'],
-                       help='Position embedding type.')
-    group.add_argument('--use-rotary-position-embeddings', action='store_true',
-                       help='Use rotary positional embeddings or not. '
-                       'Deprecated: use --position-embedding-type')
-    group.add_argument('--rotary-position-embeddings-theta', type=int, default=10000,
-                       help='Rotary positional embeddings theta value.',
-                       dest='rope_theta')
-    group.add_argument('--rotary-percent', type=float, default=1.0,
-                       help='Percent of rotary dimension to use, default 100%%')
-    group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
-                       help='Sequence length interpolation factor for rotary embeddings.')
-    group.add_argument('--no-position-embedding',
-                       action='store_false',
-                       help='Disable position embedding. Deprecated: use --position-embedding-type',
-                       dest='add_position_embedding')
-    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
-                       help='Pad the vocab size to be divisible by this value.'
-                       'This is added for computational efficieny reasons.')
-    group.add_argument('--normalization', default='LayerNorm',
-                       choices=['LayerNorm', 'RMSNorm'],
-                       help='Which normalization technique to use.')
-    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
-                       help='Layer norm epsilon.')
-    group.add_argument('--norm-epsilon', type=float, default=1e-5,
-                       help='Epsilon for layer norm and RMS norm.')
-    group.add_argument('--apply-layernorm-1p', action='store_true',
-                       help='Adjust LayerNorm weights such that they are centered '
-                       'around zero. This improves numerical stability.')
-    group.add_argument('--disable-mem-efficient-ln', action='store_false', 
-                       help='Disable the memory-efficient fused LayerNorm optimization '
-                       'introduced in https://github.com/NVIDIA/apex/pull/1715', dest='mem_efficient_ln')
-    group.add_argument('--apply-residual-connection-post-layernorm',
-                       action='store_true',
-                       help='If set, use original BERT residula connection '
-                       'ordering.')
-    group.add_argument('--openai-gelu', action='store_true',
-                       help='Use OpenAIs GeLU implementation. This option'
-                       'should not be used unless for backward compatibility'
-                       'reasons.')
-    group.add_argument('--squared-relu', action='store_true',
-                       help='Use squared relu activation instead of default gelu')
-    group.add_argument('--swiglu', action='store_true',
-                       help='Use gated linear units and SiLU activation instead of default gelu')
-    group.add_argument('--onnx-safe', type=bool, required=False,
-                       help='Use workarounds for known problems with '
-                       'Torch ONNX exporter')
-    group.add_argument('--bert-no-binary-head', action='store_false',
-                       help='Disable BERT binary head.',
-                       dest='bert_binary_head')
-    group.add_argument('--num-experts-switch', type=int, default=None,
-                       help='Number of Experts in Switch Transformer (None means no Switch)')
-    group.add_argument('--untie-embeddings-and-output-weights', action='store_true',
-                       help='Untie embeddings and output weights.'),
-    group.add_argument('--embedding-weights-in-fp32', action='store_true',
-                       help='Cast word embedding weights to fp32 before embedding fwd.'),
-    return parser
-
+    # Load saved args from Retro (if applicable).
+    load_retro_args(args)
 
-def _add_logging_args(parser):
-    group = parser.add_argument_group(title='logging')
+    if args.parallel_group_num != None:
+        assert args.parallel_group != None, \
+            'parallel-group should not be None, when parallel_group_num set!'
+        parallel_group_TP = args.parallel_group[::3]
+        parallel_group_DP = args.parallel_group[1::3]
+        parallel_group_PP = args.parallel_group[2::3]
+        
+        assert args.parallel_group_num == 2, \
+            'only support 2 parallel_group now!'
+            
+        assert args.untie_embeddings_and_output_weights, \
+            'not support shared embeddings and output weights'
+        
+        assert args.parallel_group_num == len(parallel_group_TP), \
+            'parallel-group-num should match parallel-group!'
+        assert args.world_size == sum(tp * dp * pp for tp, dp, pp in 
+                                      zip(parallel_group_TP, parallel_group_DP, parallel_group_PP)), \
+            'total world size should match sum of all tp x dp x pp!'
+        
+        #Pipeline model paralle size.
+        assert args.pipeline_model_parallel_size == sum(parallel_group_PP), \
+            'pipeline_model_parallel_size should match sum of paralle_group_PP!'
+        assert args.standalone_embedding_stage == False, \
+            'standalone not supported with parallel_group_num set!'
+        args.transformer_pipeline_model_parallel_size = args.pipeline_model_parallel_size
+        assert args.pipeline_model_parallel_split_rank == None, \
+            'pipeline_model_parallel_split_rank not supported with parallel_group_num set!'
+        
+        #Data parallel size.
+        assert all(x == parallel_group_DP[0] for x in parallel_group_DP), \
+            'all parallel group dp should be the same!'
+        args.data_parallel_size = parallel_group_DP[0]
+        
+        #Context parallel size.
+        assert args.context_parallel_size == 1, \
+            'cp!=1 not supported now!'
+            
+        #Virtual parallel size.
+        assert args.num_layers_per_virtual_pipeline_stage == None, \
+            'virtual pipeline not supported now!'
+            
+        #Expert parallel size.
+        assert args.expert_model_parallel_size == 1, \
+            'ep!=1 not supported now!'
+            
+        #Tensor model parallel size
+        num_device_of_each_pipeline_stage = []
+        tp_size_of_each_pipeline_stage = []
+        for i in range(len(parallel_group_PP)):
+            for j in range(parallel_group_PP[i]):
+                tp_size_of_each_pipeline_stage.append(parallel_group_TP[i])
+                num_device_of_each_pipeline_stage.append(parallel_group_TP[i] * args.data_parallel_size)
+            
+        # len = p + 1,  [0, sum(p0), sum(p0-p1), ..., sum(p0-pn-1)]
+        cumu_num_device_of_all_pipeline_stage = [sum(num_device_of_each_pipeline_stage[:i]) for i in range(args.pipeline_model_parallel_size + 1)]
+                
+        for i in range(args.pipeline_model_parallel_size):
+            if cumu_num_device_of_all_pipeline_stage[i] <= args.rank < cumu_num_device_of_all_pipeline_stage[i+1]:
+                args.tensor_model_parallel_size = tp_size_of_each_pipeline_stage[i]
+                    
+        args.parallel_group_TP = parallel_group_TP
+        args.parallel_group_DP = parallel_group_DP
+        args.parallel_group_PP = parallel_group_PP
+        args.cumu_num_device_of_all_pipeline_stage = cumu_num_device_of_all_pipeline_stage
+        args.tp_size_of_each_pipeline_stage = tp_size_of_each_pipeline_stage
+        
+        if args.rank == 0:
+            print('using world size: {}, data-parallel size: {}, '
+                'context-parallel size: {} '
+                'tensor-model-parallel size: {}, '
+                'pipeline-model-parallel size: {} '.format(
+                    args.world_size, args.data_parallel_size,
+                    args.context_parallel_size,
+                    args.tensor_model_parallel_size,
+                    args.pipeline_model_parallel_size), flush=True)
 
-    group.add_argument('--log-params-norm', action='store_true',
-                       help='If set, calculate and log parameters norm.')
-    group.add_argument('--log-num-zeros-in-grad', action='store_true',
-                       help='If set, calculate and log the number of zeros in gradient.')
-    group.add_argument('--log-throughput', action='store_true',
-                       help='If set, calculate and log throughput per GPU.')
-    group.add_argument('--timing-log-level', type=int,
-                       default=0, choices=range(0,3),
-                       help='Granularity level to measure and report timing. '
-                       '   0: report only iteration time and make sure timing '
-                       '      does not introduce extra overhead.'
-                       '   1: report timing for operations that are executed '
-                       '      very limited times (basically once) during '
-                       '      each iteration (such as gradient all-reduce) '
-                       '   2: report timing for operations that migh be '
-                       '      executed numerous times during each iteration. '
-                       'Note that setting the level to 1 or 2 might '
-                       'cause increase in iteration time.')
-    group.add_argument('--no-barrier-with-level-1-timing', action='store_false',
-                       help='If not set, use barrier with level 1 time '
-                       'measurements. Note that this is up to the user '
-                       'to make sure calling barrier with their timers '
-                       'will not result in hangs. This can happen if for '
-                       'example the user adds a level 1 timer that is not '
-                       'called by all ranks.',
-                       dest='barrier_with_L1_time')
-    group.add_argument('--timing-log-option', type=str, default='minmax',
-                       choices=['max', 'minmax', 'all'],
-                       help='Options for logging timing:'
-                       '  max: report the max timing across all ranks'
-                       '  minmax: report min and max timings across all ranks'
-                       '  all: report timings of all ranks.')
-    group.add_argument('--tensorboard-log-interval', type=int, default=1,
-                       help='Report to tensorboard interval.')
-    group.add_argument('--tensorboard-queue-size', type=int, default=1000,
-                       help='Size of the tensorboard queue for pending events '
-                       'and summaries before one of the ‘add’ calls forces a '
-                       'flush to disk.')
-    group.add_argument('--log-timers-to-tensorboard', action='store_true',
-                       help='If set, write timers to tensorboard.')
-    group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
-                       help='If set, write batch-size to tensorboard.')
-    group.add_argument('--no-log-learnig-rate-to-tensorboard',
-                       action='store_false',
-                       help='Disable learning rate logging to tensorboard.',
-                       dest='log_learning_rate_to_tensorboard')
-    group.add_argument('--no-log-loss-scale-to-tensorboard',
-                       action='store_false',
-                       help='Disable loss-scale logging to tensorboard.',
-                       dest='log_loss_scale_to_tensorboard')
-    group.add_argument('--log-validation-ppl-to-tensorboard',
-                       action='store_true',
-                       help='If set, write validation perplexity to '
-                       'tensorboard.')
-    group.add_argument('--log-optimizer-states-to-tensorboard',
-                       action='store_true',
-                       help='If set, write various optimizer states to '
-                       'tensorboard. This feature may consume extra GPU memory.')
-    group.add_argument('--log-memory-to-tensorboard',
-                       action='store_true',
-                       help='Enable memory logging to tensorboard.')
-    group.add_argument('--log-world-size-to-tensorboard',
-                       action='store_true',
-                       help='Enable world size logging to tensorboard.')
-    group.add_argument('--wandb-project', type=str, default='',
-                       help='The wandb project name. Ignore wandb by default.')
-    group.add_argument('--wandb-exp-name', type=str, default='',
-                       help='The wandb experiment name.')
-    group.add_argument('--wandb-save-dir', type=str, default='',
-                       help='Path to save the wandb results locally.')
-    return parser
+    else:
+        # Tensor model parallel size.
+        args.tensor_model_parallel_size = min(
+            args.tensor_model_parallel_size, args.world_size)
+        assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
+            ' ({}) is not divisible by tensor model parallel size ({})'.format(
+                args.world_size, args.tensor_model_parallel_size)
+
+        # Zero bubble pipeline is defined on deepspeed's scheduler
+        if args.enable_zbh1_pipeline:
+            assert args.deepspeed, 'Use DeepSpeed to use zero-bubble H1 pipeline'
+            assert args.sequence_parallel == False, "Sequence Parallel not tested, proceed at own will by removing this line"
+        if args.enable_zbh1_exact_semantics:
+            assert args.enable_zbh1_pipeline, 'Exact semantics require ZBH1 pipeline enabled'
+        # Pipeline model parallel size.
+        args.pipeline_model_parallel_size = min(
+            args.pipeline_model_parallel_size,
+            (args.world_size // args.tensor_model_parallel_size))
+        args.transformer_pipeline_model_parallel_size = (
+            args.pipeline_model_parallel_size - 1
+            if args.standalone_embedding_stage else
+            args.pipeline_model_parallel_size
+        )
+
+        # Checks.
+        # if args.no_pipeline_parallel:
+        #     assert args.pipeline_model_parallel_size == 1, \
+        #         "pipeline_model_parallel_size must be 1 if pipeline parallel is disabled"
+            
+        if args.ds_sequence_parallel_size > 1:
+            assert args.deepspeed, "deepspeed must be enable when ds_sequence_parallel_size > 1"
+            assert args.context_parallel_size <= 1, "Megatron-lm CP is not compatible with Deppspeed SP"
+            assert version.parse(deepspeed.__version__) >= version.parse("0.10.2"), "sequence parallelism requires DeepSpeed version 0.10.2+"
+
+        if args.deepspeed:
+            model_parallel_size = args.pipeline_model_parallel_size * \
+                                args.tensor_model_parallel_size * \
+                                args.ds_sequence_parallel_size
+            assert args.world_size % model_parallel_size == 0, 'world size ({}) is not'\
+                ' divisible by tensor parallel size ({}) times pipeline parallel ' \
+                'size ({}) times seqence parallel size ({})'.format(args.world_size, args.tensor_model_parallel_size,
+                                args.pipeline_model_parallel_size, args.ds_sequence_parallel_size)
+            args.data_parallel_size = args.world_size // model_parallel_size
+            if args.rank == 0:
+                print('using world size: {}, data-parallel-size: {}, '
+                    'sequence-parallel size: {}, '
+                    'tensor-model-parallel size: {}, '
+                    'pipeline-model-parallel size: {} '.format(
+                        args.world_size, args.data_parallel_size,
+                        args.ds_sequence_parallel_size,
+                        args.tensor_model_parallel_size,
+                        args.pipeline_model_parallel_size), flush=True)
+        else:
+            model_parallel_size = args.pipeline_model_parallel_size * \
+                                args.tensor_model_parallel_size
+            assert args.world_size % (model_parallel_size * args.context_parallel_size) == 0, \
+                'world size ({}) is not divisible by tensor parallel size ({}) times ' \
+                'pipeline parallel size ({}) times context parallel size ({})'.format(
+                args.world_size, args.tensor_model_parallel_size,
+                args.pipeline_model_parallel_size, args.context_parallel_size)
+            args.data_parallel_size = args.world_size // (model_parallel_size * args.context_parallel_size)
+            if args.rank == 0:
+                print('using world size: {}, data-parallel size: {}, '
+                    'context-parallel size: {} '
+                    'tensor-model-parallel size: {}, '
+                    'pipeline-model-parallel size: {} '.format(
+                        args.world_size, args.data_parallel_size,
+                        args.context_parallel_size,
+                        args.tensor_model_parallel_size,
+                        args.pipeline_model_parallel_size), flush=True)
+        if args.pipeline_model_parallel_size > 1:
+            if args.pipeline_model_parallel_split_rank is not None:
+                assert args.pipeline_model_parallel_split_rank < \
+                        args.pipeline_model_parallel_size, 'split rank needs'\
+                        ' to be less than pipeline model parallel size ({})'.format(
+                                args.pipeline_model_parallel_size)
 
+    if args.tp_comm_overlap:
+        assert args.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
 
-def _add_regularization_args(parser):
-    group = parser.add_argument_group(title='regularization')
+    # Deprecated arguments
+    assert args.batch_size is None, '--batch-size argument is no longer ' \
+        'valid, use --micro-batch-size instead'
+    del args.batch_size
+    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
+        '--lr-warmup-fraction instead'
+    del args.warmup
+    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
+        'longer valid, use --tensor-model-parallel-size instead'
+    del args.model_parallel_size
 
-    group.add_argument('--attention-dropout', type=float, default=0.1,
-                       help='Post attention dropout probability.')
-    group.add_argument('--hidden-dropout', type=float, default=0.1,
-                       help='Dropout probability for hidden state transformer.')
-    group.add_argument('--weight-decay', type=float, default=0.01,
-                       help='Weight decay coefficient for L2 regularization.')
-    group.add_argument('--actor-weight-decay', type=float, default=0.01,
-                       help='RLHF actor model weight decay coefficient for L2 regularization.')
-    group.add_argument('--critic-weight-decay', type=float, default=0.01,
-                       help='RLHF critic model weight decay coefficient for L2 regularization.')
-    group.add_argument('--start-weight-decay', type=float,
-                       help='Initial weight decay coefficient for L2 regularization.')
-    group.add_argument('--end-weight-decay', type=float,
-                       help='End of run weight decay coefficient for L2 regularization.')
-    group.add_argument('--weight-decay-incr-style', type=str, default='constant',
-                       choices=['constant', 'linear', 'cosine'],
-                       help='Weight decay increment function.')
-    group.add_argument('--clip-grad', type=float, default=1.0,
-                       help='Gradient clipping based on global L2 norm.')
-    group.add_argument('--adam-beta1', type=float, default=0.9,
-                       help='First coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-beta2', type=float, default=0.999,
-                       help='Second coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-eps', type=float, default=1e-08,
-                       help='Term added to the denominator to improve'
-                       'numerical stability')
-    group.add_argument('--sgd-momentum', type=float, default=0.9,
-                       help='Momentum factor for sgd')
+    # HACK: below is commented because DeepSpeed still relies on the old
+    # activation checkpointing mechanism.
+    # if args.checkpoint_activations:
+    #     if args.rank == 0:
+    #         print('--checkpoint-activations is no longer valid, use --recompute-activations, '
+    #               'or, for more control, --recompute-granularity and --recompute-method.')
+    #     exit()
+    # del args.checkpoint_activations
 
-    return parser
+    if args.recompute_activations:
+        args.recompute_granularity = 'selective'
+    del args.recompute_activations
 
+    # Set input defaults.
+    for key in defaults:
+        # For default to be valid, it should not be provided in the
+        # arguments that are passed to the program. We check this by
+        # ensuring the arg is set to None.
+        if getattr(args, key, None) is not None:
+            if args.rank == 0:
+                print('WARNING: overriding default arguments for {key}:{v} \
+                       with {key}:{v2}'.format(key=key, v=defaults[key],
+                                               v2=getattr(args, key)),
+                                               flush=True)
+        else:
+            setattr(args, key, defaults[key])
 
-def _add_training_args(parser):
-    group = parser.add_argument_group(title='training')
+    if args.data_path is not None and args.split is None:
+        legacy_default_split_value = '969, 30, 1'
+        if args.rank == 0:
+            print('WARNING: Please specify --split when using --data-path. Using legacy default value '
+                  f'of "{legacy_default_split_value}"')
+        args.split = legacy_default_split_value
 
-    group.add_argument('--micro-batch-size', type=int, default=None,
-                       help='Batch size per model instance (local batch size). '
-                       'Global batch size is local batch size times data '
-                       'parallel size times number of micro batches.')
-    group.add_argument('--batch-size', type=int, default=None,
-                       help='Old batch size parameter, do not use. '
-                       'Use --micro-batch-size instead')
-    group.add_argument('--global-batch-size', type=int, default=None,
-                       help='Training batch size. If set, it should be a '
-                       'multiple of micro-batch-size times data-parallel-size. '
-                       'If this value is None, then '
-                       'use micro-batch-size * data-parallel-size as the '
-                       'global batch size. This choice will result in 1 for '
-                       'number of micro-batches.')
-    group.add_argument('--rlhf-train-mbs', type=int, default=None,
-                       help='Micro batch size in RLHF train time')
-    group.add_argument('--rampup-batch-size', nargs='*', default=None,
-                       help='Batch size ramp up with the following values:'
-                       '  --rampup-batch-size <start batch size> '
-                       '                      <batch size incerement> '
-                       '                      <ramp-up samples> '
-                       'For example:'
-                       '   --rampup-batch-size 16 8 300000 \ '
-                       '   --global-batch-size 1024'
-                       'will start with global batch size 16 and over '
-                       ' (1024 - 16) / 8 = 126 intervals will increase'
-                       'the batch size linearly to 1024. In each interval'
-                       'we will use approximately 300000 / 126 = 2380 samples.')
-    group.add_argument('--recompute-activations', action='store_true',
-                       help='recompute activation to allow for training '
-                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--recompute-granularity', type=str, default=None,
-                       choices=['full', 'selective'],
-                       help='Checkpoint activations to allow for training '
-                       'with larger models, sequences, and batch sizes. '
-                       'It is supported at two granularities 1) full: '
-                       'whole transformer layer is recomputed, '
-                       '2) selective: core attention part of the transformer '
-                       'layer is recomputed.')
-    group.add_argument('--no-check-for-nan-in-loss-and-grad', action='store_false',
-                       help='Check for NaNs in loss and grad',
-                       dest='check_for_nan_in_loss_and_grad')
-    group.add_argument('--distribute-saved-activations',
-                       action='store_true',
-                       help='If set, distribute recomputed activations '
-                       'across model parallel group.')
-    group.add_argument('--recompute-method', type=str, default=None,
-                       choices=['uniform', 'block'],
-                       help='1) uniform: uniformly divide the total number of '
-                       'Transformer layers and recompute the input activation of '
-                       'each divided chunk at specified granularity, '
-                       '2) recompute the input activations of only a set number of '
-                       'individual Transformer layers per pipeline stage and do the '
-                       'rest without any recomputing at specified granularity'
-                       'default) do not apply activations recompute to any layers')
-    group.add_argument('--recompute-num-layers', type=int, default=None,
-                       help='1) uniform: the number of Transformer layers in each '
-                       'uniformly divided recompute unit, '
-                       '2) block: the number of individual Transformer layers '
-                       'to recompute within each pipeline stage.')
-    group.add_argument('--custom-recompute-layers-per-stage', nargs='*', type=int, default=None,
-                       help='custom recompute num layers in each PP stage, it should be equal to PP size ')
-    group.add_argument('--no-clone-scatter-output-in-embedding', action='store_false',
-                       help='If not set, clone the output of the scatter in embedding layer to GC original tensor.',
-                       dest='clone_scatter_output_in_embedding')
-    group.add_argument('--profile', action='store_true',
-                       help='Enable nsys profiling. When using this option, nsys '
-                       'options should be specified in commandline. An example '
-                       'nsys commandline is `nsys profile -s none -t nvtx,cuda '
-                       '-o <path/to/output_file> --force-overwrite true '
-                       '--capture-range=cudaProfilerApi '
-                       '--capture-range-end=stop`.')
-    group.add_argument('--profile-step-start', type=int, default=10,
-                       help='Global step to start profiling.')
-    group.add_argument('--profile-step-end', type=int, default=12,
-                       help='Global step to stop profiling.')
-    group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
-                       help='Global ranks to profile.')
-    group.add_argument('--tp-comm-overlap', action='store_true', help = 'Enables the '
-                       ' overlap of Tensor parallel communication and GEMM kernels.')
-    group.add_argument('--tp-comm-overlap-cfg', type=str, default=None, 
-                       help = 'Config file when tp_comm_overlap is enabled.')
-    group.add_argument('--disable-tp-comm-split-ag', action='store_false', 
-                       help = 'Disables the All-Gather overlap with fprop GEMM.',
-                       dest='tp_comm_split_ag')
-    group.add_argument('--disable-tp-comm-split-rs', action='store_false', 
-                       help = 'Disables the Reduce-Scatter overlap with fprop GEMM.',
-                       dest='tp_comm_split_rs')
-    group.add_argument('--disable-tp-comm-bulk-dgrad', action='store_false', 
-                       help = 'Disables the All-Gather overlap with bprop activation gradient GEMM.',
-                       dest='tp_comm_bulk_dgrad')
-    group.add_argument('--disable-tp-comm-bulk-wgrad', action='store_false', 
-                       help = 'Disables the Reduce-Scatter overlap with bprop weight gradient GEMM.',
-                       dest='tp_comm_bulk_wgrad')
+    if args.num_layers is not None:
+        assert args.encoder_num_layers is None, \
+            'cannot have both num-layers and encoder-num-layers specified'
+        args.encoder_num_layers = args.num_layers
+    else:
+        assert args.encoder_num_layers is not None, \
+            'either num-layers or encoder-num-layers should be specified'
+        args.num_layers = args.encoder_num_layers
+
+    remainder = args.num_layers % args.pipeline_model_parallel_size
+    if args.allow_padding_num_layers and remainder > 0:
+        assert not args.standalone_embedding_stage, "not support standalone embedding stage if allow_padding_num_layers is true"
+        # pad num_layers to make num_layers % pipeline_model_parallel_size == 0
+        num_layers_with_padding = args.num_layers - remainder + args.pipeline_model_parallel_size
+    else:
+        num_layers_with_padding = args.num_layers
+    args.num_layers_without_padding = args.num_layers
+    args.num_layers = num_layers_with_padding
+    args.encoder_num_layers = num_layers_with_padding
+    
+    if args.micro_batch_size_per_dp is not None:
+        assert args.micro_batch_size == None, \
+            'micro-batch-size must be None when use micro-batch-size-per-dp!'
+        assert args.context_parallel_size * args.expert_model_parallel_size == 1, \
+            "context parallel and expert model parallel can't be used with tp-pp-dp mapping."
+        assert args.dataloader_type == None or args.dataloader_type == 'single', \
+            "dataloader_type must be None or single when using micro_batch_size_per_dp."
+        assert args.use_tp_pp_dp_mapping == True, \
+            "use_tp_pp_dp_mapping must be True when using micro_batch_size_per_dp."
+        
+        data_parallel_split = args.micro_batch_size_per_dp[::2]
+        micro_batch_sizes_split = args.micro_batch_size_per_dp[1::2]
+        total_micro_batch_sizes_split = [micro_batch_sizes_split[i] for i, j in enumerate(data_parallel_split) for _ in range(j)]
+        args.data_parallel_splits = data_parallel_split
+        args.micro_batch_size_per_dp = micro_batch_sizes_split
+        args.num_micro_batches = None
+        args.num_micro_batches_grad_factor = 0.
+        assert sum(data_parallel_split) == args.data_parallel_size, \
+            'the length of micro_batch_size_per_dp (equal to sum of n0, n1, ... ) should be equal to data-parallel-size.'
+        
+        if args.num_micro_batches_per_dp is not None:
+            num_microbatches_splits = args.num_micro_batches_per_dp[1::2]
+            num_microbatches_data_parallel_splits = args.num_micro_batches_per_dp[::2]
+            args.num_micro_batches_per_dp = num_microbatches_splits
+            
+            assert sum(num_microbatches_data_parallel_splits) == args.data_parallel_size , \
+                "the length of num_micro_batches_per_dp (equal to sum of 'n0, n1, ...') should be equal to data-parallel-size."
+            assert num_microbatches_data_parallel_splits == data_parallel_split, \
+                "num micro batches' data parallel splits should be equal to micro batch sizes' data parallel splits one by one." \
+                "for example: micro batch size per dp is (1 A 1 B) then num micro batches per dp should be (1 X 1 Y)."
+            
+            total_num_microbatches_split = [num_microbatches_splits[i] for i, j in enumerate(num_microbatches_data_parallel_splits) for _ in range(j)]
+            
+            nmbs_dict = {}
+            for i in num_microbatches_splits:
+                nmbs_dict[i] = 0
+            assert len(nmbs_dict) <= 2, \
+                "the number of heterogeneous devices in parameter num_micro_batches_per_dp should be less than or equal to 2." \
+                f'but get {len(nmbs_dict)} for num micro batches.' \
+                "it means there are more than 2 heterogeneous devices in parameter num_micro_batches_per_dp! that is not supported yet."
+            
+            sum_micro_batches = sum([micro_batch_sizes_split[i] * total_num_microbatches_split[i] for i in range(len(micro_batch_sizes_split))])
+            
+            assert args.rampup_batch_size is None, 'num_micro_batches_per_dp is not currently supported for use with rampup_batch_size.'
+
+        offset = args.tensor_model_parallel_size * args.pipeline_model_parallel_size
+        for i in range(1, args.data_parallel_size + 1):
+            if args.rank < i * offset:
+                args.micro_batch_size = total_micro_batch_sizes_split[i - 1]
+                if args.num_micro_batches_per_dp is not None:
+                    args.num_micro_batches = total_num_microbatches_split[i - 1]
+                    args.num_micro_batches_grad_factor = total_micro_batch_sizes_split[i - 1] * total_num_microbatches_split[i - 1] / sum_micro_batches
+                break
+        if args.num_micro_batches_per_dp is None:
+            sum_of_micro_batch_sizes = sum(map(lambda x, y : x * y,
+                                            micro_batch_sizes_split,
+                                            data_parallel_split))
+        else:
+            sum_of_micro_batch_sizes = sum(map(lambda x, y, z : x * y * z,
+                                            micro_batch_sizes_split,
+                                            data_parallel_split,
+                                            num_microbatches_splits))
+        args.sum_micro_batch_sizes = sum_of_micro_batch_sizes
+        assert args.global_batch_size % sum_of_micro_batch_sizes == 0, \
+            'global batch size should be divisible by sum of micro batch size per dp! ' \
+            f'but get global batch size is {args.global_batch_size} and the sum of micro batch size per dp is {sum_of_micro_batch_sizes}.'
+    else:
+        args.num_micro_batches = None
+        args.data_parallel_splits = None
 
+    # Batch size.
+    assert args.micro_batch_size is not None
+    assert args.micro_batch_size > 0
+    if args.global_batch_size is None:
+        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
+        if args.rank == 0:
+            print('setting global batch size to {}'.format(
+                args.global_batch_size), flush=True)
+    assert args.global_batch_size > 0
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        if args.overlap_p2p_comm:
+            assert args.pipeline_model_parallel_size > 1, \
+                'when interleaved schedule is used, pipeline-model-parallel size '\
+                'should be greater than 1'
+        else:
+            assert args.pipeline_model_parallel_size > 2, \
+                'when interleaved schedule is used and p2p communication overlap is disabled, '\
+                'pipeline-model-parallel size should be greater than 2 to avoid having multiple '\
+                'p2p sends and recvs between same 2 ranks per communication batch'
+        if args.num_layers_per_stage is None:
+            assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
+                'number of layers is not divisible by number of layers per virtual ' \
+                'pipeline stage'
+            args.virtual_pipeline_model_parallel_size = \
+                (args.num_layers // args.transformer_pipeline_model_parallel_size) // \
+                args.num_layers_per_virtual_pipeline_stage
+        else:
+            stage_split = args.num_layers_per_stage[::2]
+            num_layers_per_stage_split = args.num_layers_per_stage[1::2]
+            num_layers_per_stage = []
+            for i in range(len(stage_split)):
+                for j in range(stage_split[i]):
+                    num_layers_per_stage.append(num_layers_per_stage_split[i])
+            args.num_layers_per_stage = num_layers_per_stage
+            total_virtual_pipeline_stage_num = len(args.num_layers_per_stage)
+            assert total_virtual_pipeline_stage_num % args.pipeline_model_parallel_size == 0, \
+                'len(args.num_layers_per_stage) is not divisible by pp size'
+            args.virtual_pipeline_model_parallel_size = len(args.num_layers_per_stage) // \
+                args.pipeline_model_parallel_size
+    else:
+        if args.num_layers_per_stage is not None:
+            stage_split = args.num_layers_per_stage[::2]
+            num_layers_per_stage_split = args.num_layers_per_stage[1::2]
+            num_layers_per_stage = []
+            for i in range(len(stage_split)):
+                for j in range(stage_split[i]):
+                    num_layers_per_stage.append(num_layers_per_stage_split[i])
+            args.num_layers_per_stage = num_layers_per_stage
+            assert len(args.num_layers_per_stage) == args.pipeline_model_parallel_size, \
+                'len(args.num_layers_per_stage) do not match with pp size'
+        args.virtual_pipeline_model_parallel_size = None
+        # Overlap P2P communication is disabled if not using the interleaved schedule.
+        args.overlap_p2p_comm = False
+        if args.rank == 0:
+            print('WARNING: Setting args.overlap_p2p_comm to False since non-interleaved '
+                  'schedule does not support overlapping p2p communication')
 
-    # deprecated
-    # HACK: added back arguments because DeepSpeed still relies on the old
-    # activation checkpointing mechanism.
-    group.add_argument('--checkpoint-activations', action='store_true',
-                       help='Checkpoint activation to allow for training '
-                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--distribute-checkpointed-activations',
-                       action='store_true',
-                       help='If set, distribute checkpointed activations '
-                       'across model parallel group.')
-    group.add_argument('--checkpoint-num-layers', type=int, default=1,
-                       help='chunk size (number of layers) for checkpointing.')
-    group.add_argument('--train-iters', type=int, default=None,
-                       help='Total number of iterations to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--train-samples', type=int, default=None,
-                       help='Total number of samples to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--train-tokens', type=int, default=None,
-                       help='Total number of tokens to train over all '
-                       'training runs.')
-    group.add_argument('--random-ltd',
-                       action='store_true',
-                       help='enable random layer token drop')    
-    group.add_argument('--log-interval', type=int, default=100,
-                       help='Report loss and timing interval.')
-    group.add_argument('--exit-interval', type=int, default=None,
-                       help='Exit the program after the iteration is divisible '
-                       'by this value.')
-    group.add_argument('--exit-duration-in-mins', type=int, default=None,
-                       help='Exit the program after this many minutes.')
-    group.add_argument('--exit-signal-handler', action='store_true',
-                       help='Dynamically save the checkpoint and shutdown the '
-                       'training if SIGTERM is received')
-    group.add_argument('--tensorboard-dir', type=str, default=None,
-                       help='Write TensorBoard logs to this directory.')
-    group.add_argument('--no-masked-softmax-fusion',
-                       action='store_false',
-                       help='Disable fusion of query_key_value scaling, '
-                       'masking, and softmax.',
-                       dest='masked_softmax_fusion')
-    group.add_argument('--no-bias-gelu-fusion', action='store_false',
-                       help='Disable bias and gelu fusion.',
-                       dest='bias_gelu_fusion')
-    group.add_argument('--no-bias-dropout-fusion', action='store_false',
-                       help='Disable bias and dropout fusion.',
-                       dest='bias_dropout_fusion')
-    group.add_argument('--disable-moe-token-dropping', action='store_false',
-                       help='Disable MoE expert token dropping.',
-                       dest='moe_token_dropping')
-    group.add_argument('--moe-train-capacity-factor', type=float, default=1.0,
-                       help='The capacity of the MoE expert at training time')
-    group.add_argument('--moe-eval-capacity-factor', type=float, default=1.0,
-                       help='The capacity of the MoE expert at eval time.')
-    group.add_argument('--moe-min-capacity', type=int, default=4,
-                       help='The minimum capacity per MoE expert regardless of the capacity_factor.')
-    group.add_argument('--moe-loss-coeff', type=float, default=0.1,
-                       help='Scaling coefficient for adding MoE loss to model loss')
-    group.add_argument('--create-moe-param-group', action='store_true',
-                       help='Create separate groups for MoE params.'
-                       'This is necessary for techniques like ZeRO.')
-    group.add_argument('--use-flash-attn', '--use-flash-attn-v1', dest='use_flash_attn_v1', action='store_true',
-                       help='use first version FlashAttention implementation of attention. '
-                       'https://arxiv.org/abs/2205.14135')
-    group.add_argument('--use-flash-attn-v2', action='store_true',
-                       help='use second version FlashAttention implementation of attention. '
-                       'https://arxiv.org/abs/2307.08691')
-    group.add_argument('--use-flash-attn-triton', action='store_true',
-                       help='use FlashAttention implementation of attention using Triton.')
-    group.add_argument('--disable-bias-linear', action='store_false',
-                       help='Disable bias in the linear layers',
-                       dest='add_bias_linear')
-    group.add_argument('--optimizer', type=str, default='adam',
-                       choices=['adam', 'sgd'],
-                       help='Optimizer function')
-    group.add_argument('--dataloader-type', type=str, default=None,
-                       choices=['single', 'cyclic'],
-                       help='Single pass vs multiple pass data loader')
-    group.add_argument('--ds-inference', action='store_true',
-                       help='DeepSpeed inference engine being used')
-    group.add_argument('--cpu-optimizer', action='store_true',
-                       help='Run optimizer on CPU')
-    group.add_argument('--cpu_torch_adam', action='store_true',
-                       help='Use Torch Adam as optimizer on CPU.')
-    group.add_argument('--ds_fused_adam', action='store_true',
-                       help='Use DeepSpeed FusedAdam as optimizer.')
-    group.add_argument('--no-pipeline-parallel', action='store_true',
-                       help='Disable pipeline parallelism')
-    group.add_argument('--use-tutel', action='store_true',
-                       help='Use Tutel optimization for MoE')
-    group.add_argument('--inference', action='store_true',
-                       help='Very basic inference mode: not allocating optim/lr - requires ZERO_STAGE=0')
+    # TODO: validate more
+    if args.zero_bubble_v_schedule:
+        assert args.virtual_pipeline_model_parallel_size == 2
+        args.enable_zero_bubble = True
+    if args.enable_zero_bubble:
+        if args.use_distributed_optimizer:
+            assert args.fp16 or args.bf16, "not supported, because it is rarely used and makes code messy"
+            assert not args.overlap_param_gather, "the original code somehow doesn't work"
+            assert not args.overlap_grad_reduce, "not supported yet because we didn't verify the correctness"
+        assert args.pipeline_model_parallel_size > 1, "zero bubble must be enabled with pipeline parallelism"
+        if args.enable_optimizer_post_validation:
+            assert args.fp16 or args.bf16, "zero bubble post validation"
+        if args.zero_bubble_max_pending_backward == 'auto':
+            assert args.zero_bubble_adaptive_memory_limit_percentile > 0
+        else:
+            args.zero_bubble_max_pending_backward = int(args.zero_bubble_max_pending_backward)
+    else:
+        args.enable_optimizer_post_validation = False
 
-    group.add_argument('--no-async-tensor-model-parallel-allreduce',
-                       action='store_false',
-                       help='Disable asynchronous execution of '
-                       'tensor-model-parallel all-reduce with weight '
-                       'gradient compuation of a column-linear layer.',
-                       dest='async_tensor_model_parallel_allreduce')
-    group.add_argument('--no-persist-layer-norm', action='store_true',
-                       help='Disable using persistent fused layer norm kernel. '
-                       'This kernel supports only a set of hidden sizes. Please '
-                       'check persist_ln_hidden_sizes if your hidden '
-                       'size is supported.')
-    group.add_argument('--sequence-parallel', action='store_true',
-                       help='Enable Megatron-LM\'s sequence parallel optimization.')
-    group.add_argument('--ds-sequence-parallel-size', type=int, default=1,
-                       help='Enable DeepSpeed\'s sequence parallel. Cannot be combined with "--sequence-parallel", which enables Megatron-LM\'s sequence parallel.')
-    group.add_argument('--force-ds-sequence-parallel', action='store_true',
-                       help='use DeepSpeed sequence parallelism regardless of sequence parallel size.')
-    group.add_argument('--no-gradient-accumulation-fusion',
-                       action='store_false',
-                       help='Disable fusing gradient accumulation to weight '
-                       'gradient computation of linear layers',
-                       dest='gradient_accumulation_fusion')
-    group.add_argument('--use-dataset-only', type=bool, required=False, default=False,
-                       help='If set to True, only use the megatron dataset for external trainer ')
-    group.add_argument('--use-mcore-models', action='store_true',
-                       help='Use the implementation from megatron core')
-    group.add_argument('--manual-gc', action='store_true',
-                       help='Disable the threshold-based default garbage '
-                       'collector and trigger the garbage collection manually. '
-                       'Manual garbage collection helps to align the timing of '
-                       'the collection across ranks which mitigates the impact '
-                       'of CPU-associated jitters. When the manual gc is enabled, '
-                       'garbage collection is performed only at the start and the '
-                       'end of the validation routine by default.')
-    group.add_argument('--manual-gc-interval', type=int, default=0,
-                       help='Training step interval to trigger manual garbage '
-                       'collection. When the value is set to 0, garbage '
-                       'collection is not triggered between training steps.')
-    group.add_argument('--no-manual-gc-eval', action='store_false',
-                       help='When using manual garbage collection, disable '
-                       'garbage collection at the start and the end of each '
-                       'evaluation run.', dest='manual_gc_eval')
-    group.add_argument('--RLHF', action="store_true",
-                       help='RLHF mode')
-    group.add_argument('--ppo-epoches', type=int, default=1,
-                       help='RLHF model train epoches')
+    if args.overlap_param_gather:
+        assert args.use_distributed_optimizer, \
+            '--overlap-param-gather only supported with distributed optimizer'
+        assert args.overlap_grad_reduce, \
+            '--overlap-grad-reduce should be turned on when using --overlap-param-gather'
+        assert not args.use_legacy_models, \
+            '--overlap-param-gather only supported with MCore models'
 
-    return parser
+    ## RLHF Batch size check
+    if args.RLHF:
+        assert args.global_batch_size == args.micro_batch_size * args.data_parallel_size, \
+            f"error with batch size setting. GBS should equal to MBS * DP"
 
+    # Parameters dtype.
+    args.params_dtype = torch.float
+    if args.fp16:
+        assert not args.bf16
+        args.params_dtype = torch.half
+        # Turn off checking for NaNs in loss and grads if using dynamic loss scaling,
+        # where NaNs in grads / loss are signal to the loss scaler.
+        if not args.loss_scale:
+            args.check_for_nan_in_loss_and_grad = False
+            if args.rank == 0:
+                print('WARNING: Setting args.check_for_nan_in_loss_and_grad to False since '
+                      'dynamic loss scaling is being used')
+    if args.bf16:
+        assert not args.fp16
+        args.params_dtype = torch.bfloat16
+        # bfloat16 requires gradient accumulation and all-reduce to
+        # be done in fp32.
+        if not args.accumulate_allreduce_grads_in_fp32:
+            args.accumulate_allreduce_grads_in_fp32 = True
+            if args.rank == 0:
+                print('accumulate and all-reduce gradients in fp32 for '
+                      'bfloat16 data type.', flush=True)
 
-def _add_initialization_args(parser):
-    group = parser.add_argument_group(title='initialization')
+    if args.rank == 0:
+        print('using {} for parameters ...'.format(args.params_dtype),
+              flush=True)
 
-    group.add_argument('--seed', type=int, default=1234,
-                       help='Random seed used for python, numpy, '
-                       'pytorch, and cuda.')
-    group.add_argument('--data-parallel-random-init', action='store_true',
-                       help='Enable random initialization of params '
-                       'across data parallel ranks')
-    group.add_argument('--init-method-std', type=float, default=0.02,
-                       help='Standard deviation of the zero mean normal '
-                       'distribution used for weight initialization.')
-    group.add_argument('--init-method-xavier-uniform', action='store_true',
-                       help='Enable Xavier uniform parameter initialization')
+    if args.dataloader_type is None:
+        args.dataloader_type = 'single'
 
-    return parser
+    # data
+    assert args.num_dataset_builder_threads > 0
 
+    # Consumed tokens.
+    args.consumed_train_samples = 0
+    args.consumed_valid_samples = 0
+    args.consumed_train_tokens = 0
 
-def _add_learning_rate_args(parser):
-    group = parser.add_argument_group(title='learning rate')
+    # Support for variable sequence lengths across batches/microbatches.
+    # set it if the dataloader supports generation of variable sequence lengths
+    # across batches/microbatches. Due to additional communication overhead
+    # during pipeline parallelism, it should not be set if sequence length
+    # is constant during training.
+    # args.variable_seq_lengths = True
 
-    group.add_argument('--lr', type=float, default=None,
-                       help='Initial learning rate. Depending on decay style '
-                       'and initial warmup, the learing rate at each '
-                       'iteration would be different.')
-    group.add_argument('--actor-learning-rate', type=float, default=None,
-                       help='Initial RLHF actor model learning rate. Depending on decay style '
-                       'and initial warmup, the learing rate at each '
-                       'iteration would be different.')
-    group.add_argument('--critic-learning-rate', type=float, default=None,
-                       help='Initial RLHF critic model learning rate. Depending on decay style '
-                       'and initial warmup, the learing rate at each '
-                       'iteration would be different.')
-    group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine', 'inverse-square-root'],
-                       help='Learning rate decay function.')
-    group.add_argument('--lr-decay-iters', type=int, default=None,
-                       help='number of iterations to decay learning rate over,'
-                       ' If None defaults to `--train-iters`')
-    group.add_argument('--lr-decay-samples', type=int, default=None,
-                       help='number of samples to decay learning rate over,'
-                       ' If None defaults to `--train-samples`')
-    group.add_argument('--lr-decay-tokens', type=int, default=None,
-                       help='number of tokens to decay learning rate over,'
-                       ' If not None will override iter/sample-based decay')
-    group.add_argument('--lr-warmup-fraction', type=float, default=None,
-                       help='fraction of lr-warmup-(iters/samples) to use '
-                       'for warmup (as a float)')
-    group.add_argument('--lr-warmup-iters', type=int, default=0,
-                       help='number of iterations to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--lr-warmup-samples', type=int, default=0,
-                       help='number of samples to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--lr-warmup-init', type=float, default=0.0,
-                       help='Initial value for learning rate warmup. The '
-                       'scheduler starts warmup from this value.')
-    group.add_argument('--warmup', type=int, default=None,
-                       help='Old lr warmup argument, do not use. Use one of the'
-                       '--lr-warmup-* arguments above')
-    group.add_argument('--min-lr', type=float, default=0.0,
-                       help='Minumum value for learning rate. The scheduler'
-                       'clip values below this threshold.')
-    group.add_argument('--override-opt_param-scheduler', action='store_true',
-                       help='Reset the values of the scheduler (learning rate,'
-                       'warmup iterations, minimum learning rate, maximum '
-                       'number of iterations, and decay style from input '
-                       'arguments and ignore values from checkpoints. Note'
-                       'that all the above values will be reset.')
-    group.add_argument('--use-checkpoint-opt_param-scheduler', action='store_true',
-                       help='Use checkpoint to set the values of the scheduler '
-                       '(learning rate, warmup iterations, minimum learning '
-                       'rate, maximum number of iterations, and decay style '
-                       'from checkpoint and ignore input arguments.')
+    # Iteration-based training.
+    if args.train_iters:
+        # If we use iteration-based training, make sure the
+        # sample-based options are off.
+        assert args.train_samples is None, \
+            'expected iteration-based training'
+        assert args.lr_decay_samples is None, \
+            'expected iteration-based learning rate decay'
+        assert args.lr_warmup_samples == 0, \
+            'expected iteration-based learning rate warmup'
+        assert args.rampup_batch_size is None, \
+            'expected no batch-size rampup for iteration-based training'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_iters == 0, \
+                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
 
-    return parser
+    # Sample-based training.
+    if args.train_samples:
+        # If we use sample-based training, make sure the
+        # iteration-based options are off.
+        assert args.train_iters is None, \
+            'expected sample-based training'
+        assert args.lr_decay_iters is None, \
+            'expected sample-based learning rate decay'
+        assert args.lr_warmup_iters == 0, \
+            'expected sample-based learnig rate warmup'
+        if args.lr_warmup_fraction is not None:
+            assert args.lr_warmup_samples == 0, \
+                'can only specify one of lr-warmup-fraction ' \
+                'and lr-warmup-samples'
 
+    # if args.num_layers is not None:
+    #     assert args.encoder_num_layers is None, \
+    #         'cannot have both num-layers and encoder-num-layers specified'
+    #     args.encoder_num_layers = args.num_layers
+    # else:
+    #     assert args.encoder_num_layers is not None, \
+    #         'either num-layers or encoder-num-layers should be specified'
+    #     args.num_layers = args.encoder_num_layers
 
-def _add_checkpointing_args(parser):
-    group = parser.add_argument_group(title='checkpointing')
+    # Check required arguments.
+    if not args.use_dataset_only:
+        required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
+                        'max_position_embeddings']
+        for req_arg in required_args:
+            _check_arg_is_not_none(args, req_arg)
 
-    group.add_argument('--save', type=str, default=None,
-                       help='Output directory to save checkpoints to.')
-    group.add_argument('--save-interval', type=int, default=None,
-                       help='Number of iterations between checkpoint saves.')
-    group.add_argument('--no-save-optim', action='store_true', default=None,
-                       help='Do not save current optimizer.')
-    group.add_argument('--no-save-rng', action='store_true', default=None,
-                       help='Do not save current rng state.')
-    group.add_argument('--load', type=str, default=None,
-                       help='Directory containing a model checkpoint.')
-    group.add_argument('--load-tag', type=str, default=None,
-                       help='Specific checkpoint tag to load. Ignores latest.')
-    parser.add_argument("--actor_model_name_or_path", type=str, default=None,
-                        help="Directory containing a actor_model checkpoint.")
-    parser.add_argument("--critic_model_name_or_path", type=str, default=None,
-                        help="Directory containing a critic_model checkpoint.")
-    group.add_argument('--no-load-optim', action='store_true', default=None,
-                       help='Do not load optimizer when loading checkpoint.')
-    group.add_argument('--no-load-rng', action='store_true', default=None,
-                       help='Do not load rng state when loading checkpoint.')
-    group.add_argument('--no-load-lr-state', action='store_true',
-                       help='Do not load lr state when loading checkpoint.')   
-    group.add_argument('--finetune', action='store_true',
-                       help='Load model for finetuning. Do not load optimizer '
-                       'or rng state from checkpoint and set iteration to 0. '
-                       'Assumed when loading a release checkpoint.')
-    group.add_argument('--no-initialization', action='store_false',
-                       help='Do not perform initialization when building model, '
-                       'can reduce startup time when definitely loading from a '
-                       'checkpoint',
-                       dest='perform_initialization')
-    group.add_argument('--use-checkpoint-args', action='store_true',
-                       help='Override any command line arguments with arguments '
-                       'from the checkpoint')
-    group.add_argument('--exit-on-missing-checkpoint', action='store_true',
-                       help="If '--load' is set, but checkpoint is not found "
-                       "(e.g., path typo), then exit instead of random "
-                       "initialization.")
-    group.add_argument('--universal-checkpoint', action='store_true',
-                        help='Loading a universal format checkpoint.')
-    return parser
+    # Checks.
+    if not args.use_dataset_only:
+        if args.ffn_hidden_size is None:
+            if args.swiglu:
+                # reduce the dimnesion for MLP since projections happens on
+                # two linear layers. this keeps the number of paramters in
+                # the same ballpark as the counterpart with 4*h size
+                # we keep it a multiple of 64, which means the actual tensor size
+                # will be a multiple of 64 / tp_size
+                args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64
+            else:
+                args.ffn_hidden_size = 4 * args.hidden_size
 
+        if args.kv_channels is None:
+            assert args.hidden_size % args.num_attention_heads == 0
+            args.kv_channels = args.hidden_size // args.num_attention_heads
 
-def _add_mixed_precision_args(parser):
-    group = parser.add_argument_group(title='mixed precision')
+    if args.seq_length is not None and args.context_parallel_size > 1:
+        assert args.seq_length % (args.context_parallel_size * 2) == 0, \
+            'seq-length should be a multiple of 2 * context-parallel-size ' \
+            'if context-parallel-size > 1.'
 
-    group.add_argument('--fp16', action='store_true',
-                       help='Run model in fp16 mode.')
-    group.add_argument('--bf16', action='store_true',
-                       help='Run model in bfloat16 mode.')
-    group.add_argument('--loss-scale', type=float, default=None,
-                       help='Static loss scaling, positive power of 2 '
-                       'values can improve fp16 convergence. If None, dynamic'
-                       'loss scaling is used.')
-    group.add_argument('--initial-loss-scale', type=float, default=2**32,
-                       help='Initial loss-scale for dynamic loss scaling.')
-    group.add_argument('--min-loss-scale', type=float, default=1.0,
-                       help='Minimum loss scale for dynamic loss scale.')
-    group.add_argument('--loss-scale-window', type=float, default=1000,
-                       help='Window over which to raise/lower dynamic scale.')
-    group.add_argument('--hysteresis', type=int, default=2,
-                       help='hysteresis for dynamic loss scaling')
-    group.add_argument('--fp32-residual-connection', action='store_true',
-                       help='Move residual connections to fp32.')
-    group.add_argument('--no-query-key-layer-scaling', action='store_false',
-                       help='Do not scale Q * K^T by 1 / layer-number.',
-                       dest='apply_query_key_layer_scaling')
-    group.add_argument('--apply-query-key-layer-scaling', action='store_true',
-                       help='Scale Q * K^T by 1 / layer-number. '
-                       'Useful for fp16 training.')
-    group.add_argument('--attention-softmax-in-fp32', action='store_true',
-                       help='Run attention masking and softmax in fp32. '
-                       'This flag is ignored unless '
-                       '--no-query-key-layer-scaling is specified.')
-    group.add_argument('--accumulate-allreduce-grads-in-fp32',
-                       action='store_true',
-                       help='Gradient accumulation and all-reduce in fp32.')
-    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
-                       help='Move the cross entropy unreduced loss calculation'
-                       'for lm head to fp16.')
+    if args.seq_length is not None:
+        assert args.encoder_seq_length is None
+        args.encoder_seq_length = args.seq_length
+    else:
+        assert args.encoder_seq_length is not None
+        args.seq_length = args.encoder_seq_length
 
-    return parser
+    if not args.use_dataset_only:
+        if args.seq_length is not None:
+            assert args.max_position_embeddings >= args.seq_length
+        if args.decoder_seq_length is not None:
+            assert args.max_position_embeddings >= args.decoder_seq_length
+    # When rotary position embeddings is used, set add_position_embedding
+    # to false to turn off absolute position embedding.
+    if args.use_rotary_position_embeddings:
+        args.add_position_embedding = False
+    if args.lr is not None:
+        assert args.min_lr <= args.lr
+    if args.save is not None:
+        assert args.save_interval is not None
+    # Mixed precision checks.
+    if args.fp16_lm_cross_entropy:
+        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
+    if args.fp32_residual_connection:
+        assert args.fp16 or args.bf16, \
+            'residual connection in fp32 only supported when using fp16 or bf16.'
 
+    if args.moe_grouped_gemm:
+        assert args.bf16, 'Currently GroupedGEMM for MoE only supports bf16 dtype.'
+        #dc = torch.cuda.get_device_capability()
+        #assert dc[0] >= 8, "Unsupported compute capability for GroupedGEMM kernels."
 
-def _add_distributed_args(parser):
-    group = parser.add_argument_group(title='distributed')
+    assert not (args.moe_block_sparse_gemm and args.moe_grouped_gemm), \
+        'moe_block_sparse_gemm and moe_grouped_gemm cannot be used together.'
 
-    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
-                       help='Degree of tensor model parallelism.')
-    group.add_argument('--enable-expert-tensor-parallelism', action='store_true',
-                        default=False,
-                        help="use tensor parallelism for expert layers in MoE")
-    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
-                       help='Degree of pipeline model parallelism.')
-    group.add_argument('--pipeline-model-parallel-split-rank',
-                       type=int, default=None,
-                       help='Rank where encoder and decoder should be split.')
-    group.add_argument('--partition-method',
-                       type=str, default='type:transformer',
-                       help='use deepspeed to patition layers. method include: uniform, parameters, type:transformer, custom')
-    group.add_argument('--custom-partition', nargs='*',
-                       type=int, default=None,
-                       help='customized model layers to PP stages, parameter of partition-method should set < custom > to take this effect. \
-                       example: divide 32 layers to 6 PP stages: 5 5 5 6 6 5. it means there are 5/5/5/6/6/5 layers in 6 pp stages')
-    group.add_argument('--moe-expert-parallel-size', type=int, default=1,
-                       help='Degree of the MoE expert parallelism.')
-    group.add_argument('--model-parallel-size', type=int, default=None,
-                       help='Old model parallel argument, do not use. Use '
-                       '--tensor-model-parallel-size instead.')
-    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
-                       help='Number of layers per virtual pipeline stage')
-    group.add_argument('--no-overlap-p2p-communication', action='store_false',
-                       help='overlap pipeline parallel communication with forward and backward chunks',
-                       dest='overlap_p2p_comm')
-    group.add_argument('--distributed-backend', default='nccl',
-                       choices=['nccl', 'gloo'],
-                       help='Which backend to use for distributed training.')
-    group.add_argument('--distributed-timeout-minutes', type=int, default=10,
-                       help='Timeout minutes for torch.distributed.')
-    group.add_argument('--DDP-impl', default='local',
-                       choices=['local', 'torch', 'FSDP'],
-                       help='which DistributedDataParallel implementation '
-                       'to use.')
-    group.add_argument('--no-contiguous-buffers-in-local-ddp',
-                       action='store_false', help='If set, dont use '
-                       'contiguous buffer in local DDP.',
-                       dest='use_contiguous_buffers_in_local_ddp')
-    group.add_argument('--overlap-grad-reduce', action='store_true',
-                       default=False, help='If set, overlap DDP grad reduce.')
-    group.add_argument('--no-delay-grad-reduce', action='store_false',
-                       help='If not set, delay / synchronize grad reductions in all but first PP stage.',
-                       dest='delay_grad_reduce')
-    group.add_argument('--overlap-param-gather', action='store_true',
-                       default=False, help='If set, overlap param all-gather in distributed optimizer.')
-    group.add_argument('--delay-param-gather', action='store_true',
-                       default=False, help='If set, delay / synchronize param all-gathers in all but first PP stage.')
-    group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
-                       help='If not set, use scatter/gather to optimize communication of tensors in pipeline.',
-                       dest='scatter_gather_tensors_in_pipeline')
-    group.add_argument('--use-ring-exchange-p2p', action='store_true',
-                       default=False, help='If set, use custom-built ring exchange '
-                       'for p2p communications. Note that this option will require '
-                       'a custom built image that support ring-exchange p2p.')
-    group.add_argument('--local_rank', type=int, default=None,
-                       help='local rank passed from distributed launcher.')
-    group.add_argument('--lazy-mpu-init', type=bool, required=False,
-                       help='If set to True, initialize_megatron() '
-                       'skips DDP initialization and returns function to '
-                       'complete it instead.Also turns on '
-                       '--use-cpu-initialization flag. This is for '
-                       'external DDP manager.' )
-    group.add_argument('--use-cpu-initialization', action='store_true',
-                       default=None, help='If set, affine parallel weights '
-                       'initialization uses CPU' )
-    group.add_argument('--empty-unused-memory-level', default=0, type=int,
-                       choices=[0, 1, 2],
-                       help='Call torch.cuda.empty_cache() each iteration '
-                       '(training and eval), to reduce fragmentation.'
-                       '0=off, 1=moderate, 2=aggressive.')
-    group.add_argument('--standalone-embedding-stage', action='store_true',
-                       default=False, help='If set, *input* embedding layer '
-                       'is placed on its own pipeline stage, without any '
-                       'transformer layers. (For T5, this flag currently only '
-                       'affects the encoder embedding.)')
-    group.add_argument('--use-distributed-optimizer', action='store_true',
-                       help='Use distributed optimizer.')
-    group.add_argument('--expert-model-parallel-size', type=int, default=1,
-                       help='Degree of expert model parallelism.')
-    group.add_argument('--context-parallel-size', type=int, default=1,
-                       help='Degree of context parallelism.')
-    group.add_argument('--nccl-communicator-config-path', type=str, default=None,
-                       help='Path to the yaml file with NCCL communicator '
-                       'configurations. The number of min/max thread groups and thread '
-                       'group cluster size of each communicator can be configured by '
-                       'setting `min_ctas`, `max_ctas`, and `cga_cluster_size`.')
-    group.add_argument('--pp-delay', action='store_true', 
-                       default=False, help='')
-    group.add_argument('--pp-split-size', type=int, default=1,
-                    help='')
-    return parser
+    if not args.use_dataset_only:
+        if args.weight_decay_incr_style == 'constant':
+            assert args.start_weight_decay is None
+            assert args.end_weight_decay is None
+            args.start_weight_decay = args.weight_decay
+            args.end_weight_decay = args.weight_decay
+        else:
+            assert args.start_weight_decay is not None
+            assert args.end_weight_decay is not None
 
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    # Persistent fused layer norm.
+    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11):
+        args.no_persist_layer_norm = True
+        if args.rank == 0:
+            print('Persistent fused layer norm kernel is supported from '
+                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
+                  'Defaulting to no_persist_layer_norm=True')
 
-def _add_validation_args(parser):
-    group = parser.add_argument_group(title='validation')
+    # Activation checkpointing.
+    if args.distribute_checkpointed_activations:
+        assert args.checkpoint_activations, \
+            'for distribute-checkpointed-activations to work you '\
+            'need to enable checkpoint-activations'
 
-    group.add_argument('--eval-iters', type=int, default=100,
-                       help='Number of iterations to run for evaluation'
-                       'validation/test for.')
-    group.add_argument('--eval-interval', type=int, default=1000,
-                       help='Interval between running evaluation on '
-                       'validation set.')
-    group.add_argument('--skip-train', action='store_true',
-                       default=False, help='If set, bypass the training loop, '
-                       'optionally do evaluation for validation/test, and exit.')
+    # Activation recomputing.
+    if args.distribute_saved_activations:
+        assert args.tensor_model_parallel_size > 1, 'can distribute ' \
+            'recomputed activations only across tensor model ' \
+            'parallel groups'
+        assert args.recompute_granularity == 'full', \
+            'distributed recompute activations is only '\
+            'application to full recompute granularity'
+        assert args.recompute_method is not None, \
+            'for distributed recompute activations to work you '\
+            'need to use a recompute method '
+        assert (TORCH_MAJOR, TORCH_MINOR) >= (1, 10), \
+            'distributed recompute activations are supported for pytorch ' \
+            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
+            'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)
 
-    return parser
+    # Tranformer-Engine/FP8 related checking
+    if args.fp8:
+        assert args.transformer_impl == 'transformer_engine', \
+            'transformer-engine required for fp8 training and inference'
 
+    if args.recompute_granularity == 'selective':
+        assert args.recompute_method is None, \
+            'recompute method is not yet supported for ' \
+            'selective recomputing granularity'
 
-def _add_data_args(parser):
-    group = parser.add_argument_group(title='data and dataloader')
+    if args.recompute_num_layers_per_stage != None:
+        assert args.recompute_granularity == 'full', \
+            'recompute-num-layers-per-stage is only'\
+            'application to full recompute granularity'
+        assert args.recompute_method_per_stage is not None, \
+            'recompute_method_per_stage must be used with '\
+            'recompute_num_layers_per_stage '
+
+        recompute_num_layers_stage_split = args.recompute_num_layers_per_stage[::2]
+        recompute_num_layers_layer_split = args.recompute_num_layers_per_stage[1::2]
+        recompute_methods_stage_split = args.recompute_method_per_stage[::2]
+        recompute_methods_method_split = args.recompute_method_per_stage[1::2]
+
+        assert len(recompute_num_layers_stage_split) == len(recompute_num_layers_layer_split), \
+            'args.recompute_num_layers_per_stage setting must match form: n0, layers0, n1, layers1, ...'
+        assert len(recompute_methods_stage_split) == len(recompute_methods_method_split), \
+            'args.recompute_method_per_stage setting must match form: n0, layers0, n1, layers1, ...'
+        if args.virtual_pipeline_model_parallel_size != None:
+            assert args.pipeline_model_parallel_size * args.virtual_pipeline_model_parallel_size == sum(recompute_num_layers_stage_split), \
+                'args.recompute_num_layers_per_stage setting:' \
+                'the sum of n0, n1, ... should be equal to pipeline-model-parallel-size * virtual_pipeline_model_parallel_size'
+            assert args.pipeline_model_parallel_size * args.virtual_pipeline_model_parallel_size == sum(recompute_methods_stage_split), \
+                'args.recompute_method_per_stage setting:' \
+                'the sum of n0, n1, ... should be equal to pipeline-model-parallel-size * virtual_pipeline_model_parallel_size'
+        else:
+            assert args.pipeline_model_parallel_size == sum(recompute_num_layers_stage_split), \
+                'args.recompute_num_layers_per_stage setting:' \
+                'the sum of n0, n1, ... should be equal to pipeline-model-parallel-size.'
+            assert args.pipeline_model_parallel_size == sum(recompute_methods_stage_split), \
+                'args.recompute_method_per_stage setting:' \
+                'the sum of n0, n1, ... should be equal to pipeline-model-parallel-size.'
+
+        recompute_num_layers_per_stage = []
+        for i in range(len(recompute_num_layers_stage_split)):
+            for j in range(recompute_num_layers_stage_split[i]):
+                recompute_num_layers_per_stage.append(recompute_num_layers_layer_split[i])
+        recompute_method_per_stage = []
+        for i in range(len(recompute_methods_stage_split)):
+            for j in range(recompute_methods_stage_split[i]):
+                recompute_method_per_stage.append(recompute_methods_method_split[i])
+
+        args.recompute_num_layers_per_stage = recompute_num_layers_per_stage
+        args.recompute_method_per_stage = recompute_method_per_stage
 
-    group.add_argument('--aml-data-download-path', type=str, default=None,
-                       help='Path to mounted input dataset')
-    group.add_argument('--data-path', nargs='*', default=None,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ... It is used with --split when a '
-                       'single dataset used for all three: train, valid '
-                       'and test. It is exclusive to the other '
-                       '--*-data-path args')
-    group.add_argument('--split', type=str, default='969, 30, 1',
-                       help='Comma-separated list of proportions for training,'
-                       ' validation, and test split. For example the split '
-                       '`90,5,5` will use 90%% of data for training, 5%% for '
-                       'validation and 5%% for test.')
-    group.add_argument('--train-data-path', nargs='*', default=None,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-    group.add_argument('--valid-data-path', nargs='*', default=None,
-                       help='Path to the validation dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-    group.add_argument('--test-data-path', nargs='*', default=None,
-                       help='Path to the test dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-    group.add_argument('--data-cache-path', default=None,
-                       help='Path to a directory to hold cached index files.')
-
-    group.add_argument('--vocab-size', type=int, default=None,
-                       help='Size of vocab before EOD or padding.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file.')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file.')
-    group.add_argument('--special-tokens-file', type=str, default=None,
-                       help='Path to the BPE special tokens file.')
-    group.add_argument('--vocab-extra-ids', type=int, default=0,
-                       help='Number of additional vocabulary tokens. '
-                            'They are used for span masking in the T5 model')
-    group.add_argument('--seq-length', type=int, default=None,
-                       help='Maximum sequence length to process.')
-    group.add_argument('--encoder-seq-length', type=int, default=None,
-                       help='Maximum encoder sequence length to process.'
-                       'This should be exclusive of --seq-length')
-    group.add_argument('--decoder-seq-length', type=int, default=None,
-                       help="Maximum decoder sequence length to process.")
-    group.add_argument('--retriever-seq-length', type=int, default=256,
-                       help='Maximum sequence length for the biencoder model for retriever')
-    parser.add_argument("--max-prompt-seq-len", type=int, default=256,
-                        help="The maximum prompt length during RLHF Training.")
-    group.add_argument('--sample-rate', type=float, default=1.0,
-                       help='sample rate for training data. Supposed to be 0 '
-                            ' < sample_rate < 1')
-    group.add_argument('--mask-prob', type=float, default=0.15,
-                       help='Probability of replacing a token with mask.')
-    group.add_argument('--short-seq-prob', type=float, default=0.1,
-                       help='Probability of producing a short sequence.')
-    group.add_argument('--mmap-warmup', action='store_true',
-                       help='Warm up mmap files.')
-    group.add_argument('--num-workers', type=int, default=2,
-                       help="Dataloader number of workers.")
-    group.add_argument('--tokenizer-type', type=str,
-                       default=None,
-                       choices=['BertWordPieceLowerCase',
-                                'BertWordPieceCase',
-                                'GPT2BPETokenizer',
-                                'SentencePieceTokenizer',
-                                'GPTSentencePieceTokenizer',
-                                'HFTokenizer',
-                                'NullTokenizer',
-                                'AquilaTokenizer',
-                                'Llama2Tokenizer',
-                                'Llama3Tokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--tokenizer-model', type=str, default=None,
-                       help='Sentencepiece tokenizer model.')
-    group.add_argument('--data-impl', type=str, default='infer',
-                       choices=['mmap', 'infer'],
-                       help='Implementation of indexed datasets.')
-    group.add_argument('--reset-position-ids', action='store_true',
-                       help='Reset posistion ids after end-of-document token.')
-    group.add_argument('--reset-attention-mask', action='store_true',
-                       help='Reset self attention maske after '
-                       'end-of-document token.')
-    group.add_argument('--eod-mask-loss', action='store_true',
-                       help='Mask loss for the end of document tokens.')
-    group.add_argument('--train-data-exact-num-epochs', type=int, default=None,
-                       help='When building the train dataset, force it to be '
-                       'an exact number of epochs of the raw data')
-    group.add_argument('--return-data-index', action='store_true',
-                       help='Return the index of data sample.')
-    group.add_argument('--data-efficiency-curriculum-learning', action='store_true',
-                       help='Use DeepSpeed data efficiency library curriculum learning feature.')
-    group.add_argument('--train-idx-path', type=str, default=None,
-                       help='Force to use certain index file.')
-    group.add_argument('--train-desc-path', type=str, default=None,
-                       help='Force to use certain index file.')
-    group.add_argument('--train-doc-idx-path', type=str, default=None,
-                       help='Force to use certain index file.')
-    group.add_argument('--train-sample-idx-path', type=str, default=None,
-                       help='Force to use certain index file.')
-    group.add_argument('--train-shuffle-idx-path', type=str, default=None,
-                       help='Force to use certain index file.')
-    group.add_argument('--repeated-dataloader', action='store_true',
-                       help='Once all the data has been loaded, reuse the DataLoader.')
-    return parser
+    if args.custom_recompute_layers_per_stage:
+        if args.virtual_pipeline_model_parallel_size is not None:
+            assert len(args.custom_recompute_layers_per_stage) == args.pipeline_model_parallel_size * args.virtual_pipeline_model_parallel_size, \
+                f"custom recompute_num_layers_per_stage length ({len(args.custom_recompute_layers_per_stage)}) should equal to total virtual pp stage size ({args.pipeline_model_parallel_size * args.virtual_pipeline_model_parallel_size})"
+        else:
+            assert len(args.custom_recompute_layers_per_stage) == args.pipeline_model_parallel_size, \
+                f"custom recompute_num_layers_per_stage ({len(args.custom_recompute_layers_per_stage)}) length should equal to PP size ({args.pipeline_model_parallel_size})"
+        
+        ## 若是deepseed使用自定义重计算pp stage则不考虑如下
+        if not args.deepspeed:
+            assert args.recompute_granularity == 'full', \
+            'custom recompute layers pp stage is only '\
+            'application to full recompute granularity'
+        
+            if args.virtual_pipeline_model_parallel_size is None:
+                num_layers_per_stage = args.num_layers // args.pipeline_model_parallel_size
+            else:
+                num_layers_per_stage = args.num_layers_per_virtual_pipeline_stage
+            if args.custom_partition is None:
+                assert max(args.custom_recompute_layers_per_stage) <= num_layers_per_stage, \
+                "recompute layers per PP stage should small than num layers per stage." \
+                f"get max recompute layers: {max(args.custom_recompute_layers_per_stage)}" \
+                f"average num layers per stage: {num_layers_per_stage}"
+            else:
+                for i in range(args.pipeline_model_parallel_size):
+                    assert args.custom_recompute_layers_per_stage[i] <= args.custom_partition[i], \
+                    "recompute layers per PP stage should small the num layers of PP stage" \
+                    f"stage ({i}): recompute layers ({args.custom_recompute_layers_per_stage[i]})  >  stage layers ({args.custom_partition[i]})"
 
+    if args.recompute_num_layers_per_stage is None and args.custom_recompute_layers_per_stage:
+        args.recompute_num_layers_per_stage = args.custom_recompute_layers_per_stage
+    elif args.recompute_num_layers_per_stage is not None and args.custom_recompute_layers_per_stage is None:
+        args.custom_recompute_layers_per_stage = args.recompute_num_layers_per_stage
 
-def _add_autoresume_args(parser):
-    group = parser.add_argument_group(title='autoresume')
+    if args.num_layers_per_stage is None and args.custom_partition:
+        args.num_layers_per_stage = args.custom_partition
+    elif args.num_layers_per_stage is not None and args.custom_partition is None:
+        args.custom_partition = args.num_layers_per_stage
 
-    group.add_argument('--adlr-autoresume', action='store_true',
-                       help='Enable autoresume on adlr cluster.')
-    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
-                       help='Intervals over which check for autoresume'
-                       'termination signal')
+    # disable sequence parallelism when tp=1
+    # to avoid change in numerics when
+    # sequence_parallelism is enabled.
+    if args.parallel_group_num == None:
+        if args.tensor_model_parallel_size == 1:
+            args.sequence_parallel = False
+    else:
+        if 1 in args.tp_size_of_each_pipeline_stage:
+            if args.rank == 0:
+                print("Set sequence_parallel False for some parallel group's tp size match 1")
+            args.sequence_parallel = False
 
-    return parser
+    # disable async_tensor_model_parallel_allreduce when
+    # model parallel memory optimization is enabled
+    if args.sequence_parallel:
+        args.async_tensor_model_parallel_allreduce = False
 
+    # TODO: currently DeepSpeed seems to be incompatible with
+    # async_tensor_model_parallel_allreduce thus temporarily disabling it.
+    # Need further investigation.
+    if args.deepspeed:
+        args.async_tensor_model_parallel_allreduce = False
 
-def _add_biencoder_args(parser):
-    group = parser.add_argument_group(title='biencoder')
+    if not args.use_dataset_only:
+        if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+            if args.sequence_parallel:
+                raise RuntimeError(
+                    "Using sequence parallelism requires setting the environment variable "
+                    "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+            if args.async_tensor_model_parallel_allreduce:
+                raise RuntimeError(
+                    "Using async gradient all reduce requires setting the environment "
+                    "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
 
-    # network size
-    group.add_argument('--ict-head-size', type=int, default=None,
-                       help='Size of block embeddings to be used in ICT and '
-                        'REALM (paper default: 128)')
-    group.add_argument('--biencoder-projection-dim', type=int, default=0,
-                       help='Size of projection head used in biencoder (paper'
-                        ' default: 128)')
-    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
-                        help='Whether to share the parameters of the query '
-                        'and context models or not')
+    # Disable bias gelu fusion if we are disabling bias altogether
+    if not args.add_bias_linear:
+        args.bias_gelu_fusion = False
 
-    # checkpointing
-    group.add_argument('--ict-load', type=str, default=None,
-                       help='Directory containing an ICTBertModel checkpoint')
-    group.add_argument('--bert-load', type=str, default=None,
-                       help='Directory containing an BertModel checkpoint '
-                       '(needed to start ICT and REALM)')
+    # Retro checks.
+    if args.retro_add_retriever:
 
-    # data
-    group.add_argument('--titles-data-path', type=str, default=None,
-                       help='Path to titles dataset used for ICT')
-    group.add_argument('--query-in-block-prob', type=float, default=0.1,
-                       help='Probability of keeping query in block for '
-                       'ICT dataset')
-    group.add_argument('--use-one-sent-docs', action='store_true',
-                       help='Whether to use one sentence documents in ICT')
-    group.add_argument('--evidence-data-path', type=str, default=None,
-                       help='Path to Wikipedia Evidence frm DPR paper')
-
-    # training
-    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
-                        default=[], help="Which top-k accuracies to report "
-                        "(e.g. '1 5 20')")
-    group.add_argument('--retriever-score-scaling', action='store_true',
-                       help='Whether to scale retriever scores by inverse '
-                        'square root of hidden size')
-
-    # faiss index
-    group.add_argument('--block-data-path', type=str, default=None,
-                       help='Where to save/load BlockData to/from')
-    group.add_argument('--embedding-path', type=str, default=None,
-                       help='Where to save/load Open-Retrieval Embedding'
-                        ' data to/from')
-
-    # indexer
-    group.add_argument('--indexer-batch-size', type=int, default=128,
-                       help='How large of batches to use when doing indexing '
-                       'jobs')
-    group.add_argument('--indexer-log-interval', type=int, default=1000,
-                       help='After how many batches should the indexer '
-                       'report progress')
-    return parser
+        # Train samples should be auto-loaded.
+        assert args.train_samples is not None, \
+            "args.train_samples should be auto-loaded from the retro config."
 
+        # Sequence parallelism unsupported.
+        assert not args.sequence_parallel, \
+            "retro currently does not support sequence parallelism."
 
-def _add_vision_args(parser):
-    group = parser.add_argument_group(title="vision")
-
-    # general vision arguements
-    group.add_argument('--num-classes', type=int, default=1000,
-                       help='num of classes in vision classificaiton task')
-    group.add_argument('--img-h', type=int, default=224,
-                       help='Image height for vision classification task')
-    group.add_argument('--img-w', type=int, default=224,
-                       help='Image height for vision classification task')
-    group.add_argument('--num-channels', type=int, default=3,
-                       help='Number of channels in input image data')
-    group.add_argument('--patch-dim', type=int, default=16,
-                       help='patch dimension')
-    group.add_argument('--classes-fraction', type=float, default=1.0,
-                       help='training with fraction of classes.')
-    group.add_argument('--data-per-class-fraction', type=float, default=1.0,
-                       help='training with fraction of data per class.')
-    group.add_argument('--no-data-sharding', action='store_false',
-                       help='Disable data sharding.',
-                       dest='data_sharding')
-    group.add_argument('--head-lr-mult', type=float, default=1.0,
-                       help='learning rate multiplier for head during finetuning')
-
-    # pretraining type and backbone selection`
-    group.add_argument('--vision-pretraining', action='store_true',
-                       help='flag to indicate vision pretraining')
-    group.add_argument('--vision-pretraining-type', type=str, default='classify',
-                       choices=['classify', 'inpaint', 'dino'],
-                       help='pretraining objectives')
-    group.add_argument('--vision-backbone-type', type=str, default='vit',
-                       choices=['vit', 'mit', 'swin'],
-                       help='backbone types types')
-    group.add_argument('--swin-backbone-type', type=str, default='tiny',
-                       choices=['tiny', 'base', 'h3'],
-                       help='pretraining objectives')
-
-    # inpainting arguments
-    group.add_argument('--mask-type', type=str, default='random',
-                       choices=['random', 'row'],
-                       help='mask types')
-    group.add_argument('--mask-factor', type=float, default=1.0,
-                       help='mask size scaling parameter')
-
-    # dino arguments
-    group.add_argument('--iter-per-epoch', type=int, default=1250,
-                       help='iterations per epoch')
-    group.add_argument('--dino-local-img-size', type=int, default=96,
-                       help='Image size for vision classification task')
-    group.add_argument('--dino-local-crops-number', type=int, default=10,
-                       help='Number of local crops')
-    group.add_argument('--dino-head-hidden-size', type=int, default=2048,
-                       help='Hidden dimension size in dino head')
-    group.add_argument('--dino-bottleneck-size', type=int, default=256,
-                       help='Bottle neck dimension in dino head ')
-    group.add_argument('--dino-freeze-last-layer', type=float, default=1,
-                       help='Freezing last layer weights')
-    group.add_argument('--dino-norm-last-layer', action='store_true',
-                       help='Disable Norm in last layer.')
-    group.add_argument('--dino-warmup-teacher-temp', type=float, default=0.04,
-                       help='warump teacher temperature')
-    group.add_argument('--dino-teacher-temp', type=float, default=0.07,
-                       help='teacher temperature')
-    group.add_argument('--dino-warmup-teacher-temp-epochs', type=int, default=30,
-                       help='warmup teacher temperaure epochs')
+        # Pipeline parallelism unsupported.
+        assert args.pipeline_model_parallel_size == 1, \
+            "retro currently does not support pipeline parallelism."
 
-    return parser
+    if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
+        assert not args.use_legacy_models, \
+            '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
+        assert not args.use_dist_ckpt, "Distributed checkpointing does not work with decoupled LR yet."
 
-def _add_experimental_args(parser):
-    group = parser.add_argument_group(title='experimental')
+    ## meg-ds start
+    args.curriculum_learning_legacy = False
+    args.compression_training = False
+    args.mos = False
+    args.kd = False
 
-    group.add_argument('--spec', type=str, default=None, nargs=2,
-                       help='Specify the <module_location function_name> pair '
-                       'that returns a spec to customize a model, transformer '
-                       'block, or transformer layer, depending on the use case. '
-                       'For more details, see the model class, '
-                       '`transformer_block.py`, or `transformer_layer.py`')
+    # FlashAttention
+    args.use_flash_attn = args.use_flash_attn or args.use_flash_attn_v1 or args.use_flash_attn_triton or args.use_flash_attn_v2
 
-    return parser
+    # AML
+    if args.aml_data_download_path is not None:
+        data_paths = []
+        for path in args.data_path:
+            data_paths.append(f"{args.aml_data_download_path}/{path}")
+        args.data_path = data_paths
 
-def _add_zero_args(parser):
-    """Text generate arguments."""
+    # GQA
+    if not args.use_dataset_only:
+        if args.group_query_attention:
+            args.num_key_value_heads = args.num_query_groups
+        if args.num_key_value_heads is None:
+            args.num_key_value_heads = args.num_attention_heads
+        assert args.num_attention_heads % args.num_key_value_heads == 0, \
+            f"num_attention_heads must be divisible by num_key_value_heads (got `num_attention_heads`: {args.num_attention_heads} " \
+            f"and `num_key_value_heads`: {args.num_key_value_heads})."
+    ## meg-ds end
 
-    group = parser.add_argument_group('ZeRO configurations', 'configurations')
-    group.add_argument("--zero-stage", type=int, default=1.0)
-    group.add_argument('--zero-reduce-scatter', action='store_true',
-                       help='Use reduce scatter if specified')
-    group.add_argument('--zero-contigious-gradients', action='store_true',
-                       help='Use contigious memory optimizaiton if specified')
-    group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
-    group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
-    group.add_argument('--remote-device', type=str, default='none', choices=['none', 'cpu', 'nvme'],
-                      help='Remote device for ZeRO-3 initialized parameters.')
-    group.add_argument('--use-pin-memory', action='store_true',
-                     help='Use pinned CPU memory for ZeRO-3 initialized model parameters.')
-    return parser
+    # Legacy RoPE arguments
+    if args.use_rotary_position_embeddings:
+        args.position_embedding_type = 'rope'
+    if args.rotary_interleaved and args.apply_rope_fusion:
+        raise RuntimeError('--rotary-interleaved does not work with rope_fusion.')
+    if args.rotary_interleaved and args.use_legacy_models:
+        raise RuntimeError('--rotary-interleaved is not supported in legacy models.')
 
-def _add_memoryopt_args(parser):
-    """Memory optimization arguments."""
+    # Would just need to add 'NoPE' as a position_embedding_type to support this, but for now
+    # don't allow it to keep things simple
+    if not args.add_position_embedding and args.position_embedding_type != 'rope':
+        raise RuntimeError('--no-position-embedding is deprecated, use --position-embedding-type')
 
-    group = parser.add_argument_group('Memory optimizations', 'configurations')
-    group.add_argument("--scattered-embeddings", action='store_true',
-                       help='Save memory by scattering embedding activations. '
-                            'Introduces dropout differences across MP configurations.')
-    group.add_argument("--split-transformers", action='store_true',
-                       help='Save memory by splitting transformer layers into two parts, '
-                       'allowing for more frequent activation checkpoint savings.')
-    group.add_argument("--memory-centric-tiled-linear", action="store_true",
-                       help='Save memory by tiling with deepspeed.zero.TiledLinear.')
-    group.add_argument("--tile-factor", type=int, default=1,
-                       help='Make all linear layers the same size of [hidden/tile_factor, hidden/tile_factor]. '
-                            'Must be enabled with --memory-centric-tiled-linear. '
-                            'Example A: if tile_factor=1, the qkv layer [hidden, 3* hidden] would be converted into [1,3] tiles of size [hidden,hidden]. '
-                            'Example B: if tile_factor=2, the intermediate layer [4*hidden, hidden] will be converted into [8, 2] tiles of size [hidden/2, hidden/2]. '
-                            'Default is 1.')
+    # MoE Spec check
+    if args.num_experts == 0:
+        args.num_experts = None
+    if args.num_experts is not None:
+        assert args.spec is None, "Model Spec must be None when using MoEs"
 
-    return parser
+    # Context parallel
+    # if args.context_parallel_size > 1:
+    #     assert not args.use_legacy_models, "Context parallelism is not supported in legacy models."
 
-def _add_activation_checkpoint_args(parser):
-    group = parser.add_argument_group('Activation Checkpointing',
-                                      'Checkpointing Configurations')
-    group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
-                       help='uses activation checkpointing from deepspeed')
-    group.add_argument('--partition-activations', action='store_true',
-                       help='partition Activations across GPUs before checkpointing.')
-    group.add_argument('--contigious-checkpointing', action='store_true',
-                       help='Contigious memory checkpointing for activatoins.')
-    group.add_argument('--checkpoint-in-cpu', action='store_true',
-                       help='Move the activation checkpoints to CPU.')
-    group.add_argument('--synchronize-each-layer', action='store_true',
-                       help='does a synchronize at the beginning and end of each checkpointed layer.')
-    group.add_argument('--profile-backward', action='store_true',
-                       help='Enables backward pass profiling for checkpointed layers.')
-    return parser
+    # Expert parallelism check
+    if args.expert_model_parallel_size  > 1:
+        assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism"
+        assert args.num_experts % args.expert_model_parallel_size == 0, \
+            "Number of experts should be a multiple of expert model parallel_size."
+        assert not args.fp16, \
+            "Expert parallelism is not supported with fp16 training."
 
+    # Distributed checkpointing checks
+    if args.use_dist_ckpt and args.use_legacy_models:
+        raise RuntimeError('--use-dist-ckpt is not supported in legacy models.')
+
+    # Data blend checks
+    assert args.mock_data + \
+           bool(args.data_path) + \
+           any([args.train_data_path, args.valid_data_path, args.test_data_path]) \
+           <= 1, "A single data source must be provided in training mode, else None"
+
+    if args.use_tp_pp_dp_mapping:
+        assert args.context_parallel_size * args.expert_model_parallel_size <= 1, \
+            "context_parallel and expert_model_parallel can't be used with tp-pp-dp mapping."
+
+    # Deterministic mode
+    if args.deterministic_mode:
+        assert not args.use_flash_attn, 'Flash attention can not be used in deterministic mode.'
+
+        all_reduce_choices = ["Tree", "Ring", "CollnetDirect", "CollnetChain", "^NVLS"]
+        assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, \
+            f"NCCL_ALGO must be one of {all_reduce_choices}."
+
+    # Update the printed args to reflect that `apply_query_key_layer_scaling` also controls `attention_softmax_in_fp32`
+    if args.apply_query_key_layer_scaling:
+        args.attention_softmax_in_fp32 = True
+
+    # Checkpointing
+    if args.ckpt_fully_parallel_save_deprecated and args.rank == 0:
+        print('--ckpt-fully-parallel-save flag is deprecated and has no effect.'
+              ' Use --no-ckpt-fully-parallel-save to disable parallel save.')
+    if (
+        args.use_dist_ckpt
+        and not args.ckpt_fully_parallel_save
+        and args.use_distributed_optimizer
+        and args.rank == 0
+    ):
+        print('Warning: With non-parallel ckpt save and DistributedOptimizer,'
+              ' it will be impossible to resume training with different parallelism.'
+              ' Consider removing flag --no-ckpt-fully-parallel-save.')
+
+    if args.two_stage_p2p and args.sequence_parallel:
+        raise RuntimeError('--two-stage-p2p only support sequence_parallel off.')
+
+    if args.create_attention_mask_in_dataloader and args.rank == 0:
+        print('WARNING: create_attention_mask_in_dataloader is True, do you really need it?!')
 
-def _add_distillation_args(parser):
-    group = parser.add_argument_group('Knowledge distillation',
-                                      'Distillation Configurations')
-    
-    group.add_argument('--num-layers-teacher', type=int, default=None,
-                       help='Number of the teacher transformer layers.')                  
-    group.add_argument('--num-experts-teacher', type=int, nargs='+', default=[1,],
-                        help='number of teacher experts list, MoE related.')
-    group.add_argument('--hidden-size-teacher', type=int, default=None,
-                       help='Tansformer teacher hidden size.')
-    group.add_argument('--num-attention-heads-teacher', type=int, default=None,
-                       help='Number of teacher transformer attention heads.') 
-
-    group.add_argument('--mos', action='store_true',
-                       help='Enable Mixture-of-Students via knolwedge distillation.')
-    group.add_argument('--kd', action='store_true',
-                       help='Enable knolwedge distillation.')
-    group.add_argument('--kd-alpha-ce', default=1, type=float)
-    group.add_argument('--kd-beta-ce', default=1, type=float)
-    group.add_argument('--kd-temp', default=1.0, type=float)
-    group.add_argument('--reset-iteration', action='store_true',
-                    help='Reset the iteration count.')
-    
-    group.add_argument('--load-teacher', type=str, default=None,
-                       help='Directory containing a teacher model checkpoint.')
+    # Print arguments.
+    _print_args("arguments", args)
 
-    return parser
+    if args.pp_delay:
+        if not args.overlap_p2p_comm:
+            args.pp_delay = False
+
+    return args
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/training/checkpointing.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/checkpointing.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8077928aff78241752bda5291b499d12fb9d8d5
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/checkpointing.py
@@ -0,0 +1,626 @@
+"""Input/output checkpointing."""
+
+from logging import getLogger
+import os
+import random
+import sys
+import numpy as np
+from time import time
+from functools import wraps
+
+import torch
+
+from megatron.core import mpu, tensor_parallel, dist_checkpointing
+from megatron.core.dist_checkpointing.mapping import ShardedObject
+from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy
+from megatron.core.dist_checkpointing.strategies.fully_parallel import \
+    FullyParallelSaveStrategyWrapper, FullyParallelLoadStrategyWrapper
+from megatron.core.num_microbatches_calculator import update_num_microbatches
+from megatron.training.async_utils import schedule_async_save
+from megatron.training.global_vars import get_args, get_one_logger, get_tokenizer
+from megatron.training.utils import unwrap_model, print_rank_0, append_to_progress_log, is_last_rank
+from megatron.core.dist_checkpointing.serialization import \
+    get_default_save_sharded_strategy
+from megatron.training.one_logger_utils import on_save_checkpoint_start, on_save_checkpoint_success
+
+from megatron.training.checkpointing import (
+    set_checkpoint_version,
+    get_checkpoint_version,
+    get_checkpoint_name,
+    get_distributed_optimizer_checkpoint_name,
+    get_checkpoint_tracker_filename,
+    checkpoint_exists,
+    read_metadata,
+    get_rng_state,
+    _load_base_checkpoint,
+    fix_query_key_value_ordering,
+    ensure_directory_exists,
+    find_checkpoint_rank_0,
+    logger
+)
+
+from deepspeed.checkpoint import (
+    ORIGINAL_VOCAB_SIZE,
+    PADDED_VOCAB_SIZE,
+    UNIVERSAL_CHECKPOINT_INFO,
+    UNIVERSAL_CHECKPOINT_VERSION_KEY,
+    UNIVERSAL_CHECKPOINT_VERSION_VALUE,
+)
+
+# [ModelOpt]: Import
+try:
+    from modelopt.torch.opt.plugins import (
+        save_modelopt_state,
+        save_sharded_modelopt_state,
+        restore_modelopt_state,
+        restore_sharded_modelopt_state,
+    )
+    has_nvidia_modelopt = True
+except Exception:
+    has_nvidia_modelopt = False
+
+_CHECKPOINT_VERSION = None
+
+def check_checkpoint_args(checkpoint_args):
+    """Ensure fixed arguments for a model are the same for the input
+    arguments and the one retrieved from checkpoint."""
+    args = get_args()
+
+    def _compare(arg_name, old_arg_name=None, default=None):
+        if old_arg_name is not None:
+            ckpt_arg_name = old_arg_name
+        else:
+            ckpt_arg_name = arg_name
+        if default is not None:
+            checkpoint_value = getattr(checkpoint_args, ckpt_arg_name, default)
+        else:
+            checkpoint_value = getattr(checkpoint_args, ckpt_arg_name)
+        args_value = getattr(args, arg_name)
+        error_message = '{} value from checkpoint ({}) is not equal to the ' \
+                        'input argument value ({}).'.format(
+                            arg_name, checkpoint_value, args_value)
+        assert checkpoint_value == args_value, error_message
+
+    _compare('num_layers')
+    _compare('hidden_size')
+    _compare('num_attention_heads')
+    _compare('add_position_embedding', default=True)
+    if args.vocab_file:
+        _compare('max_position_embeddings')
+        if not args.universal_checkpoint:
+            _compare('make_vocab_size_divisible_by')
+        if not args.use_dist_ckpt or not args.universal_checkpoint:
+            _compare('padded_vocab_size')
+        _compare('tokenizer_type')
+    if args.data_parallel_random_init:
+        _compare('data_parallel_random_init')
+    if get_checkpoint_version() < 3.0 and not args.universal_checkpoint:
+        _compare('tensor_model_parallel_size',
+                 old_arg_name='model_parallel_size')
+    if get_checkpoint_version() >= 3.0 and (not args.use_dist_ckpt or not args.universal_checkpoint):
+        _compare('tensor_model_parallel_size')
+        _compare('pipeline_model_parallel_size')
+
+def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floating_point_operations_so_far=0, checkpointing_context=None,
+                    pipeline_rank=None,expert_rank=None, tensor_rank=None, pipeline_parallel=None, expert_parallel=None):
+    """Save a model checkpoint.
+
+    Checkpointing context is used to persist some checkpointing state
+    throughout a single job. Must be initialized externally (not used if None).
+    """
+    start_ckpt = time()
+    args = get_args()
+
+    # Prepare E2E metrics at start of save checkpoint
+    productive_metrics = on_save_checkpoint_start(args.async_save)
+
+    # Only rank zero of the data parallel writes to the disk.
+    if not args.deepspeed:
+        model = unwrap_model(model)
+
+    ckpt_format = args.dist_ckpt_format if args.use_dist_ckpt else 'torch'
+    print_rank_0('saving checkpoint at iteration {:7d} to {} in {} format'.format(
+        iteration, args.save, ckpt_format))
+
+    # Collect rng state across data parallel ranks.
+    rng_state = get_rng_state(args.use_dist_ckpt)
+
+    # Checkpoint name.
+    checkpoint_name = get_checkpoint_name(args.save, iteration, release=False, pipeline_parallel=pipeline_parallel,
+        tensor_rank=tensor_rank, pipeline_rank=pipeline_rank, expert_parallel=expert_parallel, expert_rank=expert_rank, return_base_dir=args.use_dist_ckpt)
+
+    # Save distributed optimizer's custom parameter state.
+    if args.use_distributed_optimizer and not args.no_save_optim and optimizer is not None and not args.use_dist_ckpt:
+        optim_checkpoint_name = \
+            get_distributed_optimizer_checkpoint_name(checkpoint_name)
+        ensure_directory_exists(optim_checkpoint_name)
+        optimizer.save_parameter_state(optim_checkpoint_name)
+
+    async_save_request = None
+    if args.async_save:
+        if not args.use_dist_ckpt:
+            raise NotImplementedError('Async checkpoint save not implemented for legacy checkpoints')
+        elif args.dist_ckpt_format != 'torch_dist':
+            raise NotImplementedError(f'Async checkpoint save not implemented for {args.dist_ckpt_format} distributed checkpoint format')
+
+    # Collect args, model, RNG.
+    if not torch.distributed.is_initialized() \
+            or mpu.get_data_modulo_expert_parallel_rank(with_context_parallel=True) == 0 \
+            or args.use_dist_ckpt or args.deepspeed:
+
+        optim_sd_kwargs = {}
+        if args.use_dist_ckpt and args.use_distributed_optimizer:
+            optim_sd_kwargs['sharding_type'] = ('fully_sharded_model_space'
+                                                if args.ckpt_fully_parallel_save
+                                                else 'dp_zero_gather_scatter')
+            print_rank_0(f'Storing distributed optimizer sharded state of type {optim_sd_kwargs["sharding_type"]}')
+        state_dict = generate_state_dict(args, model, optimizer, opt_param_scheduler, rng_state,
+                                         args.use_dist_ckpt, iteration, optim_sd_kwargs=optim_sd_kwargs)
+
+        if not args.deepspeed:
+            state_dict['num_floating_point_operations_so_far'] = num_floating_point_operations_so_far
+            if args.use_dist_ckpt:
+                if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+                    ensure_directory_exists(checkpoint_name, check_parent=False)
+                validate_sharding_integrity = True
+                save_strategy = (checkpointing_context or {}).get('save_strategy',
+                                                                get_default_save_sharded_strategy(args.dist_ckpt_format))
+                if args.ckpt_assume_constant_structure and args.dist_ckpt_format == 'torch_dist':
+                    save_strategy.use_cached_ckpt_structure = args.ckpt_assume_constant_structure
+                if args.ckpt_fully_parallel_save:
+                    if checkpointing_context is not None and 'save_strategy' in checkpointing_context:
+                        # Already saved once before - don't need to rerun sharding validation
+                        validate_sharding_integrity = not args.ckpt_assume_constant_structure
+                    else:
+                        save_strategy = FullyParallelSaveStrategyWrapper(save_strategy, mpu.get_data_parallel_group(with_context_parallel=True),
+                                                                        args.ckpt_assume_constant_structure)
+                # Store save strategy for future checkpoint saves
+                if checkpointing_context is not None:
+                    checkpointing_context['save_strategy'] = save_strategy
+                end_ckpt = time()
+                if not torch.distributed.is_initialized():
+                    logger.debug(f"takes {end_misc - start_misc} to finalize ckpt save ")
+                else:
+                    logger.debug(f"rank: {torch.distributed.get_rank()}, takes {end_ckpt - start_ckpt} to prepare state dict for ckpt ")
+                async_save_request = dist_checkpointing.save(state_dict, checkpoint_name, save_strategy,
+                                                            async_sharded_save=args.async_save)
+
+                # [ModelOpt]: save sharded modelopt_state
+                if has_nvidia_modelopt:
+                    save_sharded_modelopt_state(model, checkpoint_name, (args.dist_ckpt_format, 1))
+            else:
+                # [ModelOpt]: Inject modelopt_state into state_dict
+                if has_nvidia_modelopt:
+                    save_modelopt_state(model, state_dict)
+
+                # Save.
+                ensure_directory_exists(checkpoint_name)
+                torch.save(state_dict, checkpoint_name)
+    
+    if args.deepspeed:
+        #megatron model uses state_dict_for_save_checkpointing instead of the standard state_dict
+        #state_dict is used by deepspeed for module saving so it needs to point to the right function
+        if args.no_pipeline_parallel:
+            original_state_dict = model[0].module.state_dict
+            def state_dict_for_save_checkpoint_deepspeed(destination=None, prefix='', keep_vars=False):
+                return model[0].module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
+            model[0].module.state_dict = state_dict_for_save_checkpoint_deepspeed
+
+        # Saving is a collective communication
+        checkpoint_name = get_checkpoint_name(args.save, iteration)
+
+        # Trim off the filename and mp_rank_* directory.
+        for _ in range(3):
+            checkpoint_name = os.path.dirname(checkpoint_name)
+        model[0].save_checkpoint(checkpoint_name, client_state=state_dict)
+
+        if args.no_pipeline_parallel:
+            model[0].module.state_dict = original_state_dict
+
+    if not args.deepspeed:
+        start_misc = time()
+        if not args.async_save:
+            assert async_save_request is None
+            # Wait so everyone is done (necessary)
+            if torch.distributed.is_initialized():
+                torch.distributed.barrier()
+
+        # And update the latest iteration
+        if not torch.distributed.is_initialized() \
+            or torch.distributed.get_rank() == 0 \
+            or (getattr(args, 'data_cache_local', None) and torch.distributed.get_rank() % torch.cuda.device_count() == 0):
+            tracker_filename = get_checkpoint_tracker_filename(args.save)
+
+            def iter_finalize_fn():
+                with open(tracker_filename, 'w') as f:
+                    f.write(str(iteration))
+                print_rank_0('  successfully saved checkpoint from iteration {:7d} to {}'
+                            .format(iteration, args.save))
+                if args.log_progress and args.async_save:
+                    append_to_progress_log(f'Saved async checkpoint\tIteration: {iteration}',
+                                        barrier=False)
+
+            if args.async_save:
+                assert async_save_request is not None
+                async_save_request.add_finalize_fn(iter_finalize_fn)
+            else:
+                iter_finalize_fn()
+
+        # Additional callback for one_logger (last rank)
+        if not torch.distributed.is_initialized() \
+        or is_last_rank():
+            def onelogger_finalize_fn():
+                on_save_checkpoint_success(productive_metrics, args.async_save)
+            if args.async_save:
+                assert async_save_request is not None
+                async_save_request.add_finalize_fn(onelogger_finalize_fn)
+            else:
+                onelogger_finalize_fn()
+
+        if args.async_save:
+            schedule_async_save(async_save_request)
+            print_rank_0('  scheduled an async checkpoint save at iteration {:7d} to {}' \
+                        .format(iteration, args.save))
+
+        # Wait so everyone is done (not necessary)
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+        end_misc = time()
+        if not torch.distributed.is_initialized():
+            logger.debug(f"takes {end_misc - start_misc} to finalize ckpt save ")
+        else:
+            logger.debug(f"rank: {torch.distributed.get_rank()}, takes {end_misc - start_misc} to finalize ckpt save ")
+    else:
+        # Wait so everyone is done (necessary)
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+        print_rank_0('  successfully saved checkpoint at iteration {:7d} to {}' \
+                    .format(iteration, args.save))
+
+        # And update the latest iteration
+        if not torch.distributed.is_initialized() \
+            or torch.distributed.get_rank() == 0 \
+            or (getattr(args, 'data_cache_local', None) and torch.distributed.get_rank() % torch.cuda.device_count() == 0):
+            tracker_filename = get_checkpoint_tracker_filename(args.save)
+            with open(tracker_filename, 'w') as f:
+                f.write(str(iteration))
+
+        # Wait so everyone is done (not necessary)
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+def generate_state_dict(args, model, optimizer, opt_param_scheduler,
+                        rng_state, use_dist_ckpt=False, iteration=None,
+                        optim_sd_kwargs=None):
+    # Arguments, iteration, and model.
+    state_dict = {}
+    state_dict['args'] = args
+    state_dict['checkpoint_version'] = 3.0
+    if iteration is not None:
+        state_dict['iteration'] = iteration
+    state_dict['tokens'] = args.consumed_train_tokens
+
+    if args.deepspeed:
+        state_dict[UNIVERSAL_CHECKPOINT_INFO] = _universal_checkpoint_info(model)
+
+    # DeepSpeed saves the model/optimizer/scheduler
+    if not args.deepspeed:
+        if len(model) == 1:
+            state_dict['model'] = (model[0].sharded_state_dict()
+                                if use_dist_ckpt else
+                                model[0].state_dict_for_save_checkpoint())
+        else:
+            for i in range(len(model)):
+                mpu.set_virtual_pipeline_model_parallel_rank(i)
+                state_dict['model%d' % i] = (
+                    model[i].sharded_state_dict()
+                    if use_dist_ckpt else
+                    model[i].state_dict_for_save_checkpoint())
+        # Optimizer stuff.
+        if not args.no_save_optim:
+            if optimizer is not None:
+                state_dict['optimizer'] = (optimizer.sharded_state_dict(state_dict, **(optim_sd_kwargs or {}))
+                                        if use_dist_ckpt else
+                                        optimizer.state_dict())
+            if opt_param_scheduler is not None:
+                state_dict['opt_param_scheduler'] = \
+                    opt_param_scheduler.state_dict()
+    # RNG states.
+    if not args.no_save_rng:
+        state_dict["rng_state"] = rng_state
+    return state_dict
+
+# Not used! num_key_value_heads can be replaced with megatron-lm num_query_groups.
+def load_args_from_checkpoint_wrapper(load_args_from_checkpoint):
+    @wraps(load_args_from_checkpoint)
+    def wrapper(args, load_arg='load', exit_on_missing_checkpoint=False):
+        args, checkpoint_args = load_args_from_checkpoint(args, load_arg, exit_on_missing_checkpoint)
+
+        def _set_arg(arg_name, old_arg_name=None, force=False):
+            if not force and getattr(args, arg_name, None) is not None:
+                return
+
+            if old_arg_name is not None:
+                checkpoint_value = getattr(checkpoint_args, old_arg_name, None)
+            else:
+                checkpoint_value = getattr(checkpoint_args, arg_name, None)
+
+            if checkpoint_value is not None:
+                print_rank_0(f"Setting {arg_name} to {checkpoint_value} from checkpoint")
+                setattr(args, arg_name, checkpoint_value)
+            else:
+                print_rank_0(f"Checkpoint did not provide arguments {arg_name}")
+
+        _set_arg('num_key_value_heads')
+
+        return args, checkpoint_args
+    return wrapper
+
+
+def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', strict=True, load_only_weights=False):
+    """Load a model checkpoint and return the iteration.
+    strict (bool): whether to strictly enforce that the keys in
+        :attr:`state_dict` of the checkpoint match the names of
+        parameters and buffers in model.
+    """
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+
+    # Finetuning directories
+    pretrained_dir = getattr(args,'pretrained_checkpoint', None)
+    if pretrained_dir is not None and not checkpoint_exists(load_dir):
+        print_rank_0(f'Checkpoint file not found in load directory {load_dir} attempting to finetune with checkpoint in {pretrained_dir}')
+        load_dir = pretrained_dir
+        if not checkpoint_exists(load_dir):
+            raise FileNotFoundError("No checkpoint found in load directory or pretrained directory")
+        args.finetune = True
+
+    if args.deepspeed:
+        if args.finetune:
+            loaded_dir, state_dict = model[0].load_checkpoint(load_dir,
+                load_module_strict=strict, load_optimizer_states=False,
+                load_lr_scheduler_states=False, load_module_only=True,
+                tag=args.load_tag)
+        else:
+            loaded_dir, state_dict = model[0].load_checkpoint(load_dir,
+                load_module_strict=strict, tag=args.load_tag)
+        if loaded_dir is None:
+            print_rank_0('WARNING: could not find the metadata file {} '.format(
+                load_dir))
+            print_rank_0('    will not load any checkpoints and will start from '
+                        'random')
+            return 0, 0
+        release = False        
+    else:
+        model = unwrap_model(model)
+
+        load_kwargs = {}
+        is_dist_ckpt = False
+        if args.auto_detect_ckpt_format or args.use_dist_ckpt:
+            state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=True, exit_on_missing_checkpoint=args.exit_on_missing_checkpoint)
+            is_dist_ckpt = dist_checkpointing.check_is_distributed_checkpoint(checkpoint_name)
+            if is_dist_ckpt:
+                ckpt_tp_pp = (state_dict['args'].tensor_model_parallel_size, state_dict['args'].pipeline_model_parallel_size)
+                run_tp_pp = (mpu.get_tensor_model_parallel_world_size(), mpu.get_pipeline_model_parallel_world_size())
+                mismatch_msg = "(TP, PP) mismatch after resume ({} vs {} from checkpoint)".format(ckpt_tp_pp, run_tp_pp)
+
+                # Determine if RNG state will be loaded
+                if (ckpt_tp_pp == run_tp_pp and not release and not args.finetune and not args.no_load_rng
+                        and not getattr(state_dict['args'], 'no_save_rng', False)):
+                    gen_sd_rng_state = get_rng_state(True)  # we can load the rng state
+                else:
+                    gen_sd_rng_state = None
+                    if ckpt_tp_pp != run_tp_pp:
+                        print_rank_0("{}: RNG state will be ignored".format(mismatch_msg))
+
+                optim_sd_kwargs = dict(is_loading=True)
+                # Determine if optimizer state will be loaded
+                if (not release and not args.finetune and not args.no_load_optim
+                        and not getattr(state_dict['args'], 'no_save_optim', False)):
+                    gen_sd_optim = optimizer
+                    gen_sd_opt_param_scheduler = opt_param_scheduler
+
+                    if args.use_distributed_optimizer:
+                        optim_sd_kwargs['sharding_type'] = ('fully_sharded_model_space'
+                                                            if getattr(state_dict['args'], 'ckpt_fully_parallel_save', False)
+                                                            else 'dp_zero_gather_scatter')
+                        # This is for backwards-compatibility. Can be removed once 'fully_sharded_bucket_space' loading is removed
+                        for maybe_dist_opt_optim_state in (state_dict['optimizer'], *state_dict['optimizer'].values()):
+                            if 'param_state_sharding_type' in maybe_dist_opt_optim_state:
+                                if maybe_dist_opt_optim_state['param_state_sharding_type'] == 'fully_sharded_bucket_space':
+                                    print_rank_0('Detected deprecated `fully_sharded_bucket_space` DistributedOptimizer checkpoint format')
+                                    optim_sd_kwargs['sharding_type'] = maybe_dist_opt_optim_state['param_state_sharding_type']
+                                break
+
+                        if ckpt_tp_pp != run_tp_pp and optim_sd_kwargs['sharding_type'] != 'fully_sharded_model_space':
+                            raise RuntimeError(f"{mismatch_msg}: not supported for DistributedOptimizer with sharding type {optim_sd_kwargs['sharding_type']}."
+                                            f" Please use `--ckpt-fully-parallel-save` flag during checkpoint saving.")
+                else:
+                    gen_sd_optim = None
+                    gen_sd_opt_param_scheduler = None
+                load_kwargs['sharded_state_dict'] = generate_state_dict(args, model, gen_sd_optim, gen_sd_opt_param_scheduler,
+                                                                        gen_sd_rng_state, True, optim_sd_kwargs=optim_sd_kwargs)
+                load_kwargs['exit_on_missing_checkpoint'] = args.exit_on_missing_checkpoint
+
+        state_dict, checkpoint_name, release = _load_base_checkpoint(load_dir, rank0=False, **load_kwargs)
+
+        # Checkpoint not loaded.
+        if state_dict is None:
+            # Iteration and num_floating_point_operations_so_far default to 0.
+            return 0, 0
+    checkpoint_name = get_checkpoint_name(load_dir, state_dict['iteration'], release)
+
+    # Set checkpoint version.
+    set_checkpoint_version(state_dict.get('checkpoint_version', 0))
+
+    # Set iteration.
+    if args.finetune or release or load_only_weights:
+        iteration = 0
+        # Make DeepSpeed engine aware of this reset of iteration
+        model[0].global_steps = 0
+    else:
+        try:
+            iteration = state_dict['iteration']
+            if 'tokens' in state_dict:
+                args.consumed_train_tokens = state_dict['tokens']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = state_dict['total_iters']
+            except KeyError:
+                print_rank_0('A metadata file exists but unable to load '
+                             'iteration from checkpoint {}, exiting'.format(checkpoint_name))
+                sys.exit()
+    num_floating_point_operations_so_far = state_dict.get('num_floating_point_operations_so_far', 0)
+
+    # Check arguments.
+    if not load_only_weights:
+        assert args.consumed_train_samples == 0
+        assert args.consumed_valid_samples == 0
+        if 'args' in state_dict and not args.finetune:
+            checkpoint_args = state_dict['args']
+            check_checkpoint_args(checkpoint_args)
+            args.consumed_train_samples = getattr(checkpoint_args,
+                                                'consumed_train_samples', 0)
+            update_num_microbatches(consumed_samples=args.consumed_train_samples)
+            args.consumed_valid_samples = getattr(checkpoint_args,
+                                                'consumed_valid_samples', 0)
+        else:
+            print_rank_0('could not find arguments in the checkpoint ...')
+
+    # [ModelOpt]: loading modelopt_state (sharded or not)
+    if has_nvidia_modelopt:
+        if args.use_dist_ckpt:
+            restore_sharded_modelopt_state(model, checkpoint_name)
+        else:
+            restore_modelopt_state(model, state_dict)
+
+    # Model.
+    if not args.deepspeed:
+        strict = False if args.retro_add_retriever else strict
+        if len(model) == 1:
+            model[0].load_state_dict(state_dict['model'], strict=strict)
+        else:
+            for i in range(len(model)):
+                mpu.set_virtual_pipeline_model_parallel_rank(i)
+                model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
+
+    # Fix up query/key/value matrix ordering if needed.
+    checkpoint_version = get_checkpoint_version()
+    print_rank_0(f' checkpoint version {checkpoint_version}')
+    fix_query_key_value_ordering(model, checkpoint_version)
+
+    # Optimizer.
+    if not args.deepspeed:
+        if not release and not args.finetune and not args.no_load_optim:
+            try:
+                # Load state dict.
+                if optimizer is not None:
+                    optimizer.load_state_dict(state_dict['optimizer'])
+
+                # Load distributed optimizer's custom parameter state.
+                # For distributed checkpoint it's already loaded in load_state_dict above
+                if args.use_distributed_optimizer and not is_dist_ckpt:
+                    tracker_filename = get_checkpoint_tracker_filename(load_dir)
+                    iteration, release = read_metadata(tracker_filename)
+                    model_checkpoint_name = \
+                        get_checkpoint_name(load_dir, iteration, release)
+                    optim_checkpoint_name = \
+                        get_distributed_optimizer_checkpoint_name(
+                            model_checkpoint_name)
+                    optimizer.load_parameter_state(optim_checkpoint_name)
+
+                # Load scheduler.
+                if opt_param_scheduler is not None:
+                    if 'lr_scheduler' in state_dict: # backward compatbility
+                        opt_param_scheduler.load_state_dict(state_dict['lr_scheduler'])
+                    else:
+                        opt_param_scheduler.load_state_dict(state_dict['opt_param_scheduler'])
+            except KeyError:
+                print_rank_0('Unable to load optimizer from checkpoint {}. '
+                            'Specify --no-load-optim or --finetune to prevent '
+                            'attempting to load the optimizer state, '
+                            'exiting ...'.format(checkpoint_name))
+                sys.exit()
+        else:
+            if (args.fp16 or args.bf16) and optimizer is not None:
+                optimizer.reload_model_params()
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            if 'rng_state' in state_dict:
+                # access rng_state for data parallel rank
+                if args.data_parallel_random_init:
+                    rng_state = state_dict['rng_state'][mpu.get_data_parallel_rank()]
+                else:
+                    rng_state = state_dict['rng_state'][0]
+                random.setstate(rng_state['random_rng_state'])
+                np.random.set_state(rng_state['np_rng_state'])
+                torch.set_rng_state(rng_state['torch_rng_state'])
+                torch.cuda.set_rng_state(rng_state['cuda_rng_state'])
+                # Check for empty states array
+                if not rng_state['rng_tracker_states']:
+                    raise KeyError
+                tensor_parallel.get_cuda_rng_tracker().set_states(
+                    rng_state['rng_tracker_states'])
+            else:  # backward compatability
+                random.setstate(state_dict['random_rng_state'])
+                np.random.set_state(state_dict['np_rng_state'])
+                torch.set_rng_state(state_dict['torch_rng_state'])
+                torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
+                # Check for empty states array
+                if not state_dict['rng_tracker_states']:
+                    raise KeyError
+                tensor_parallel.get_cuda_rng_tracker().set_states(
+                    state_dict['rng_tracker_states'])
+        except KeyError:
+            print_rank_0('Unable to load rng state from checkpoint {}. '
+                         'Specify --no-load-rng or --finetune to prevent '
+                         'attempting to load the rng state, '
+                         'exiting ...'.format(checkpoint_name))
+            sys.exit()
+
+        if args.universal_checkpoint:
+            # TLDR: unique rng is needed for dropout to be really random on TP ranks
+            #
+            # Each tp-rank stores its model-parallel-rng states info.
+            # This is required to e.g. have different dropout patterns on different tp ranks that operate on
+            # slices of attention_probs tensor.
+            #
+            # When loading from universal checkpoint, we use mp_rank_<mp>_model_states.pt checkpoint files
+            # to restore the model-parallel-rng (<mp> is {tp-rank, pp-rank} combination).
+            # However, if the loaded checkpoint mp configuration does not match the current mp configuration,
+            # we can not use it to restore model-parallel-rng info.
+            #
+            # In the case of mp configuration change, we reconfigure the model-parallel-rng states s.t. each
+            # tp-rank will have a unique state. In order to ensure that subsequent loads from universal will
+            # not cause the model-parallel-rng states to be repeated, we add the iteration number to the base seed.
+            ckp_args = state_dict['args']
+            if ((args.tensor_model_parallel_size != ckp_args.tensor_model_parallel_size)
+                    or (args.pipeline_model_parallel_size != ckp_args.pipeline_model_parallel_size)):
+                print_rank_0(' loading universal checkpoint with modified mp configuration '
+                             '-> reconfigure tp seed')
+                tensor_parallel.model_parallel_reconfigure_tp_seed(args.seed + iteration)
+
+    # Some utilities want to load a checkpoint without distributed being initialized
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+    print_rank_0(f'  successfully loaded checkpoint from {load_dir} '
+                 f'[ t {mpu.get_tensor_model_parallel_rank()}, '
+                 f'p {mpu.get_pipeline_model_parallel_rank()} ] '
+                 f'at iteration {iteration}')
+
+    return iteration, num_floating_point_operations_so_far
+
+def _universal_checkpoint_info(model):
+    args = get_args()
+    tokenizer = get_tokenizer()
+    info = dict()
+    info[UNIVERSAL_CHECKPOINT_VERSION_KEY] = UNIVERSAL_CHECKPOINT_VERSION_VALUE
+    info[ORIGINAL_VOCAB_SIZE] = tokenizer.vocab_size
+    info[PADDED_VOCAB_SIZE] = args.padded_vocab_size
+    info.update(model[0].universal_checkpoint_info())
+    return info
diff --git a/toolbox/Megatron-DeepSpeed/megatronspeed/training/global_vars.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/global_vars.py
new file mode 100644
index 0000000000000000000000000000000000000000..35abccb4d0eb8c6d45f1d3d00cef497885e25557
--- /dev/null
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/global_vars.py
@@ -0,0 +1,16 @@
+"""Megatron global variables."""
+
+import os
+import sys
+import torch
+
+_GLOBAL_RLHF_ARGS = None
+
+def get_rlhf_args():
+    '''Return rlhf arguments.'''
+    return _GLOBAL_RLHF_ARGS
+
+
+def set_rlhf_args(rlhf_args):
+    global _GLOBAL_RLHF_ARGS
+    _GLOBAL_RLHF_ARGS = rlhf_args
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/initialize.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/initialize.py
similarity index 66%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/initialize.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/initialize.py
index 8999e2f693275ed9c099e51314bd29125e49b256..5cbe7ecf0bce0d18fa2f21a33ebb3850cfb78583 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/initialize.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/initialize.py
@@ -1,7 +1,5 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
 """Megatron initialization."""
-
+import logging
 import random
 import os
 import time
@@ -10,17 +8,34 @@ import numpy as np
 import torch
 from datetime import timedelta
 
+from megatron.legacy import fused_kernels
+from megatron.training import get_adlr_autoresume
+from megatron.training import get_args
+from megatron.training import get_tensorboard_writer
+from megatron.core import mpu, tensor_parallel
+from megatron.training.arguments import parse_args, validate_args
+from megatron.training.yaml_arguments import validate_yaml
+from megatron.training.checkpointing import load_args_from_checkpoint
+from megatron.training.global_vars import set_global_variables
+from megatron.legacy.model.transformer import bias_dropout_add_fused_train
+from megatron.legacy.model.fused_bias_gelu import bias_gelu
+from megatron.core import ixte_extensions
+from megatron.training.initialize import (
+    setup_logging,
+    _set_random_seed,
+    _init_autoresume,
+    _initialize_tp_communicators,
+    _initialize_ixte
+)
+from megatronspeed.core.pipeline_parallel.deepspeed_zbh1_engine import _exec_backward_only_pass, _exec_weight_pass
+from megatronspeed.core.pipeline_parallel.deepspeed_zbh1_schedule import BackwardOnlyPass, WeightPass, ZeroBubbleH1Pipeline
+
 from deepspeed.accelerator import get_accelerator
-from megatron_ds import fused_kernels
-from megatron_ds import get_adlr_autoresume
-from megatron_ds import get_args
-from megatron_ds import get_tensorboard_writer
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.arguments import parse_args, validate_args
-from megatron_ds.checkpointing import load_args_from_checkpoint
-from megatron_ds.global_vars import set_global_variables
-from megatron_ds.model.transformer import bias_dropout_add_fused_train
-from megatron_ds.model.fused_bias_gelu import bias_gelu
+import deepspeed
+from deepspeed.ops.op_builder.builder import OpBuilder
+
+logger = logging.getLogger(__name__)
+
 
 def initialize_megatron(
     extra_args_provider=None,
@@ -53,12 +68,19 @@ def initialize_megatron(
         assert args.load is not None, "--use-checkpoints-args requires --load argument"
         load_args_from_checkpoint(args)
 
-    validate_args(args, args_defaults)
+    if args.yaml_cfg is not None:
+        args = validate_yaml(args, args_defaults)
+    else:
+        validate_args(args, args_defaults)
+
 
     # set global args, build tokenizer, and set adlr-autoresume,
     # tensorboard-writer, and timers.
     set_global_variables(args)
 
+    # set logging level
+    setup_logging()
+
     # torch.distributed initialization
     def finish_mpu_init():
         args = get_args()
@@ -88,15 +110,21 @@ def initialize_megatron(
         # Megatron's MPU is the master. Complete initialization right away.
         finish_mpu_init()
 
+        # Initialize memory buffers.
+        _initialize_mem_buffs()
+
         # Autoresume.
         _init_autoresume()
 
         # Compile dependencies.
         _compile_dependencies()
 
-        if args.tp_comm_overlap:
-           _initialize_tp_communicators()
+        if args.tp_comm_overlap and not ixte_extensions._USE_IXTE:
+            _initialize_tp_communicators()
 
+        if not args.deepspeed or (args.deepspeed and args.no_pipeline_parallel):
+            if ixte_extensions._USE_IXTE:
+                _initialize_ixte()
         # No continuation function
         return None
 
@@ -109,21 +137,19 @@ def _compile_dependencies():
     # Compile dataset C++ code.
     # =========================
     # TODO: move this to ninja
-
-    if args.use_dataset_only:
-        return
-    if torch.distributed.get_rank() % get_accelerator().device_count() == 0:
+    # LOCAL_RANK only setted when use torchrun
+    if not torch.distributed.is_initialized() or int(os.environ["LOCAL_RANK"]) == 0:
         if args.deepspeed:
             start_time = time.time()
             print('> compiling dataset index builder ...')
-            from megatron_ds.data.dataset_utils import compile_helper
+            from megatronspeed.legacy.data.dataset_utils import compile_helper
             compile_helper()
             print('>>> done with dataset index builder. Compilation time: {:.3f} '
                 'seconds'.format(time.time() - start_time), flush=True)
         else:
             start_time = time.time()
             print("> compiling dataset index builder ...")
-            from megatron_ds.core.datasets.utils import compile_helpers
+            from megatron.core.datasets.utils import compile_helpers
 
             compile_helpers()
             print(
@@ -132,6 +158,8 @@ def _compile_dependencies():
                 flush=True,
             )
 
+    if args.use_dataset_only:
+        return
     # ==================
     # Load fused kernels
     # ==================
@@ -184,36 +212,31 @@ def _compile_dependencies():
             flush=True,
         )
 
-def _initialize_tp_communicators():
-    """ initializing the communicators with user buffers for high-performance tensor-model-parallel 
-        communication overlap """
-
-    try:
-       import yaml
-
-       import transformer_engine
-       from transformer_engine.pytorch import module as te_module
-
-    except ImportError:
-       raise RuntimeError("Tensor Parallel Communication/GEMM Overlap optimization needs 'yaml' and "
-             "'transformer_engine' packages") 
-
-    args = get_args()
-
-    if args.tp_comm_overlap_cfg is not None:
-       with open(args.tp_comm_overlap_cfg,"r") as stream:    
-          ub_cfgs = yaml.safe_load(stream)
-    else:
-       ub_cfgs = {}
-
-    input_shape = [args.seq_length * args.micro_batch_size , args.hidden_size]
-
-    #We create a MPI process group, which is needed to bootstrap the pipelined 
-    #tensor-model-parallel communication overlap
-    torch.distributed.new_group(backend='mpi')
-
-    te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size, 
-                                 use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,)
+def setup_deepspeed_random_and_activation_checkpointing(args):
+    '''Optional DeepSpeed Activation Checkpointing features.
+    Gives access to partition activations, contiguous memory optimizations
+    and cpu checkpointing.
+    Activation checkpoint requires keep track of the random states
+    and setting the random seed for each MP process. Megatron uses
+    mpu.get_cuda_rng_tracker and mpu.model_parallel_cuda_manual_seed
+    for keeping track of the random states and setting the random seeds.
+    Since they are used in places outside of activation checkpointing,
+    we overwrite them to maintain consistency.
+    This must be called before all the calls to mpu.model_parallel_cuda_manual_seed
+    '''
+    num_layers = args.num_layers // args.checkpoint_num_layers
+    num_layers = num_layers if args.num_layers % args.checkpoint_num_layers == 0 else num_layers + 1
+    if args.split_transformers:
+        num_layers *= 2
+
+    deepspeed.checkpointing.configure(
+        mpu,
+        partition_activations=args.partition_activations,
+        contiguous_checkpointing=args.contigious_checkpointing,
+        num_checkpoints=num_layers,
+        checkpoint_in_cpu=args.checkpoint_in_cpu,
+        synchronize=args.synchronize_each_layer,
+        profile=args.profile_backward)
 
 def _initialize_distributed():
     """Initialize torch.distributed and core model parallel."""
@@ -245,7 +268,19 @@ def _initialize_distributed():
             else:
                 args.local_rank = device
             torch.cuda.set_device(device)
+
+    if args.enable_zbh1_pipeline:
+        deepspeed.runtime.pipe.schedule.TrainSchedule = ZeroBubbleH1Pipeline
+        deepspeed.runtime.pipe.engine.PipelineEngine._INSTRUCTION_MAP.update(
+            {
+                BackwardOnlyPass: _exec_backward_only_pass,
+                WeightPass: _exec_weight_pass,
+            }
+        )
         # Call the init process
+    if args.deepspeed or args.ds_inference:
+        deepspeed.init_distributed()
+    else:
         torch.distributed.init_process_group(
             backend=args.distributed_backend,
             world_size=args.world_size,
@@ -259,15 +294,24 @@ def _initialize_distributed():
         if mpu.model_parallel_is_initialized():
             print("model parallel is already initialized")
         else:
+            if args.ds_sequence_parallel_size > 1 and args.sequence_parallel:
+                raise RuntimeError(
+                    f"sequence_parallel_size > 1 enables DeepSpeed's sequence parallel, "
+                    f"which is not compatible with Megatron-LM's sequence parallel. "
+                    f"Remove --sequence_parallel to use DeepSpeed's sequence parallel."
+                )
+
             mpu.initialize_model_parallel(
                 args.tensor_model_parallel_size,
                 args.pipeline_model_parallel_size,
-                args.ds_sequence_parallel_size,
                 args.virtual_pipeline_model_parallel_size,
                 args.pipeline_model_parallel_split_rank,
+                args.ds_sequence_parallel_size,
                 context_parallel_size=args.context_parallel_size,
                 expert_model_parallel_size=args.expert_model_parallel_size,
+                distributed_timeout_minutes=args.distributed_timeout_minutes,
                 nccl_communicator_config_path=args.nccl_communicator_config_path,
+                order='tp-cp-ep-dp-pp' if not args.use_tp_pp_dp_mapping else 'tp-pp-dp',
             )
             if args.rank == 0:
                 print(
@@ -278,70 +322,16 @@ def _initialize_distributed():
                     f"> initialized pipeline model parallel with size "
                     f"{mpu.get_pipeline_model_parallel_world_size()}"
                 )
-                print(
-                    f"> initialized context parallel with size "
-                    f"{mpu.get_context_parallel_world_size()}"
-                )
 
+    if args.deepspeed and args.deepspeed_activation_checkpointing:
+        setup_deepspeed_random_and_activation_checkpointing(args)
 
-def _init_autoresume():
-    """Set autoresume start time."""
-    autoresume = get_adlr_autoresume()
-    if autoresume:
-        torch.distributed.barrier()
-        autoresume.init()
-        torch.distributed.barrier()
-
-
-def _set_random_seed(seed_, data_parallel_random_init=False):
-    """Set random seed for reproducability."""
-    if seed_ is not None and seed_ > 0:
-        # Ensure that different pipeline MP stages get different seeds.
-        seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
-        # Ensure different data parallel ranks get different seeds
-        if data_parallel_random_init:
-            seed = seed + (10 * mpu.get_data_parallel_rank())
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.device_count() > 0:
-            tensor_parallel.model_parallel_cuda_manual_seed(seed)
-    else:
-        raise ValueError("Seed ({}) should be a positive integer.".format(seed))
-
-
-def write_args_to_tensorboard():
-    """Write arguments to tensorboard."""
+def _initialize_mem_buffs():
+    """Initialize manually allocated static memory."""
     args = get_args()
-    writer = get_tensorboard_writer()
-    if writer:
-        for arg in vars(args):
-            writer.add_text(arg, str(getattr(args, arg)), global_step=args.iteration)
-
-
-def set_jit_fusion_options():
-    """Set PyTorch JIT layer fusion options."""
-    # flags required to enable jit fusion kernels
-    TORCH_MAJOR = int(torch.__version__.split(".")[0])
-    TORCH_MINOR = int(torch.__version__.split(".")[1])
-    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
-        # nvfuser
-        torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(True)
-        torch._C._jit_override_can_fuse_on_cpu(False)
-        torch._C._jit_override_can_fuse_on_gpu(False)
-        torch._C._jit_set_texpr_fuser_enabled(False)
-        torch._C._jit_set_nvfuser_enabled(True)
-        torch._C._debug_set_autodiff_subgraph_inlining(False)
-    else:
-        # legacy pytorch fuser
-        torch._C._jit_set_profiling_mode(False)
-        torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_override_can_fuse_on_cpu(True)
-        torch._C._jit_override_can_fuse_on_gpu(True)
-
-    _warmup_jit_function()
-
+    # Initialize memory for checkpointed activations.
+    if args.distribute_checkpointed_activations:
+        tensor_parallel.init_checkpointed_activations_memory_buffer()
 
 def _warmup_jit_function():
     """Compilie JIT functions before the main training steps"""
@@ -361,7 +351,7 @@ def _warmup_jit_function():
     )
     input = torch.rand(
         (
-            args.seq_length,
+            args.seq_length // args.ds_sequence_parallel_size,
             args.micro_batch_size,
             args.ffn_hidden_size // args.tensor_model_parallel_size,
         ),
@@ -382,12 +372,12 @@ def _warmup_jit_function():
     else:
         seq_length = args.seq_length
     input = torch.rand(
-        (seq_length, args.micro_batch_size, args.hidden_size),
+        (seq_length // args.ds_sequence_parallel_size, args.micro_batch_size, args.hidden_size),
         dtype=dtype,
         device="cuda",
     )
     residual = torch.rand(
-        (seq_length, args.micro_batch_size, args.hidden_size),
+        (seq_length // args.ds_sequence_parallel_size, args.micro_batch_size, args.hidden_size),
         dtype=dtype,
         device="cuda",
     )
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/memory.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/memory.py
similarity index 99%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/memory.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/memory.py
index a5fef75baa749d557da227bbccf706501ffdd10f..cc1f844b4c013b319b7e1ecfe2743748d6232e4a 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/memory.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/memory.py
@@ -3,7 +3,6 @@
 
 import torch
 
-
 # A dictionary of all the memory buffers allocated.
 _MEM_BUFFS = dict()
 
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/__init__.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/__init__.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/__init__.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/__init__.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/bert_tokenization.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/bert_tokenization.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/bert_tokenization.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/bert_tokenization.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/gpt2_tokenization.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/gpt2_tokenization.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/gpt2_tokenization.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/gpt2_tokenization.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/tokenization_utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/tokenization_utils.py
similarity index 100%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/tokenization_utils.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/tokenization_utils.py
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/tokenizer.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/tokenizer.py
similarity index 50%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/tokenizer.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/tokenizer.py
index 79fa4f2e307bd6323bfbfebee10a3bcc0d9980c4..3b10eff264fabde4c7b729210c472e1e5a3f8ed6 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/tokenizer/tokenizer.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/tokenizer/tokenizer.py
@@ -2,13 +2,20 @@
 
 """Megatron tokenizers."""
 
-from abc import ABC
-from abc import abstractmethod
 import os
 
 from transformers import AutoTokenizer
-from .bert_tokenization import FullTokenizer as FullBertTokenizer
-from .gpt2_tokenization import GPT2Tokenizer
+from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
+from megatron.training.tokenizer.gpt2_tokenization import GPT2Tokenizer
+from megatron.training.tokenizer.tokenizer import (
+    _vocab_size_with_padding,
+    _BertWordPieceTokenizer,
+    _GPT2BPETokenizer,
+    _SentencePieceTokenizer,
+    _GPTSentencePieceTokenizer,
+    _Llama2Tokenizer,
+    create_llama3_tokenizer
+)
 
 from typing import (
     AbstractSet,
@@ -79,508 +86,6 @@ def build_tokenizer(args):
 
     return tokenizer
 
-
-def _vocab_size_with_padding(orig_vocab_size, args):
-    """Pad vocab size so it is divisible by model parallel size and
-    still having GPU friendly size."""
-
-    after = orig_vocab_size
-    multiple = args.make_vocab_size_divisible_by * \
-        args.tensor_model_parallel_size
-    while (after % multiple) != 0:
-        after += 1
-    if args.rank == 0:
-        print(' > padded vocab (size: {}) with {} dummy tokens '
-              '(new size: {})'.format(
-                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
-    return after
-
-
-class AbstractTokenizer(ABC):
-    """Abstract class for tokenizer."""
-
-    def __init__(self, name):
-        self.name = name
-        super().__init__()
-
-    @property
-    @abstractmethod
-    def vocab_size(self):
-        pass
-
-    @property
-    @abstractmethod
-    def vocab(self):
-        """Dictionary from vocab text token to id token."""
-        pass
-
-    @property
-    @abstractmethod
-    def inv_vocab(self):
-        """Dictionary from vocab id token to text token."""
-        pass
-
-    @abstractmethod
-    def tokenize(self, text):
-        pass
-
-    def detokenize(self, token_ids):
-        raise NotImplementedError('detokenizer is not implemented for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def cls(self):
-        raise NotImplementedError('CLS is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def sep(self):
-        raise NotImplementedError('SEP is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def pad(self):
-        raise NotImplementedError('PAD is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def eod(self):
-        raise NotImplementedError('EOD is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-    @property
-    def mask(self):
-        raise NotImplementedError('MASK is not provided for {} '
-                                  'tokenizer'.format(self.name))
-
-
-class _BertWordPieceTokenizer(AbstractTokenizer):
-    """Original BERT wordpiece tokenizer."""
-
-    def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
-        if lower_case:
-            name = 'BERT Lower Case'
-        else:
-            name = 'BERT Upper Case'
-        super().__init__(name)
-        self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=lower_case)
-        self.cls_id = self.tokenizer.vocab['[CLS]']
-        self.sep_id = self.tokenizer.vocab['[SEP]']
-        self.pad_id = self.tokenizer.vocab['[PAD]']
-        self.mask_id = self.tokenizer.vocab['[MASK]']
-        self._additional_special_tokens = []
-
-        # (dsachan) Add BOS and EOS tokens
-        SPECIAL_TOKENS = {'eos_token': '[EOS]',
-                          'bos_token': '[BOS]'}
-        self._bos_token = '[BOS]'
-        self.add_token(self._bos_token)
-        self._bos_token_id = self.vocab.get(self._bos_token)
-
-        self._eos_token = '[EOS]'
-        self.add_token(self._eos_token)
-        self._eos_token_id = self.vocab.get(self._eos_token)
-
-        # (dsachan) Add additional special tokens
-        # These can be used as sentinel tokens in T5 model inputs
-        additional_special_tokens = []
-        additional_special_tokens.extend(
-            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
-        self.add_additional_special_tokens(additional_special_tokens)
-
-    def add_token(self, token):
-        if token not in self.vocab:
-            self.inv_vocab[self.vocab_size] = token
-            # self.vocab_size comes from len(vocab)
-            # and it will increase as we add elements
-            self.vocab[token] = self.vocab_size
-
-    def add_additional_special_tokens(self, tokens_list):
-        setattr(self, "additional_special_tokens", tokens_list)
-        for value in tokens_list:
-            self.add_token(value)
-
-    @property
-    def vocab_size(self):
-        return self.tokenizer.vocab_size()
-
-    @property
-    def vocab(self):
-        return self.tokenizer.vocab
-
-    @property
-    def inv_vocab(self):
-        return self.tokenizer.inv_vocab
-
-    def tokenize(self, text):
-        text_tokens = self.tokenizer.tokenize(text)
-        return self.tokenizer.convert_tokens_to_ids(text_tokens)
-
-    def decode(self, ids):
-        tokens = self.tokenizer.convert_ids_to_tokens(ids)
-        return self.tokenizer.convert_tokens_to_string(tokens)
-
-    def decode_token_ids(self, token_ids):
-        tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
-        exclude_list = ['[PAD]', '[CLS]']
-        non_pads = [t for t in tokens if t not in exclude_list]
-
-        result = ""
-        for s in non_pads:
-            if s.startswith("##"):
-                result += s[2:]
-            else:
-                result += " " + s
-
-        return result
-
-    @property
-    def cls(self):
-        return self.cls_id
-
-    @property
-    def sep(self):
-        return self.sep_id
-
-    @property
-    def pad(self):
-        return self.pad_id
-
-    @property
-    def mask(self):
-        return self.mask_id
-
-    @property
-    def bos_token(self):
-        """ Beginning of sentence token id """
-        return self._bos_token
-
-    @property
-    def eos_token(self):
-        """ End of sentence token id """
-        return self._eos_token
-
-    @property
-    def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings)."""
-        return self._additional_special_tokens
-
-    @property
-    def bos_token_id(self):
-        """ Id of the beginning of sentence token in the vocabulary."""
-        return self._bos_token_id
-
-    @property
-    def eos_token_id(self):
-        """ Id of the end of sentence token in the vocabulary."""
-        return self._eos_token_id
-
-    @property
-    def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
-        return [self.vocab.get(token) for token in self._additional_special_tokens]
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
-
-
-class _GPT2BPETokenizer(AbstractTokenizer):
-    """Original GPT2 BPE tokenizer."""
-
-    def __init__(self, vocab_file, merge_file):
-        name = 'GPT2 BPE'
-        super().__init__(name)
-
-        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
-                                       special_tokens=[], max_len=None)
-        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
-
-    @property
-    def vocab_size(self):
-        return len(self.tokenizer.encoder)
-
-    @property
-    def vocab(self):
-        return self.tokenizer.encoder
-
-    @property
-    def inv_vocab(self):
-        return self.tokenizer.decoder
-
-    def tokenize(self, text):
-        return self.tokenizer.encode(text)
-
-    def detokenize(self, token_ids):
-        return self.tokenizer.decode(token_ids)
-
-    @property
-    def eod(self):
-        return self.eod_id
-
-
-class _SentencePieceTokenizer(AbstractTokenizer):
-    """SentencePieceTokenizer-Megatron wrapper"""
-
-    def __init__(self, model_file, vocab_extra_ids=0):
-        name = 'SentencePieceTokenizer'
-        super().__init__(name)
-
-        import sentencepiece
-        self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file)
-        self._initalize(vocab_extra_ids)
-
-    def _populate_vocab(self):
-        self._vocab = {}
-        self._inv_vocab = {}
-
-        for i in range(len(self.tokenizer)):
-            t = self.tokenizer.id_to_piece(i)
-            self._inv_vocab[i] = t
-            self._vocab[t] = i
-
-    def _initalize(self, vocab_extra_ids):
-        self._populate_vocab()
-        self._special_tokens = {}
-        self._inv_special_tokens = {}
-
-        self._t5_tokens = []
-
-        def _add_special_token(t):
-            if t not in self._vocab:
-                next_id = len(self._vocab)
-                self._vocab[t] = next_id
-                self._inv_vocab[next_id] = t
-            self._special_tokens[t] = self._vocab[t]
-            self._inv_special_tokens[self._vocab[t]] = t
-
-        _add_special_token('<CLS>')
-        self._cls_id = self._vocab['<CLS>']
-        _add_special_token('<SEP>')
-        self._sep_id = self._vocab['<SEP>']
-        _add_special_token('<EOD>')
-        self._eod_id = self._vocab['<EOD>']
-        _add_special_token('<MASK>')
-        self._mask_id = self._vocab['<MASK>']
-
-        pad_id = self.tokenizer.pad_id()
-        try:
-            pad_token = self.tokenizer.id_to_piece(pad_id)
-        except IndexError:
-            pad_token = '<PAD>'
-        _add_special_token(pad_token)
-        self._pad_id = self._vocab[pad_token]
-
-        bos_id = self.tokenizer.bos_id()
-        try:
-            bos_token = self.tokenizer.id_to_piece(bos_id)
-        except IndexError:
-            bos_token = '<BOS>'
-        _add_special_token(bos_token)
-        self._bos_id = self._vocab[bos_token]
-
-        eos_id = self.tokenizer.eos_id()
-        try:
-            eos_token = self.tokenizer.id_to_piece(eos_id)
-        except IndexError:
-            eos_token = '<EOS>'
-        _add_special_token(eos_token)
-        self._eos_id = self._vocab[eos_token]
-
-        for i in range(vocab_extra_ids):
-            t = "<extra_id_{}>".format(i)
-            _add_special_token(t)
-            self._t5_tokens += [t]
-
-    @property
-    def vocab_size(self):
-        return len(self._vocab)
-
-    @property
-    def vocab(self):
-        return self._vocab
-
-    @property
-    def inv_vocab(self):
-        return self._inv_vocab
-
-    @property
-    def decoder(self):
-        return self._inv_vocab
-
-    @property
-    def encoder(self):
-        return self._vocab
-
-    # From:
-    # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L89
-    def tokenize(self, text):
-        ids = []
-        idx = 0
-
-        while 1:
-            indices = {}
-            for token in self._special_tokens:
-                try:
-                    indices[token] = text[idx:].index(token)
-                except ValueError:
-                    continue
-            if len(indices) == 0:
-                break
-
-            next_token = min(indices, key=indices.get)
-            next_idx = idx + indices[next_token]
-
-            ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
-            ids.append(self._special_tokens[next_token])
-            idx = next_idx + len(next_token)
-
-        ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
-        return ids
-
-    # From:
-    # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L125
-    def detokenize(self, ids):
-        text = ""
-        last_i = 0
-
-        for i, id in enumerate(ids):
-            if id in self._inv_special_tokens:
-                text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
-                text += self._inv_special_tokens[id] + " "
-                last_i = i + 1
-
-        text += self.tokenizer.decode_ids(ids[last_i:])
-        return text
-
-    @property
-    def cls(self):
-        return self._cls_id
-
-    @property
-    def sep(self):
-        return self._sep_id
-
-    @property
-    def pad(self):
-        return self._pad_id
-
-    @property
-    def bos_token_id(self):
-        return self._bos_id
-
-    @property
-    def bos(self):
-        return self._bos_id
-
-    @property
-    def eod(self):
-        return self._eod_id
-
-    @property
-    def eos_token_id(self):
-        return self._eos_id
-
-    @property
-    def eos(self):
-        return self._eos_id
-
-    @property
-    def mask(self):
-        return self._mask_id
-
-    @property
-    def additional_special_tokens_ids(self):
-        return [self.vocab[k] for k in self._t5_tokens]
-
-class _GPTSentencePieceTokenizer(_SentencePieceTokenizer):
-    """SentencePieceTokenizer-Megatron wrapper"""
-
-    def __init__(self, model_file,):
-        super().__init__(model_file, vocab_extra_ids=0)
-
-    def _initalize(self, vocab_extra_ids):
-        self._populate_vocab()
-
-        self._pad_id = self.tokenizer.pad_id()
-        self._bos_id = self.tokenizer.bos_id()
-        self._eos_id = self.tokenizer.eos_id()
-
-    def tokenize(self, text):
-        return self.tokenizer.encode_as_ids(text)
-
-    def detokenize(self, ids):
-        return self.tokenizer.decode_ids(ids)
-
-    @property
-    def cls(self):
-        return -1
-
-    @property
-    def sep(self):
-        return -1
-
-    @property
-    def mask(self):
-        return -1
-
-    @property
-    def eod(self):
-        return self._eos_id
-
-    @property
-    def additional_special_tokens_ids(self):
-        return None
-
-class _Llama2Tokenizer(_SentencePieceTokenizer):
-    """SentencePieceTokenizer-Megatron wrapper"""
-
-    def __init__(self, model_file,):
-        super().__init__(model_file, vocab_extra_ids=0)
-
-    def _initalize(self, vocab_extra_ids):
-        self._populate_vocab()
-
-        # BOS / EOS token IDs
-        self.n_words: int = self.tokenizer.vocab_size()
-        self.bos_id: int = self.tokenizer.bos_id()
-        self.eos_id: int = self.tokenizer.eos_id()
-        self.pad_id: int = self.tokenizer.pad_id()
-        assert self.tokenizer.vocab_size() == self.tokenizer.get_piece_size()
-
-    def tokenize(self, s: str, bos=True, eos=False):
-        '''Default args for text completion, not chat/dialog.'''
-        assert type(s) is str
-        t = self.tokenizer.encode(s)
-        if bos:
-            t = [self.bos_id] + t
-        if eos:
-            t = t + [self.eos_id]
-        return t
-
-    def detokenize(self, ids):
-        return self.tokenizer.decode_ids(ids)
-
-    @property
-    def cls(self):
-        return -1
-
-    @property
-    def sep(self):
-        return -1
-
-    @property
-    def mask(self):
-        return -1
-
-    @property
-    def eod(self):
-        return self.eos_id
-
-    @property
-    def additional_special_tokens_ids(self):
-        return None
-
 class _NullTokenizer:
     def __init__(self, vocab_size):
         vocab_size = int(vocab_size)
@@ -613,7 +118,7 @@ class _NullTokenizer:
     def additional_special_tokens_ids(self):
         return None
 
-class _AquilaTokenizer(AbstractTokenizer):
+class _AquilaTokenizer(MegatronTokenizer):
     """Aquila tokenizer."""
 
     def __init__(self, vocab_file, merge_file, special_tokens_file):
@@ -661,7 +166,7 @@ class _AquilaTokenizer(AbstractTokenizer):
         return self.pad_id
     
 
-class _HFTokenizer(AbstractTokenizer):
+class _HFTokenizer(MegatronTokenizer):
     """HF Tokenizer"""
     def __init__(self, tokenizer_name_or_path,max_seq_len):
         name = tokenizer_name_or_path
@@ -766,6 +271,8 @@ class _HFTokenizer(AbstractTokenizer):
 # from logging import getLogger
 # logger = getLogger(__name__)
 
+from collections import OrderedDict
+
 class _Llama3Tokenizer:
     """
     Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
@@ -826,6 +333,8 @@ class _Llama3Tokenizer:
         }
         # logger.info(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
 
+        self.unique_identifiers = OrderedDict()
+
     def encode(
         self,
         s: str,
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/training.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/training.py
similarity index 71%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/training.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/training.py
index 2880c4e37e44a4d77232875c013ee21ead322fa7..430fd4be81028bc7835799d33658a4f6493f816f 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/training.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/training.py
@@ -1,85 +1,89 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
 """Pretrain utilities."""
 
-import gc
+import dataclasses
 from datetime import datetime
-import math
+import gc
 import logging
+import math
+import os
 import sys
-from .log_handler import CustomHandler
+import json
+try:
+    import wandb
+except (ImportError, ModuleNotFoundError):
+    wandb = None
+from megatron.training.log_handler import CustomHandler
 # Make default logging level INFO, but filter out all log messages not from MCore.
 logging.basicConfig(handlers=[CustomHandler()], level=logging.INFO)
-from .theoretical_memory_usage import report_theoretical_memory
+from megatron.training.theoretical_memory_usage import report_theoretical_memory
 import time
-import json
-import os
 # The earliest we can measure the start time.
 _TRAIN_START_TIME = time.time()
 import torch
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-
-from megatron_ds import get_args
-from megatron_ds import get_signal_handler
-from megatron_ds import get_timers
-from megatron_ds import get_tensorboard_writer
-from megatron_ds import get_wandb_writer
-from megatron_ds import get_current_global_batch_size
-from megatron_ds import get_num_microbatches
-from megatron_ds import is_last_rank
-from megatron_ds import update_num_microbatches
-from megatron_ds.core import mpu, tensor_parallel
-from megatron_ds.core.utils import get_model_config
-from megatron_ds import print_rank_0
-from megatron_ds import print_rank_last
-from megatron_ds.checkpointing import load_checkpoint
-from megatron_ds.checkpointing import save_checkpoint
-from megatron_ds.model import Float16Module
-from megatron_ds.model import GPTModel
-from megatron_ds.core.distributed import DistributedDataParallel as DDP
-from megatron_ds.core.distributed import finalize_model_grads
-from megatron_ds.core.enums import ModelType
-from megatron_ds.optimizer import get_megatron_optimizer
-from megatron_ds.initialize import initialize_megatron
-from megatron_ds.initialize import write_args_to_tensorboard
-from megatron_ds.initialize import set_jit_fusion_options
-from megatron_ds.optimizer_param_scheduler import OptimizerParamScheduler
-from megatron_ds.model import DistributedDataParallel as LocalDDP
-from megatron_ds.utils import check_adlr_autoresume_termination
-from megatron_ds.utils import unwrap_model
-from megatron_ds.data.data_samplers import build_pretraining_data_loader
-from megatron_ds.utils import calc_params_l2_norm
-from megatron_ds.core.pipeline_parallel import get_forward_backward_func
-from megatron_ds.utils import report_memory, throughput_calculator, checkpoint_throughput_calculator, update_rotary_pos_emb
-# from megatron.model.vision.knn_monitor import compute_feature_bank
-from megatron_ds.arguments import core_transformer_config_from_args
+from collections import OrderedDict
+from enum import Enum
+
+from megatron.core import mpu, tensor_parallel
+from megatron.core.utils import check_param_hashes_across_dp_replicas, get_model_config, StragglerDetector
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.checkpointing import save_checkpoint
+from megatron.legacy.model import Float16Module
+from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.distributed import DistributedDataParallel as DDP
+from megatron.core.distributed import finalize_model_grads
+from megatron.core.enums import ModelType
+from megatron.core.optimizer import get_megatron_optimizer, OptimizerConfig
+from megatron.training.initialize import initialize_megatron
+from megatron.training.initialize import write_args_to_tensorboard
+from megatron.training.initialize import set_jit_fusion_options
+from megatron.training.optimizer_param_scheduler import OptimizerParamScheduler
+from megatron.legacy.data.data_samplers import build_pretraining_data_loader
+from megatron.core.transformer.moe.moe_utils import track_moe_metrics
+from megatron.core.pipeline_parallel import get_forward_backward_func
+from megatron.core.num_microbatches_calculator import (
+    get_current_global_batch_size,
+    get_num_microbatches,
+    update_num_microbatches)
+
+from megatron.training.async_utils import maybe_finalize_async_save
+from megatron.training.utils import (
+    calc_params_l2_norm,
+    check_adlr_autoresume_termination,
+    is_last_rank,
+    print_rank_0,
+    print_rank_last,
+    report_memory,
+    unwrap_model,
+    append_to_progress_log,
+)
+from megatron.training.global_vars import (
+    get_args,
+    get_signal_handler,
+    get_timers,
+    get_tensorboard_writer,
+    get_wandb_writer,
+    get_one_logger)
+from megatron.training import one_logger_utils
+from megatron.training.training import (
+    print_datetime,
+    num_floating_point_operations,
+    update_train_iters,
+    build_train_valid_test_data_iterators,
+    get_optimizer_param_scheduler,
+    build_train_valid_test_datasets,
+    save_checkpoint_and_time
+)
+
+from megatronspeed.training.utils import is_rank_0, throughput_calculator, checkpoint_throughput_calculator, update_rotary_pos_emb
+from megatronspeed.core import parallel_state
 
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.compression.compress import init_compression, redundancy_clean
 from deepspeed.runtime.data_pipeline.data_routing.helper import convert_to_random_ltd
-from megatron_ds.model.transformer import ParallelTransformerLayer
 
 from deepspeed import comm as dist
 
-try:
-    import wandb
-except (ImportError, ModuleNotFoundError):
-    wandb = None
-
-
-def execCmd(cmd):
-    r = os.popen(cmd)
-    text = r.read()
-    r.close()
-    return text
-
-def print_datetime(string):
-    """Note that this call will sync across all ranks."""
-    torch.distributed.barrier()
-    time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-    print_rank_0('[' + string + '] datetime: {} '.format(time_str))
-
 '''
 Since v0.9.0, deepspeed.initialize() has forbidden simultaneous setting of args.deepspeed_config (Path) and ds_config dict.
 So, we use ds_config dict which is the more flexible option. 
@@ -99,26 +103,6 @@ def _create_ds_config_dict():
     args.deepspeed_config = None 
 
     return ds_config_dict
-    
-
-def num_floating_point_operations(args, batch_size):
-    if not args.group_query_attention:
-        args.num_query_groups = args.num_attention_heads
-    return (
-        60
-        * batch_size
-        * args.seq_length
-        * args.num_layers
-        * args.hidden_size
-        * args.hidden_size
-        * (
-            1
-            + (args.num_query_groups / (5 * args.num_attention_heads))
-            + (args.seq_length / (5 * args.hidden_size))
-            + (args.padded_vocab_size / (10 * args.num_layers * args.hidden_size))
-        )
-    )
-
 
 def pretrain(train_valid_test_dataset_provider,
              model_provider,
@@ -137,7 +121,7 @@ def pretrain(train_valid_test_dataset_provider,
         3) call train_val_test_data_provider to get train/val/test datasets.
         4) train the modle using the forward_step_func.
 
-    Arguments:
+    Args:
         train_valid_test_dataset_provider: a function that takes the size of
             train/valid/test dataset and returns `train, valid, test` datasets.
         model_provider: a function that returns a vanilla version of the
@@ -161,21 +145,38 @@ def pretrain(train_valid_test_dataset_provider,
     # Initalize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(extra_args_provider=extra_args_provider,
                         args_defaults=args_defaults, external_args=external_args)
+
+    args = get_args()
+    timers = get_timers()
+    # import pdb
+    # if torch.distributed.get_rank() == 0:
+    #     pdb.set_trace()
+
+    if args.log_progress:
+        append_to_progress_log("Starting job")
+
     # Set pytorch JIT layer fusion options and warmup JIT functions.
-    if get_accelerator().device_name() == 'cuda':
-        set_jit_fusion_options()
+    set_jit_fusion_options()
 
     # Adjust the startup time so it reflects the largest value.
     # This will be closer to what scheduler will see (outside of
     # image ... launches.
     global _TRAIN_START_TIME
-    start_time_tensor = torch.cuda.DoubleTensor([_TRAIN_START_TIME])
+    start_time_tensor = torch.tensor([_TRAIN_START_TIME],
+                                     dtype=torch.float,
+                                     device='cuda')
     torch.distributed.all_reduce(start_time_tensor,
                                  op=torch.distributed.ReduceOp.MIN)
     _TRAIN_START_TIME = start_time_tensor.item()
+
+    app_metrics = {}
+    app_metrics['app_start_time'] = round(_TRAIN_START_TIME * 1000.0)
+    app_metrics['app_model_init_start_time'] = round(_TRAIN_START_TIME * 1000.0)
+
     print_rank_0('time to initialize megatron (seconds): {:.3f}'.format(
         time.time() - _TRAIN_START_TIME))
     print_datetime('after megatron is initialized')
+    app_metrics['app_model_init_finish_time'] = one_logger_utils.get_timestamp_in_ms()
 
     args = get_args()
     timers = get_timers()
@@ -194,20 +195,24 @@ def pretrain(train_valid_test_dataset_provider,
         if "compression_training" in args.deepspeed_config_dict:
             args.compression_training = True
 
+    # Track E2E metrics on pretrain start
+    one_logger_utils.on_pretrain_start()
+
     # Model, optimizer, and learning rate.
     timers('model-and-optimizer-setup', log_level=0).start(barrier=True)
+    app_metrics['app_build_optimizer_start_time'] = one_logger_utils.get_timestamp_in_ms()
     model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
-        model_provider, model_type, teacher=False, data_post_process=data_post_process,
+        model_provider, model_type, data_post_process=data_post_process,
         build_train_valid_test_datasets_provider=train_valid_test_dataset_provider)
+
     timers('model-and-optimizer-setup').stop()
     print_datetime('after model, optimizer, and learning rate '
                    'scheduler are built')
-    if args.deepspeed:
-        config = core_transformer_config_from_args(args)
-    else:
-        config = get_model_config(model[0])
+    app_metrics['app_build_optimizer_finish_time'] = one_logger_utils.get_timestamp_in_ms()
+    config = get_model_config(model[0])
 
     # Data stuff.
+    app_metrics['app_build_dataiters_start_time'] = one_logger_utils.get_timestamp_in_ms()
     timers('train/valid/test-data-iterators-setup', log_level=0).start(
         barrier=True)
     if args.virtual_pipeline_model_parallel_size is not None:
@@ -239,41 +244,53 @@ def pretrain(train_valid_test_dataset_provider,
             train_data_iterator = None
     timers('train/valid/test-data-iterators-setup').stop()
     print_datetime('after dataloaders are built')
+    app_metrics['app_build_dataiters_finish_time'] = one_logger_utils.get_timestamp_in_ms()
+
+    # Track if training is enabled. Can only be done once args.do_train is assigned after dataloader is built.
+    one_logger_utils.track_config_flags(args.train_iters, args.skip_train, args.do_train,
+                                        args.do_valid, args.do_test, args.dataloader_type,
+                                        args.retro_project_dir, args.retro_cyclic_train_iters)
 
-    # args.teacher_model is used as global variable to pass the teacher model
-    # for knowledge distillation. Users do not need to set it in the command
-    # line to use kd, but users do need to provide teacher model configurations
-    # like args.num_layers_teacher as described in setup_teacher_model()
-    args.teacher_model = None
-    if args.mos or args.kd: # Set up teacher model
-        args.teacher_model = setup_teacher_model(args, model_provider)
+    # Context used for persisting some state between checkpoint saves.
+    checkpointing_context = {}
 
     # Print setup timing.
     print_rank_0('done with setup ...')
     timers.log(['model-and-optimizer-setup',
                 'train/valid/test-data-iterators-setup'], barrier=True)
 
+    one_logger = get_one_logger()
+    one_logger and one_logger.log_metrics(app_metrics)
+
     if not args.skip_train:
         print_rank_0('training ...')
 
-        if args.dataloader_type == 'cyclic' and args.retro_add_retriever:
+        if args.dataloader_type == 'cyclic' and args.retro_project_dir:
+            assert args.retro_cyclic_train_iters is not None
             args.train_iters = args.retro_cyclic_train_iters
             print_rank_0("retro cyclic train iters : %d" % args.train_iters)
 
         iteration = 0
         if args.do_train and args.train_iters > 0:
-            iteration = train(forward_step_func,
-                              model, optimizer, opt_param_scheduler,
-                              train_data_iterator, valid_data_iterator,
-                              process_non_loss_data_func, config)
+            iteration, num_floating_point_operations_so_far = train(
+                forward_step_func,
+                model, optimizer, opt_param_scheduler,
+                train_data_iterator, valid_data_iterator,
+                process_non_loss_data_func, config, checkpointing_context)
 
         print_datetime('after training is done')
         # Clean the model
         if args.compression_training:
             model = [redundancy_clean(model[0], args.deepspeed_config_dict, mpu)]
 
-        if args.save and iteration != 0:
-            save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
+        if args.save and iteration != 0 and iteration % args.save_interval != 0:
+            save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
+                            num_floating_point_operations_so_far, checkpointing_context)
+
+        one_logger and one_logger.log_metrics({
+            'app_train_loop_finish_time': one_logger_utils.get_timestamp_in_ms()
+        })
+
     else:
         print_rank_0('skipping training (--skip-train is on) ...')
 
@@ -291,67 +308,16 @@ def pretrain(train_valid_test_dataset_provider,
         evaluate_and_print_results(prefix, forward_step_func,
                                    test_data_iterator, model,
                                    iteration, process_non_loss_data_func, config,
-                                   verbose=True, write_to_tensorboard=not args.skip_train)
-
+                                   verbose=True, write_to_tensorboard=not args.skip_train, test=True)
 
-def update_train_iters(args):
-
-    # For iteration-based training, we don't need to do anything
-    if args.train_iters:
-        return
+    maybe_finalize_async_save(blocking=True)
 
-    # Constant batch size with sample-based training.
-    if args.rampup_batch_size is None:
-        args.train_iters = args.train_samples // args.global_batch_size
+    one_logger and one_logger.log_metrics({
+        'app_finish_time': one_logger_utils.get_timestamp_in_ms()
+    })
+    one_logger_utils.finish()
 
-    else:
-        # Sample based training with rampup batch size.
-        iterations = 0
-        consumed_samples = 0
-        # Rampup phase.
-        while consumed_samples <= int(args.rampup_batch_size[2]):
-            update_num_microbatches(consumed_samples, consistency_check=False)
-            consumed_samples += get_current_global_batch_size()
-            iterations += 1
-        # Reset
-        update_num_microbatches(0, consistency_check=False)
-        # Constant phase
-        # Note that we throw away any partial last batch.
-        iterations += (args.train_samples - consumed_samples) // \
-                      args.global_batch_size
-        args.train_iters = iterations
-
-    print_rank_0('setting training iterations to {}'.format(args.train_iters))
-
-
-def setup_teacher_model(args, model_provider):        
-    
-    print_rank_0('***>>>>> Student model checkpoint iteration:{}'.format(args.iteration))
-    iteration_stuent = args.iteration
-    num_layers_student = args.num_layers
-    num_experts_student = args.num_experts
-    hidden_size_student = args.hidden_size
-    num_attention_heads_student = args.num_attention_heads
-    load_student = args.load
-
-    print_rank_0('***>>>>> Setting up the teacher model')
-
-    args.num_layers = args.num_layers_teacher
-    args.num_experts = args.num_experts_teacher
-    args.hidden_size = args.hidden_size_teacher
-    args.num_attention_heads = args.num_attention_heads_teacher
-    args.load = args.load_teacher
-    teacher_model, _, _ = load_model_weights_only(model_provider)
-    print_rank_0('***>>>>> Teacher model:{}'.format(teacher_model))
-
-    args.num_layers = num_layers_student
-    args.num_experts = num_experts_student
-    args.hidden_size = hidden_size_student
-    args.num_attention_heads = num_attention_heads_student
-    args.load = load_student
-    args.iteration = iteration_stuent
-
-    return teacher_model
+    return model
 
 def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
     """Build the model."""
@@ -407,12 +373,6 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
     if not isinstance(model, list):
         model = [model]
 
-    # Disallow training and inference with Transformer Engine
-    # for non-GPT models
-    args.allow_transformer_engine = all([type(m) == GPTModel for m in model])
-    # assert args.allow_transformer_engine or args.transformer_impl == 'local', \
-    #     'Transformer Engine is only approved for GPT models'
-
     # Set tensor model parallel attributes if not set.
     # Only parameters that are already tensor model parallel have these
     # attributes set for them. We should make sure the default attributes
@@ -443,12 +403,16 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
 
     if wrap_with_ddp:
         config = get_model_config(model[0])
+        ddp_config = DistributedDataParallelConfig(
+            grad_reduce_in_fp32=args.accumulate_allreduce_grads_in_fp32,
+            overlap_grad_reduce=args.overlap_grad_reduce,
+            use_distributed_optimizer=args.use_distributed_optimizer,
+            check_for_nan_in_grad=args.check_for_nan_in_loss_and_grad,
+            bucket_size=args.ddp_bucket_size,
+            average_in_collective=args.ddp_average_in_collective)
         model = [DDP(config,
+                     ddp_config,
                      model_chunk,
-                     data_parallel_group=mpu.get_data_parallel_group(with_context_parallel=True),
-                     accumulate_allreduce_grads_in_fp32=args.accumulate_allreduce_grads_in_fp32,
-                     overlap_grad_reduce=args.overlap_grad_reduce,
-                     use_distributed_optimizer=args.use_distributed_optimizer,
                      # Turn off bucketing for model_chunk 2 onwards, since communication for these
                      # model chunks is overlapped with compute anyway.
                      disable_bucketing=(model_chunk_idx > 0))
@@ -461,56 +425,6 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
 
     return model
 
-
-def get_optimizer_param_scheduler(optimizer):
-    """Build the learning rate scheduler."""
-    args = get_args()
-
-    # Iteration-based training.
-    if args.train_iters:
-        if args.lr_decay_iters is None:
-            args.lr_decay_iters = args.train_iters
-        lr_decay_steps = args.lr_decay_iters * args.global_batch_size
-        wd_incr_steps = args.train_iters * args.global_batch_size
-        if args.lr_warmup_fraction is not None:
-            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
-        else:
-            lr_warmup_steps = args.lr_warmup_iters * args.global_batch_size
-    # Sample-based training.
-    elif args.train_samples:
-        # We need to set training iters for later use. Technically
-        # we need to adjust the training samples too (due to last
-        # batch being incomplete) but we leave it as is for now.
-        update_train_iters(args)
-        if args.lr_decay_samples is None:
-            args.lr_decay_samples = args.train_samples
-        lr_decay_steps = args.lr_decay_samples
-        wd_incr_steps = args.train_samples
-        if args.lr_warmup_fraction is not None:
-            lr_warmup_steps = args.lr_warmup_fraction * lr_decay_steps
-        else:
-            lr_warmup_steps = args.lr_warmup_samples
-    else:
-        raise Exception(
-            'either train-iters or train-samples should be provided.')
-
-    opt_param_scheduler = OptimizerParamScheduler(
-        optimizer,
-        init_lr=args.lr_warmup_init,
-        max_lr=args.lr,
-        min_lr=args.min_lr,
-        lr_warmup_steps=lr_warmup_steps,
-        lr_decay_steps=lr_decay_steps,
-        lr_decay_style=args.lr_decay_style,
-        start_wd=args.start_weight_decay,
-        end_wd=args.end_weight_decay,
-        wd_incr_steps=wd_incr_steps,
-        wd_incr_style=args.weight_decay_incr_style,
-        use_checkpoint_opt_param_scheduler=args.use_checkpoint_opt_param_scheduler,
-        override_opt_param_scheduler=args.override_opt_param_scheduler)
-
-    return opt_param_scheduler
-
 def load_model_weights_only(model_provider_func):
     """Setup model and optimizer."""
     args = get_args()
@@ -538,44 +452,27 @@ def load_model_weights_only(model_provider_func):
 
     print_datetime('before load checkpoint')
     if args.load is not None:
-        iteration = load_checkpoint(model, optimizer, lr_scheduler, strict=True, load_only_weights=True)
+        iteration, _ = load_checkpoint(model, optimizer, lr_scheduler, strict=True, load_only_weights=True)
 
     print_datetime('after load checkpoint weights')
 
     return model, optimizer, lr_scheduler
 
-
 def setup_model_and_optimizer(model_provider_func,
                               model_type,
                               no_wd_decay_cond=None,
                               scale_lr_cond=None,
                               lr_mult=1.0,
-                              teacher=False,
                               data_post_process=None,
                               build_train_valid_test_datasets_provider=None):
     """Setup model and optimizer."""
     args = get_args()
+    timers = get_timers()
+    one_logger = get_one_logger()
 
     model = get_model(model_provider_func, model_type)
-    # unwrapped_model = unwrap_model(model)
 
     # initialize the compression here
-    student_global_steps = 0
-    if args.kd or args.mos:
-        model, _, _, _ = deepspeed.initialize(
-                model=model[0],
-                args=args,
-                mpu=mpu if args.no_pipeline_parallel else None,
-                config=args.deepspeed_config_dict,
-            )
-        model = [model]
-        if args.load is not None:
-            args.iteration = load_checkpoint(model, None, None, strict=False)
-        else:
-            args.iteration = 0
-        student_global_steps = model[0].global_steps
-        print_rank_0('***>>>>> Student model, global step:{}'.format(student_global_steps))
-
     if args.compression_training:
         model, _, _, _ = deepspeed.initialize(
             model=model[0],
@@ -586,18 +483,20 @@ def setup_model_and_optimizer(model_provider_func,
         model = [model]
         model = [init_compression(model[0].module, args.deepspeed_config_dict, mpu)]
 
-    unwrapped_model = unwrap_model(model,
-                                   (torchDDP, LocalDDP, DDP, Float16Module))
+    unwrapped_model = unwrap_model(model)
 
+    kwargs = {}
+    for f in dataclasses.fields(OptimizerConfig):
+        if hasattr(args, f.name):
+            kwargs[f.name] = getattr(args, f.name)
+    config = OptimizerConfig(**kwargs)
+    config.timers = timers
     if args.inference:
         optimizer = None
         opt_param_scheduler = None
     else:
-        if teacher:
-            optimizer = None
-        else:
-            optimizer = get_megatron_optimizer(model, no_wd_decay_cond,
-                                               scale_lr_cond, lr_mult)
+        optimizer = get_megatron_optimizer(config, model, no_wd_decay_cond,
+                                        scale_lr_cond, lr_mult)
         # opt_param_scheduler is the old lr_scheduler plus weight decay scheduling
         opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
 
@@ -657,25 +556,24 @@ def setup_model_and_optimizer(model_provider_func,
             assert model.grid.get_data_parallel_rank() == mpu.get_data_parallel_rank()
         model = [model]
 
-    # Compression has its own checkpoint loading path (e.g, loading both teacher and student models). So if compression is enabled, we skip the following checkpoint loading.
-    no_post_init_checkpoint_loading = args.kd or args.mos
-    if not no_post_init_checkpoint_loading:
-        if args.load is not None:
-            timers = get_timers()
-            timers('load-checkpoint', log_level=0).start(barrier=True)
-            args.iteration = load_checkpoint(model, optimizer, opt_param_scheduler)
-            timers('load-checkpoint').stop(barrier=True)
-            timers.log(['load-checkpoint'])
-        else:
-            args.iteration = 0
+    if args.load is not None or args.pretrained_checkpoint is not None:
+        one_logger and one_logger.log_metrics({
+            'load_checkpoint_start_time': one_logger_utils.get_timestamp_in_ms()
+        })
+        timers('load-checkpoint', log_level=0).start(barrier=True)
+        args.iteration, args.num_floating_point_operations_so_far = load_checkpoint(
+            model, optimizer, opt_param_scheduler)
+        timers('load-checkpoint').stop(barrier=True)
+        timers.log(['load-checkpoint'])
+        one_logger and one_logger.log_metrics({
+            'load_checkpoint_finish_time': one_logger_utils.get_timestamp_in_ms(),
+            'load_checkpoint_time': timers('load-checkpoint').active_time()
+        })
     else:
-        model[0].global_steps = student_global_steps
+        args.iteration = 0
+        args.num_floating_point_operations_so_far = 0
 
-    # We only support local DDP with multiple micro-batches.
-    if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1:
-        assert args.DDP_impl == 'local'
-
-    # get model without FP16 and/or TorchDDP wrappers
+    # get model without FP16 and/or DDP wrappers
     if args.iteration == 0 and len(unwrapped_model) == 1 \
         and hasattr(unwrapped_model[0], 'init_state_dict_from_bert'):
         print_rank_0("Initializing ICT from pretrained BERT model")
@@ -685,8 +583,6 @@ def setup_model_and_optimizer(model_provider_func,
 
     return model, optimizer, opt_param_scheduler
 
-
-
 def train_step(forward_step_func, data_iterator,
                model, optimizer, opt_param_scheduler, config):
     """Single training step."""
@@ -698,19 +594,22 @@ def train_step(forward_step_func, data_iterator,
         num_zeros_in_grad = 0
         assert isinstance(model[0], deepspeed.PipelineEngine)
         loss = model[0].train_batch(data_iter=data_iterator)
+        additional_losses = model[0].get_additional_losses()
+        loss_key = 'lm loss' if additional_losses is None else 'loss'  # use "lm loss" for backward compatibility
+        loss_dict = OrderedDict({loss_key: loss})
+        if additional_losses is not None:
+            loss_dict.update(additional_losses)
         grad_norm = model[0].get_global_grad_norm()
-        return {'lm loss' : loss}, skipped_iter, grad_norm, num_zeros_in_grad
+        return loss_dict, skipped_iter, grad_norm, num_zeros_in_grad
 
     # Set grad to zero.
-    for model_chunk in model:
-        # If using distributed optimizer, don't zero buffer here; zeroing of buffer is
-        # handled automatically by the optimizer after all-gathers finish.
-        # Otherwise, zero the buffer.
-        model_chunk.zero_grad_buffer(zero_buffer=(not args.use_distributed_optimizer))
-    optimizer.zero_grad()
+    if not args.deepspeed:
+        for model_chunk in model:
+            model_chunk.zero_grad_buffer()
+        optimizer.zero_grad()
 
     # Forward pass.
-    forward_backward_func = get_forward_backward_func()
+    forward_backward_func = get_forward_backward_func(config)
     losses_reduced = forward_backward_func(
         forward_step_func=forward_step_func,
         data_iterator=data_iterator,
@@ -719,52 +618,82 @@ def train_step(forward_step_func, data_iterator,
         seq_length=args.seq_length,
         micro_batch_size=args.micro_batch_size,
         decoder_seq_length=args.decoder_seq_length,
-        forward_only=False)
+        forward_only=False,
+        config=config)
 
     # Empty unused memory.
     if args.empty_unused_memory_level >= 1:
         torch.cuda.empty_cache()
 
     # Vision gradients.
-    if args.vision_pretraining and args.vision_pretraining_type == "dino":
+    if getattr(args, 'vision_pretraining', False) and args.vision_pretraining_type == "dino":
         unwrapped_model = unwrap_model(model[0])
         unwrapped_model.cancel_gradients_last_layer(args.curr_iteration)
 
     # Update parameters.
     timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time)
-    update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers)
+    if args.deepspeed:
+        increment = get_num_microbatches() * \
+                    args.micro_batch_size * \
+                    args.data_parallel_size
+        model[0].step(lr_kwargs={'increment': increment})
+        update_successful = model[0].was_step_applied()
+    else:
+        update_successful, grad_norm, num_zeros_in_grad = optimizer.step()
     timers('optimizer').stop()
 
     # Vision momentum.
-    if args.vision_pretraining and args.vision_pretraining_type == "dino":
+    if getattr(args, 'vision_pretraining', False) and args.vision_pretraining_type == "dino":
         unwrapped_model = unwrap_model(model[0])
         unwrapped_model.update_momentum(args.curr_iteration)
 
     # Update learning rate.
-    if update_successful:
-        increment = get_num_microbatches() * \
-                    args.micro_batch_size * \
-                    args.data_parallel_size
-        opt_param_scheduler.step(increment=increment)
+    if args.deepspeed:
         skipped_iter = 0
-    else:
-        skipped_iter = 1
-
-    # Empty unused memory.
-    if args.empty_unused_memory_level >= 2:
-        torch.cuda.empty_cache()
-
-    if mpu.is_pipeline_last_stage(ignore_virtual=True):
-        # Average loss across microbatches.
+        grad_norm = None
+        num_zeros_in_grad = None
         loss_reduced = {}
         for key in losses_reduced[0]:
             losses_reduced_for_key = [x[key] for x in losses_reduced]
             loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key)
         return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad
+    else:
+        if update_successful:
+            increment = get_num_microbatches() * \
+                        args.micro_batch_size * \
+                        args.data_parallel_size
+            opt_param_scheduler.step(increment=increment)
+            skipped_iter = 0
+        else:
+            skipped_iter = 1
+
+        # Empty unused memory.
+        if args.empty_unused_memory_level >= 2:
+            torch.cuda.empty_cache()
+
+        if mpu.is_pipeline_last_stage(ignore_virtual=True):
+            # Average loss across microbatches.
+            loss_reduced = {}
+            for key in losses_reduced[0].keys():
+                numerator = 0
+                denominator = 0
+                for x in losses_reduced:
+                    val = x[key]
+                    # there is one dict per microbatch. in new reporting, we average
+                    # over the total number of tokens across the global batch.
+                    if isinstance(val, tuple) or isinstance(val, list):
+                        numerator += val[0]
+                        denominator += val[1]
+                    else:
+                        # legacy behavior. we average over the number of microbatches,
+                        # and so the denominator is 1.
+                        numerator += val
+                        denominator += 1
+                loss_reduced[key] = numerator / denominator
+            return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad
     return {}, skipped_iter, grad_norm, num_zeros_in_grad
 
-
-def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
+def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_rate, iteration,
                  loss_scale, report_memory_flag, skipped_iter,
                  grad_norm, params_norm, num_zeros_in_grad,
                  model=None, optimizer=None):
@@ -773,6 +702,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
     timers = get_timers()
     writer = get_tensorboard_writer()
     wandb_writer = get_wandb_writer()
+    one_logger = get_one_logger()
 
     # 获取 Iluvatar 设备判断
     # IS_BI_V150 = "BI-V150" in execCmd("ixsmi -L")
@@ -797,7 +727,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
     for key in loss_dict:
         if not skipped_iter:
             total_loss_dict[key] = total_loss_dict.get(
-                key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
+                key, torch.tensor([0.0], dtype=torch.float, device='cuda')) + loss_dict[key]
         else:
             value = loss_dict[key].float().sum().item()
             is_nan = value == float('inf') or \
@@ -838,6 +768,9 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
     batch_size = args.micro_batch_size * args.data_parallel_size * \
         get_num_microbatches()
 
+    # Track app tag & app tag ID
+    one_logger_utils.track_app_tag(batch_size, args.world_size, args.seq_length)
+
     total_iterations = total_loss_dict[advanced_iters_key] + \
                        total_loss_dict[skipped_iters_key]
 
@@ -853,6 +786,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
                              iteration)
         if args.log_learning_rate_to_tensorboard:
             writer.add_scalar('learning-rate', learning_rate, iteration)
+            if args.decoupled_lr is not None:
+                writer.add_scalar('decoupled-learning-rate', decoupled_learning_rate, iteration)
             writer.add_scalar('learning-rate vs samples', learning_rate,
                               args.consumed_train_samples)
             if wandb_writer:
@@ -916,6 +851,9 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
                 mem_stats["allocation.all.current"],
                 iteration,
             )
+    if args.num_experts is not None:
+        moe_loss_scale = 1 / get_num_microbatches()
+        track_moe_metrics(moe_loss_scale, iteration, writer, wandb_writer, total_loss_dict, args.moe_per_layer_logging)
 
     if iteration % args.log_interval == 0:
         elapsed_time = timers('interval-time').elapsed(barrier=True)
@@ -954,14 +892,21 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
                     f'loss/{k}': v for k, v in loss_dict.items()
                 }
                 wandb_metrics |= {'loss/iteration': iteration}
-        if writer:
-            if args.log_timers_to_tensorboard:
-                writer.add_scalar('iteration-time/iteration-time',
+
+        throughput = num_floating_point_operations(args, batch_size) / (
+            elapsed_time_per_iteration * 10**12 * args.world_size)
+
+        one_logger_utils.track_e2e_metrics(args.log_throughput, throughput)
+
+        if args.log_timers_to_tensorboard:
+            if writer:
+                writer.add_scalar('iteration-time',
                                   elapsed_time_per_iteration, iteration)
             if wandb_writer:
                 wandb_writer.log({'iteration-time': elapsed_time_per_iteration},
                                  iteration)
-        log_string = ' iteration {:8d}/{:8d} |'.format(
+        log_string = f" [{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]"
+        log_string += ' iteration {:8d}/{:8d} |'.format(
             iteration, args.train_iters)
         log_string += ' consumed samples: {:12d} |'.format(
             args.consumed_train_samples)
@@ -984,7 +929,15 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
                     writer.add_scalar('throughput', throughput, iteration)
                 if wandb_writer:
                     wandb_writer.log({'throughput': throughput}, iteration)
-        log_string += ' learning rate: {:.3E} |'.format(learning_rate)
+        assert learning_rate is not None
+        # Decoupled_learning_rate should be not None only on first and last pipeline stage.
+        log_string += ' learning rate: {:.6E} |'.format(learning_rate)
+        if args.decoupled_lr is not None and (mpu.is_pipeline_first_stage(ignore_virtual=True) or
+                                              mpu.is_pipeline_last_stage(ignore_virtual=True)):
+            assert decoupled_learning_rate is not None
+            log_string += ' decoupled learning rate: {:.6E} |'.format(decoupled_learning_rate)
+        else:
+            assert decoupled_learning_rate is None
         log_string += ' global batch size: {:5d} |'.format(batch_size)
         if wandb is not None and getattr(wandb, 'run', None) is not None:
             wandb_metrics |= {
@@ -1008,7 +961,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
                       float(max(1, total_loss_dict[advanced_iters_key]))
                 if avg > 0.0:
                     log_string += ' {}: {:.6E} |'.format(key, avg)
-                total_loss_dict[key] = torch.cuda.FloatTensor([0.0])
+                total_loss_dict[key] = torch.tensor([0.0], dtype=torch.float, device='cuda')
         if wandb is not None and getattr(wandb, 'run', None) is not None:
             wandb.log(wandb_metrics)
         if loss_scale is not None:
@@ -1037,13 +990,15 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
         global tflops_10
         global tps_per_device
         global times
-        if iteration == args.log_interval:
+        log_initialized = False
+        if not log_initialized:
             elapsed_time_per_iteration_10 = 0.0
             tokens_per_second_10 = 0.0
             tflops_10 = 0.0
             times = 0
             tps_per_device = 0.0
-        if iteration >= 4:
+            log_initialized = True
+        if iteration >= 1:
             elapsed_time_per_iteration_10 += elapsed_time_per_iteration * 1000.0
             tokens_per_second_10 += batch_size * total_iterations * args.seq_length / elapsed_time
             tflops_10 += tflops
@@ -1067,23 +1022,16 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
 
     return report_memory_flag
 
-
-def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler):
-    timers = get_timers()
-    # Extra barrier is added to make sure
-    # all ranks report the max time.
-    timers('save-checkpoint', log_level=0).start(barrier=True)
-    save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-    timers('save-checkpoint').stop(barrier=True)
-    timers.log(['save-checkpoint'])
-
-
 def train(forward_step_func, model, optimizer, opt_param_scheduler,
           train_data_iterator, valid_data_iterator,
-          process_non_loss_data_func, config):
+          process_non_loss_data_func, config, checkpointing_context):
     """Train the model function."""
     args = get_args()
     timers = get_timers()
+    one_logger = get_one_logger()
+
+    if torch.distributed.get_rank() == 0:
+        print("config: ", config)
 
     # Write args to tensorboard
     write_args_to_tensorboard()
@@ -1098,7 +1046,16 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
     # Iterations.
     iteration = args.iteration
 
-    # Translate args to core configuration
+    # Track E2E metrics at the start of training
+    one_logger_utils.on_train_start(iteration=iteration, consumed_train_samples=args.consumed_train_samples,
+                                    train_samples=args.train_samples, seq_length=args.seq_length,
+                                    train_iters=args.train_iters, save=args.save, async_save=args.async_save,
+                                    log_throughput=args.log_throughput,
+                                    num_floating_point_operations_so_far=args.num_floating_point_operations_so_far)
+
+    num_floating_point_operations_so_far = args.num_floating_point_operations_so_far
+
+    # Setup some training config params
     if not args.deepspeed:
         config.grad_scale_func = optimizer.scale_loss
     config.timers = timers
@@ -1118,7 +1075,8 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                                   for model_index in range(len(model))]
         if len(model) == 1:
             config.param_sync_func = config.param_sync_func[0]
-    config.finalize_model_grads_func = finalize_model_grads
+    if not args.deepspeed:
+        config.finalize_model_grads_func = finalize_model_grads
 
     timers('interval-time', log_level=0).start(barrier=True)
     print_datetime('before the start of training step')
@@ -1133,6 +1091,41 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         gc.disable()
         gc.collect()
 
+    # Singleton Initialization
+    if args.log_straggler:
+        global stimer
+        world = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        mmcnt = args.straggler_minmax_count
+        stimer.configure(world, rank,
+                mmcnt = mmcnt,
+                enabled = not args.disable_straggler_on_startup,
+                port = args.straggler_ctrlr_port)
+    total_flops = 0.0
+
+    num_microbatches = get_num_microbatches()
+    eval_duration = 0.0
+    eval_iterations = 0
+
+    def get_e2e_base_metrics():
+        """Get base metrics values for one-logger to calculate E2E tracking metrics.
+        """
+        return {
+            'iteration': iteration,
+            'train_duration': timers('interval-time').active_time(),
+            'eval_duration': eval_duration,
+            'eval_iterations': eval_iterations,
+            'total_flops': total_flops,
+            'num_floating_point_operations_so_far': num_floating_point_operations_so_far,
+            'consumed_train_samples': args.consumed_train_samples,
+            'world_size': args.world_size,
+            'seq_length': args.seq_length
+        }
+    # Cache into one-logger for callback
+    if one_logger:
+        with one_logger.get_context_manager():
+            one_logger.store_set('get_e2e_base_metrics', get_e2e_base_metrics)
+
     while iteration < args.train_iters:
         if args.profile and \
            iteration == args.profile_step_start and \
@@ -1140,7 +1133,22 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
             torch.cuda.cudart().cudaProfilerStart()
             torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
 
-        update_num_microbatches(args.consumed_train_samples)
+        maybe_finalize_async_save(False)
+
+        # Update number of microbatches first without consistency check to decide if a
+        # checkpoint should be saved. If the number of microbatches is different
+        # from the previous iteration, save a checkpoint. Then run consistency check
+        # to make sure training configuration is still valid.
+        update_num_microbatches(args.consumed_train_samples, consistency_check=False)
+        if get_num_microbatches() != num_microbatches and iteration != 0:
+            assert get_num_microbatches() > num_microbatches, \
+                "number of microbatches should be increasing due to batch size rampup"
+            save_checkpoint_and_time(iteration, model, optimizer,
+                                     opt_param_scheduler,
+                                     num_floating_point_operations_so_far,
+                                     checkpointing_context)
+        num_microbatches = get_num_microbatches()
+        update_num_microbatches(args.consumed_train_samples, consistency_check=True)
         if args.deepspeed:
             # inform deepspeed of any batch size changes
             global_batch_size = mpu.get_data_parallel_world_size() * \
@@ -1155,6 +1163,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                 if args.use_rotary_position_embeddings:
                     update_rotary_pos_emb(curriculum_seqlen)
             args.curriculum_seqlen = curriculum_seqlen
+
         args.curr_iteration = iteration
         loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
             train_step(forward_step_func,
@@ -1165,18 +1174,17 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                        config)
         iteration += 1
         args.iteration = iteration
-        new_samples = mpu.get_data_parallel_world_size() * \
-                                       args.micro_batch_size * \
-                                       get_num_microbatches()
-        args.consumed_train_samples += new_samples
+        batch_size = mpu.get_data_parallel_world_size() * \
+                     args.micro_batch_size * \
+                     get_num_microbatches()
+        args.consumed_train_samples += batch_size
+        num_fp_ops = num_floating_point_operations(args, batch_size)
+        num_floating_point_operations_so_far += num_fp_ops
+        total_flops += num_fp_ops
         # This actual_seq_length is used for actual consumed tokens calculation, flops calculation, and logging.
         args.actual_seq_length = args.seq_length
         if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
             args.actual_seq_length = args.curriculum_seqlen
-        if args.random_ltd:
-            args.random_ltd_reserved_length = model[0].random_ltd_scheduler.get_current_seq()
-            if args.random_ltd_reserved_length < args.actual_seq_length:
-                args.actual_seq_length = (args.actual_seq_length * (args.num_layers - args.random_ltd_layer_num) + args.random_ltd_reserved_length * args.random_ltd_layer_num) // args.num_layers
         if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
             if hasattr(args, 'data_efficiency_curriculum_learning_numel'):
                 act_mbsz = args.data_efficiency_curriculum_learning_numel / args.curriculum_seqlen
@@ -1184,10 +1192,10 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                 args.consumed_train_tokens += mpu.get_data_parallel_world_size() * \
                         get_num_microbatches() * act_token
             else:
-                args.consumed_train_tokens += new_samples * args.actual_seq_length
+                args.consumed_train_tokens += batch_size * args.actual_seq_length
         else:
-            args.consumed_train_tokens += new_samples * args.actual_seq_length
-        
+            args.consumed_train_tokens += batch_size * args.actual_seq_length
+
         # Logging.
         if args.deepspeed:
             if hasattr(model[0].optimizer, 'cur_scale'):
@@ -1199,13 +1207,38 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         params_norm = None
         if args.log_params_norm:
             params_norm = calc_params_l2_norm(model)
+
+        learning_rate = None
+        decoupled_learning_rate = None
+        for param_group in optimizer.param_groups:
+            if param_group['is_decoupled_lr']:
+                decoupled_learning_rate = param_group['lr']
+            else:
+                learning_rate = param_group['lr']
         report_memory_flag = training_log(loss_dict, total_loss_dict,
-                                          optimizer.param_groups[0]['lr'],
+                                          learning_rate,
+                                          decoupled_learning_rate,
                                           iteration, loss_scale,
                                           report_memory_flag, skipped_iter,
                                           grad_norm, params_norm, num_zeros_in_grad,
                                           model, optimizer)
 
+        # StragglerDetector
+        if iteration % args.log_interval == 0 and args.log_straggler:
+            stimer.report(total_flops, args.log_interval)
+            total_flops = 0.0
+
+        if args.check_weight_hash_across_dp_replicas_interval is not None and \
+                iteration % args.check_weight_hash_across_dp_replicas_interval == 0:
+            if args.use_distributed_optimizer and args.overlap_param_gather:
+                optimizer.disable_pre_hook()
+            assert check_param_hashes_across_dp_replicas(model), \
+                "Parameter hashes not matching across DP replicas"
+            torch.distributed.barrier()
+            print_rank_0(f">>> Weight hashes match after {iteration} iterations...")
+            if args.use_distributed_optimizer and args.overlap_param_gather:
+                optimizer.enable_pre_hook()
+
         # Autoresume
         if args.adlr_autoresume and \
            (iteration % args.adlr_autoresume_interval == 0):
@@ -1216,17 +1249,27 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         if args.eval_interval and iteration % args.eval_interval == 0 and \
            args.do_valid:
             timers('interval-time').stop()
+            if args.use_distributed_optimizer and args.overlap_param_gather:
+                optimizer.disable_pre_hook()
             if args.manual_gc and args.manual_gc_eval:
                 # Collect all objects.
                 gc.collect()
             prefix = 'iteration {}'.format(iteration)
+            timers('eval-time', log_level=0).start(barrier=True)
             evaluate_and_print_results(prefix, forward_step_func,
                                        valid_data_iterator, model,
                                        iteration, process_non_loss_data_func,
                                        config, False)
+            eval_duration += timers('eval-time').elapsed()
+            eval_iterations += args.eval_iters
+            timers('eval-time').stop()
+            one_logger_utils.track_e2e_metrics()
+
             if args.manual_gc and args.manual_gc_eval:
                 # Collect only the objects created and used in evaluation.
                 gc.collect(generation=0)
+            if args.use_distributed_optimizer and args.overlap_param_gather:
+                optimizer.enable_pre_hook()
             timers('interval-time', log_level=0).start(barrier=True)
 
         # Checkpointing
@@ -1235,31 +1278,36 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
             signal_handler = get_signal_handler()
             if any(signal_handler.signals_received()):
                 save_checkpoint_and_time(iteration, model, optimizer,
-                                         opt_param_scheduler)
+                                         opt_param_scheduler,
+                                         num_floating_point_operations_so_far,
+                                         checkpointing_context)
                 print_datetime('exiting program after receiving SIGTERM.')
                 exit = True
                 break
 
         if args.save and args.save_interval and \
            iteration % args.save_interval == 0:
-            timers('interval-time').stop()
             save_checkpoint_and_time(iteration, model, optimizer,
-                                     opt_param_scheduler)
+                                     opt_param_scheduler,
+                                     num_floating_point_operations_so_far,
+                                     checkpointing_context)
             saved_checkpoint = True
-            timers('interval-time', log_level=0).start(barrier=True)
 
         # Exiting based on duration
         if args.exit_duration_in_mins:
             train_time = (time.time() - _TRAIN_START_TIME) / 60.0
-            done_cuda = torch.cuda.IntTensor(
-                [train_time > args.exit_duration_in_mins])
+            done_cuda = torch.tensor(
+                [train_time > args.exit_duration_in_mins],
+                dtype=torch.int, device='cuda')
             torch.distributed.all_reduce(
                 done_cuda, op=torch.distributed.ReduceOp.MAX)
             done = done_cuda.item()
             if done:
                 if not saved_checkpoint:
                     save_checkpoint_and_time(iteration, model, optimizer,
-                                             opt_param_scheduler)
+                                             opt_param_scheduler,
+                                             num_floating_point_operations_so_far,
+                                             checkpointing_context)
                 print_datetime('exiting program after {} minutes'.format(train_time))
                 exit = True
                 break
@@ -1268,7 +1316,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         if args.exit_interval and iteration % args.exit_interval == 0:
             if args.save and not saved_checkpoint:
                 save_checkpoint_and_time(iteration, model, optimizer,
-                                         opt_param_scheduler)
+                                         opt_param_scheduler,
+                                         num_floating_point_operations_so_far,
+                                         checkpointing_context)
             torch.distributed.barrier()
             print_datetime('exiting program at iteration {}'.format(iteration))
             exit = True
@@ -1283,7 +1333,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
             if args.manual_gc_interval != 0 and iteration % args.manual_gc_interval == 0:
                 gc.collect()
 
-    # Flush TensorBoard and WandB writers.
+    one_logger_utils.track_e2e_metrics()
+
+    # Flush TensorBoard, WandB writers and one-logger
     writer = get_tensorboard_writer()
     if writer:
         writer.flush()
@@ -1291,12 +1343,17 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
     if wandb_writer:
         wandb_writer.finish()
 
+    # Close out pre-hooks if using distributed optimizer and overlapped param gather.
+    if args.use_distributed_optimizer and args.overlap_param_gather:
+        optimizer.disable_pre_hook()
+
+    maybe_finalize_async_save(True)
+
     # If any exit conditions (signal handler, duration, iterations) have been reached, exit.
     if exit:
         sys.exit()
 
-    return iteration
-
+    return iteration, num_floating_point_operations_so_far
 
 def evaluate(forward_step_func,
              data_iterator,
@@ -1311,6 +1368,7 @@ def evaluate(forward_step_func,
     timers('evaluate', log_level=0).start(barrier=True)
 
     if args.vision_pretraining and args.vision_pretraining_type == "dino":
+        from megatron.legacy.model.vision.knn_monitor import compute_feature_bank
         compute_feature_bank(model)
 
     # Turn on evaluation mode which disables dropout.
@@ -1345,7 +1403,7 @@ def evaluate(forward_step_func,
             if verbose:
                 print_rank_0(f'Evaluating iter {iteration}/{args.eval_iters}')
 
-            forward_backward_func = get_forward_backward_func()
+            forward_backward_func = get_forward_backward_func(config)
             # Don't care about timing during evaluation
             config.timers = None
             if args.deepspeed and args.ds_pipeline_enabled:
@@ -1358,7 +1416,7 @@ def evaluate(forward_step_func,
                     forward_step_func=forward_step_func,
                     data_iterator=data_iterator,
                     model=model,
-                    num_microbatches=get_num_microbatches(),
+                    num_microbatches=eval_num_microbatches,
                     seq_length=args.seq_length,
                     micro_batch_size=args.micro_batch_size,
                     decoder_seq_length=args.decoder_seq_length,
@@ -1373,15 +1431,23 @@ def evaluate(forward_step_func,
                 # Reduce across processes.
                 for loss_dict in loss_dicts:
                     for key in loss_dict:
-                        total_loss_dict[key] = total_loss_dict.get(
-                            key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
+                        if key not in total_loss_dict:
+                            total_loss_dict[key] = torch.tensor([0.0, 0.0], dtype=torch.float).cuda()
+                        val = loss_dict[key]
+                        if isinstance(val, tuple) or isinstance(val, list):
+                            total_loss_dict[key][0] += val[0]
+                            total_loss_dict[key][1] += val[1]
+                        else:
+                            total_loss_dict[key][0] += val
+                            total_loss_dict[key][1] += 1
 
             args.consumed_valid_samples += eval_batch_size
 
             if args.exit_duration_in_mins:
                 train_time = (time.time() - _TRAIN_START_TIME) / 60.0
-                done_cuda = torch.cuda.IntTensor(
-                    [train_time > args.exit_duration_in_mins])
+                done_cuda = torch.tensor(
+                    [train_time > args.exit_duration_in_mins],
+                    dtype=torch.int, device='cuda')
                 torch.distributed.all_reduce(
                     done_cuda, op=torch.distributed.ReduceOp.MAX)
                 done = done_cuda.item()
@@ -1407,11 +1473,21 @@ def evaluate(forward_step_func,
         model_module.train()
 
     for key in total_loss_dict:
-        total_loss_dict[key] /= args.eval_iters * eval_num_microbatches
+        numerator, denominator = total_loss_dict[key]
+        total_loss_dict[key] = numerator / denominator
 
     timers('evaluate').stop()
     timers.log(['evaluate'])
 
+    if args.curriculum_learning_legacy and not args.no_pipeline_parallel:
+        # roll back to actual curriculum seqlen at the end of eval.
+        args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \
+            args.iteration + 1)
+        if args.curriculum_seqlen < args.seq_length:
+            if args.use_rotary_position_embeddings:
+                update_rotary_pos_emb(args.curriculum_seqlen)
+            model[0].reset_activation_shape()
+
     return total_loss_dict, collected_non_loss_data, False
 
 def evaluate_and_print_results(prefix, forward_step_func,
@@ -1463,38 +1539,6 @@ def evaluate_and_print_results(prefix, forward_step_func,
     print_rank_last(string)
     print_rank_last('-' * length)
 
-
-def cyclic_iter(iter):
-    while True:
-        for x in iter:
-            yield x
-
-
-def build_train_valid_test_datasets(build_train_valid_test_datasets_provider):
-    """Build pretraining datasets."""
-
-    args = get_args()
-
-    # Number of train/valid/test samples.
-    if args.train_samples:
-        train_samples = args.train_samples
-    else:
-        train_samples = args.train_iters * args.global_batch_size
-    eval_iters = (args.train_iters // args.eval_interval + 1) * \
-                 args.eval_iters
-    test_iters = args.eval_iters
-    train_val_test_num_samples = [train_samples,
-                                  eval_iters * args.global_batch_size,
-                                  test_iters * args.global_batch_size]
-    print_rank_0(' > datasets target sizes (minimum size):')
-    print_rank_0('    train:      {}'.format(train_val_test_num_samples[0]))
-    print_rank_0('    validation: {}'.format(train_val_test_num_samples[1]))
-    print_rank_0('    test:       {}'.format(train_val_test_num_samples[2]))
-
-    # Build the datasets.
-    return build_train_valid_test_datasets_provider(train_val_test_num_samples)
-
-
 def build_train_valid_test_data_loaders(
         build_train_valid_test_datasets_provider):
     """Build pretraining data loaders."""
@@ -1519,7 +1563,9 @@ def build_train_valid_test_data_loaders(
     is_distributed = getattr(build_train_valid_test_datasets_provider, "is_distributed", False)
 
     # Construct the data pipeline
-    if is_distributed or mpu.get_tensor_model_parallel_rank() == 0:
+    ds_sequence_parallel = parallel_state.get_sequence_parallel_world_size() > 1 or args.force_ds_sequence_parallel
+    rank_in_parallel_group = parallel_state.get_sequence_parallel_rank() if ds_sequence_parallel else mpu.get_tensor_model_parallel_rank()
+    if is_distributed or rank_in_parallel_group == 0:
 
         # Build datasets.
         train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
@@ -1538,10 +1584,11 @@ def build_train_valid_test_data_loaders(
         do_train = train_dataloader is not None and args.train_iters > 0
         do_valid = valid_dataloader is not None and args.eval_iters > 0
         do_test = test_dataloader is not None and args.eval_iters > 0
-        flags = torch.cuda.LongTensor(
-            [int(do_train), int(do_valid), int(do_test)])
+        flags = torch.tensor(
+            [int(do_train), int(do_valid), int(do_test)],
+            dtype=torch.long, device='cuda')
     else:
-        flags = torch.cuda.LongTensor([0, 0, 0])
+        flags = torch.tensor([0, 0, 0], dtype=torch.long, device='cuda')
 
     torch.distributed.broadcast(flags, 0)
 
@@ -1550,39 +1597,3 @@ def build_train_valid_test_data_loaders(
     args.do_test = getattr(args, "do_test", False) or flags[2].item()
 
     return train_dataloader, valid_dataloader, test_dataloader
-
-
-def build_train_valid_test_data_iterators(
-        build_train_valid_test_datasets_provider):
-    """Build pretraining data iterators."""
-
-    args = get_args()
-
-    # Build loaders.
-    train_dataloader, valid_dataloader, test_dataloader = \
-        build_train_valid_test_data_loaders(
-            build_train_valid_test_datasets_provider)
-
-    # Build iterators.
-    dl_type = args.dataloader_type
-    assert dl_type in ['single', 'cyclic']
-
-    if train_dataloader is not None:
-        train_data_iterator = iter(train_dataloader) if dl_type == 'single' \
-                              else iter(cyclic_iter(train_dataloader))
-    else:
-        train_data_iterator = None
-
-    if valid_dataloader is not None:
-        valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \
-                              else iter(cyclic_iter(valid_dataloader))
-    else:
-        valid_data_iterator = None
-
-    if test_dataloader is not None:
-        test_data_iterator = iter(test_dataloader) if dl_type == 'single' \
-                             else iter(cyclic_iter(test_dataloader))
-    else:
-        test_data_iterator = None
-
-    return train_data_iterator, valid_data_iterator, test_data_iterator
diff --git a/toolbox/Megatron-DeepSpeed/megatron_ds/utils.py b/toolbox/Megatron-DeepSpeed/megatronspeed/training/utils.py
similarity index 54%
rename from toolbox/Megatron-DeepSpeed/megatron_ds/utils.py
rename to toolbox/Megatron-DeepSpeed/megatronspeed/training/utils.py
index c687302f0d43e683ae839d38aec373884437bfd2..57f44c580d33e21a09a9bc3cde33b089048a83ea 100644
--- a/toolbox/Megatron-DeepSpeed/megatron_ds/utils.py
+++ b/toolbox/Megatron-DeepSpeed/megatronspeed/training/utils.py
@@ -1,34 +1,21 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
 """General utilities."""
 
 import sys
 import os
+
 import torch
+from torch.nn.parallel import DistributedDataParallel as torchDDP
 
 from deepspeed.accelerator import get_accelerator
-try:
-    from apex.multi_tensor_apply import multi_tensor_applier
-except ImportError:
-    multi_tensor_applier = None
 
-try:
-    import amp_C
-except ImportError:
-    amp_C = None
-
-from megatron_ds import (
+from megatron.core import mpu
+from megatron.training import (
     get_args,
     get_adlr_autoresume,
-    get_num_microbatches
 )
-from megatron_ds.core import DistributedDataParallel as DDP
-from megatron_ds.core import mpu
-from megatron_ds.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
-from megatron_ds.model import Float16Module
-from megatron_ds.model.module import param_is_not_shared
-from megatron_ds.model.rotary_pos_embedding import RotaryEmbedding
-
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.training.utils import print_rank_0
+from megatronspeed.legacy.model.rotary_pos_embedding import RotaryEmbedding
 
 def update_rotary_pos_emb(seq_length):
     args = get_args()
@@ -46,138 +33,6 @@ def update_rotary_pos_emb(seq_length):
     args.rotary_pos_emb = rotary_pos_emb
 
 
-ALL_MODULE_WRAPPER_CLASSNAMES = (DDP, Float16Module)
-
-
-def unwrap_model(model, module_instances=ALL_MODULE_WRAPPER_CLASSNAMES):
-    return_list = True
-    if not isinstance(model, list):
-        model = [model]
-        return_list = False
-    unwrapped_model = []
-    for model_module in model:
-        while isinstance(model_module, module_instances):
-            model_module = model_module.module
-        unwrapped_model.append(model_module)
-    if not return_list:
-        return unwrapped_model[0]
-    return unwrapped_model
-
-
-def calc_params_l2_norm(model):
-    """Calculate l2 norm of parameters """
-    args = get_args()
-    if not isinstance(model, list):
-        model = [model]
-    # Remove duplicate params.
-    params_data = []
-    for model_ in model:
-        for param in model_.parameters():
-            is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-            if mpu.get_expert_model_parallel_rank() > 0:
-                if not getattr(param, 'allreduce', True) and is_not_tp_duplicate:
-                    assert param_is_not_shared(param)
-                    params_data.append(param.data.float() if args.bf16 else param.data)
-            else:
-                is_not_shared = param_is_not_shared(param)
-                if is_not_shared and is_not_tp_duplicate:
-                    params_data.append(param.data.float() if args.bf16 else param.data)
-
-    # Check the availability of apex
-    assert multi_tensor_applier is not None and amp_C is not None, \
-        "apex is not available, please install it from https://github.com/NVIDIA/apex"
-
-    # Calculate norm
-    dummy_overflow_buf = torch.cuda.IntTensor([0])
-    norm, _ = multi_tensor_applier(
-        amp_C.multi_tensor_l2norm,
-        dummy_overflow_buf,
-        [params_data],
-        False # no per-parameter norm
-    )
-    norm_2 = norm * norm
-    if mpu.get_expert_model_parallel_world_size() == 1:
-        # Sum across all model-parallel GPUs(tensor + pipeline).
-        torch.distributed.all_reduce(norm_2,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=mpu.get_model_parallel_group())
-    else:
-        # Sum across tensor, pipeline and expert model-parallel GPUs.
-        torch.distributed.all_reduce(norm_2,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=mpu.get_tensor_and_expert_parallel_group())
-        torch.distributed.all_reduce(norm_2,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=mpu.get_pipeline_model_parallel_group())
-    return norm_2.item() ** 0.5
-
-
-def average_losses_across_data_parallel_group(losses):
-    """Reduce a tensor of losses across all GPUs."""
-    averaged_losses = torch.cat(
-        [loss.clone().detach().view(1) for loss in losses])
-    torch.distributed.all_reduce(averaged_losses,
-                                 group=mpu.get_data_parallel_group())
-    averaged_losses = averaged_losses / \
-        torch.distributed.get_world_size(group=mpu.get_data_parallel_group())
-
-    return averaged_losses
-
-
-def report_memory(name):
-    """Simple GPU memory report."""
-    mega_bytes = 1024.0 * 1024.0
-    string = name + ' memory (MB)'
-    string += ' | allocated: {}'.format(
-        torch.cuda.memory_allocated() / mega_bytes)
-    string += ' | max allocated: {}'.format(
-        torch.cuda.max_memory_allocated() / mega_bytes)
-    string += ' | reserved: {}'.format(
-        torch.cuda.memory_reserved() / mega_bytes)
-    string += ' | max reserved: {}'.format(
-        torch.cuda.max_memory_reserved() / mega_bytes)
-    if mpu.get_data_parallel_rank() == 0:
-        print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
-              flush=True)
-
-
-def print_params_min_max_norm(optimizer, iteration):
-    """Print min, max, and norm of all parameters."""
-    index = 0
-    rank = torch.distributed.get_rank()
-    string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n'
-    optimizer_ = optimizer.optimizer
-    for param_group in optimizer_.param_groups:
-        for param in param_group['params']:
-            index += 1
-            min_ = param.data.min()
-            max_ = param.data.max()
-            norm = torch.linalg.norm(param.data)
-            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
-                iteration, rank, index, int(param.tensor_model_parallel))
-            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)
-    print(string, flush=True)
-
-
-def check_adlr_autoresume_termination(iteration, model,
-                                      optimizer, opt_param_scheduler):
-    """Check for autoresume signal and exit if it is received."""
-    from megatron_ds.checkpointing import save_checkpoint
-
-    args = get_args()
-    autoresume = get_adlr_autoresume()
-    # Add barrier to ensure consistnecy.
-    torch.distributed.barrier()
-    if autoresume.termination_requested():
-        if args.save:
-            save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-        print_rank_0(">>> autoresume termination request found!")
-        if torch.distributed.get_rank() == 0:
-            autoresume.request_resume()
-        print_rank_0(">>> training terminated. Returning")
-        sys.exit(0)
-
-
 def get_ltor_masks_and_position_ids(data,
                                     eod_token,
                                     reset_position_ids,
@@ -194,6 +49,7 @@ def get_ltor_masks_and_position_ids(data,
         att_mask_batch = micro_batch_size
     else:
         att_mask_batch = 1
+    attention_mask = None
     if not skip_mask:
         attention_mask = torch.tril(torch.ones(
             (att_mask_batch, seq_length, seq_length), device=data.device)).view(
@@ -241,61 +97,11 @@ def get_ltor_masks_and_position_ids(data,
     return attention_mask, loss_mask, position_ids
 
 
-def get_batch_on_this_cp_rank(batch):
-    """ Slice batch input along sequence dimension into multiple chunks,
-        which are parallelized across GPUs in a context parallel group.
-    """
-
-    # With causal masking, each token only attends to its prior tokens. Simply split
-    # sequence into CP chunks can result in severe load imbalance. That's to say, chunks
-    # at the end of sequence have bigger workload than others. To address this issue,
-    # we split sequence into 2*CP ranks. Assuming CP=2, we then get 4 chunks, chunk_0
-    # and chunk_3 are assigned to GPU0, chunk_1 and chunk_2 are assigned to GPU1, so
-    # that we can get balanced workload among GPUs in a context parallel group.
-    args = get_args()
-    cp_size = args.context_parallel_size
-    if cp_size > 1:
-        cp_rank = mpu.get_context_parallel_rank()
-        for key, val in batch.items():
-            seq_dim = 1 if key != 'attention_mask' else 2
-            val = val.view(
-                *val.shape[0:seq_dim],
-                2 * cp_size,
-                val.shape[seq_dim] // (2 * cp_size),
-                *val.shape[(seq_dim + 1) :],
-            )
-            index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], device=val.device)
-            val = val.index_select(seq_dim, index)
-            val = val.view(*val.shape[0:seq_dim], -1, *val.shape[(seq_dim + 2) :])
-            batch[key] = val
-
-    return batch
-
-
-def print_rank_0(message):
-    """If distributed is initialized, print only on rank 0."""
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == 0:
-            print(message, flush=True)
-    else:
-        print(message, flush=True)
-
-def is_last_rank():
-    return torch.distributed.get_rank() == (
-        torch.distributed.get_world_size() - 1)
-
-def print_rank_last(message):
-    """If distributed is initialized, print only on last rank."""
-    if torch.distributed.is_initialized():
-        if is_last_rank():
-            print(message, flush=True)
-    else:
-        print(message, flush=True)
-
 def is_aml():
     # Are we running inside an Azure Machine Learning (AML) environment?
     return 'AZUREML_EXPERIMENT_ID' in os.environ
 
+
 def is_rank_0():
     """Check whether it is rank 0. For AML, check if it is rank 0 of a node"""
     if torch.distributed.is_initialized():
@@ -443,3 +249,4 @@ def dump_weights(preamble, iteration, model, optimizer, tensor=None):
                 p = model[0].module.tied_modules.embed.word_embeddings.weight._hp_param
                 fh.write(f"{get_fingerprint(p)} module.tied_modules.embed.word_embeddings.weight._hp_param {p.shape}\n")
 
+
diff --git a/toolbox/Megatron-DeepSpeed/pretrain_bert.py b/toolbox/Megatron-DeepSpeed/pretrain_bert.py
deleted file mode 100644
index f5c553029c432073de6a0c75278e46e43daa23d6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/pretrain_bert.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-
-"""Pretrain BERT"""
-
-from functools import partial
-
-import torch
-import torch.nn.functional as F
-
-from megatron.training import get_args
-from megatron.training import get_tokenizer
-from megatron.training import print_rank_0
-from megatron.training import get_timers
-from megatron.core import tensor_parallel
-from megatron.core.enums import ModelType
-import megatron.legacy.model
-from megatron.core.models.bert.bert_model import BertModel
-from megatron.training import pretrain
-from megatron.training.utils import average_losses_across_data_parallel_group
-from megatron.training.arguments import core_transformer_config_from_args
-from megatron.core.transformer.spec_utils import import_module
-from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec, bert_layer_local_spec
-from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.core.datasets.bert_dataset import BERTMaskedWordPieceDataset, BERTMaskedWordPieceDatasetConfig
-from megatron.core.datasets.utils import get_blend_from_list
-from megatron.core import mpu, tensor_parallel
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    print_rank_0('building BERT model ...')
-
-    args = get_args()
-    config = core_transformer_config_from_args(args)
-    num_tokentypes = 2 if args.bert_binary_head else 0
-
-    if args.use_legacy_models:
-        model = megatron.legacy.model.BertModel(
-            config=config,
-            num_tokentypes=num_tokentypes,
-            add_binary_head=args.bert_binary_head,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process)
-    else:
-        if args.spec is None:
-            transformer_layer_spec = bert_layer_with_transformer_engine_spec #default spec
-        elif args.spec[0] == 'local':
-            print_rank_0('Using Local spec for transformer layers')
-            transformer_layer_spec = bert_layer_local_spec
-        else :
-            transformer_layer_spec = import_module(args.spec)
-
-        model = BertModel(
-            config=config,
-            transformer_layer_spec=transformer_layer_spec,
-            vocab_size=args.padded_vocab_size,
-            max_sequence_length=args.max_position_embeddings,
-            num_tokentypes=num_tokentypes,
-            add_binary_head=args.bert_binary_head,
-            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process)
-
-    return model
-
-
-def get_batch(data_iterator):
-    """Build the batch."""
-
-    # Items and their type.
-    keys = ['text', 'types', 'labels',
-            'is_random', 'loss_mask', 'padding_mask']
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens = data_b['text'].long()
-    types = data_b['types'].long()
-    sentence_order = data_b['is_random'].long()
-    loss_mask = data_b['loss_mask'].float()
-    lm_labels = data_b['labels'].long()
-    padding_mask = data_b['padding_mask'].long()
-
-    return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask
-
-
-def loss_func(loss_mask, sentence_order, output_tensor):
-    lm_loss_, sop_logits = output_tensor
-
-    lm_loss_ = lm_loss_.float()
-    loss_mask = loss_mask.float()
-    lm_loss = torch.sum(
-        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
-
-    if sop_logits is not None:
-        sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(),
-                                   sentence_order.view(-1),
-                                   ignore_index=-1)
-        sop_loss = sop_loss.float()
-        loss = lm_loss + sop_loss
-        averaged_losses = average_losses_across_data_parallel_group(
-            [lm_loss, sop_loss])
-        return loss, {'lm loss': averaged_losses[0],
-                      'sop loss': averaged_losses[1]}
-    else:
-        loss = lm_loss
-        averaged_losses = average_losses_across_data_parallel_group(
-            [lm_loss])
-        return loss, {'lm loss': averaged_losses[0]}
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    args = get_args()
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator', log_level=2).start()
-    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = get_batch(
-        data_iterator)
-    timers('batch-generator').stop()
-
-    if not args.bert_binary_head:
-        types = None
-
-    # Forward pass through the model.
-    output_tensor = model(tokens, padding_mask,
-                          tokentype_ids=types, lm_labels=lm_labels)
-
-    return output_tensor, partial(loss_func, loss_mask, sentence_order)
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    tokenizer = get_tokenizer()
-
-    config = BERTMaskedWordPieceDatasetConfig(
-        random_seed=args.seed,
-        sequence_length=args.seq_length,
-        blend=get_blend_from_list(args.data_path),
-        blend_per_split=[
-            get_blend_from_list(args.train_data_path),
-            get_blend_from_list(args.valid_data_path),
-            get_blend_from_list(args.test_data_path)
-        ],
-        split=args.split,
-        path_to_cache=args.data_cache_path,
-        tokenizer=tokenizer,
-        masking_probability=args.mask_prob,
-        short_sequence_probability=args.short_seq_prob,
-        masking_max_ngram=3,
-        masking_do_full_word=True,
-        masking_do_permutation=False,
-        masking_use_longer_ngrams=False,
-        masking_use_geometric_distribution=False,
-        classification_head=args.bert_binary_head,
-    )
-
-    print_rank_0('> building train, validation, and test datasets '
-                 'for BERT ...')
-
-    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
-        BERTMaskedWordPieceDataset,
-        train_val_test_num_samples,
-        lambda: mpu.get_tensor_model_parallel_rank() == 0,
-        config,
-    ).build()
-
-    print_rank_0("> finished creating BERT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-if __name__ == "__main__":
-
-    # Temporary for transition to core datasets
-    train_valid_test_datasets_provider.is_distributed = True
-
-    pretrain(train_valid_test_datasets_provider, model_provider,
-             ModelType.encoder_or_decoder,
-             forward_step, args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
diff --git a/toolbox/Megatron-DeepSpeed/pretrain_ict.py b/toolbox/Megatron-DeepSpeed/pretrain_ict.py
deleted file mode 100644
index 205588b5e9572ee4970925727ce0974c007e3259..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/pretrain_ict.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-
-"""Pretrain BERT for Inverse Cloze Task"""
-
-from functools import partial
-import math
-
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-
-from megatron.training import get_args
-from megatron.training import print_rank_0
-from megatron.training import get_timers
-from megatron.core import mpu
-from megatron.core.enums import ModelType
-from megatron.legacy.data.biencoder_dataset_utils import get_ict_batch
-from megatron.legacy.data.dataset_utils import build_train_valid_test_datasets
-from megatron.legacy.model.biencoder_model import biencoder_model_provider
-from megatron.training import pretrain
-from megatron.training.utils import average_losses_across_data_parallel_group
-
-
-def pretrain_ict_model_provider(pre_process=True, post_process=True):
-    args = get_args()
-
-    model = biencoder_model_provider(
-                only_context_model=False,
-                only_query_model=False,
-                biencoder_shared_query_context_model=\
-                args.biencoder_shared_query_context_model,
-                pre_process=pre_process, post_process=post_process)
-
-    return model
-
-def get_group_world_size_rank():
-
-    group = mpu.get_data_parallel_group()
-    rank = torch.distributed.get_rank(group=group)
-    world_size = torch.distributed.get_world_size(group=group)
-
-    return group, rank, world_size
-
-
-class AllgatherFromDataParallelRegion(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, input_):
-        assert input_.dim() == 2
-        group, rank, world_size = get_group_world_size_rank()
-
-        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-        tensor_list[rank] = input_
-        torch.distributed.all_gather(tensor_list, input_, group=group)
-
-        output = torch.cat(tensor_list, dim=0).contiguous()
-
-        return output
-
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        group, rank, world_size = get_group_world_size_rank()
-
-        assert grad_output.shape[0] % world_size == 0
-        dim_size = grad_output.shape[0] // world_size
-        output_list = torch.split(grad_output, dim_size, dim=0)
-
-        # get chunk from this rank
-        output = output_list[rank].contiguous()
-        return output
-
-def loss_func(output_tensor):
-    args = get_args()
-    query_logits, context_logits = output_tensor
-
-    micro_batch_size = query_logits.shape[0]
-    # recall we assert that tensor_model_parallel_size == 1
-    assert mpu.get_tensor_model_parallel_world_size() == 1, \
-        "Model parallel size > 1 not supported for ICT"
-
-    global_batch_size = dist.get_world_size() * micro_batch_size
-    all_query_logits = AllgatherFromDataParallelRegion.apply(query_logits)
-    all_context_logits = AllgatherFromDataParallelRegion.apply(context_logits)
-
-    # scores are inner products between query and context embeddings
-    retrieval_scores = torch.matmul(all_query_logits,
-                        torch.transpose(all_context_logits, 0, 1))
-    # scaling the retriever scores
-    if args.retriever_score_scaling:
-        retrieval_scores = retrieval_scores / math.sqrt(args.hidden_size)
-
-    softmax_scores = F.log_softmax(retrieval_scores, dim=1)
-    sorted_vals, sorted_indices = torch.topk(softmax_scores,
-                                    k=softmax_scores.shape[1], sorted=True)
-
-    def topk_accuracy(k):
-        return torch.cuda.FloatTensor([sum([int(i in sorted_indices[i, :k]) \
-            for i in range(global_batch_size)]) / global_batch_size])
-
-    topk_accs = [topk_accuracy(int(k)) for k in args.retriever_report_topk_accuracies]
-
-    labels = torch.arange(global_batch_size).long().cuda()
-    loss = F.nll_loss(softmax_scores, labels, reduction='mean')
-    reduced_losses = average_losses_across_data_parallel_group([loss, *topk_accs])
-
-    # Scale the retrieval loss
-    loss = loss * mpu.get_data_parallel_world_size()
-
-    # create stats_dict with retrieval loss and all specified top-k accuracies
-    topk_acc_dict = {'top{}_acc'.format(k): v * 100 for k, v in \
-                        zip(args.retriever_report_topk_accuracies, reduced_losses[1:])}
-    stats_dict = dict(loss=reduced_losses[0], **topk_acc_dict)
-    return loss, stats_dict
-
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    args = get_args()
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator', log_level=2).start()
-    query_tokens, query_mask, \
-    context_tokens, context_mask, context_indices = get_ict_batch(data_iterator)
-    timers('batch-generator').stop()
-
-    # Query and Context Types
-    query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
-    context_types = torch.cuda.LongTensor(*context_tokens.shape).fill_(0)
-
-    # Forward model.
-    output_tensor = model(query_tokens, query_mask, query_types, context_tokens,
-                        context_mask, context_types)
-
-    return output_tensor, partial(loss_func)
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid and test datasets."""
-    args = get_args()
-    print_rank_0('> building train, validation, and test datasets '
-                 'for BERT ICT...')
-
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        max_seq_length=args.seq_length,
-        masked_lm_prob=args.mask_prob,
-        short_seq_prob=args.short_seq_prob,
-        seed=args.seed,
-        binary_head=False,
-        dataset_type='ict')
-    print_rank_0("> finished creating BERT ICT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-if __name__ == "__main__":
-    print_rank_0("WARNING : This script is DEPRECATED. Will be removed in mcore release 0.9")
-    pretrain(train_valid_test_datasets_provider,
-             pretrain_ict_model_provider,
-             ModelType.encoder_or_decoder,
-             forward_step,
-             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
diff --git a/toolbox/Megatron-DeepSpeed/pretrain_retro.py b/toolbox/Megatron-DeepSpeed/pretrain_retro.py
deleted file mode 100644
index a0d8f9d9221103a026e005b8585d2f7f3166ad4b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/pretrain_retro.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-
-"""Pretrain Retro."""
-
-from functools import partial
-import torch
-
-from megatron.training import get_args
-from megatron.training import get_timers
-from megatron.training import get_tokenizer
-from megatron.training import print_rank_0
-from megatron.training.arguments import core_transformer_config_from_args
-from megatron.core import tensor_parallel
-from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.core.datasets.utils import get_blend_from_list
-from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets
-from megatron.core.datasets.retro.query.multi_split_gpt_dataset import MultiSplitGPTDataset, MultiSplitGPTDatasetConfig
-from megatron.core.enums import ModelType
-from megatron.core.models.retro import get_retro_decoder_block_spec, RetroConfig, RetroModel
-from megatron.core.models.retro.utils import get_all_true_mask
-from megatron.training import pretrain
-from megatron.training.utils import get_ltor_masks_and_position_ids
-from pretrain_gpt import (
-    is_dataset_built_on_rank,
-    loss_func,
-    model_provider as default_model_provider,
-    train_valid_test_datasets_provider as gpt_train_valid_test_datasets_provider,
-)
-
-
-def get_retro_config():
-    return core_transformer_config_from_args(get_args(), RetroConfig)
-
-
-def core_model_provider(pre_process=True, post_process=True):
-    """Build the model using Megatron-Core."""
-
-    args = get_args()
-    config = get_retro_config()
-
-    # NOTE: Experimental customization feature
-    if args.spec is not None:
-        block_spec = import_module(args.spec)()
-    else:
-        block_spec = get_retro_decoder_block_spec(config, use_transformer_engine=True)
-
-    print_rank_0('building GPT model ...')
-    model = RetroModel(
-        config=config,
-        transformer_layer_spec=block_spec,
-        vocab_size=args.padded_vocab_size,
-        max_sequence_length=args.max_position_embeddings,
-        pre_process=pre_process,
-        post_process=post_process,
-        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
-        parallel_output=True,
-        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
-        position_embedding_type=args.position_embedding_type,
-        rotary_percent=args.rotary_percent
-    )
-    return model
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model.
-
-    Select between two different model classes:
-      1. Default model (uses megatron.legacy.models/gpt_model.py).
-      2. Core model (uses megatron/core/models/retro/model.py).
-    """
-
-    args = get_args()
-    if not args.use_legacy_models and args.retro_add_retriever:
-        provider = core_model_provider
-    else:
-        provider = default_model_provider
-    model = provider(pre_process=pre_process, post_process=post_process)
-    return model
-
-
-def get_batch(data_iterator):
-    """Generate a batch"""
-
-    args = get_args()
-    tokenizer = get_tokenizer()
-    config = get_retro_config()
-
-    # Items and their type.
-    keys = ['text']
-    if args.retro_add_retriever:
-        keys.append('neighbor_tokens')
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
-
-    # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-
-    if args.retro_add_retriever:
-        # note: [bs * l * k, r]
-        # note: 2x == neighbor, continuation
-        neighbor_tokens = data_b['neighbor_tokens'] \
-            .view(-1, config.retro_retrieved_length).long()
-        _, _, neighbor_position_ids = get_ltor_masks_and_position_ids(
-            neighbor_tokens,
-            tokenizer.eod,
-            args.reset_position_ids,
-            args.reset_attention_mask,
-            args.eod_mask_loss)
-        neighbor_attention_mask = get_all_true_mask(
-            (1, 1, config.retro_retrieved_length, config.retro_retrieved_length),
-            neighbor_tokens.device)
-        return tokens, labels, loss_mask, attention_mask, position_ids, \
-               neighbor_tokens, neighbor_attention_mask, neighbor_position_ids
-
-    else:
-        return tokens, labels, loss_mask, attention_mask, position_ids
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    args = get_args()
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator').start()
-    if args.retro_add_retriever:
-        tokens, labels, loss_mask, attention_mask, position_ids, \
-            neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = \
-                get_batch(data_iterator)
-    else:
-        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-            data_iterator)
-        neighbor_tokens, neighbor_attention_mask, neighbor_position_ids = \
-            None, None, None
-    timers('batch-generator').stop()
-
-    # Model call.
-    if args.use_legacy_models:
-        forward_kwargs = {
-            "retriever_input_ids" : neighbor_tokens,
-            "retriever_position_ids" : neighbor_position_ids,
-            "retriever_attn_mask" : neighbor_attention_mask,
-        }
-    else:
-        if args.retro_add_retriever:
-            forward_kwargs = {
-                "context_input_ids" : neighbor_tokens,
-                "context_position_ids" : neighbor_position_ids,
-                "context_mask" : neighbor_attention_mask,
-            }
-        else:
-            forward_kwargs = {}
- 
-    output_tensor = model(tokens, position_ids, attention_mask,
-                          labels=labels, **forward_kwargs)
-
-    return output_tensor, partial(loss_func, loss_mask)
-
-
-def train_valid_test_datasets_provider(train_valid_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    # Dataset config.
-    retro_config = get_retro_config()
-    data_config = MultiSplitGPTDatasetConfig(
-        random_seed=args.seed,
-        sequence_length=args.seq_length,
-        blend=get_blend_from_list(args.data_path),
-        blend_per_split=[
-            get_blend_from_list(args.train_data_path),
-            get_blend_from_list(args.valid_data_path),
-            get_blend_from_list(args.test_data_path)
-        ],
-        split=args.split,
-        split_preprocessing=retro_config.retro_split_preprocessing,
-        path_to_cache=args.data_cache_path,
-        return_document_ids=False,
-        tokenizer=get_tokenizer(),
-        reset_position_ids=args.reset_position_ids,
-        reset_attention_mask=args.reset_attention_mask,
-        eod_mask_loss=args.eod_mask_loss,
-    )
-
-    # GPT datasets.
-    print_rank_0(" > multi-split gpt datasets.")
-    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
-        MultiSplitGPTDataset,
-        train_valid_test_num_samples,
-        is_dataset_built_on_rank,
-        data_config,
-    ).build()
-
-    gpt_datasets = {
-        "train" : (train_ds, train_valid_test_num_samples[0]),
-        "valid" : (valid_ds, train_valid_test_num_samples[1]),
-        "test"  : (test_ds, train_valid_test_num_samples[2]),
-    }
-
-    # Retro datasets.
-    if args.retro_add_retriever:
-        return get_retro_datasets(
-            config=retro_config,
-            gpt_datasets=gpt_datasets,
-            sample_length=args.seq_length,
-            eod_token_id=get_tokenizer().eod,
-        )
-
-    # Multi-split GPT datasets.
-    else:
-        return (
-            gpt_datasets["train"][0],
-            gpt_datasets["valid"][0],
-            gpt_datasets["test"][0],
-        )
-
-
-if __name__ == "__main__":
-
-    # Temporary for transition to core datasets.
-    train_valid_test_datasets_provider.is_distributed = True
-
-    pretrain(train_valid_test_datasets_provider,
-             model_provider,
-             ModelType.retro_decoder,
-             forward_step,
-             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
diff --git a/toolbox/Megatron-DeepSpeed/pretrain_t5.py b/toolbox/Megatron-DeepSpeed/pretrain_t5.py
deleted file mode 100644
index e9702c30721b899b181d841047a7224fe0bb73b1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/pretrain_t5.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-
-"""Pretrain T5"""
-
-from functools import partial
-from typing import Union
-
-import torch
-
-from megatron.training import (
-    get_args,
-    get_timers,
-    get_tokenizer,
-    print_rank_0
-)
-from megatron.core import mpu, tensor_parallel
-from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.core.datasets.t5_dataset import (
-    T5MaskedWordPieceDataset,
-    T5MaskedWordPieceDatasetConfig,
-)
-from megatron.core.enums import ModelType
-from megatron.core.models.T5 import T5Model
-from megatron.training import pretrain
-from megatron.training.arguments import core_transformer_config_from_args
-from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset, T5MaskedWordPieceDatasetConfig
-from megatron.core.datasets.utils import get_blend_from_list
-from megatron.core.models.T5.t5_spec import (get_t5_encoder_with_transformer_engine_block_spec,
-                                            get_t5_decoder_with_transformer_engine_block_spec,
-                                            get_t5_encoder_with_local_block_spec,
-                                            get_t5_decoder_with_local_block_spec)
-from megatron.legacy.model import T5Model as LegacyT5Model
-
-"""
-Pipeline parallelism for T5
-(Caveat: currently, mcore T5 model has not supported pipeline-parallelism)
-===========================
-
-T5 is a model architecture with both encoder and decoder blocks.
-Consequently, pipeline parallelism is implemented slightly differently
-compared to architectures like GPT and BERT.
-
-In particular, when pipeline_model_parallel_world_size > 1, each stage
-either executes an encoder block or a decoder block. The
---pipeline-model-parallel-split-rank argument controls the rank at which
-the split happens: all ranks lower than this argument execute the
-encoder block, and all ranks equal to or higher than this argument value
-execute the decoder block.
-
-In the encoder section of the model, only one tensor is sent downstream:
-the intermediate encoder_hidden_state. In the decoder section of the
-model, two tensors are sent downstream in the forward pass: the fully
-computed encoder_hidden_state, and the intermediate decoder_hidden_state.
-
-In particular, these are the shapes of the tensors sent between
-different workers:
-    If rank is in decoder section:
-        intermediate decoder_hidden_state (pre-transpose),
-        complete encoder_hidden_state (post-transpose).
-    If rank is at boundary between encoder and decoder sections:
-        complete encoder_hidden_state (post-transpose).
-    If rank is in encoder section:
-        intermediate encoder_hidden_state (pre-transpose).
-
-Additionally, we have code in the backward_step function in schedules.py
-to accumulate the encoder_hidden_state gradient across skip connections
-(encoder_hidden_state fed in as input to each layer in the decoder).
-"""
-
-
-def model_provider(
-    pre_process=True, post_process=True, add_encoder=True, add_decoder=True
-) -> Union[LegacyT5Model, T5Model]:
-    """Builds the model.
-
-    Args:
-        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
-        post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True.
-        add_encoder (bool, optional): Defaults to True
-        add_decoder (bool, optional): Defaults to True
-    Returns:
-        T5Model: The returned T5 model
-    """
-
-    args = get_args()
-    config = core_transformer_config_from_args(args)
-    if args.use_legacy_models:
-        model = LegacyT5Model(
-            config=config,
-            num_tokentypes=0,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process,
-            add_encoder=add_encoder,
-            add_decoder=add_decoder,
-        )
-    else:
-        if args.transformer_impl == "local":
-            en_block_spec = get_t5_encoder_with_local_block_spec(args.encoder_num_layers)
-            de_block_spec = get_t5_decoder_with_local_block_spec(args.decoder_num_layers)
-        elif args.transformer_impl == "transformer_engine":
-            en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(
-                args.encoder_num_layers
-            )
-            de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(
-                args.decoder_num_layers
-            )
-        print_rank_0('building T5 model ...')
-        model = T5Model(
-            config=config,
-            transformer_encoder_layer_spec=en_block_spec,
-            transformer_decoder_layer_spec=de_block_spec,
-            vocab_size=args.padded_vocab_size,
-            max_sequence_length=args.max_position_embeddings,
-            pre_process=pre_process,
-            post_process=post_process,
-            fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
-            parallel_output=True,
-            share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
-            position_embedding_type=args.position_embedding_type,
-            rotary_percent=args.rotary_percent,
-        )
-
-    return model
-
-
-def get_batch(data_iterator):
-    """Build the batch."""
-
-    keys = ['text_enc', 'text_dec', 'labels', 'loss_mask', 'enc_mask', 'dec_mask', 'enc_dec_mask']
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    data_b = tensor_parallel.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens_enc = data_b['text_enc'].long()
-    tokens_dec = data_b['text_dec'].long()
-    labels = data_b['labels'].long()
-    loss_mask = data_b['loss_mask'].float()
-
-    enc_mask = data_b['enc_mask'] < 0.5
-    dec_mask = data_b['dec_mask'] < 0.5
-    enc_dec_mask = data_b['enc_dec_mask'] < 0.5
-
-    return tokens_enc, tokens_dec, loss_mask, labels, enc_mask, dec_mask, enc_dec_mask
-
-
-def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
-    """Loss function.
-
-    Args:
-        loss_mask (torch.Tensor): Used to mask out some portions of the loss
-        output_tensor (torch.Tensor): The tensor with the losses
-
-    Returns:
-        the loss scalar for this micro-batch
-        the number of non-padded tokens in this microbatch
-        a dict containing reporting metrics on the loss and number of tokens across
-            the data parallel ranks
-    """
-    lm_loss_ = output_tensor.float()
-    total_tokens = loss_mask.sum()
-
-    lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1))
-    lm_loss = torch.cat([lm_loss.view(1), total_tokens.view(1)])
-
-    reporting_loss = lm_loss.clone().detach()
-    torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
-
-    num_tokens = lm_loss[1].clone().detach().to(torch.int)
-    return lm_loss[0], num_tokens, {'lm loss': (reporting_loss[0], reporting_loss[1])}
-
-
-def forward_step(data_iterator, model: T5Model):
-    """Forward training step.
-
-    Args:
-        data_iterator : Input data iterator
-        model (T5Model): The T5 Model
-    """
-
-    args = get_args()
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch generator', log_level=2).start()
-    tokens_enc, tokens_dec, loss_mask, lm_labels, enc_mask, dec_mask, enc_dec_mask = get_batch(
-        data_iterator
-    )
-    timers('batch generator').stop()
-
-    # Forward model lm_labels
-    output_tensor = model(
-        tokens_enc, tokens_dec, enc_mask, dec_mask, enc_dec_mask, lm_labels=lm_labels
-    )
-
-    return output_tensor, partial(loss_func, loss_mask)
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples: int):
-    """Build the train test and validation datasets.
-
-    Args:
-        train_val_test_num_samples : A list containing the number of samples in train test and validation.
-    """
-    args = get_args()
-
-    tokenizer = get_tokenizer()
-
-    config = T5MaskedWordPieceDatasetConfig(
-        random_seed=args.seed,
-        sequence_length=args.encoder_seq_length,
-        sequence_length_decoder=args.decoder_seq_length,
-        blend=get_blend_from_list(args.data_path),
-        blend_per_split=[
-            get_blend_from_list(args.train_data_path),
-            get_blend_from_list(args.valid_data_path),
-            get_blend_from_list(args.test_data_path)
-        ],
-        split=args.split,
-        path_to_cache=args.data_cache_path,
-        tokenizer=tokenizer,
-        masking_probability=args.mask_prob,
-        short_sequence_probability=args.short_seq_prob,
-        masking_max_ngram=10,
-        masking_do_full_word=True,
-        masking_do_permutation=False,
-        masking_use_longer_ngrams=False,
-        masking_use_geometric_distribution=True,
-    )
-
-    print_rank_0('> building train, validation, and test datasets for T5 ...')
-
-    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
-        T5MaskedWordPieceDataset,
-        train_val_test_num_samples,
-        lambda: mpu.get_tensor_model_parallel_rank() == 0,
-        config,
-    ).build()
-
-    print_rank_0("> finished creating T5 datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-if __name__ == "__main__":
-
-    # Temporary for transition to core datasets
-    train_valid_test_datasets_provider.is_distributed = True
-
-    pretrain(
-        train_valid_test_datasets_provider,
-        model_provider,
-        ModelType.encoder_and_decoder,
-        forward_step,
-        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'},
-    )
diff --git a/toolbox/Megatron-DeepSpeed/pretrain_vision_classify.py b/toolbox/Megatron-DeepSpeed/pretrain_vision_classify.py
deleted file mode 100644
index 8d9b28baeb92ca861e47593af8a4107fdc6e87e4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/pretrain_vision_classify.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-
-"""Pretrain VIT"""
-
-import torch
-import torch.nn.functional as F
-from functools import partial
-from megatron.training import get_args, get_timers, print_rank_0
-from megatron.core.enums import ModelType
-from megatron.legacy.data.vit_dataset import build_train_valid_datasets
-from megatron.legacy.model.vision.classification import VitClassificationModel
-from megatron.legacy.model.vision.classification import MitClassificationModel
-from megatron.training import pretrain
-from megatron.training.utils import average_losses_across_data_parallel_group
-from megatron.training.arguments import core_transformer_config_from_args
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    args = get_args()
-    config = core_transformer_config_from_args(args)
-    if args.vision_backbone_type == 'vit':
-        print_rank_0("building VIT model ...")
-        model = VitClassificationModel(config=config,
-                                       num_classes=args.num_classes,
-                                       pre_process=pre_process,
-                                       post_process=post_process)
-    elif args.vision_backbone_type == 'mit':
-        print_rank_0("building MIT model ...")
-        model = MitClassificationModel(num_classes=args.num_classes,
-                                       pre_process=pre_process,
-                                       post_process=post_process)
-    else:
-        raise Exception('{} vision backbone is not supported.'.format(
-                              args.vision_backbone_type))
-    return model
-
-
-def get_batch(data_iterator):
-    """Build the batch."""
-    data = next(data_iterator)
-
-    # only data parallelism; no need for broadcast
-    images = data[0].cuda()
-    labels = data[1].cuda()
-
-    return images, labels
-
-
-def loss_func(labels, output_tensor):
-    logits = output_tensor.contiguous().float()
-    loss = F.cross_entropy(logits, labels)
-
-    outputs = torch.argmax(logits, -1)
-    correct = (outputs == labels).float()
-    accuracy = torch.mean(correct)
-
-    averaged_loss = average_losses_across_data_parallel_group([loss, accuracy])
-
-    return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    timers = get_timers()
-
-    # Get the batch.
-    timers("batch-generator", log_level=2).start()
-    (
-        images,
-        labels,
-    ) = get_batch(data_iterator)
-    timers("batch-generator").stop()
-
-    # Forward model. lm_labels
-    output_tensor = model(images)
-
-    return output_tensor, partial(loss_func, labels)
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    print_rank_0(
-        "> building train, validation, and test datasets " "for VIT ..."
-    )
-    train_ds, valid_ds = build_train_valid_datasets(
-        data_path=args.data_path,
-        image_size=(args.img_h, args.img_w)
-    )
-    print_rank_0("> finished creating VIT datasets ...")
-
-    return train_ds, valid_ds, None
-
-
-if __name__ == "__main__":
-
-    pretrain(
-        train_valid_test_datasets_provider,
-        model_provider,
-        ModelType.encoder_or_decoder,
-        forward_step,
-        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
-    )
diff --git a/toolbox/Megatron-DeepSpeed/pretrain_vision_dino.py b/toolbox/Megatron-DeepSpeed/pretrain_vision_dino.py
deleted file mode 100644
index f75280c42d70449dedf8b12ae012a25769ac8c03..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/pretrain_vision_dino.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-
-import torch
-import torch.nn.functional as F
-import torch.nn as nn
-import numpy as np
-import torch.distributed as dist
-from functools import partial
-from megatron.training import get_args, get_timers, print_rank_0
-from megatron.core.enums import ModelType
-from megatron.legacy.data.vit_dataset import build_train_valid_datasets
-from megatron.legacy.model.vision.dino import DINOPretrainModel
-from megatron.legacy.model.vision.knn_monitor import knn_predict, get_feature_bank
-from megatron.training import pretrain
-from megatron.training.utils import average_losses_across_data_parallel_group, unwrap_model
-from megatron.training.arguments import core_transformer_config_from_args
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-    config = core_transformer_config_from_args(get_args())
-    return DINOPretrainModel(config, pre_process=pre_process, post_process=post_process)
-
-def get_batch(data_iterator):
-    """Build the batch."""
-    data = next(data_iterator)
-
-    # only data parallelism; no need for broadcast
-    if isinstance(data[0], list):
-        images = [aug.cuda() for aug in data[0]]
-    else:
-        images = data[0].cuda()
-    labels = data[1].cuda()
-
-    return images, labels
-
-
-def loss_func(model, labels, output_tensor, collect_data=False):
-    args = get_args()
-
-    model = unwrap_model(model)
-    if model.training:
-        student_output, teacher_output = output_tensor
-        loss = model.dino_loss(student_output, teacher_output, args.curr_iteration)
-        averaged_loss = average_losses_across_data_parallel_group([loss])
-        return loss, {"loss": averaged_loss[0]}
-    else:
-        _, teacher_feature = output_tensor
-        feature_bank, feature_labels, classes = get_feature_bank()
-        feature = F.normalize(teacher_feature.float(), dim=1)
-
-        knn_accs = []
-        for k in [10, 20, 100, 200]:
-            pred_labels = knn_predict(feature, feature_bank,
-                                      feature_labels, classes, k, 0.07)
-            knn_acc = (pred_labels[:, 0] == labels).float().mean()
-            knn_accs.append(knn_acc)
-
-        averaged_loss = average_losses_across_data_parallel_group(knn_accs)
-        return 0, {"knn_acc_10": averaged_loss[0],
-                   "knn_acc_20": averaged_loss[1],
-                   "knn_acc_100": averaged_loss[2],
-                   "knn_acc_200": averaged_loss[3]}
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    timers = get_timers()
-
-    # Get the batch.
-    timers("batch-generator", log_level=2).start()
-    (
-        images,
-        labels,
-    ) = get_batch(data_iterator)
-    timers("batch-generator").stop()
-
-    return model(images), partial(loss_func, model, labels)
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    print_rank_0(
-        "> building train, validation, and test datasets " "for VIT ..."
-    )
-    train_ds, valid_ds = build_train_valid_datasets(
-        data_path=args.data_path,
-        image_size=(args.img_h, args.img_w)
-    )
-    print_rank_0("> finished creating VIT datasets ...")
-
-    return train_ds, valid_ds, None
-
-
-if __name__ == "__main__":
-
-    pretrain(
-        train_valid_test_datasets_provider,
-        model_provider,
-        ModelType.encoder_or_decoder,
-        forward_step,
-        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
-    )
-
diff --git a/toolbox/Megatron-DeepSpeed/pretrain_vision_inpaint.py b/toolbox/Megatron-DeepSpeed/pretrain_vision_inpaint.py
deleted file mode 100644
index 8570baab5b40bbb85bfa312b80238c75b2b1259f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/pretrain_vision_inpaint.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-
-"""Pretrain VIT"""
-
-import torch
-import torch.nn.functional as F
-from functools import partial
-from megatron.training import get_args, get_timers, print_rank_0, print_rank_last
-from megatron.core.enums import ModelType
-from megatron.legacy.data.vit_dataset import build_train_valid_datasets
-from megatron.legacy.model.vision.inpainting import VitInpaintingModel
-from megatron.legacy.model.vision.inpainting import MitInpaintingModel
-from megatron.training import pretrain
-from megatron.training.utils import average_losses_across_data_parallel_group
-from tasks.vision.segmentation.metrics import SSIM, PSNR
-from megatron.training.arguments import core_transformer_config_from_args
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-    args = get_args()
-    config = core_transformer_config_from_args(args)
-    if args.vision_backbone_type == 'vit':
-        model = VitInpaintingModel(config=config,
-                                   pre_process=pre_process,
-                                   post_process=post_process)
-    elif args.vision_backbone_type == 'mit':
-        model = MitInpaintingModel(config=config,
-                                   pre_process=pre_process,
-                                   post_process=post_process)
-    else:
-        raise Exception('{} vision backbone is not supported.'.format(
-                              args.vision_backbone_type))
-    return model
-
-
-def get_batch(data_iterator):
-    """Build the batch."""
-    data = next(data_iterator)
-
-    # only data parallelism; no need for broadcast
-    images = data[0][0].cuda()
-    masks = data[0][1].cuda()
-    return images, masks
-
-
-def loss_func(images, masks, masked_images, outputs, non_loss_data=False):
-    outputs = outputs.contiguous().float()
-    masks_flip = 1-masks
-    flip_masked_outputs = outputs.masked_fill(masks_flip.bool(), 0)
-    flip_masked_images = images.masked_fill(masks_flip.bool(), 0)
-
-    ssim_fun = SSIM()
-    psnr_fun = PSNR()
-
-    if not non_loss_data:
-        mask_count = torch.count_nonzero(masks)
-        loss = F.mse_loss(
-            flip_masked_outputs,
-            flip_masked_images.float(),
-            reduction="sum"
-        )
-        loss = loss/mask_count
-        ssim = ssim_fun(flip_masked_outputs, flip_masked_images.float())
-        psnr = psnr_fun(flip_masked_outputs, flip_masked_images.float())
-
-        averaged_loss = average_losses_across_data_parallel_group(
-            [loss, psnr, ssim]
-        )
-
-        return loss, {"loss": averaged_loss[0],
-                      "psnr": averaged_loss[1],
-                      'ssim': averaged_loss[2]}
-    else:
-        synth_images = masked_images.float() + flip_masked_outputs
-        ssim = ssim_fun(synth_images, images.float())
-        psnr = psnr_fun(synth_images, images.float())
-        return torch.cat((images, masked_images, synth_images), dim=2), ssim, psnr
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-    timers = get_timers()
-
-    # Get the batch.
-    timers("batch-generator", log_level=2).start()
-    (
-        images,
-        masks,
-    ) = get_batch(data_iterator)
-    timers("batch-generator").stop()
-
-    masked_images = images.masked_fill(masks.bool(), 0)
-    outputs = model(masked_images)
-
-    # Forward mode
-    return outputs, partial(loss_func, images, masks, masked_images)
-
-
-def process_non_loss_data(data, iteration, writer):
-    psnr_sum = 0
-    ssim_sum = 0
-    for (output_tb, ssim, psnr) in data:
-        output_tb[output_tb < 0] = 0
-        output_tb[output_tb > 1] = 1
-        writer.add_images("gt-input-output-vald", output_tb,
-                          global_step=iteration, walltime=None,
-                          dataformats='NCHW')
-        psnr_sum = psnr_sum + psnr.item()
-        ssim_sum = ssim_sum + ssim.item()
-    psnr = psnr_sum/len(data)
-    ssim = ssim_sum/len(data)
-    writer.add_scalar('PSNR generate value-validation', psnr, iteration)
-    writer.add_scalar('SSIM generate value-validation', ssim, iteration)
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-    args = get_args()
-
-    print_rank_0(
-        "> building train, validation, and test datasets " "for VIT ..."
-    )
-    train_ds, valid_ds = build_train_valid_datasets(
-        data_path=args.data_path,
-        image_size=(args.img_h, args.img_w)
-    )
-    print_rank_0("> finished creating VIT datasets ...")
-
-    return train_ds, valid_ds, None
-
-
-if __name__ == "__main__":
-
-    pretrain(
-        train_valid_test_datasets_provider,
-        model_provider,
-        ModelType.encoder_or_decoder,
-        forward_step,
-        process_non_loss_data,
-        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
-    )
diff --git a/toolbox/Megatron-DeepSpeed/report_theoretical_memory.py b/toolbox/Megatron-DeepSpeed/report_theoretical_memory.py
deleted file mode 100644
index c95a0203c8a40b28dca4e14b00164a00ba5f78e9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/report_theoretical_memory.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-"""Computes theoretical memory footprint for model training without instantiating
-a model and running training iterations on GPU(s)."""
-
-from megatron.training import get_args
-from megatron.training.initialize import initialize_megatron
-from megatronspeed.theoretical_memory_usage import report_theoretical_memory
-
-if __name__ == "__main__":
-    initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True)
-    args = get_args()
-
-    report_theoretical_memory(args, verbose=True)
diff --git a/toolbox/Megatron-DeepSpeed/requirments_rlhf.txt b/toolbox/Megatron-DeepSpeed/requirments_rlhf.txt
deleted file mode 100644
index 32240710601858f56d54411d289cb25ecf68a250..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/requirments_rlhf.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-pybind11
-transformers
-accelerate
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/setup.py b/toolbox/Megatron-DeepSpeed/setup.py
deleted file mode 100644
index 1e0e6aecf36ff6daa04c4c0ee62861159033f7fb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/setup.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from setuptools import setup, find_packages
-
-"""Setup for pip package."""
-
-import importlib.util
-import os
-import setuptools
-
-spec = importlib.util.spec_from_file_location('package_info', 'megatronspeed/core/package_info.py')
-package_info = importlib.util.module_from_spec(spec)
-spec.loader.exec_module(package_info)
-
-
-__contact_emails__ = package_info.__contact_emails__
-__contact_names__ = package_info.__contact_names__
-__description__ = package_info.__description__
-__download_url__ = package_info.__download_url__
-__homepage__ = package_info.__homepage__
-__keywords__ = package_info.__keywords__
-__license__ = package_info.__license__
-__package_name__ = package_info.__package_name__
-__repository_url__ = package_info.__repository_url__
-__version__ = package_info.__version__
-
-
-if os.path.exists('megatronspeed/core/README.md'):
-    with open("megatronspeed/core/README.md", "r", encoding='utf-8') as fh:
-        long_description = fh.read()
-    long_description_content_type = "text/markdown"
-
-else:
-    long_description = 'See ' + __homepage__
-    long_description_content_type = "text/plain"
-
-
-###############################################################################
-#                             Dependency Loading                              #
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
-
-def req_file(filename, folder="megatronspeed/core"):
-    with open(os.path.join(folder, filename), encoding='utf-8') as f:
-        content = f.readlines()
-    # you may also want to remove whitespace characters
-    # Example: `\n` at the end of each line
-    return [x.strip() for x in content]
-
-install_requires = req_file("requirements.txt")
-
-###############################################################################
-
-if "MEGATRONSPEED_VERSION_IDENTIFIER" in os.environ:
-        __version__ += "+" + str(os.environ['MEGATRONSPEED_VERSION_IDENTIFIER'])
-
-setuptools.setup(
-    name=__package_name__,
-    # Versions should comply with PEP440.  For a discussion on single-sourcing
-    # the version across setup.py and the project code, see
-    # https://packaging.python.org/en/latest/single_source_version.html
-    version=__version__,
-    description=__description__,
-    long_description=long_description,
-    long_description_content_type=long_description_content_type,
-    # The project's main homepage.
-    url=__repository_url__,
-    download_url=__download_url__,
-    # Author details
-    author=__contact_names__,
-    author_email=__contact_emails__,
-    # maintainer Details
-    maintainer=__contact_names__,
-    maintainer_email=__contact_emails__,
-    # The licence under which the project is released
-    license=__license__,
-    classifiers=[
-        # How mature is this project? Common values are
-        #  1 - Planning
-        #  2 - Pre-Alpha
-        #  3 - Alpha
-        #  4 - Beta
-        #  5 - Production/Stable
-        #  6 - Mature
-        #  7 - Inactive
-        'Development Status :: 5 - Production/Stable',
-        # Indicate who your project is intended for
-        'Intended Audience :: Developers',
-        'Intended Audience :: Science/Research',
-        'Intended Audience :: Information Technology',
-        # Indicate what your project relates to
-        'Topic :: Scientific/Engineering',
-        'Topic :: Scientific/Engineering :: Mathematics',
-        'Topic :: Scientific/Engineering :: Image Recognition',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: Software Development :: Libraries',
-        'Topic :: Software Development :: Libraries :: Python Modules',
-        'Topic :: Utilities',
-        # Pick your license as you wish (should match "license" above)
-        'License :: OSI Approved :: BSD License',
-        # Supported python versions
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        # Additional Setting
-        'Environment :: Console',
-        'Natural Language :: English',
-        'Operating System :: OS Independent',
-    ],
-    packages=setuptools.find_packages(include=["megatronspeed", "megatronspeed.*"]),
-    install_requires=install_requires,
-
-    # Add in any packaged data.
-    include_package_data=True,
-    # PyPI package information.
-    keywords=__keywords__,
-)
diff --git a/toolbox/Megatron-DeepSpeed/tasks/data_utils.py b/toolbox/Megatron-DeepSpeed/tasks/data_utils.py
deleted file mode 100644
index 914acf10c3ff738151a7cc2a5a1c8e4d7707533d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/data_utils.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-""" Tasks data utility."""
-
-import re
-import numpy as np
-
-
-def clean_text(text):
-    """Remove new lines and multiple spaces and adjust end of sentence dot."""
-
-    text = text.replace("\n", " ")
-    text = re.sub(r'\s+', ' ', text)
-    for _ in range(3):
-        text = text.replace(' . ', '. ')
-
-    return text
-
-
-def build_sample(ids, types, paddings, label, unique_id):
-    """Convert to numpy and return a sample consumed by the batch producer."""
-
-    ids_np = np.array(ids, dtype=np.int64)
-    types_np = np.array(types, dtype=np.int64)
-    paddings_np = np.array(paddings, dtype=np.int64)
-    sample = ({'text': ids_np,
-               'types': types_np,
-               'padding_mask': paddings_np,
-               'label': int(label),
-               'uid': int(unique_id)})
-
-    return sample
-
-
-def build_tokens_types_paddings_from_text(text_a, text_b,
-                                          tokenizer, max_seq_length):
-    """Build token types and paddings, trim if needed, and pad if needed."""
-
-    text_a_ids = tokenizer.tokenize(text_a)
-    text_b_ids = None
-    if text_b is not None:
-        text_b_ids = tokenizer.tokenize(text_b)
-
-    return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
-                                                max_seq_length, tokenizer.cls,
-                                                tokenizer.sep, tokenizer.pad)
-
-
-def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
-                                         cls_id, sep_id, pad_id):
-    """Build token types and paddings, trim if needed, and pad if needed."""
-
-    ids = []
-    types = []
-    paddings = []
-
-    # [CLS].
-    ids.append(cls_id)
-    types.append(0)
-    paddings.append(1)
-
-    # A.
-    len_text_a = len(text_a_ids)
-    ids.extend(text_a_ids)
-    types.extend([0] * len_text_a)
-    paddings.extend([1] * len_text_a)
-
-    # [SEP].
-    ids.append(sep_id)
-    types.append(0)
-    paddings.append(1)
-
-    # B.
-    if text_b_ids is not None:
-        len_text_b = len(text_b_ids)
-        ids.extend(text_b_ids)
-        types.extend([1] * len_text_b)
-        paddings.extend([1] * len_text_b)
-
-    # Cap the size.
-    trimmed = False
-    if len(ids) >= max_seq_length:
-        max_seq_length_m1 = max_seq_length - 1
-        ids = ids[0:max_seq_length_m1]
-        types = types[0:max_seq_length_m1]
-        paddings = paddings[0:max_seq_length_m1]
-        trimmed = True
-
-    # [SEP].
-    if (text_b_ids is not None) or trimmed:
-        ids.append(sep_id)
-        if text_b_ids is None:
-            types.append(0)
-        else:
-            types.append(1)
-        paddings.append(1)
-
-    # Padding.
-    padding_length = max_seq_length - len(ids)
-    if padding_length > 0:
-        ids.extend([pad_id] * padding_length)
-        types.extend([pad_id] * padding_length)
-        paddings.extend([0] * padding_length)
-
-    return ids, types, paddings
diff --git a/toolbox/Megatron-DeepSpeed/tasks/ensemble_classifier.py b/toolbox/Megatron-DeepSpeed/tasks/ensemble_classifier.py
deleted file mode 100644
index c2333b70154b5761b47bcb7cdf50e11c3d500dda..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/ensemble_classifier.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import os
-import argparse
-import collections
-
-import numpy as np
-import torch
-
-
-def process_files(args):
-    all_predictions = collections.OrderedDict()
-    all_labels = collections.OrderedDict()
-    all_uid = collections.OrderedDict()
-    for path in args.paths:
-        path = os.path.join(path, args.prediction_name)
-        try:
-            data = torch.load(path)
-            for dataset in data:
-                name, d = dataset
-                predictions, labels, uid = d
-                if name not in all_predictions:
-                    all_predictions[name] = np.array(predictions)
-                    if args.labels is None:
-                        args.labels = [i for i in range(all_predictions[name].shape[1])]
-                    if args.eval:
-                        all_labels[name] = np.array(labels)
-                    all_uid[name] = np.array(uid)
-                else:
-                    all_predictions[name] += np.array(predictions)
-                    assert np.allclose(all_uid[name], np.array(uid))
-        except Exception as e:
-            print(e)
-            continue
-    return all_predictions, all_labels, all_uid
-
-
-def get_threshold(all_predictions, all_labels, one_threshold=False):
-    if one_threshold:
-        all_predictons = {'combined': np.concatenate(list(all_predictions.values()))}
-        all_labels = {'combined': np.concatenate(list(all_predictions.labels()))}
-    out_thresh = []
-    for dataset in all_predictions:
-        preds = all_predictions[dataset]
-        labels = all_labels[dataset]
-        out_thresh.append(calc_threshold(preds, labels))
-    return out_thresh
-
-
-def calc_threshold(p, l):
-    trials = [(i) * (1. / 100.) for i in range(100)]
-    best_acc = float('-inf')
-    best_thresh = 0
-    for t in trials:
-        acc = ((apply_threshold(p, t).argmax(-1) == l).astype(float)).mean()
-        if acc > best_acc:
-            best_acc = acc
-            best_thresh = t
-    return best_thresh
-
-
-def apply_threshold(preds, t):
-    assert (np.allclose(preds.sum(-1), np.ones(preds.shape[0])))
-    prob = preds[:, -1]
-    thresholded = (prob >= t).astype(int)
-    preds = np.zeros_like(preds)
-    preds[np.arange(len(thresholded)), thresholded.reshape(-1)] = 1
-    return preds
-
-
-def threshold_predictions(all_predictions, threshold):
-    if len(threshold) != len(all_predictions):
-        threshold = [threshold[-1]] * (len(all_predictions) - len(threshold))
-    for i, dataset in enumerate(all_predictions):
-        thresh = threshold[i]
-        preds = all_predictions[dataset]
-        all_predictions[dataset] = apply_threshold(preds, thresh)
-    return all_predictions
-
-
-def postprocess_predictions(all_predictions, all_labels, args):
-    for d in all_predictions:
-        all_predictions[d] = all_predictions[d] / len(args.paths)
-
-    if args.calc_threshold:
-        args.threshold = get_threshold(all_predictions, all_labels, args.one_threshold)
-        print('threshold', args.threshold)
-
-    if args.threshold is not None:
-        all_predictions = threshold_predictions(all_predictions, args.threshold)
-
-    return all_predictions, all_labels
-
-
-def write_predictions(all_predictions, all_labels, all_uid, args):
-    all_correct = 0
-    count = 0
-    for dataset in all_predictions:
-        preds = all_predictions[dataset]
-        preds = np.argmax(preds, -1)
-        if args.eval:
-            correct = (preds == all_labels[dataset]).sum()
-            num = len(all_labels[dataset])
-            accuracy = correct / num
-            count += num
-            all_correct += correct
-            accuracy = (preds == all_labels[dataset]).mean()
-            print(accuracy)
-        if not os.path.exists(os.path.join(args.outdir, dataset)):
-            os.makedirs(os.path.join(args.outdir, dataset))
-        outpath = os.path.join(
-            args.outdir, dataset, os.path.splitext(
-                args.prediction_name)[0] + '.tsv')
-        with open(outpath, 'w') as f:
-            f.write('id\tlabel\n')
-            f.write('\n'.join(str(uid) + '\t' + str(args.labels[p])
-                              for uid, p in zip(all_uid[dataset], preds.tolist())))
-    if args.eval:
-        print(all_correct / count)
-
-
-def ensemble_predictions(args):
-    all_predictions, all_labels, all_uid = process_files(args)
-    all_predictions, all_labels = postprocess_predictions(all_predictions, all_labels, args)
-    write_predictions(all_predictions, all_labels, all_uid, args)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--paths', required=True, nargs='+',
-                        help='paths to checkpoint directories used in ensemble')
-    parser.add_argument('--eval', action='store_true',
-                        help='compute accuracy metrics against labels (dev set)')
-    parser.add_argument('--outdir',
-                        help='directory to place ensembled predictions in')
-    parser.add_argument('--prediction-name', default='test_predictions.pt',
-                        help='name of predictions in checkpoint directories')
-    parser.add_argument('--calc-threshold', action='store_true',
-                        help='calculate threshold classification')
-    parser.add_argument('--one-threshold', action='store_true',
-                        help='use on threshold for all subdatasets')
-    parser.add_argument('--threshold', nargs='+', default=None, type=float,
-                        help='user supplied threshold for classification')
-    parser.add_argument('--labels', nargs='+', default=None,
-                        help='whitespace separated list of label names')
-    args = parser.parse_args()
-    ensemble_predictions(args)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tasks/eval_harness/download.py b/toolbox/Megatron-DeepSpeed/tasks/eval_harness/download.py
deleted file mode 100644
index 27519020b1f3f4e9c2f591a2197de6f11fcf499b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/eval_harness/download.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed
-# under the license https://huggingface.co/spaces/bigscience/license
-
-# Downloads the specified taks in the evaluation harness
-# This is particularly useful when running in environments where the GPU nodes 
-# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
-
-from lm_eval import tasks
-from lm_eval.tasks import ALL_TASKS
-import argparse
-import os
-
-
-parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
-parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
-args = parser.parse_args()
-
-def main():
-    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
-    tasks.get_task_dict(task_list)
-
-if __name__ == '__main__':
-    main()
-
-
-    
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tasks/eval_harness/evaluate.py b/toolbox/Megatron-DeepSpeed/tasks/eval_harness/evaluate.py
deleted file mode 100644
index ddc1447222208bcc7f1775d976570dd3fd6123f7..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/eval_harness/evaluate.py
+++ /dev/null
@@ -1,453 +0,0 @@
-# This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed
-# under the license https://huggingface.co/spaces/bigscience/license
-
-from functools import reduce
-from logging import logMultiprocessing
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir,os.path.pardir)))
-
-from lm_eval.models.gpt2 import GPT2LM
-from lm_eval import evaluator, tasks, utils
-from lm_eval.base import CacheHook
-from tqdm import tqdm
-import torch.nn.functional as F
-
-from lm_eval.tasks import ALL_TASKS
-from pretrain_gpt import model_provider
-import numpy as np
-import time
-
-import torch
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds import get_tokenizer
-from megatron_ds.core.enums import ModelType
-from megatron_ds.core import mpu
-from megatron_ds.training import setup_model_and_optimizer, get_model
-from megatron_ds.core.tensor_parallel.mappings import gather_from_tensor_model_parallel_region
-
-from megatron_ds.utils import get_ltor_masks_and_position_ids, unwrap_model
-from megatron_ds.p2p_communication import recv_forward, send_forward
-import pickle
-import json
-
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron_ds.model.distributed import DistributedDataParallel as LocalDDP
-from megatron_ds.model.module import Float16Module
-from deepspeed.runtime.pipe import schedule
-from deepspeed.accelerator import get_accelerator
-
-class EvalHarnessAdaptor(GPT2LM):
-    def __init__(self, model, tokenizer):
-        args = get_args()
-        self.args = args
-        self.model = model
-        self.tokenizer = tokenizer
-        self.VOCAB_SIZE = tokenizer.vocab_size
-        self.EOT_TOKEN_ID = tokenizer.eod
-
-        self._max_length = args.seq_length
-
-        # For ds we split into mini batches and then micro batches to keep pipelining api happy.
-        # With Megatron we just go to micro_batches directly
-        self._batch_size = args.micro_batch_size
-
-        self.cache_hook = CacheHook(None)
-        self.is_main = args.rank == 0
-        self.is_local_main = args.local_rank == 0
-        self._device = get_accelerator().current_device_name()
-        self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
-        self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
-        self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
-        self.adaptive_seq_len = args.adaptive_seq_len
-        if self.is_data_parallel and args.moe_expert_parallel_size == 1: # For MoE model, allow a "fake data parallel" in order to partition model into multiple gpus
-            raise NotImplementedError("Data parallelism is currently not supported for evaluation")
-
-        self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage()  # only the last stage of the pipeline model will receive the logits
-
-    @property
-    def max_length(self):
-        return self._max_length
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def device(self):
-        return self._device
-
-
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc = [self.EOT_TOKEN_ID]
-            else:
-                context_enc = self.tokenizer_encode(context)
-
-            continuation_enc = self.tokenizer_encode(continuation)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
-
-    def loglikelihood_rolling(self, requests):
-        # TODO: Implement caching once we've confirmed the perplexity implementation
-        # TODO: automatic batch size detection for vectorization
-
-        loglikelihoods = []
-        with torch.no_grad():
-            for string, in tqdm(requests):
-                rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
-                    token_list=self.tokenizer_encode(string),
-                    prefix_token=self.EOT_TOKEN_ID,
-                    max_seq_len=self.max_length,
-                    context_len=1,
-                )))
-
-                rolling_token_windows = [(None,) + x for x in rolling_token_windows]
-
-                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
-                string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
-
-                # discard is_greedy
-                string_nll = [x[0] for x in string_nll]
-
-                string_nll = sum(string_nll)
-                loglikelihoods.append(string_nll)
-
-        return loglikelihoods
-
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
-        disable_tqdm = disable_tqdm if self.is_main else True
-        res = []
-        res_len = 0  # storing the result length for later
-        self.model.eval()
-        with torch.no_grad():
-            def _collate(x):
-                toks = x[1] + x[2]
-                return (-len(toks), tuple(toks))
-
-            reord = utils.Reorderer(requests, _collate)
-            for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
-                inps, contlens, inplens, padding_length = [], [], [], None
-                for _, context_enc, continuation_enc in chunk:
-                    # when too long to fit in context, truncate from the left
-                    inp = torch.tensor(
-                        (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
-                        , dtype=torch.long).to(self.device)
-                    inplen, = inp.shape
-
-                    cont = continuation_enc
-
-                    # since in _collate we make sure length is descending, the longest is always the first one.
-                    padding_length = padding_length if padding_length is not None else inplen
-                    if not self.adaptive_seq_len:
-                        padding_length = self.max_length
-                    # pad to length
-                    inp = torch.cat([
-                        inp,  # [seq]
-                        torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device)  # [padding_length - seq]
-                    ], dim=0)
-
-                    inps.append(inp.unsqueeze(0))
-
-                    contlens.append(cont)
-                    inplens.append(inplen)
-
-                logits = self._model_call(torch.cat(inps, dim=0))
-                res_len += len(chunk)
-                if logits is not None:
-                    multi_logits = F.log_softmax(logits, dim=-1).cpu()  # [batch, seq, vocab]
-
-                    for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
-                        contlen = len(cont_toks)
-                        logits = logits[inplen - contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
-                        greedy_tokens = logits.argmax(dim=-1)
-                        # cont_toks :: [1, seq]
-                        cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
-                        max_equal = (greedy_tokens == cont_toks).all()
-                        # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
-
-                        logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
-                        answer = (float(logits.sum()), bool(max_equal))
-                        # partial caching
-                        if cache_key is not None:
-                            self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-                        res.append(answer)
-
-        if not mpu.is_pipeline_last_stage():
-            # @HACK: To make the eval harness happy on threads that don't have access to the results.
-            #        We just randomly generate some data.
-            res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests]
-
-        return reord.get_original(res)
-
-    def create_model_inputs(self, tokens):
-        args = get_args()
-
-        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-            tokens,
-            self.EOT_TOKEN_ID,
-            args.reset_position_ids,
-            args.reset_attention_mask,
-            args.eod_mask_loss)
-
-        return (tokens, position_ids, attention_mask), (tokens, loss_mask)
-
-    def _model_call(self, inps):
-        args = get_args()
-
-        if args.deepspeed:
-            if args.no_pipeline_parallel:
-                # self.model.set_batch_fn(self.create_model_inputs)
-                # round up to multiple of micro_batch_size
-                new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
-                padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
-                # dummy data iterator for pipelining.
-                data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
-                self.model.micro_batches = len(data_iterator)
-                # output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
-                output = []
-                for tokens in data_iterator:
-                    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-                                                                tokens,
-                                                                self.EOT_TOKEN_ID,
-                                                                args.reset_position_ids,
-                                                                args.reset_attention_mask,
-                                                                args.eod_mask_loss)
-                    a_output, *other_losses = self.model(tokens,
-                        position_ids,
-                        attention_mask,
-                        tokentype_ids=None)
-                    output.append(a_output)
-
-                if output is not None:
-                    output = torch.cat(output, 0)[:len(inps)]
-                else:
-                    output = None
-
-                # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same
-                if args.adaptive_seq_len:
-                    self.model.total_loss = None
-            else:
-                self.model.set_batch_fn(self.create_model_inputs)
-                # round up to multiple of micro_batch_size
-                new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
-                padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
-                # dummy data iterator for pipelining.
-                data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
-                self.model.micro_batches = len(data_iterator)
-                output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
-
-
-                if output is not None:
-                    output = torch.cat(output, 0)[:len(inps)]
-                else:
-                    output = None
-
-                # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same
-                if args.adaptive_seq_len:
-                    self.model.total_loss = None
-        else:
-            # Since the shape of the micro-batch will change
-            # We need set the correct shapes here
-            # So that latter pipeline stages knows which shapes to expect.
-            # Otherwise we will deadlock.
-
-            args.micro_batch_size = len(inps)
-            args.seq_length = len(inps[0])
-            args.max_position_embeddings = args.seq_length
-
-            input_tensor = recv_forward()
-
-            # Forward pass through the model.
-            unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
-            unwrapped_model.set_input_tensor(input_tensor)
-            output = self.model(*self.create_model_inputs(inps)[0])
-            send_forward(output)
-
-        if mpu.is_pipeline_last_stage():
-            return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
-        else:
-            return None
-
-    def tokenizer_encode(self, text):
-        """Tokenize text *without* adding special tokens."""
-        # Splitting this into its own method in case we need to handle special cases for different tokenizers
-        from megatron_ds.tokenizer.gpt2_tokenization import GPT2Tokenizer
-        if isinstance(self.tokenizer.tokenizer, GPT2Tokenizer):
-            return self.tokenizer.tokenizer.encode(text)
-        else:
-            return self.tokenizer.tokenizer.encode(text, add_special_tokens=False)
-
-
-from megatron_ds.initialize import initialize_megatron
-import megatron_ds
-
-from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
-from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
-
-def override_args(args, override_args, skip_keys, skip_if_specified_keys):
-    for k, v in vars(override_args).items():
-        if k in skip_keys:
-            continue
-        if k in skip_if_specified_keys and getattr(args, k) is not None:
-            continue
-        setattr(args, k, v)
-
-
-# Note(Hesslow):
-# The model loading is a bit convoluted.
-# We want to parse out the model arguments from the checkpoint and use those to initialize megatron-ds.
-#
-# However megatron-ds expects its arguments on the command line.
-# And at that point we don't know them.
-#
-# Instead we use Jasons way: we load the arguments form the checkpoint and then override _parse_args to return whatever args we want.
-#
-# If the checkpoint is old, some new arguments may have been introduced and the code will expect these arguments to exist.
-# In order to support this we _first_ parse the arguments normally, and then override them with the arguments from the checkpoint.
-# Keeping the default-value of newer arguments.
-#
-# We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints.
-def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
-    # parse the megatorn args. But wait with initalizing megatron_ds.
-    # avoid printing the arguments, since they will later be overridden.
-<<<<<<< HEAD
-    _print_args = megatron.arguments._print_args
-    megatron.arguments._print_args = lambda *_args, **kwarg: None
-    args = parse_args(extra_args_provider=extra_args_provider)
-=======
-    _print_args = megatron_ds.arguments._print_args
-    megatron_ds.arguments._print_args = lambda *_args, **kwarg: None
-    args = _parse_args(extra_args_provider)
->>>>>>> 1339997... update megatron to megatron_ds
-
-    ds_checkpoint = DeepSpeedCheckpoint(args.load,
-                                        tp_degree=args.tensor_model_parallel_size,
-                                        pp_degree=args.pipeline_model_parallel_size,
-                                        no_pp=args.no_pipeline_parallel)
-
-
-    cp_args = ds_checkpoint.get_args()
-    # Merge the current args with the checkpoint args.
-    skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config',
-                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'moe_expert_parallel_size', 'moe_token_dropping', 'load', 'rampup_batch_size', 'iteration', 'inference', 'random_ltd']
-
-    skip_if_specified = ['merge_file', 'vocab_file']
-
-    if args.eval_fp32:
-        cp_args.fp16 = False
-        cp_args.bf16 = False
-        cp_args.params_dtype = torch.float32
-
-    cp_args.tokenizer_type = 'GPT2BPETokenizer'
-
-    override_args(args, cp_args, skip_keys, skip_if_specified)
-
-    # stop megatron from reparsing the arguments.
-<<<<<<< HEAD
-    megatron.arguments.parse_args = lambda *_args, **kwarg: args
-    megatron.global_vars._ensure_var_is_not_initialized = lambda *_args, **kwarg: None
-    megatron.global_vars._GLOBAL_ARGS = args
-=======
-    megatron_ds.global_vars._parse_args = lambda *_args, **kwarg: args
-    megatron_ds.global_vars._GLOBAL_ARGS = args
->>>>>>> 1339997... update megatron to megatron_ds
-
-    initialize_megatron(extra_args_provider=extra_args_provider)
-    megatron.global_vars._GLOBAL_ARGS = args
-    torch.distributed.barrier()
-
-    # Initializing megatron will update eg. tokenizer size. Override again.
-    override_args(args, cp_args, skip_keys, skip_if_specified)
-
-    # print final arguments.
-    _print_args("eval_harness arguments", args)
-    if args.deepspeed:
-
-        # Hack #3:
-        # Loading pipelined models in deepspeed with different TP than it was originally trained on fails
-        # due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
-        # This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
-        # which does not contain these attention-specific keys.
-        #
-        # Deepspeed does however manage to load the model if we just turn off this sanity check.
-        import deepspeed
-        deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
-
-
-        cp_path = args.load
-        args.load = None
-        model, _, _ = setup_model_and_optimizer(model_provider, ModelType.encoder_or_decoder)
-        model = model[0]
-        zero_enabled = model._config.zero_enabled
-        model._config.zero_enabled = False
-        _, _ = model.load_checkpoint(cp_path, tag = '.', load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
-        model._config.zero_enabled = zero_enabled
-    else:
-        model = get_model(model_provider)[0]
-        # Initialize megatron model using the parsed state dict.
-        sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), True)
-
-        model.load_state_dict(sd['model'], strict=True)
-
-    if args.eval_fp32:
-        model = model.float()
-
-    torch.distributed.barrier()
-    return model
-
-def tasks_args(parser):
-    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='Evaluation options')
-    group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
-    group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
-    group.add_argument('--adaptive_seq_len',  default = False, action='store_true',
-                       help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
-    group.add_argument('--num_fewshot', type=int, default = 0, help='Number of few-shot prompts.')
-    group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
-    return parser
-
-<<<<<<< HEAD
-from megatron.arguments import parse_args
-=======
-from megatron_ds.global_vars import _parse_args
->>>>>>> 1339997... update megatron to megatron_ds
-
-def main():
-    start = time.time()
-    model = load_ds_checkpoint_and_setup_megatron(extra_args_provider=tasks_args)
-
-    args = get_args()
-    if args.deepspeed and args.adaptive_seq_len:
-        # adaptive_seq_len hack #1:
-        # CL automatically enables reset_activation_shape() which allows us to change input shapes
-        # and it also reshapes the attenion scores in attention_mask_func
-        args.curriculum_learning_legacy = 1
-
-    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
-    task_dict = tasks.get_task_dict(task_list)
-
-    model.module.activation_checkpoint_interval = 0
-    model._compute_loss = False
-    model.fwd_outputs = []
-
-    tokenizer = get_tokenizer()
-    adaptor = EvalHarnessAdaptor(model, tokenizer)
-    results = evaluator.evaluate(adaptor, task_dict, False, args.num_fewshot, None)
-
-    if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        print(json.dumps(results, indent=2))
-        with open(args.results_path, 'w') as outfile:
-            json.dump(results, outfile, indent = 4)
-    end = time.time()
-    print("evaluation of {} ends in {:.2f} sec, or {:.2f} min, or {:.2f} hr".format(args.task_list, end-start, (end-start)/60.0, (end-start)/3600.0))
-
-if __name__ == '__main__':
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tasks/eval_harness/report-to-csv.py b/toolbox/Megatron-DeepSpeed/tasks/eval_harness/report-to-csv.py
deleted file mode 100644
index e624d8bed7aec75f3de614f9bf86672fc6a8a690..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/eval_harness/report-to-csv.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env python
-
-# This code is originally from https://github.com/bigscience-workshop/Megatron-DeepSpeed
-# under the license https://huggingface.co/spaces/bigscience/license
-
-# this script converts results.json:
-#
-#   "results": {
-#     "arc_challenge": {
-#       "acc": 0.24232081911262798,
-#       "acc_stderr": 0.01252159329580012,
-#       "acc_norm": 0.2764505119453925,
-#       "acc_norm_stderr": 0.013069662474252425
-#     },
-#
-# into a format expected by a spreadsheet, which is:
-#
-#   task          metric   value    err
-#   arc_challenge acc      xxx      yyy
-#   arc_challenge acc_norm xxx      yyy
-#   arc_challenge f1       xxx      yyy
-#
-# usage:
-# report-to-csv.py results.json
-
-
-import sys
-import json
-import io
-import csv
-
-results_file = sys.argv[1]
-
-csv_file = results_file.replace("json", "csv")
-
-print(f"Converting {results_file} to {csv_file}")
-
-with io.open(results_file, 'r', encoding='utf-8') as f:
-    results = json.load(f)
-
-with io.open(csv_file, 'w', encoding='utf-8') as f:
-
-    writer = csv.writer(f)
-    writer.writerow(["task", "metric", "value", "err", "version"])
-
-    versions = results["versions"]
-
-    for k,v in sorted(results["results"].items()):
-        if k not in versions:
-            versions[k] = -1
-
-        if "acc" in v:
-            writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]])
-        if "acc_norm" in v:
-            writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]])
-        if "f1" in v:
-            writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]])
-        # if "ppl" in v:
-        #     writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]])
-        # if "em" in v:
-        #     writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]])
diff --git a/toolbox/Megatron-DeepSpeed/tasks/eval_utils.py b/toolbox/Megatron-DeepSpeed/tasks/eval_utils.py
deleted file mode 100644
index a2d62b4809daa21c0abd3cb8fc7c4e0bdcaff773..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/eval_utils.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Evaluation utilities."""
-
-import os
-import time
-from functools import partial
-
-import torch
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_last, is_last_rank
-from megatron_ds.core import mpu
-from megatron_ds.schedules import get_forward_backward_func
-from tasks.finetune_utils import build_data_loader
-from tasks.finetune_utils import process_batch
-from deepspeed.accelerator import get_accelerator
-
-
-def accuracy_func_provider(single_dataset_provider):
-    """Provide function that calculates accuracies."""
-    args = get_args()
-
-    # Build dataloaders.
-    datapaths = args.valid_data
-    dataloaders = []
-    for datapath in datapaths:
-        dataset = single_dataset_provider(datapath)
-        dataloader = build_data_loader(
-            dataset, args.orig_micro_batch_size, num_workers=args.num_workers,
-            drop_last=(mpu.get_data_parallel_world_size() > 1))
-        dataloaders.append((dataset.dataset_name, dataloader))
-
-    def metrics_func(model, epoch, output_predictions=False):
-        print_rank_last('calculating metrics ...')
-        correct = 0
-        total = 0
-        if output_predictions:
-            assert mpu.get_data_parallel_world_size() == 1
-            named_predictions = []
-            names = 'predictions'
-        for name, dataloader in dataloaders:
-            output = calculate_correct_answers(name, model, dataloader,
-                                               epoch, output_predictions)
-            if not output_predictions:
-                correct_ans, total_count = output
-            else:
-                correct_ans, total_count, predictions = output
-                named_predictions.append((name, predictions))
-                names += '_' + name
-            correct += correct_ans
-            total += total_count
-        if is_last_rank():
-            percent = 0
-            if total > 0:
-                percent = float(correct) * 100.0 / float(total)
-            print(' >> |epoch: {}| overall: correct / total = {} / {} = '
-                  '{:.4f} %'.format(epoch, correct, total, percent))
-
-        if output_predictions and is_last_rank():
-            assert args.load is not None
-            filename = os.path.join(args.load, names + '.pt')
-            torch.save(named_predictions, filename)
-
-    return metrics_func
-
-
-def calculate_correct_answers(name, model, dataloader,
-                              epoch, output_predictions):
-    """Calculate correct over total answers and return prediction if the
-    `output_predictions` is true."""
-    args = get_args()
-    forward_backward_func = get_forward_backward_func()
-    start_time = time.time()
-    for m in model:
-        m.eval()
-    saved_micro_batch_size = args.micro_batch_size
-    saved_global_batch_size = args.global_batch_size
-
-    ds = dataloader.dataset
-    if hasattr(ds, 'sample_multiplier'):
-        # If our dataset as a sample_multiplier attribute that means
-        # each "sample" from the dataset actually has multiple samples
-        # that will collapse into the batch dimension (for example in
-        # the RACE dataset that has several options), we need to
-        # account for that when setting the micro batch size.
-        sample_multiplier = ds.sample_multiplier
-    else:
-        sample_multiplier = 1
-    micro_batch_size_times_data_parallel = args.orig_micro_batch_size * args.data_parallel_size
-    num_micro_batches = args.orig_global_batch_size // micro_batch_size_times_data_parallel
-
-    def loss_func(output_predictions, labels, output_tensor):
-        args = get_args()
-        logits = output_tensor
-
-        loss_dict = {}
-        # Add output predictions.
-        if output_predictions:
-            assert False
-            loss_dict['softmaxes'] = torch.nn.Softmax(dim=-1)(
-                logits.float()).data.cpu().numpy().tolist()
-            loss_dict['labels'] = labels.data.cpu().numpy().tolist()
-            loss_dict['ids'] = batch['uid'].cpu().numpy().tolist()
-        # Compute the correct answers.
-        if args.finetune and args.task == 'CoLA':
-            predicted = torch.argmax(logits, dim=-1)
-            loss_dict['labels'] = labels.data.cpu().numpy().tolist()
-            loss_dict['predicted'] = predicted.data.cpu().numpy().tolist()
-        elif args.finetune and args.task == 'STS-B':
-            predicted = torch.squeeze(logits)
-            loss_dict['labels'] = labels.data.cpu().numpy().tolist()
-            loss_dict['predicted'] = predicted.data.cpu().numpy().tolist()
-        else:
-            predicted = torch.argmax(logits, dim=-1)
-            corrects = (predicted == labels)
-            # Add to the counters.
-            loss_dict['total'] = labels.size(0)
-            loss_dict['correct'] = corrects.sum().item()
-
-        return 0, loss_dict
-
-    # defined inside to capture output_predictions
-    def correct_answers_forward_step(batch, model):
-        try:
-            batch_ = next(batch)
-        except BaseException:
-            batch_ = batch
-        tokens, types, labels, attention_mask = process_batch(batch_)
-
-        # Forward model.
-        args = get_args()
-        output_tensor = model(tokens, attention_mask, tokentype_ids=types)
-
-        return output_tensor, partial(loss_func, output_predictions, labels)
-
-    with torch.no_grad():
-        # For all the batches in the dataset.
-        total = 0
-        correct = 0
-        labels = []
-        predicted = []
-        if output_predictions:
-            # This option is only possible when data parallel size is 1.
-            assert mpu.get_data_parallel_world_size() == 1
-            softmaxes = []
-            labels = []
-            ids = []
-        for _, batch in enumerate(dataloader):
-            # For evaluation only mode we use drop_last = False to get all the
-            # samples, which means we might not have a full batch, so we
-            # adjust batch_size here to actual batch size of data
-            actual_batch_size = len(batch['label'])
-            # ... applying sample_multiplier if necessary
-            args.micro_batch_size = actual_batch_size * sample_multiplier
-            args.global_batch_size = actual_batch_size * sample_multiplier * num_micro_batches
-
-            loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model,
-                                               optimizer=None, timers=None, forward_only=True)
-
-            for loss_dict in loss_dicts:
-                if output_predictions:
-                    softmaxes.extend(loss_dict['softmaxes'])
-                    labels.extend(loss_dict['labels'])
-                    ids.extend(loss_dict['ids'])
-                if args.finetune and args.task in ['CoLA', 'STS-B']:
-                    labels.extend(loss_dict['labels'])
-                    predicted.extend(loss_dict['predicted'])
-                else:
-                    total += loss_dict['total']
-                    correct += loss_dict['correct']
-
-
-    for m in model:
-        m.train()
-    args.micro_batch_size = saved_micro_batch_size
-    args.global_batch_size = saved_global_batch_size
-
-    # Reduce.
-    if mpu.is_pipeline_last_stage():
-        if args.finetune and args.task in ['CoLA', 'STS-B']:
-            if args.task == 'CoLA':
-                labels = get_accelerator().LongTensor(labels)
-                predicted = get_accelerator().LongTensor(predicted)
-                labels_gather = [torch.zeros(len(labels), dtype=torch.long,
-                    device=labels.device) for _ in range(mpu.get_data_parallel_world_size())]
-                predicted_gather = [torch.zeros(len(predicted), dtype=torch.long,
-                    device=predicted.device) for _ in range(mpu.get_data_parallel_world_size())]
-            else:
-                labels = get_accelerator().FloatTensor(labels)
-                predicted = get_accelerator().FloatTensor(predicted)
-                labels_gather = [torch.zeros(len(labels), dtype=torch.float,
-                    device=labels.device) for _ in range(mpu.get_data_parallel_world_size())]
-                predicted_gather = [torch.zeros(len(predicted), dtype=torch.float,
-                    device=predicted.device) for _ in range(mpu.get_data_parallel_world_size())]
-            torch.distributed.all_gather(labels_gather, labels,
-                group=mpu.get_data_parallel_group())
-            torch.distributed.all_gather(predicted_gather, predicted,
-                group=mpu.get_data_parallel_group())
-
-            labels_gather = sum([x.data.cpu().numpy().tolist() for x in labels_gather], [])
-            predicted_gather = sum([x.data.cpu().numpy().tolist() for x in predicted_gather], [])
-
-            # Print on screen.
-            if args.task == 'CoLA':
-                from sklearn.metrics import matthews_corrcoef
-                mcc = matthews_corrcoef(labels_gather, predicted_gather)
-                elapsed_time = time.time() - start_time
-                print_rank_last(' > |epoch: {}| metrics for {}: mcc '
-                                '= {} , elapsed time (sec): {:.3f}'.format(
-                                    epoch, name, mcc, elapsed_time))
-            else:
-                from scipy.stats import pearsonr, spearmanr
-                pearson_corr = pearsonr(predicted_gather, labels_gather)[0]
-                spearman_corr = spearmanr(predicted_gather, labels_gather)[0]
-                corr = (pearson_corr + spearman_corr) / 2
-                elapsed_time = time.time() - start_time
-                print_rank_last(' > |epoch: {}| metrics for {}: pearson '
-                                '= {} spearmanr = {} corr = {} elapsed time (sec): {:.3f}'.format(
-                                    epoch, name, pearson_corr, spearman_corr,
-                                    corr, elapsed_time))
-
-            if output_predictions:
-                return 0, 0, ()
-            return 0, 0
-        else:
-            unreduced = get_accelerator().LongTensor([correct, total])
-            torch.distributed.all_reduce(unreduced,
-                                         group=mpu.get_data_parallel_group())
-
-            # Print on screen.
-
-            correct_ans = unreduced[0].item()
-            total_count = unreduced[1].item()
-            percent = float(correct_ans) * 100.0 / float(total_count)
-            elapsed_time = time.time() - start_time
-            print_rank_last(' > |epoch: {}| metrics for {}: correct / total '
-                            '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format(
-                                epoch, name, correct_ans, total_count,
-                                percent, elapsed_time))
-
-            if output_predictions:
-                return correct_ans, total_count, (softmaxes, labels, ids)
-            return correct_ans, total_count
-    if output_predictions:
-        return 0, 0, ()
-    return 0, 0
diff --git a/toolbox/Megatron-DeepSpeed/tasks/finetune_utils.py b/toolbox/Megatron-DeepSpeed/tasks/finetune_utils.py
deleted file mode 100644
index 0549c3ba6840b29d9bd910027aeb9e7cff992f3e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/finetune_utils.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Finetune utilities."""
-
-from functools import partial
-import sys
-import torch
-
-from megatron_ds import get_args, get_num_microbatches
-from megatron_ds import print_rank_0
-from megatron_ds import get_timers
-from megatron_ds.core import mpu
-from megatron_ds.core.enums import ModelType
-from megatron_ds.checkpointing import load_checkpoint
-from megatron_ds.checkpointing import save_checkpoint
-from megatron_ds.training import evaluate_and_print_results
-from megatron_ds.training import setup_model_and_optimizer
-from megatron_ds.training import train_step
-from megatron_ds.training import training_log
-from megatron_ds.utils import average_losses_across_data_parallel_group
-from megatron_ds.utils import calc_params_l2_norm
-from megatron_ds.utils import check_adlr_autoresume_termination
-from deepspeed.accelerator import get_accelerator
-
-def process_batch(batch):
-    """Process batch and produce inputs for the model."""
-    args = get_args()
-
-    tokens = batch['text'].long().to(get_accelerator().device_name()).contiguous()
-    types = batch['types'].long().to(get_accelerator().device_name()).contiguous()
-    labels = batch['label'].long().to(get_accelerator().device_name()).contiguous()
-    attention_mask = batch['padding_mask'].float().to(get_accelerator().device_name()).contiguous()
-    if args.fp16:
-        attention_mask = attention_mask.half()
-
-    return tokens, types, labels, attention_mask
-
-
-def cross_entropy_loss_func(labels, output_tensor):
-    logits = output_tensor
-
-    # Cross-entropy loss.
-    loss_func = torch.nn.CrossEntropyLoss()
-    loss = loss_func(logits.contiguous().float(), labels)
-
-    # Reduce loss for logging.
-    averaged_loss = average_losses_across_data_parallel_group([loss])
-
-    return loss, {'lm loss': averaged_loss[0]}
-
-
-def _cross_entropy_forward_step(batch, model):
-    """Simple forward step with cross-entropy loss."""
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator', log_level=2).start()
-    try:
-        batch_ = next(batch)
-    except BaseException:
-        batch_ = batch
-    tokens, types, labels, attention_mask = process_batch(batch_)
-    timers('batch-generator').stop()
-
-    # Forward model.
-    output_tensor = model(tokens, attention_mask, tokentype_ids=types)
-
-    return output_tensor, partial(cross_entropy_loss_func, labels)
-
-def process_batch_mse(batch):
-    """Process batch and produce inputs for the model."""
-    args = get_args()
-
-    tokens = batch['text'].long().to(get_accelerator().device_name()).contiguous()
-    types = batch['types'].long().to(get_accelerator().device_name()).contiguous()
-    labels = batch['label'].float().to(get_accelerator().device_name()).contiguous()
-    attention_mask = batch['padding_mask'].float().to(get_accelerator().device_name()).contiguous()
-    if args.fp16:
-        attention_mask = attention_mask.half()
-
-    return tokens, types, labels, attention_mask
-
-def mse_loss_func(labels, output_tensor):
-    logits = output_tensor
-
-    # Cross-entropy loss.
-    loss_func = torch.nn.MSELoss()
-    loss = loss_func(logits.contiguous().float().view(-1), labels.view(-1))
-
-    # Reduce loss for logging.
-    averaged_loss = average_losses_across_data_parallel_group([loss])
-
-    return loss, {'lm loss': averaged_loss[0]}
-
-def mse_forward_step(batch, model):
-    """Simple forward step with cross-entropy loss."""
-    timers = get_timers()
-
-    # Get the batch.
-    timers('batch-generator').start()
-    try:
-        batch_ = next(batch)
-    except BaseException:
-        batch_ = batch
-    tokens, types, labels, attention_mask = process_batch_mse(batch_)
-    timers('batch-generator').stop()
-
-    # Forward model.
-    output_tensor = model(tokens, attention_mask, tokentype_ids=types)
-
-    return output_tensor, partial(mse_loss_func, labels)
-
-def build_data_loader(dataset, micro_batch_size, num_workers, drop_last,
-        task_collate_fn=None):
-    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
-
-    # Sampler.
-    world_size = mpu.get_data_parallel_world_size()
-    rank = mpu.get_data_parallel_rank()
-    sampler = torch.utils.data.distributed.DistributedSampler(
-        dataset, num_replicas=world_size, rank=rank)
-
-    # Data loader. Note that batch size is the per GPU batch size.
-    data_loader = torch.utils.data.DataLoader(dataset,
-                                              batch_size=micro_batch_size,
-                                              sampler=sampler,
-                                              shuffle=False,
-                                              num_workers=num_workers,
-                                              drop_last=drop_last,
-                                              pin_memory=True,
-                                              collate_fn=task_collate_fn)
-
-    return data_loader
-
-
-def _build_infinite_size_dataloader(dataloader):
-    """Build a looped dataloader with infinite size."""
-
-    iterator = dataloader.__iter__()
-    while True:
-        try:
-            yield iterator.__next__()
-        except StopIteration:
-            iterator = dataloader.__iter__()
-
-
-def _build_train_valid_dataloaders(train_dataset, valid_dataset, 
-    task_collate_fn=None):
-    """Traing and validation dataloaders."""
-    args = get_args()
-
-    print_rank_0('building train and validation dataloaders ...')
-    # Training dataset.
-    train_dataloader = build_data_loader(train_dataset, args.micro_batch_size,
-                                         args.num_workers, not args.keep_last,
-                                         task_collate_fn)
-    # Set the training iterations.
-    args.train_iters_per_epoch = len(train_dataloader)
-    args.train_iters = args.epochs * args.train_iters_per_epoch
-    # Validation dataset. For this dataset, we do not need to set up
-    # shuffling so we can just use a simple infinite loop.
-    valid_dataloader_ = build_data_loader(valid_dataset, args.micro_batch_size,
-                                          args.num_workers, not args.keep_last,
-                                          task_collate_fn)
-    valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)
-
-    # Now that we've built the data loaders, set batch_size arguments
-    # to the actual batch size the model will see for this dataset.
-    # This is necessary so pipeline transfers know what size they are
-    # and the LR schedule, which is based on samples seen, gets set
-    # correctly.
-    args.orig_micro_batch_size = args.micro_batch_size
-    args.orig_global_batch_size = args.global_batch_size
-    if hasattr(train_dataset, 'sample_multiplier'):
-        # If our dataset as a sample_multiplier attribute that means
-        # each "sample" from the dataset actually has multiple samples
-        # that will collapse into the batch dimension (for example in
-        # the RACE dataset that has several options), we need to
-        # account for that when setting the micro batch size.
-        args.micro_batch_size *= train_dataset.sample_multiplier
-        args.global_batch_size *= train_dataset.sample_multiplier
-
-    return train_dataloader, valid_dataloader
-
-
-def _train(model, optimizer, opt_param_scheduler, forward_step,
-           train_dataloader, valid_dataloader, end_of_epoch_callback):
-    """Train the model."""
-    args = get_args()
-    timers = get_timers()
-
-    assert get_num_microbatches() == 1, "finetuning with gradient accumulation doesn't currently work"
-
-    # Turn on training mode which enables dropout.
-    for m in model:
-        m.train()
-
-    # Tracking loss.
-    losses_dict_sum = {}
-
-    # Starting epoch and iteration
-    start_epoch = args.iteration // args.train_iters_per_epoch
-    start_iteration = args.iteration % args.train_iters_per_epoch
-    iteration = args.iteration
-
-    # Memory reporting flag.
-    report_memory_flag = True
-
-    # For each remaining epoch
-    timers('interval-time', log_level=0).start(barrier=True)
-    for epoch in range(start_epoch, args.epochs):
-        print_rank_0('working on epoch {} ...'.format(epoch + 1))
-
-        # Set the data loader epoch to shuffle the index iterator.
-        train_dataloader.sampler.set_epoch(args.seed + epoch)
-
-        # For all the batches in the dataset.
-        for iteration_, batch in enumerate(train_dataloader):
-
-            # Ignore the iterations before starting value
-            if iteration_ < start_iteration:
-                continue
-            # Set to zero so the next epoch does not skip any batches.
-            start_iteration = 0
-
-            # Train for one step.
-            out = train_step(forward_step, batch, model, optimizer, opt_param_scheduler)
-
-            losses_dict, skipped_iter, grad_norm, num_zeros_in_grad = out
-            iteration += 1
-
-            # Logging.
-            params_norm = None
-            if args.log_params_norm:
-                params_norm = calc_params_l2_norm(model)
-            if args.deepspeed:
-                loss_scale = model[0].optimizer.cur_scale
-            else:
-                loss_scale = optimizer.get_loss_scale().item()
-            report_memory_flag = training_log(losses_dict, losses_dict_sum,
-                                              optimizer.param_groups[0]['lr'],
-                                              iteration, loss_scale,
-                                              report_memory_flag, skipped_iter,
-                                              grad_norm, params_norm, num_zeros_in_grad)
-
-            # Autoresume
-            if args.adlr_autoresume and \
-               (iteration % args.adlr_autoresume_interval == 0):
-                check_adlr_autoresume_termination(iteration, model,
-                                                  optimizer, opt_param_scheduler)
-
-            # Checkpointing
-            saved_checkpoint = False
-            if args.save and args.save_interval and \
-               iteration % args.save_interval == 0:
-                save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-                saved_checkpoint = True
-
-            # Evaluation
-            if args.eval_interval and iteration % args.eval_interval == 0:
-                prefix = 'iteration {}'.format(iteration)
-                evaluate_and_print_results(prefix, forward_step,
-                                           valid_dataloader, model,
-                                           iteration, None, False)
-
-            # Exiting based on iterations
-            if args.exit_interval and iteration % args.exit_interval == 0:
-                if not saved_checkpoint:
-                    save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-                torch.distributed.barrier()
-                print_rank_0('exiting program at iteration {}'.format(iteration))
-                sys.exit()
-
-        # Checkpointing at the end of each epoch.
-        if args.save:
-            save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-
-        # Callback at the end of each epoch.
-        if end_of_epoch_callback is not None:
-            end_of_epoch_callback(model, epoch)
-
-
-def finetune(train_valid_datasets_provider, model_provider,
-             model_type=ModelType.encoder_or_decoder,
-             forward_step=_cross_entropy_forward_step,
-             end_of_epoch_callback_provider=None,
-             task_collate_fn=None):
-    """Main finetune function used across all tasks."""
-    args = get_args()
-    timers = get_timers()
-
-    assert args.rampup_batch_size is None, \
-        'batch size scaling is not supported for finetuning'
-
-    # Train and validation data loaders.
-    timers('train/valid/test dataset/dataloder', log_level=0).start()
-    if args.epochs > 0:
-        train_dataset, valid_dataset = train_valid_datasets_provider()
-        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
-            train_dataset, valid_dataset, task_collate_fn)
-    else:
-        args.train_iters = 0
-    timers('train/valid/test dataset/dataloder').stop()
-
-    # Build calback function.
-    timers('callback function', log_level=0).start()
-    end_of_epoch_callback = None
-    if end_of_epoch_callback_provider is not None:
-        end_of_epoch_callback = end_of_epoch_callback_provider()
-    timers('callback function').stop()
-
-    # Build model, optimizer and learning rate scheduler.
-    timers('model and optimizer', log_level=0).start()
-    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(model_provider, model_type)
-    timers('model and optimizer').stop()
-
-    # If pretrained checkpoint is provided and we have not trained for
-    # any iteration (i.e., iteration is zero), then load the pretrained
-    # checkpoint.
-    timers('pretrained checkpoint', log_level=0).start(barrier=True)
-    if args.iteration == 0 and args.pretrained_checkpoint is not None:
-        original_load = args.load
-        args.load = args.pretrained_checkpoint
-        original_rng = args.no_load_rng
-        args.no_load_rng = True
-        _ = load_checkpoint(model, None, None)
-        args.load = original_load
-        args.no_load_rng = original_rng
-        # This is critical when only model is loaded. We should make sure
-        # main parameters are also updated. When DeepSpeed is enabled,
-        # DeepSpeed engine will handle this.
-        if not args.deepspeed:
-            optimizer.reload_model_params()
-    timers('pretrained checkpoint').stop()
-
-    # Print setup timing.
-    print_rank_0('done with setups ...')
-    timers.log(['train/valid/test dataset/dataloder', 'callback function',
-                'model and optimizer', 'pretrained checkpoint'], barrier=True)
-    print_rank_0('training ...')
-
-    # Finetune the model.
-    if args.epochs > 0:
-        _train(model, optimizer, opt_param_scheduler, forward_step,
-               train_dataloader, valid_dataloader, end_of_epoch_callback)
-    # Or just evaluate.
-    else:
-        if end_of_epoch_callback is not None:
-            print_rank_0('evaluation only mode, setting epoch to -1')
-            end_of_epoch_callback(model, epoch=-1, output_predictions=True)
-    print_rank_0('done :-)')
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/cola.py b/toolbox/Megatron-DeepSpeed/tasks/glue/cola.py
deleted file mode 100644
index f6fb9bb1e28ef1b8ae9a79bc597ffd5038c2203a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/cola.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""CoLA dataset."""
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import clean_text
-from .data import GLUEAbstractDataset
-
-
-LABELS = [0, 1]
-
-
-class CoLADataset(GLUEAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length,
-                 test_label=0):
-        self.test_label = test_label
-        super().__init__('CoLA', name, datapaths,
-                         tokenizer, max_seq_length)
-
-    def process_samples_from_single_path(self, filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-
-        samples = []
-        total = 0
-        first = True
-        is_test = False
-        with open(filename, 'r') as f:
-            for line in f:
-                row = line.strip().split('\t')
-                if first:
-                    first = False
-                    if len(row) == 2:
-                        is_test = True
-                        print_rank_0('   reading {} and {} columns and '
-                                     'setting labels to {}'.format(
-                                         row[0].strip(), row[1].strip(),
-                                         self.test_label))
-                        continue
-
-                if is_test:
-                    assert len(row) == 2, 'expected length 2: {}'.format(row)
-                    uid = int(row[0].strip())
-                    text_a = clean_text(row[1].strip())
-                    text_b = None
-                    label = self.test_label
-                    assert len(text_a) > 0
-                else:
-                    if len(row) == 4:
-                        uid = total
-                        text_a = clean_text(row[3].strip())
-                        text_b = None
-                        label = int(row[1].strip())
-                    else:
-                        print_rank_0('***WARNING*** index error, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_a) == 0:
-                        print_rank_0('***WARNING*** zero length a, '
-                                     'skipping: {}'.format(row))
-                        continue
-                assert label in LABELS
-                assert uid >= 0
-
-                sample = {'uid': uid,
-                          'text_a': text_a,
-                          'text_b': text_b,
-                          'label': label}
-                total += 1
-                samples.append(sample)
-
-                if total % 50000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/data.py b/toolbox/Megatron-DeepSpeed/tasks/glue/data.py
deleted file mode 100644
index 15b6bd689faff1947d0871c00d14664e7e375d10..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/data.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""GLUE dataset."""
-
-from abc import ABC
-from abc import abstractmethod
-
-from torch.utils.data import Dataset
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import build_sample
-from tasks.data_utils import build_tokens_types_paddings_from_text
-
-
-class GLUEAbstractDataset(ABC, Dataset):
-    """GLUE base dataset class."""
-
-    def __init__(self, task_name, dataset_name, datapaths,
-                 tokenizer, max_seq_length):
-        # Store inputs.
-        self.task_name = task_name
-        self.dataset_name = dataset_name
-        self.tokenizer = tokenizer
-        self.max_seq_length = max_seq_length
-        print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
-                                                             self.dataset_name))
-        # Process the files.
-        string = '  > paths:'
-        for path in datapaths:
-            string += ' ' + path
-        print_rank_0(string)
-        self.samples = []
-        for datapath in datapaths:
-            self.samples.extend(self.process_samples_from_single_path(datapath))
-        print_rank_0('  >> total number of samples: {}'.format(
-            len(self.samples)))
-
-    def __len__(self):
-        return len(self.samples)
-
-    def __getitem__(self, idx):
-        raw_sample = self.samples[idx]
-        ids, types, paddings = build_tokens_types_paddings_from_text(
-            raw_sample['text_a'], raw_sample['text_b'],
-            self.tokenizer, self.max_seq_length)
-        sample = build_sample(ids, types, paddings,
-                              raw_sample['label'], raw_sample['uid'])
-        return sample
-
-    @abstractmethod
-    def process_samples_from_single_path(self, datapath):
-        """Abstract method that takes a single path / filename and
-        returns a list of dataset samples, each sample being a dict of
-            {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
-        """
-        pass
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/finetune.py b/toolbox/Megatron-DeepSpeed/tasks/glue/finetune.py
deleted file mode 100644
index d6b42e134b28d10c959444dd036b0c64dbe78e33..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/finetune.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""GLUE finetuning/evaluation."""
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds import get_tokenizer
-from megatron_ds.model.classification import Classification
-from tasks.eval_utils import accuracy_func_provider
-from tasks.finetune_utils import finetune, mse_forward_step
-from megatron_ds.arguments import core_transformer_config_from_args
-
-
-def glue_classification(num_classes, Dataset,
-                        name_from_datapath_func):
-
-    def train_valid_datasets_provider():
-        """Build train and validation dataset."""
-        args = get_args()
-        tokenizer = get_tokenizer()
-
-        train_dataset = Dataset('training', args.train_data,
-                                tokenizer, args.seq_length)
-        valid_dataset = Dataset('validation', args.valid_data,
-                                tokenizer, args.seq_length)
-
-        return train_dataset, valid_dataset
-
-    def model_provider(pre_process=True, post_process=True):
-        """Build the model."""
-        args = get_args()
-        config = core_transformer_config_from_args()
-
-        print_rank_0('building classification model for {} ...'.format(
-            args.task))
-        model = Classification(config=config, num_classes=num_classes, num_tokentypes=2,
-                               pre_process=pre_process, post_process=post_process)
-
-        return model
-
-    def metrics_func_provider():
-        """Privde metrics callback function."""
-        def single_dataset_provider(datapath):
-            args = get_args()
-            tokenizer = get_tokenizer()
-
-            name = name_from_datapath_func(datapath)
-            return Dataset(name, [datapath], tokenizer, args.seq_length)
-        return accuracy_func_provider(single_dataset_provider)
-
-    args = get_args()
-    """Finetune/evaluate."""
-    if args.task == 'STS-B':
-        finetune(train_valid_datasets_provider, model_provider,
-                forward_step=mse_forward_step,
-                end_of_epoch_callback_provider=metrics_func_provider)
-    else:
-        finetune(train_valid_datasets_provider, model_provider,
-                end_of_epoch_callback_provider=metrics_func_provider)
-
-
-def main():
-    args = get_args()
-
-    if args.task == 'MNLI':
-
-        num_classes = 3
-        from tasks.glue.mnli import MNLIDataset as Dataset
-
-        def name_from_datapath(datapath):
-            return datapath.split('MNLI')[-1].strip(
-                '.tsv').strip('/').replace('_', '-')
-
-    elif args.task == 'QQP':
-
-        num_classes = 2
-        from tasks.glue.qqp import QQPDataset as Dataset
-
-        def name_from_datapath(datapath):
-            return datapath.split('QQP')[-1].strip(
-                '.tsv').strip('/').replace('_', '-')
-    elif args.task == 'QNLI':
-
-        num_classes = 2
-        from tasks.glue.qnli import QNLIDataset as Dataset
-
-        def name_from_datapath(datapath):
-            return datapath.split('QNLI')[-1].strip(
-                '.tsv').strip('/').replace('_', '-')
-    elif args.task == 'SST-2':
-
-        num_classes = 2
-        from tasks.glue.sst2 import SST2Dataset as Dataset
-
-        def name_from_datapath(datapath):
-            return datapath.split('SST-2')[-1].strip(
-                '.tsv').strip('/').replace('_', '-')
-    elif args.task == 'CoLA':
-
-        num_classes = 2
-        from tasks.glue.cola import CoLADataset as Dataset
-
-        def name_from_datapath(datapath):
-            return datapath.split('CoLA')[-1].strip(
-                '.tsv').strip('/').replace('_', '-')
-    elif args.task == 'STS-B':
-
-        num_classes = 1
-        from tasks.glue.stsb import STSBDataset as Dataset
-
-        def name_from_datapath(datapath):
-            return datapath.split('STS-B')[-1].strip(
-                '.tsv').strip('/').replace('_', '-')
-    elif args.task == 'MRPC':
-
-        num_classes = 2
-        from tasks.glue.mrpc import MRPCDataset as Dataset
-
-        def name_from_datapath(datapath):
-            return datapath.split('MRPC')[-1].strip(
-                '.tsv').strip('/').replace('_', '-')
-    elif args.task == 'RTE':
-
-        num_classes = 2
-        from tasks.glue.rte import RTEDataset as Dataset
-
-        def name_from_datapath(datapath):
-            return datapath.split('RTE')[-1].strip(
-                '.tsv').strip('/').replace('_', '-')
-    else:
-        raise NotImplementedError('GLUE task {} is not implemented.'.format(
-            args.task))
-
-    glue_classification(num_classes, Dataset, name_from_datapath)
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/mnli.py b/toolbox/Megatron-DeepSpeed/tasks/glue/mnli.py
deleted file mode 100644
index 2a1da03211cc845d8dda33b7cc3f5c90ab0c668c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/mnli.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""MNLI dataset."""
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import clean_text
-from .data import GLUEAbstractDataset
-
-
-LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
-
-
-class MNLIDataset(GLUEAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length,
-                 test_label='contradiction'):
-        self.test_label = test_label
-        super().__init__('MNLI', name, datapaths,
-                         tokenizer, max_seq_length)
-
-    def process_samples_from_single_path(self, filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-
-        samples = []
-        total = 0
-        first = True
-        is_test = False
-        with open(filename, 'r') as f:
-            for line in f:
-                row = line.strip().split('\t')
-                if first:
-                    first = False
-                    if len(row) == 10:
-                        is_test = True
-                        print_rank_0(
-                            '   reading {}, {} and {} columns and setting '
-                            'labels to {}'.format(
-                                row[0].strip(), row[8].strip(),
-                                row[9].strip(), self.test_label))
-                    else:
-                        print_rank_0('    reading {} , {}, {}, and {} columns '
-                                     '...'.format(
-                                         row[0].strip(), row[8].strip(),
-                                         row[9].strip(), row[-1].strip()))
-                    continue
-
-                text_a = clean_text(row[8].strip())
-                text_b = clean_text(row[9].strip())
-                unique_id = int(row[0].strip())
-                label = row[-1].strip()
-                if is_test:
-                    label = self.test_label
-
-                assert len(text_a) > 0
-                assert len(text_b) > 0
-                assert label in LABELS
-                assert unique_id >= 0
-
-                sample = {'text_a': text_a,
-                          'text_b': text_b,
-                          'label': LABELS[label],
-                          'uid': unique_id}
-                total += 1
-                samples.append(sample)
-
-                if total % 50000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/mrpc.py b/toolbox/Megatron-DeepSpeed/tasks/glue/mrpc.py
deleted file mode 100644
index 06fee04727326016ce3db13a0bd9083558763821..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/mrpc.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""MRPC dataset."""
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import clean_text
-from .data import GLUEAbstractDataset
-
-
-LABELS = [0, 1]
-
-
-class MRPCDataset(GLUEAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length,
-                 test_label=0):
-        self.test_label = test_label
-        super().__init__('MRPC', name, datapaths,
-                         tokenizer, max_seq_length)
-
-    def process_samples_from_single_path(self, filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-
-        samples = []
-        total = 0
-        first = True
-        is_test = False
-        with open(filename, 'r') as f:
-            for line in f:
-                row = line.strip().split('\t')
-                if first:
-                    first = False
-                    if row[0].strip() == 'index':
-                        is_test = True
-                        print_rank_0('   reading {}, {}, and {} columns and '
-                                     'setting labels to {}'.format(
-                                         row[0].strip(), row[3].strip(),
-                                         row[4].strip(), self.test_label))
-                    else:
-                        assert len(row) == 5
-                        print_rank_0('    reading {}, {}, and {} columns'
-                                     ' ...'.format(
-                                         row[0].strip(), row[3].strip(),
-                                         row[4].strip()))
-                    continue
-
-                if is_test:
-                    assert len(row) == 5, 'expected length 5: {}'.format(row)
-                    uid = int(row[0].strip())
-                    text_a = clean_text(row[3].strip())
-                    text_b = clean_text(row[4].strip())
-                    label = self.test_label
-                    assert len(text_a) > 0
-                    assert len(text_b) > 0
-                else:
-                    if len(row) == 5:
-                        uid = total
-                        text_a = clean_text(row[3].strip())
-                        text_b = clean_text(row[4].strip())
-                        label = int(row[0].strip())
-                    else:
-                        print_rank_0('***WARNING*** index error, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_a) == 0:
-                        print_rank_0('***WARNING*** zero length a, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_b) == 0:
-                        print_rank_0('***WARNING*** zero length b, '
-                                     'skipping: {}'.format(row))
-                        continue
-                assert label in LABELS
-                assert uid >= 0
-
-                sample = {'uid': uid,
-                          'text_a': text_a,
-                          'text_b': text_b,
-                          'label': label}
-                total += 1
-                samples.append(sample)
-
-                if total % 50000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/qnli.py b/toolbox/Megatron-DeepSpeed/tasks/glue/qnli.py
deleted file mode 100644
index 71f1ecfdb25311f1f0460642dde4e39c38691790..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/qnli.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""QNLI dataset."""
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import clean_text
-from .data import GLUEAbstractDataset
-
-
-LABELS = {'entailment': 0, 'not_entailment': 1}
-
-
-class QNLIDataset(GLUEAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length,
-                 test_label='entailment'):
-        self.test_label = test_label
-        super().__init__('QNLI', name, datapaths,
-                         tokenizer, max_seq_length)
-
-    def process_samples_from_single_path(self, filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-
-        samples = []
-        total = 0
-        first = True
-        is_test = False
-        with open(filename, 'r') as f:
-            for line in f:
-                row = line.strip().split('\t')
-                if first:
-                    first = False
-                    if len(row) == 3:
-                        is_test = True
-                        print_rank_0('   reading {}, {}, and {} columns and '
-                                     'setting labels to {}'.format(
-                                         row[0].strip(), row[1].strip(),
-                                         row[2].strip(), self.test_label))
-                    else:
-                        assert len(row) == 4
-                        print_rank_0('    reading {}, {}, {}, and {} columns'
-                                     ' ...'.format(
-                                         row[0].strip(), row[1].strip(),
-                                         row[2].strip(), row[3].strip()))
-                    continue
-
-                if is_test:
-                    assert len(row) == 3, 'expected length 3: {}'.format(row)
-                    uid = int(row[0].strip())
-                    text_a = clean_text(row[1].strip())
-                    text_b = clean_text(row[2].strip())
-                    label = self.test_label
-                    assert len(text_a) > 0
-                    assert len(text_b) > 0
-                else:
-                    if len(row) == 4:
-                        uid = int(row[0].strip())
-                        text_a = clean_text(row[1].strip())
-                        text_b = clean_text(row[2].strip())
-                        label = row[-1].strip()
-                    else:
-                        print_rank_0('***WARNING*** index error, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_a) == 0:
-                        print_rank_0('***WARNING*** zero length a, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_b) == 0:
-                        print_rank_0('***WARNING*** zero length b, '
-                                     'skipping: {}'.format(row))
-                        continue
-                assert label in LABELS
-                assert uid >= 0
-
-                sample = {'uid': uid,
-                          'text_a': text_a,
-                          'text_b': text_b,
-                          'label': LABELS[label]}
-                total += 1
-                samples.append(sample)
-
-                if total % 50000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/qqp.py b/toolbox/Megatron-DeepSpeed/tasks/glue/qqp.py
deleted file mode 100644
index 38ca12b21d5eb4c7623c6b272d0fdfb8b2c94b7f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/qqp.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""QQP dataset."""
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import clean_text
-from .data import GLUEAbstractDataset
-
-
-LABELS = [0, 1]
-
-
-class QQPDataset(GLUEAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length,
-                 test_label=0):
-        self.test_label = test_label
-        super().__init__('QQP', name, datapaths,
-                         tokenizer, max_seq_length)
-
-    def process_samples_from_single_path(self, filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-
-        samples = []
-        total = 0
-        first = True
-        is_test = False
-        with open(filename, 'r') as f:
-            for line in f:
-                row = line.strip().split('\t')
-                if first:
-                    first = False
-                    if len(row) == 3:
-                        is_test = True
-                        print_rank_0('   reading {}, {}, and {} columns and '
-                                     'setting labels to {}'.format(
-                                         row[0].strip(), row[1].strip(),
-                                         row[2].strip(), self.test_label))
-                    else:
-                        assert len(row) == 6
-                        print_rank_0('    reading {}, {}, {}, and {} columns'
-                                     ' ...'.format(
-                                         row[0].strip(), row[3].strip(),
-                                         row[4].strip(), row[5].strip()))
-                    continue
-
-                if is_test:
-                    assert len(row) == 3, 'expected length 3: {}'.format(row)
-                    uid = int(row[0].strip())
-                    text_a = clean_text(row[1].strip())
-                    text_b = clean_text(row[2].strip())
-                    label = self.test_label
-                    assert len(text_a) > 0
-                    assert len(text_b) > 0
-                else:
-                    if len(row) == 6:
-                        uid = int(row[0].strip())
-                        text_a = clean_text(row[3].strip())
-                        text_b = clean_text(row[4].strip())
-                        label = int(row[5].strip())
-                    else:
-                        print_rank_0('***WARNING*** index error, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_a) == 0:
-                        print_rank_0('***WARNING*** zero length a, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_b) == 0:
-                        print_rank_0('***WARNING*** zero length b, '
-                                     'skipping: {}'.format(row))
-                        continue
-                assert label in LABELS
-                assert uid >= 0
-
-                sample = {'uid': uid,
-                          'text_a': text_a,
-                          'text_b': text_b,
-                          'label': label}
-                total += 1
-                samples.append(sample)
-
-                if total % 50000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/rte.py b/toolbox/Megatron-DeepSpeed/tasks/glue/rte.py
deleted file mode 100644
index 6abb7ad2251d62bef69e61da85cf680a1d923d79..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/rte.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""RTE dataset."""
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import clean_text
-from .data import GLUEAbstractDataset
-
-
-LABELS = {'entailment': 0, 'not_entailment': 1}
-
-
-class RTEDataset(GLUEAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length,
-                 test_label='entailment'):
-        self.test_label = test_label
-        super().__init__('RTE', name, datapaths,
-                         tokenizer, max_seq_length)
-
-    def process_samples_from_single_path(self, filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-
-        samples = []
-        total = 0
-        first = True
-        is_test = False
-        with open(filename, 'r') as f:
-            for line in f:
-                row = line.strip().split('\t')
-                if first:
-                    first = False
-                    if len(row) == 3:
-                        is_test = True
-                        print_rank_0('   reading {}, {}, and {} columns and '
-                                     'setting labels to {}'.format(
-                                         row[0].strip(), row[1].strip(),
-                                         row[2].strip(), self.test_label))
-                    else:
-                        assert len(row) == 4
-                        print_rank_0('    reading {}, {}, {}, and {} columns'
-                                     ' ...'.format(
-                                         row[0].strip(), row[1].strip(),
-                                         row[2].strip(), row[3].strip()))
-                    continue
-
-                if is_test:
-                    assert len(row) == 3, 'expected length 3: {}'.format(row)
-                    uid = int(row[0].strip())
-                    text_a = clean_text(row[1].strip())
-                    text_b = clean_text(row[2].strip())
-                    label = self.test_label
-                    assert len(text_a) > 0
-                    assert len(text_b) > 0
-                else:
-                    if len(row) == 4:
-                        uid = int(row[0].strip())
-                        text_a = clean_text(row[1].strip())
-                        text_b = clean_text(row[2].strip())
-                        label = row[-1].strip()
-                    else:
-                        print_rank_0('***WARNING*** index error, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_a) == 0:
-                        print_rank_0('***WARNING*** zero length a, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_b) == 0:
-                        print_rank_0('***WARNING*** zero length b, '
-                                     'skipping: {}'.format(row))
-                        continue
-                assert label in LABELS
-                assert uid >= 0
-
-                sample = {'uid': uid,
-                          'text_a': text_a,
-                          'text_b': text_b,
-                          'label': LABELS[label]}
-                total += 1
-                samples.append(sample)
-
-                if total % 50000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/sst2.py b/toolbox/Megatron-DeepSpeed/tasks/glue/sst2.py
deleted file mode 100644
index 7e5a1e5bc152b07e41fb34c6bcafc339c83f5da2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/sst2.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""SST-2 dataset."""
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import clean_text
-from .data import GLUEAbstractDataset
-
-
-LABELS = [0, 1]
-
-
-class SST2Dataset(GLUEAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length,
-                 test_label=0):
-        self.test_label = test_label
-        super().__init__('SST-2', name, datapaths,
-                         tokenizer, max_seq_length)
-
-    def process_samples_from_single_path(self, filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-
-        samples = []
-        total = 0
-        first = True
-        is_test = False
-        with open(filename, 'r') as f:
-            for line in f:
-                row = line.strip().split('\t')
-                if first:
-                    first = False
-                    if row[0].strip() == 'index':
-                        is_test = True
-                        print_rank_0('   reading {} and {} columns and '
-                                     'setting labels to {}'.format(
-                                         row[0].strip(), row[1].strip(),
-                                         self.test_label))
-                    else:
-                        assert len(row) == 2
-                        print_rank_0('    reading {} and {} columns'
-                                     ' ...'.format(
-                                         row[0].strip(), row[1].strip()))
-                    continue
-
-                if is_test:
-                    assert len(row) == 2, 'expected length 2: {}'.format(row)
-                    uid = int(row[0].strip())
-                    text_a = clean_text(row[1].strip())
-                    text_b = None
-                    label = self.test_label
-                    assert len(text_a) > 0
-                else:
-                    if len(row) == 2:
-                        uid = total
-                        text_a = clean_text(row[0].strip())
-                        text_b = None
-                        label = int(row[-1].strip())
-                    else:
-                        print_rank_0('***WARNING*** index error, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_a) == 0:
-                        print_rank_0('***WARNING*** zero length a, '
-                                     'skipping: {}'.format(row))
-                        continue
-                assert label in LABELS
-                assert uid >= 0
-
-                sample = {'uid': uid,
-                          'text_a': text_a,
-                          'text_b': text_b,
-                          'label': label}
-                total += 1
-                samples.append(sample)
-
-                if total % 50000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/glue/stsb.py b/toolbox/Megatron-DeepSpeed/tasks/glue/stsb.py
deleted file mode 100644
index a8d3fe35fd5d47621efdc68471273550ac4f0ac5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/glue/stsb.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""STS-B dataset."""
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import clean_text
-from .data import GLUEAbstractDataset
-
-
-LABELS = [None]
-
-
-class STSBDataset(GLUEAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length,
-                 test_label=0.0):
-        self.test_label = test_label
-        super().__init__('STS-B', name, datapaths,
-                         tokenizer, max_seq_length)
-
-    def process_samples_from_single_path(self, filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-
-        samples = []
-        total = 0
-        first = True
-        is_test = False
-        with open(filename, 'r') as f:
-            for line in f:
-                row = line.strip().split('\t')
-                if first:
-                    first = False
-                    if len(row) == 9:
-                        is_test = True
-                        print_rank_0('   reading {}, {}, and {} columns and '
-                                     'setting labels to {}'.format(
-                                         row[0].strip(), row[7].strip(),
-                                         row[8].strip(), self.test_label))
-                    else:
-                        assert len(row) == 10
-                        print_rank_0('    reading {}, {}, {}, and {} columns'
-                                     ' ...'.format(
-                                         row[0].strip(), row[7].strip(),
-                                         row[8].strip(), row[-1].strip()))
-                    continue
-
-                if is_test:
-                    assert len(row) == 9, 'expected length 9: {}'.format(row)
-                    uid = int(row[0].strip())
-                    text_a = clean_text(row[7].strip())
-                    text_b = clean_text(row[8].strip())
-                    label = self.test_label
-                    assert len(text_a) > 0
-                    assert len(text_b) > 0
-                else:
-                    if len(row) == 10:
-                        uid = int(row[0].strip())
-                        text_a = clean_text(row[7].strip())
-                        text_b = clean_text(row[8].strip())
-                        label = float(row[-1].strip())
-                    else:
-                        print_rank_0('***WARNING*** index error, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_a) == 0:
-                        print_rank_0('***WARNING*** zero length a, '
-                                     'skipping: {}'.format(row))
-                        continue
-                    if len(text_b) == 0:
-                        print_rank_0('***WARNING*** zero length b, '
-                                     'skipping: {}'.format(row))
-                        continue
-                assert uid >= 0
-
-                sample = {'uid': uid,
-                          'text_a': text_a,
-                          'text_b': text_b,
-                          'label': label}
-                total += 1
-                samples.append(sample)
-
-                if total % 50000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/main.py b/toolbox/Megatron-DeepSpeed/tasks/main.py
deleted file mode 100644
index 2e640197e6943de3615ffa3630871fefcbc19166..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/main.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Main tasks functionality."""
-
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-
-from megatron_ds import get_args
-from megatron_ds.initialize import initialize_megatron
-
-
-def get_tasks_args(parser):
-    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='tasks')
-
-    group.add_argument('--task', type=str, required=True,
-                       help='Task name.')
-    group.add_argument('--epochs', type=int, default=None,
-                       help='Number of finetunning epochs. Zero results in '
-                       'evaluation only.')
-    group.add_argument('--pretrained-checkpoint', type=str, default=None,
-                       help='Pretrained checkpoint used for finetunning.')
-    group.add_argument('--keep-last', action='store_true',
-                       help='Keep the last batch (maybe incomplete) in'
-                       'the data loader')
-    group.add_argument('--train-data', nargs='+', default=None,
-                       help='Whitespace separated paths or corpora names '
-                       'for training.')
-    group.add_argument('--valid-data', nargs='*', default=None,
-                       help='path(s) to the validation data.')
-    group.add_argument('--overlapping-eval', type=int, default=32,
-                       help='Sliding window for overlapping evaluation.')
-    group.add_argument('--strict-lambada', action='store_true',
-                       help='Use more difficult formulation of lambada.')
-    # Retriever args
-    group.add_argument('--qa-data-dev', type=str, default=None,
-                       help='Path to the QA dataset dev file.')
-    group.add_argument('--qa-data-test', type=str, default=None,
-                       help='Path to the QA dataset test file.')
-
-    # Faiss arguments for retriever
-    group.add_argument('--faiss-use-gpu', action='store_true',
-                       help='Whether create the FaissMIPSIndex on GPU')
-    group.add_argument('--faiss-match', type=str, default='string', \
-                        choices=['regex', 'string'], help="Answer matching '\
-                        'logic type")
-    group.add_argument('--faiss-topk-retrievals', type=int, default=100,
-                       help='Number of blocks to use as top-k during retrieval')
-
-    # finetune for retriever
-    group.add_argument('--eval-micro-batch-size', type=int, default=None,
-                       help='Eval Batch size per model instance (local batch '
-                            'size). Global batch size is local batch size '
-                            'times data parallel size.')
-    group.add_argument('--train-with-neg', action='store_true',
-                       help='Whether to use negative examples during model '
-                        'training')
-    group.add_argument('--train-hard-neg', type=int, default=0,
-                       help='Number of hard negative exmaples to use during '
-                        'training')
-
-
-    # parameters for Av.rank validation method
-    # Following options/arguments have been taken directly from DPR codebase
-    group.add_argument('--val-av-rank-hard-neg', type=int, default=30,
-                        help='Av.rank validation: how many hard negatives to'
-                        ' take from each question pool')
-    group.add_argument('--val-av-rank-other-neg', type=int, default=30,
-                        help='Av.rank validation: how many other negatives to'
-                        ' take from each question pool')
-
-
-    return parser
-
-
-if __name__ == '__main__':
-
-    initialize_megatron(extra_args_provider=get_tasks_args)
-
-    args = get_args()
-
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        print("Interleaved pipeline schedule is not yet supported for downstream tasks.")
-        exit()
-
-    if args.task == 'RACE':
-        from race.finetune import main
-    elif args.task in ['MNLI', 'QQP', 'QNLI', 'SST-2', 'CoLA', 'STS-B', 'MRPC', 'RTE']:
-        from glue.finetune import main
-    elif args.task in ['LAMBADA', 'WIKITEXT103']:
-        from zeroshot_gpt.evaluate import main
-    elif args.task in ['ICT-ZEROSHOT-NQ', 'RETRIEVER-EVAL']:
-        from orqa.evaluate_orqa import main
-    elif args.task in ['RET-FINETUNE-NQ']:
-        from orqa.supervised.finetune import main
-    else:
-        raise NotImplementedError('Task {} is not implemented.'.format(
-            args.task))
-
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tasks/msdp/README.md b/toolbox/Megatron-DeepSpeed/tasks/msdp/README.md
deleted file mode 100644
index 27c8728eca146aea44c627a99d5f80184b6fbf84..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/msdp/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Multi-Stage Prompting for Knowledgeable Dialogue Generation
-
-Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework.
-
-## Multi-Stage Dialogue Prompting
-
-### Data Preparation
-1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
-2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets.
-
-### Stage-1: Prompting for Knowledge Generation
-1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
-2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
-
-### Stage-2: Prompting for Response Generation
-1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file).
-2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation.
-3.  We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation.
diff --git a/toolbox/Megatron-DeepSpeed/tasks/msdp/evaluate.py b/toolbox/Megatron-DeepSpeed/tasks/msdp/evaluate.py
deleted file mode 100644
index 89593e056bcfa8529012fc0ca70216d0f75bfebb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/msdp/evaluate.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Model evaluation"""
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from tasks.msdp.metrics import F1Metric
-from tqdm import tqdm
-
-
-def evaluate_f1(guess_file, answer_file):
-    """Evaluating F1 Score"""
-
-    guess_list = []
-    print_rank_0('reading %s' % guess_file)
-    with open(guess_file, "r") as f:
-        for i, line in enumerate(tqdm(f)):
-            line = line.strip()
-            if "<|endoftext|>" in line:
-                line = line.replace("<|endoftext|>", "")
-            guess_list.append(line)
-
-    answer_list = []
-    print_rank_0('reading %s' % answer_file)
-    with open(answer_file, "r") as f:
-        for i, line in enumerate(tqdm(f)):
-            line = line.strip()
-            if line == "no_passages_used":
-                line = ""
-            answer_list.append(line)
-
-    assert len(guess_list) == len(answer_list), \
-        "lengths of guess and answer are different!"
-
-    precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
-    print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1))
-
-    print_rank_0('done :-)')
-
-
-def main():
-    args = get_args()
-    
-    evaluate_f1(args.guess_file, args.answer_file)
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/msdp/main.py b/toolbox/Megatron-DeepSpeed/tasks/msdp/main.py
deleted file mode 100644
index 1b1586df2cb1b1c6079628e2dd266d1e5a614fb8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/msdp/main.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Run multi-stage dialogue prompting (MSDP)."""
-
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(
-    os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir)))
-from megatron_ds import get_args
-from megatron_ds.initialize import initialize_megatron
-
-
-def get_tasks_args(parser):
-    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title='tasks')
-
-    # parameters for the knowledgeable dialogue generation
-    group.add_argument('--task', type=str, required=True,
-                       help='Task name.')
-    group.add_argument("--sample-input-file", type=str, default=None,
-                       help='Get input from file instead of interactive mode, '
-                       'each line is an input.')
-    group.add_argument("--sample-output-file", type=str, default=None,
-                       help='Output file got from --sample-input-file')
-    group.add_argument('--prompt-file', type=str, default=None,
-                       help='prompting file')
-    group.add_argument('--prompt-type', type=str, default=None, 
-                       choices=['knowledge', 'response'],
-                       help='prompt type (knowledge or response)')
-    group.add_argument('--num-prompt-examples', type=int, default=10,
-                       help='number of prompt examples')
-    group.add_argument('--guess-file', type=str, default=None,
-                       help='datapath for generated sentences')
-    group.add_argument('--answer-file', type=str, default=None,
-                       help='datapath for golden sentences')
-    group.add_argument('--out-seq-length', type=int, default=100,
-                       help='output sequence length')
-    group.add_argument('--api-prompt', default=False, action="store_true",
-                       help='setup model api for prompting')
-    group.add_argument('--megatron-api-url', type=str, default=None,
-                       help='url of the megatron api')
-
-    return parser
-
-
-if __name__ == '__main__':
-
-    initialize_megatron(extra_args_provider=get_tasks_args)
-
-    args = get_args()
-
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        print("Interleaved pipeline schedule is not yet supported for downstream tasks.")
-        exit()
-
-    if args.task == 'MSDP-PROMPT':
-        from tasks.msdp.prompt import main
-
-    elif args.task == 'MSDP-EVAL-F1':
-        from tasks.msdp.evaluate import main
-
-    else:
-        raise NotImplementedError('Task {} is not implemented.'.format(
-            args.task))
-
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tasks/msdp/metrics.py b/toolbox/Megatron-DeepSpeed/tasks/msdp/metrics.py
deleted file mode 100644
index fc7ce5dd5ad986f4d985eb962629627ac2f834a3..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/msdp/metrics.py
+++ /dev/null
@@ -1,77 +0,0 @@
-
-# The following code is adapted from
-# https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 
-# which is licensed under the MIT license. More details on the license can be 
-# found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE.
-
-"""Provides standard metric evaluations for dialog."""
-
-from collections import Counter
-from typing import List
-import numpy as np
-import re
-
-re_art = re.compile(r'\b(a|an|the)\b')
-re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
-
-
-def normalize_answer(s):
-    """
-    Lower text and remove punctuation, articles and extra whitespace.
-    """
-    s = s.lower()
-    s = re_punc.sub(' ', s)
-    s = re_art.sub(' ', s)
-    s = ' '.join(s.split())
-    return s
-
-
-class F1Metric:
-    """
-    Helper class which computes token-level F1.
-    """
-
-    @staticmethod
-    def _prec_recall_f1_score(pred_items, gold_items):
-        """
-        Compute precision, recall and f1 given a set of gold and prediction items.
-        :param pred_items: iterable of predicted values
-        :param gold_items: iterable of gold values
-        :return: tuple (p, r, f1) for precision, recall, f1
-        """
-        common = Counter(gold_items) & Counter(pred_items)
-        num_same = sum(common.values())
-        if num_same == 0:
-            return 0, 0, 0
-        precision = 1.0 * num_same / len(pred_items)
-        recall = 1.0 * num_same / len(gold_items)
-        f1 = (2 * precision * recall) / (precision + recall)
-        return precision, recall, f1
-
-    @staticmethod
-    def compute_each_pair(guess: str, answer: str):
-        if answer == "":
-            return None, None, None
-        if guess == "":
-            return 0, 0, 0
-        g_tokens = normalize_answer(guess).split()
-        a_tokens = normalize_answer(answer).split()
-
-        precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens)
-        return precision, recall, f1
-        
-    @staticmethod
-    def compute_all_pairs(guesses: List[str], answers: List[str]):
-        # additional augment:
-        assert len(guesses) == len(answers)
-        
-        precision_list, recall_list, f1_list = [], [], []
-        for guess, answer in zip(guesses, answers):
-            precision, recall, f1 = F1Metric.compute_each_pair(guess, answer)
-            if precision is None or recall is None or f1 is None:
-                continue
-            precision_list.append(precision)
-            recall_list.append(recall)
-            f1_list.append(f1)
-        
-        return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
diff --git a/toolbox/Megatron-DeepSpeed/tasks/msdp/preprocessing.py b/toolbox/Megatron-DeepSpeed/tasks/msdp/preprocessing.py
deleted file mode 100644
index d904c9d0d51d32a3f05b0a62199f3db0403d281b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/msdp/preprocessing.py
+++ /dev/null
@@ -1,582 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Preprocessing for Wizard of Wikipedia and Wizard of Internet datasets"""
-
-import torch
-import argparse
-from nltk import word_tokenize
-from tqdm import tqdm
-import numpy as np
-import json
-
-def get_args():
-    parser = argparse.ArgumentParser(description="Preprocessing")
-
-    parser.add_argument("--func", type=str, default=None,
-                        help="choose to run which function")
-    parser.add_argument("--raw_file", type=str, default=None,
-                        help="path of the input file")
-    parser.add_argument("--processed_file", type=str, default=None,
-                        help="path of the output file")
-    parser.add_argument("--knwl_ref_file", type=str, default=None,
-                        help="path of the knowledge reference file")
-    parser.add_argument("--resp_ref_file", type=str, default=None,
-                        help="path of the knowledge reference file")
-    parser.add_argument("--knwl_gen_file", type=str, default=None,
-                        help="path of the generated knowledge file")
-    parser.add_argument("--test_file", type=str, default=None,
-                        help="path of the test file")
-    parser.add_argument("--train_file", type=str, default=None,
-                        help="path of the train file")
-    parser.add_argument("--model_file", type=str, default=None,
-                        help="path of the model file")
-    parser.add_argument("--data_type", type=str, default=None,
-                        help="data types, choose one out of three types: \
-                              wow_seen, wow_unseen, and woi")
-    parser.add_argument("--seed", type=int, default=1234,
-                        help="random seed")
-
-    args = parser.parse_args()
-    return args
-
-
-def process_wow_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
-    """
-      This is a function used for processing the wizard of wikipedia (wow) dataset
-      Expected processed format:
-      topic \t dialogue context \t golden knowledge \t golden response
-    """
-
-    # loading the raw data
-    print("> Loading data from %s" % raw_file)
-    with open(raw_file, "r") as fr:
-        dialog_data = json.load(fr)
-    
-    print("> Processing data ...")
-    fproc = open(processed_file, "w")
-    fknwl = open(knwl_ref_file, "w") if knwl_ref_file else None
-    fresp = open(resp_ref_file, "w") if resp_ref_file else None
-    
-    for i, sample in enumerate(tqdm(dialog_data)):
-        # get all the dialog data for a single dialog sample
-        dialog = sample["dialog"]
-        
-        turn_list = []  # collect the dialog history
-        # processing for each single dialog sample
-        for j, turn in enumerate(dialog):
-            # text of each turn
-            text = turn["text"]
-            if not (text.endswith("?") or text.endswith(".") or text.endswith("!")):
-                text = text + "."
-            
-            if j == 0:
-                # first turn
-                turn_list.append(text)
-                continue
-
-            speaker = turn["speaker"].lower()
-            if "wizard" in speaker:
-                checked_sentence = list(turn["checked_sentence"].values())  # knowledge
-                checked_passage = list(turn["checked_passage"].values())    # topic
-                
-                assert len(checked_sentence) <= 1
-
-                # get the ground truth knowledge
-                if len(checked_sentence) > 0:
-                    checked_sentence = checked_sentence[0]
-                else:
-                    checked_sentence = "no_passages_used"
-
-                if len(checked_passage) == 1:
-                    checked_passage = checked_passage[0]
-                else:
-                    checked_passage = "no_passages_used"
-
-                # get the topic
-                if checked_passage != "no_passages_used":
-                    topic = checked_passage
-                else:
-                    topic = sample["chosen_topic"]
-                
-                dialog_context = " [SEP] ".join(turn_list)
-                knowledge = checked_sentence
-                response = text
-                # add the response into the dialog history
-                turn_list.append(response)
-
-                # write to the output files
-                fproc.write(topic + "\t" + dialog_context + "\t" + \
-                                knowledge + "\t" + response + "\n")
-                
-                if fknwl:
-                    fknwl.write(knowledge + "\n")
-                if fresp:
-                    # tokenize for evaluation
-                    response = " ".join(word_tokenize(response))
-                    fresp.write(response + "\n")
-
-            else:
-                assert "apprentice" in speaker
-                turn_list.append(text)
-
-    fproc.close()
-    if fknwl:
-        fknwl.close()
-    if fresp:
-        fresp.close()
-
-
-def process_woi_dataset(raw_file, processed_file, knwl_ref_file, resp_ref_file):
-    """
-      This is a function used for processing the wizard of internet (woi) dataset
-      Expected processed format:
-      topic \t dialogue context \t golden knowledge \t golden response
-    """
-    
-    print("> Processing %s" % raw_file)
-    fproc = open(processed_file, "w")
-    fknwl = open(knwl_ref_file, "w") if knwl_ref_file else None
-    fresp = open(resp_ref_file, "w") if resp_ref_file else None
-    
-    with open(raw_file, "r") as fr:
-        for i, line in tqdm(enumerate(fr)):
-            # read line by line, each line uses json format
-            line = line.strip()
-            item_dict = json.loads(line)
-
-            # item_dict is a dictionary
-            # its key is the data id, and its value contains all the data content
-            item_dict = item_dict.values()
-            item_dict = list(item_dict)[0]  # len(item_dict) == 1
-            
-            # get the whole dialog data for a single dialog sample
-            dialog_data = item_dict['dialog_history']
-            length = len(dialog_data)
-            
-            turn_list = []  # collect the dialog history
-            search_text = ""
-            for i in range(length):
-                item = dialog_data[i]
-                action = item['action']
-
-                if action == "Wizard => SearchAgent":
-                    search_text = item['text']
-
-                elif action == "Wizard => Apprentice":
-                    if len(turn_list) == 0:
-                        # first turn
-                        turn = item['text']
-                        turn_list.append(turn)
-                        continue
-
-                    # get the relevant content
-                    contents = item["context"]["contents"]
-                    selects = item["context"]["selected_contents"]
-                    flag = selects[0][0]
-                    selects = selects[1:]
-                    assert len(selects) == len(contents)
-                    
-                    # get the topic
-                    if flag:
-                        # no knowledge sentence is used for the response
-                        topic = "no_topic"
-                        knwl_sent = "no_passages_used"
-                    else:
-                        # we consider the search text as the topic
-                        topic = search_text
-                        # get the knowledge sentence
-                        knwl_sent = ""
-                        for content, select in zip(contents, selects):
-                            content = content['content']
-                            assert len(content) == len(select)
-                            for c, s in zip(content, select):
-                                if s:
-                                    knwl_sent = c
-                                    break
-
-                    if knwl_sent == "":
-                        # no knowledge is used for the response
-                        topic = "no_topic"
-                        knwl_sent = "no_passages_used"
-
-                    # get dialogue context, knowledge, and response 
-                    dialog_context = " [SEP] ".join(turn_list)
-                    response = item['text']
-
-                    # processing
-                    topic = topic.replace("\n", "").replace("\r", \
-                                "").replace("\t", "")
-                    dialog_context = dialog_context.replace("\n", "").replace("\r", \
-                                "").replace("\t", "")
-                    knwl_sent = knwl_sent.replace("\n", "").replace("\r", \
-                                "").replace("\t", "")
-                    response = response.replace("\n", "").replace("\r", \
-                                "").replace("\t", "")
-                    
-                    if topic != "no_topic":
-                        # write to the ouput files
-                        fproc.write(topic + "\t" + dialog_context + "\t" + \
-                                        knwl_sent + "\t" + response + "\n")
-                        if fknwl:
-                            fknwl.write(knwl_sent + "\n")
-                        if fresp:
-                            # tokenize for evaluation
-                            response = " ".join(word_tokenize(response))
-                            fresp.write(response + "\n")
-
-                    turn_list.append(response)
-
-                elif action == "Apprentice => Wizard":
-                    turn = item['text']
-                    turn_list.append(turn)
-
-                else:
-                    assert action == "SearchAgent => Wizard", \
-                            "Please check whether you have used the correct data!"
-
-    fproc.close()
-    if fknwl:
-        fknwl.close()
-    if fresp:
-        fresp.close()
-
-
-def get_database(test_datapath, train_datapath, data_type):
-    """Get the database by topics"""
-
-    assert data_type in ["wow_seen", "wow_unseen", "woi"], \
-                "Please input a correct data type!!"
-
-    # get test data topic dictionary
-    print("> reading test data from %s" % test_datapath)
-    test_topics = {}
-    with open(test_datapath, "r") as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            splits = line.split("\t")
-            topic = splits[0]
-            test_topics[topic] = True
-
-    print("> reading data from %s" % train_datapath)
-    train_data_by_topic = {}
-    dialog_data_by_topic = {}
-    dialog_examples = []
-    with open(train_datapath, "r") as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            splits = line.split("\t")
-            topic = splits[0]
-            turns = splits[1].split(" [SEP] ")[-3:]
-            knowledge = splits[2]
-            response = splits[3]
-            # filtering data samples
-            if knowledge == "no_passages_used":
-                # when no knowledge is used
-                continue
-            if data_type != "wow_seen" and ("(" in knowledge or ")" in knowledge):
-                # when bracket exists in the knowledge
-                continue
-            if data_type != "wow_seen" and topic not in knowledge:
-                # when topic does not exist in the knowledge
-                continue
-
-            # get the instance
-            last_turn = turns[-1]
-            instance = "( " + last_turn + " ) " + topic + " => " + knowledge
-            
-            # construct dialog example
-            dialog_example = ""
-            if data_type != "wow_seen":
-                dialog_example += "( " + topic + " ) "
-            for i, turn in enumerate(turns):
-                if i != 0:
-                    dialog_example += " "
-                dialog_example += turn
-            
-            # check overlaps
-            if topic in test_topics:
-                if topic not in train_data_by_topic:
-                    train_data_by_topic[topic] = [instance]
-                else:
-                    train_data_by_topic[topic].append(instance)
-                
-                if topic not in dialog_data_by_topic:
-                    dialog_data_by_topic[topic] = [dialog_example]
-                else:
-                    dialog_data_by_topic[topic].append(dialog_example)
-            
-            else:
-                # filtering data samples
-                if len(knowledge.split()) > 20:
-                    # knowledge is too long
-                    continue
-                if knowledge.startswith("It") or knowledge.startswith("it") or \
-                   knowledge.startswith("This") or knowledge.startswith("this"):
-                    continue
-                
-            # append all the data into dialogue examples list
-            dialog_examples.append((topic, dialog_example, instance))
-
-    return train_data_by_topic, dialog_data_by_topic, dialog_examples
-
-
-emb_dict = {}
-def select_prompts_based_on_similarity(
-        query, dialog_list, prompt_list, topic, tokenizer, encoder, topk):
-    """Select samples based on the similarity"""
-
-    with torch.no_grad():
-        # get the query embeddings
-        query_ids = tokenizer.encode(query)
-        query_ids = torch.LongTensor([query_ids]).cuda()
-        query_emb = encoder(input_ids=query_ids).pooler_output
-        query_emb = query_emb[0]
-        
-        # calculate embeddings for the samples in the database
-        if topic in emb_dict:
-            example_embeddings = emb_dict[topic]
-            example_embeddings = example_embeddings.cuda()
-        else:
-            for idx, example in enumerate(dialog_list):
-                example_ids = tokenizer.encode(example)
-                example_ids = torch.LongTensor([example_ids]).cuda()
-                example_emb = encoder(input_ids=example_ids).pooler_output
-                if idx == 0:
-                    example_embeddings = example_emb
-                else:
-                    example_embeddings = torch.cat(
-                        (example_embeddings, example_emb), dim=0)
-            emb_dict[topic] = example_embeddings.cpu()
-
-        # compare the similarity and select the topk samples
-        similarity_list = example_embeddings.matmul(query_emb)
-        _, indices = torch.topk(similarity_list, k=topk)
-    
-    indices = indices.tolist()
-    indices = indices[::-1] # reverse the order
-    selected_prompts = []
-    for index in indices:
-        # index = index.item()
-        selected_prompts.append(prompt_list[index])
-
-    return selected_prompts
-
-
-def prompt_selection_for_knowledge_generation(
-        test_datapath, train_datapath, model_path, output_prompt_path, data_type):
-    """Selecting prompts for the knowledge generation"""
-
-    print("> Selecting prompts for the knowledge generation")
-
-    train_data_by_topic, dialog_data_by_topic, dialog_examples = \
-                            get_database(test_datapath, train_datapath, data_type)
-    
-    from transformers import DPRQuestionEncoderTokenizer
-    print("> loading tokenizer and encoder")
-    tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-                    'facebook/dpr-question_encoder-single-nq-base')
-    encoder = torch.load(model_path).cuda()
-
-    print("> getting dialog embeddings")
-    with torch.no_grad():
-        for idx, example in tqdm(enumerate(dialog_examples)):
-            dialog = example[1]
-            dialog_ids = tokenizer.encode(dialog)
-            dialog_ids = torch.LongTensor([dialog_ids]).cuda()
-            dialog_emb = encoder(input_ids=dialog_ids).pooler_output
-
-            if idx == 0:
-                dialog_embeddings = dialog_emb
-            else:
-                dialog_embeddings = torch.cat((dialog_embeddings, dialog_emb), dim=0)
-
-    print("> reading test data from %s" % test_datapath)
-    prompt_list_for_each_sample = []
-    with open(test_datapath, "r") as f:
-        for i, line in tqdm(enumerate(f)):
-            line = line.strip()
-
-            splits = line.split("\t")
-            topic = splits[0]
-            turns = splits[1].split(" [SEP] ")[-3:]
-
-            # get the query sentence
-            query_sent = ""
-            if data_type != "seen":
-                query_sent += "( " + topic + " ) "
-            for i, turn in enumerate(turns):
-                if i != 0:
-                    query_sent += " "
-                query_sent += turn
-
-            if topic not in train_data_by_topic:
-                # get the query embedding
-                query_ids = tokenizer.encode(query_sent)
-                query_ids = torch.LongTensor([query_ids]).cuda()
-                query_emb = encoder(input_ids=query_ids).pooler_output
-                query_emb = query_emb[0]
-
-                # calculate the similarity
-                similarity_list = dialog_embeddings.matmul(query_emb)
-                _, indices = torch.sort(similarity_list)
-                indices = indices.tolist()
-                selected_topics = {}
-                selected_prompts = []
-                num_prompt = 0
-                for index in indices:
-                    example = dialog_examples[index]
-                    topic_temp = example[0]
-                    if topic_temp not in selected_topics:
-                        selected_topics[topic_temp] = True
-                        selected_prompts.append(example[2])
-                        num_prompt += 1
-                        if num_prompt == 10:
-                            break
-                
-                # get the selected samples
-                example_list = selected_prompts[::-1]
-                key = topic + " " + turns[-1]
-                prompt_list_for_each_sample.append({key: example_list})
-
-            else:
-                num_data_sample = min(len(train_data_by_topic[topic]), 10)
-                total_example_list = train_data_by_topic[topic]
-                
-                dialog_list = dialog_data_by_topic[topic]
-                assert len(dialog_list) == len(train_data_by_topic[topic])
-
-                # calculate the similarity
-                example_list = select_prompts_based_on_similarity(
-                                query_sent, dialog_list, total_example_list, 
-                                topic, tokenizer, encoder, topk=num_data_sample)
-                
-                key = topic + " " + turns[-1]
-                prompt_list_for_each_sample.append({key: example_list})
-
-    print("writing to %s" % output_prompt_path)
-    with open(output_prompt_path, "w") as f:
-        for instance in tqdm(prompt_list_for_each_sample):
-            json.dump(instance, f)
-            f.write("\n")
-
-
-def prompt_selection_for_response_generation(input_path, output_path, seed):
-    """Selecting prompts for the response generation"""
-
-    print("> Selecting prompts for the response generation")
-    print("> set random seed")
-    np.random.seed(seed)
-
-    prompt_example_list = []
-    print("> reading data from %s" % input_path)
-    with open(input_path, "r") as f:
-        for i, line in tqdm(enumerate(f)):
-            line = line.strip()
-            splits = line.split("\t")
-
-            # get the topic, context, knowledge and response
-            topic = splits[0]
-            dialog_context = splits[1]
-            knowledge = splits[2]
-            response = splits[3]
-            turns = dialog_context.split(" [SEP] ")[-3:]
-            if knowledge == "no_passages_used":
-                continue
-
-            # calculate the overlap ratio
-            from nltk import word_tokenize
-            knowledge_sent_token_list = word_tokenize(knowledge)
-            knowledge_sent_token_dict = {token: True for token in knowledge_sent_token_list}
-            knowledge_len = len(knowledge_sent_token_list)
-            response_token_list = word_tokenize(response)
-            response_len = len(response_token_list)
-            num_overlap_token = 0
-            accumulator = 0
-            for token in response_token_list:
-                if token in knowledge_sent_token_dict:
-                    accumulator += 1
-                else:
-                    if accumulator >= 10:
-                        num_overlap_token += accumulator
-                    accumulator = 0
-            if accumulator >= 10:
-                num_overlap_token += accumulator
-            
-            # filtering the data based on the ratio
-            if num_overlap_token > response_len * 0.9 or num_overlap_token < response_len * 0.6:
-                continue
-            if num_overlap_token < knowledge_len * 0.8:
-                continue
-            
-            last_turn = " ".join(word_tokenize(turns[-1]))
-            knowledge = " ".join(word_tokenize(knowledge))
-            response = " ".join(word_tokenize(response))
-            prompt_example = ""
-            # add dialog context
-            prompt_example += "Topic: " + topic + ". "
-            prompt_example += "User says: " + last_turn + " "
-            prompt_example += "We know that: " + knowledge + " "
-            prompt_example += "System replies: " + response
-            
-            prompt_example_list.append(prompt_example)
-        
-    # shuffle the prompt examples
-    np.random.shuffle(prompt_example_list)
-    
-    print("> writing to %s" % output_path)
-    with open(output_path, "w") as f:
-        # f.write("Generate the System's response based on the knowledge sentence:\n")
-        for i in tqdm(range(20)):
-            example = prompt_example_list[i]
-            f.write(example + "\n")
-
-
-def prepare_input_for_response_generation(test_file, knwl_gen_file, processed_file):
-    """Preparing inputs for the response generation"""
-
-    print("> Reading knowledge file from %s" % knwl_gen_file)
-    # get the knowledge list
-    with open(knwl_gen_file, "r") as f:
-        knowledge_list = f.readlines()
-    
-    print("> Processing ...")
-    with open(test_file, "r") as fr:
-        with open(processed_file, "w") as fw:
-            for line_num, line in enumerate(tqdm(fr)):
-                line = line.strip()
-                splits = line.split("\t")
-                # prepare topic, context, knowledge and response
-                topic = splits[0]
-                dialog_context = splits[1]
-                response = splits[3]
-                knowledge = knowledge_list[line_num]
-                knowledge = knowledge.strip()
-                if "<|endoftext|>" in knowledge:
-                    knowledge = knowledge.replace("<|endoftext|>", "")
-
-                # write to the output file
-                fw.write(topic + "\t" + dialog_context + "\t" \
-                                     + knowledge + "\t" + response + "\n")
-
-
-if __name__ == "__main__":
-
-    args = get_args()
-    if args.func == "process_wow_dataset":
-        process_wow_dataset(args.raw_file, args.processed_file, args.knwl_ref_file, args.resp_ref_file)
-
-    elif args.func == "process_woi_dataset":
-        process_woi_dataset(args.raw_file, args.processed_file, args.knwl_ref_file, args.resp_ref_file)
-
-    elif args.func == "get_knwl_gen_prompts":
-        prompt_selection_for_knowledge_generation(
-            args.test_file, args.train_file, args.model_file, 
-            args.processed_file, args.data_type)
-    
-    elif args.func == "get_resp_gen_prompts":
-        prompt_selection_for_response_generation(
-            args.train_file, args.processed_file, args.seed)
-
-    elif args.func == "prepare_input":
-        prepare_input_for_response_generation(
-            args.test_file, args.knwl_gen_file, args.processed_file)
diff --git a/toolbox/Megatron-DeepSpeed/tasks/msdp/prompt.py b/toolbox/Megatron-DeepSpeed/tasks/msdp/prompt.py
deleted file mode 100644
index 3f9a5bb02cb97b9c3ed256fc428ddf793018769a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/msdp/prompt.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Prompting the pretrained language model to generate knowledge/response"""
-
-import json
-import torch
-import requests
-from nltk import word_tokenize
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds import get_tokenizer
-from megatron_ds.core import mpu
-from megatron_ds.model import GPTModel
-from megatron_ds.training import get_model
-from megatron_ds.arguments import core_transformer_config_from_args
-from megatron_ds.checkpointing import load_checkpoint
-from megatron_ds.initialize import initialize_megatron
-from megatron_ds.text_generation import generate_and_post_process
-
-
-def call_model_api(inputs, tokens_to_generate):
-    """Calling the model api to get the output generations"""
-
-    args = get_args()
-
-    # The following is an example of using the Megatron API
-    # You can also implement your own API function to place this part
-    headers = {'Content-Type': 'application/json; charset=UTF-8'}
-    data = {"prompts": [inputs], "tokens_to_generate": tokens_to_generate, "top_k": 1}
-    data_json = json.dumps(data)
-    outputs = requests.put(args.megatron_api_url, headers=headers, data=data_json).json()["text"][0]
-
-    input_len = len(inputs)
-    outputs = outputs[input_len:]
-    outputs = outputs.split("\n")[0].strip()
-
-    return outputs
-
-
-def read_prompts(prompt_path, prompt_type, n_example):
-    """Read prompt data"""
-
-    if prompt_type == "knowledge":
-        # prompts for the knowledge generation
-        prompt_examples_dict = {}
-        # read prompt_path
-        with open(prompt_path, "r") as f:
-            for i, line in enumerate(f):
-                line = line.strip()
-                line_dict = json.loads(line)
-                key = list(line_dict.keys())[0]
-
-                if key not in prompt_examples_dict:
-                    prompt_examples = line_dict[key]
-                    prompt = ""
-                    for instance in prompt_examples:
-                        instance = instance.strip()
-                        prompt += instance + " \n"
-                    prompt_examples_dict[key] = prompt
-
-        return prompt_examples_dict
-
-    else:
-        # prompts for the response generation
-        # read prompt_path
-        prompt = ""
-        with open(prompt_path, "r") as f:
-            prompt_examples = f.readlines()
-            prompt_examples = prompt_examples[:n_example]
-            for instance in prompt_examples:
-                instance = instance.strip()
-                prompt += instance + " \n"
-
-        return prompt
-
-
-def generate_samples_by_calling_api():
-    """ Generate outputs by calling"""
-    args = get_args()
-    assert args.prompt_type in ["knowledge", "response"], \
-                "Please input a correct prompt type!"
-
-    if args.prompt_type == "knowledge":
-        # read knowledge generation prompts
-        knwl_gen_prompt_dict = read_prompts(
-            args.prompt_file, args.prompt_type, args.num_prompt_examples)
-
-    else:
-        resp_gen_prompt = read_prompts(
-            args.prompt_file, args.prompt_type, args.num_prompt_examples)
-
-    # read the test data
-    fname = open(args.sample_input_file, "r")
-    test_sample_list = fname.readlines()
-    # create output file
-    fname_out = open(args.sample_output_file, "w")
-
-    # call the api to get the output generations
-    for test_sample in test_sample_list:
-        test_sample = test_sample.strip()
-        splits = test_sample.split("\t")
-        topic = splits[0]
-
-        # prepare the inputs for the api
-        if args.prompt_type == "knowledge":
-            ## inputs = prompt + current test
-            # get the prompt
-            turns = splits[1].split(" [SEP] ")
-            last_turn = turns[-1]
-            key = topic + " " + last_turn
-            inputs = knwl_gen_prompt_dict[key]
-
-            # add current test
-            inputs += "( " + last_turn + " ) " + topic + " =>"
-
-        else:
-            # inputs = prompt + current test
-            # get the prompt
-            inputs = resp_gen_prompt
-
-            # add current test
-            turns = splits[1].split(" [SEP] ")
-            knowledge = splits[2]
-            last_turn = turns[-1]
-            last_turn = " ".join(word_tokenize(last_turn))
-            knowledge = " ".join(word_tokenize(knowledge))
-            knowledge = knowledge.strip()
-            last_turn = last_turn.strip()
-            inputs += "Topic: " + topic + ". "
-            inputs += "User says: " + last_turn + " "
-            inputs += "We know that: " + knowledge + " "
-            inputs += "System replies:"
-
-        # get the output generations from the api,
-        # and write to the output file
-        generations = call_model_api(inputs, args.out_seq_length)
-        fname_out.write(generations)
-        fname_out.write("\n")
-
-    fname.close()
-    fname_out.close()
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    config = core_transformer_config_from_args(get_args())
-
-    print_rank_0('building GPT model ...')
-    model = GPTModel(
-        config=config,
-        num_tokentypes=0,
-        parallel_output=True,
-        pre_process=pre_process,
-        post_process=post_process
-    )
-    return model
-
-
-def generate_samples_by_prompting_input_from_file(model):
-    """Prompt a pretrained language model to generate knowledge/response"""
-
-    # get tokenizer
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    # Read the sample file and open the output file.
-    assert args.sample_input_file is not None, \
-        'sample input file is not provided.'
-    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        fname = open(args.sample_input_file, "r")
-        all_raw_text = fname.readlines()
-        input_count = len(all_raw_text)
-        if args.sample_output_file is None:
-            sample_output_file = args.sample_input_file + ".out"
-            print('`sample-output-file` not specified, setting '
-                    'it to {}'.format(sample_output_file))
-        else:
-            sample_output_file = args.sample_output_file
-
-        fname_out = open(sample_output_file, "w")
-
-    # only two prompt types (i.e., knowledge and response) are allowed
-    assert args.prompt_type in ["knowledge", "response"], \
-                "Please input a correct prompt type!"
-
-    # Read the prompt file
-    if args.prompt_type == "knowledge":
-        # read the prompts for the knowledge generation
-        prompt_examples_dict = {}
-        with open(args.prompt_file, "r") as f:
-            for i, line in enumerate(f):
-                line = line.strip()
-                line_dict = json.loads(line)
-                key = list(line_dict.keys())[0]
-
-                # get the prompt examples based on the key
-                if key not in prompt_examples_dict:
-                    prompt_examples = line_dict[key]
-                    prompt = ""
-                    for instance in prompt_examples:
-                        instance = instance.strip()
-                        prompt += instance + " \n"
-                    prompt_examples_dict[key] = prompt
-
-    else:
-        # read the prompts for the response generation
-        # prompts are fixed for all test samples
-        with open(args.prompt_file, "r") as f:
-            prompt_examples = f.readlines()
-            prompt_examples = prompt_examples[:args.num_prompt_examples]
-
-            prompt = ""
-            for instance in prompt_examples:
-                instance = instance.strip()
-                prompt += instance + " \n"
-
-    input_pos = 0
-    model.eval()
-    # perform prompting
-    with torch.no_grad():
-        while True:
-            raw_text_len = 0
-            if mpu.is_pipeline_first_stage() \
-               and mpu.get_tensor_model_parallel_rank() == 0:
-                input_str = all_raw_text[input_pos]
-                input_str = input_str.strip()
-                splits = input_str.split("\t")
-                topic = splits[0]
-
-                if args.prompt_type == "knowledge":
-                    # first add the prompt into the raw_text
-                    turns = splits[1].split(" [SEP] ")
-                    last_turn = turns[-1]
-                    key = topic + " " + last_turn
-                    raw_text = prompt_examples_dict[key]
-
-                    # construct inputs for knowledge generation
-                    # then add the constructed inputs into the raw_text
-                    raw_text += "( " + last_turn + " ) " + topic + " =>"
-
-                else:
-                    # first add the prompt into the raw_text
-                    raw_text = prompt
-
-                    # construct inputs for response generation
-                    # then add the constructed inputs into the raw_text
-                    turns = splits[1].split(" [SEP] ")
-                    knowledge = splits[2]
-                    last_turn = turns[-1]
-                    last_turn = " ".join(word_tokenize(last_turn))
-                    knowledge = " ".join(word_tokenize(knowledge))
-                    knowledge = knowledge.strip()
-                    last_turn = last_turn.strip()
-                    raw_text += "Topic: " + topic + ". "
-                    raw_text += "User says: " + last_turn + " "
-                    raw_text += "We know that: " + knowledge + " "
-                    raw_text += "System replies:"
-
-                input_pos += 1
-                raw_text_len = len(raw_text)
-
-            else:
-                raw_text = "EMPTY TEXT"
-
-            if input_pos % 100 == 0:
-                print_rank_0("input_pos: %d" % input_pos)
-
-            outputs = generate_and_post_process(
-                        model=model,
-                        prompts=[raw_text],
-                        tokens_to_generate=args.out_seq_length,
-                        top_k_sampling=1)
-            prompts_plus_generations = outputs[0]
-            prompts_plus_generations = prompts_plus_generations[0]
-
-            # write the generated output to the output file
-            if mpu.get_tensor_model_parallel_rank() == 0:
-                if mpu.is_pipeline_first_stage():
-
-                    generations = prompts_plus_generations[raw_text_len:]
-                    generations = generations.split("\n")[0]
-                    generations = generations.strip()
-                    fname_out.write(generations)
-                    fname_out.write("\n")
-
-            raw_text = None
-            if input_pos == input_count:
-                return
-
-
-def main():
-
-    args = get_args()
-    if args.api_prompt:
-        # obtain the generations by calling the api
-        generate_samples_by_calling_api()
-        return
-
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        print("Interleaved pipeline schedule is not yet supported for text generation.")
-        exit()
-
-    # Set up model and load checkpoint.
-    model = get_model(model_provider, wrap_with_ddp=False)
-    if args.load is not None:
-        _ = load_checkpoint(model, None, None)
-
-    assert len(model) == 1, "Above condition should have caught this"
-    model = model[0]
-
-    # perform the prompting
-    generate_samples_by_prompting_input_from_file(model)
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/README.md b/toolbox/Megatron-DeepSpeed/tasks/orqa/README.md
deleted file mode 100644
index a8e8f8e6fabcca14aacc3776a062f753b1253d27..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-## End-to-End Training of Neural Retrievers for Open-Domain Question Answering
-
-Below we present the steps to run unsupervised and supervised trainining and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
-
-## Retriever Training
-
-#### Unsupervised pretraining
-1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body.
-
-<pre>
-python tools/preprocess_data.py \
-    --input /path/to/corpus.json \
-    --json-keys text title \
-    --split-sentences \
-    --tokenizer-type BertWordPieceLowerCase \
-    --vocab-file /path/to/vocab.txt \
-    --output-prefix corpus_indexed \
-    --workers 10
-</pre>
-
-2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training.
-
-3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf).
-
-#### Supervised finetuning
-
-1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906).
-
-2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model.
-
-More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408).
-
-## Reader Training
-
-The reader component will be available soon.
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/evaluate_orqa.py b/toolbox/Megatron-DeepSpeed/tasks/orqa/evaluate_orqa.py
deleted file mode 100644
index cde7c73d116642ddc17727967b8bdcaa6b7896cb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/evaluate_orqa.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Main tasks functionality."""
-
-from megatron_ds import get_args, print_rank_0
-from megatron_ds.indexer import IndexBuilder
-from tasks.orqa.evaluate_utils import ORQAEvaluator
-
-def main():
-    """
-    Main program
-    """
-
-    args = get_args()
-
-    """
-    Create a BlockData data structure by running an IndexBuilder over an
-    ICT Dataset and then evaluate on NQ task
-    """
-
-    print_rank_0("Starting index builder!")
-
-    index_builder = IndexBuilder()
-    index_builder.build_and_save_index()
-    print_rank_0("Build and save indices: done!")
-
-
-    print_rank_0("Starting evaluations!")
-
-    # Set up the model and evaluator
-    evaluator = ORQAEvaluator()
-
-    # Run evaluation
-    if args.qa_data_dev is not None:
-        evaluator.evaluate(args.qa_data_dev, "DEV")
-
-    if args.qa_data_test is not None:
-        evaluator.evaluate(args.qa_data_test, "TEST")
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/evaluate_utils.py b/toolbox/Megatron-DeepSpeed/tasks/orqa/evaluate_utils.py
deleted file mode 100644
index 5eb8ebc961718afd7dfe50ed1eaec7fb60c000d9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/evaluate_utils.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import torch
-
-from megatron_ds import get_args, print_rank_0
-from megatron_ds.checkpointing import load_biencoder_checkpoint
-from megatron_ds.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
-from megatron_ds.data.realm_index import OpenRetreivalDataStore, FaissMIPSIndex
-from megatron_ds.model.biencoder_model import get_model_provider
-from megatron_ds.training import get_model
-from deepspeed.accelerator import get_accelerator
-from tasks.orqa.unsupervised.nq import get_nq_dataset
-from tasks.orqa.unsupervised.nq import get_one_epoch_nq_dataloader
-from tasks.orqa.unsupervised.nq import process_nq_batch
-from tasks.orqa.unsupervised.qa_utils import calculate_matches
-
-
-class ORQAEvaluator(object):
-    def __init__(self):
-        args = get_args()
-        self.embedding_size = args.hidden_size
-        self.faiss_use_gpu = args.faiss_use_gpu
-        self.evidence_embedder_obj = None
-        self.evidence_dataset = None
-        self.mips_index = None
-        self.eval_dataset = None
-
-        # Get Evidence (Wikipedia) dataset
-        self.get_evidence_dataset()
-
-        # Load query encoder checkpoint
-        only_query_model = True
-        if args.biencoder_shared_query_context_model:
-            only_query_model = False
-
-        model = get_model(get_model_provider(only_query_model=only_query_model,
-            biencoder_shared_query_context_model=args.biencoder_shared_query_context_model))
-
-        self.model = load_biencoder_checkpoint(model,
-                only_query_model=only_query_model)
-
-        assert len(self.model) == 1
-        self.model[0].eval()
-
-        # Load faiss indexer
-        self.faiss_wrapper()
-
-    def get_evidence_embedding(self):
-        # This will load the embedding from the embedding path
-        self.evidence_embedder_obj = OpenRetreivalDataStore(load_from_path=True)
-
-    def get_evidence_dataset(self):
-        self.evidence_dataset = get_open_retrieval_wiki_dataset()
-
-    def faiss_wrapper(self):
-        # Initialize FAISS wrapper on local rank = 0 as the evidence embeddings
-        # is distributed over all the GPUs in a node and FAISS is not 
-        # thread-safe
-        args = get_args()
-        if args.local_rank == 0:
-            # Get evidence embeddings computed using context encoder
-            self.get_evidence_embedding()
-
-            assert self.evidence_embedder_obj is not None
-            self.mips_index = FaissMIPSIndex(embed_size=self.embedding_size,
-                                        embed_data=self.evidence_embedder_obj,
-                                        use_gpu=self.faiss_use_gpu)
-
-        # Wait for the FAISS index to be initialized in all the nodes
-        torch.distributed.barrier()
-
-    def generate_query_vectors(self, qa_data, split):
-
-        self.eval_dataset = get_nq_dataset(qa_data, split)
-        dataloader = get_one_epoch_nq_dataloader(self.eval_dataset)
-
-        query_vectors = []
-        reference_list = []
-
-        for batch in dataloader:
-            # batch also has query_tokens and query_pad_data
-            query_tokens, query_mask, query_types, \
-                query_len, reference = process_nq_batch(batch)
-
-            assert len(self.model) == 1
-            unwrapped_model = self.model[0]
-            while not hasattr(unwrapped_model, 'embed_text'):
-                unwrapped_model = unwrapped_model.module
-
-            with torch.no_grad():
-                query_logits = unwrapped_model.embed_text(
-                    unwrapped_model.query_model, query_tokens, 
-                    query_mask, query_types)
-
-            reference_list.extend(reference)
-            query_vectors.extend(query_logits.split(1, dim=0))
-            if len(query_vectors) % 100 == 0:
-                print_rank_0('Encoded queries {}'.format(len(query_vectors)))
-
-        query_tensor = torch.cat(query_vectors, dim=0)
-        print_rank_0('Total encoded queries tensor {}'.format(query_tensor.size()))
-
-        assert query_tensor.size(0) == len(self.eval_dataset)
-        return query_tensor, reference_list
-
-    def evaluate(self, qa_data, split):
-        args = get_args()
-        query_tensor, reference_list = self.generate_query_vectors(qa_data, \
-                                                                    split)
-        local_rank = args.local_rank
-        rank = torch.distributed.get_rank()
-        device_count = get_accelerator().device_count()
-        num_nodes = torch.distributed.get_world_size() // device_count
-        node_id = rank // device_count
-
-        for node in range(num_nodes):
-            start_rank = node * device_count
-            end_rank = (node + 1) * device_count
-            ranks_list = list(range(start_rank, end_rank))
-            node_group = torch.distributed.new_group(ranks=ranks_list)
-
-            if node_id == node:
-                device_start_rank = start_rank
-                group = node_group
-        
-        input_ = torch.empty_like(query_tensor).copy_(query_tensor).detach_()
-        tensor_list = [torch.empty_like(input_) for _ in range(device_count)]
-        torch.distributed.all_gather(tensor_list, query_tensor, group=group)
-
-        if local_rank == 0 and self.mips_index is not None:
-            all_query_tensor = torch.cat(tensor_list, dim=0).contiguous()
-
-            distance, topkindex = self.mips_index.search_mips_index(
-                all_query_tensor, top_k=args.faiss_topk_retrievals, 
-                reconstruct=False)
-            distance = torch.from_numpy(distance).to(get_accelerator().device_name())
-            topkindex = torch.LongTensor(topkindex).to(get_accelerator().device_name())
-
-        if local_rank != 0:
-            distance = torch.empty(device_count * len(query_tensor), \
-                args.faiss_topk_retrievals, dtype=torch.float32).to(get_accelerator().device_name())
-            topkindex = torch.empty(device_count * len(query_tensor), \
-                args.faiss_topk_retrievals, dtype=torch.int64).to(get_accelerator().device_name())
-
-        torch.distributed.broadcast(distance, src=device_start_rank, \
-            group=group)
-        torch.distributed.broadcast(topkindex, src=device_start_rank, \
-            group=group)
-
-        distance = torch.split(distance, len(query_tensor), dim=0)\
-            [local_rank]
-        topkindex = torch.split(topkindex, len(query_tensor), dim=0)\
-            [local_rank]
-
-        top_ids_and_scores = []
-        for darray, topkarray in zip(distance, topkindex):
-            top_ids_and_scores.append((topkarray.tolist(), darray.tolist()))
-
-        passages = self.evidence_dataset.id2text
-        match_stats = calculate_matches(passages,
-                                        reference_list,
-                                        top_ids_and_scores,
-                                        workers_num=args.num_workers,
-                                        match_type=args.faiss_match)
-        top_k_hits = match_stats.top_k_hits
-
-        print_rank_0("{} SET RESULTS".format(split))
-        print_rank_0("topk-{} documents hits {}".format(
-            args.faiss_topk_retrievals, top_k_hits))
-        top_k_hits = [v / len(top_ids_and_scores) for v in top_k_hits]
-        print_rank_0("top-k documents hits accuracy {}".format(top_k_hits))
-
-        for i in args.retriever_report_topk_accuracies:
-            print_rank_0("top-{}: {:.2f}".format(i, top_k_hits[i-1] * 100))
-
-        return
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/data.py b/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/data.py
deleted file mode 100644
index d96f0ef9d46c29bab98ad11e11e0ed9a52504d99..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/data.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""ORQA dataset."""
-
-import json
-import random
-from abc import ABC
-from abc import abstractmethod
-
-import numpy as np
-from torch.utils.data import Dataset
-
-from megatron_ds import print_rank_0, get_args
-from megatron_ds.data.biencoder_dataset_utils import make_attention_mask
-
-def build_token_types_from_context_list(ctx_list, tokenizer, max_seq_length):
-    ctx_id_list, ctx_types_list = [], []
-    for context in ctx_list:
-        title_ids = tokenizer.tokenize(context['title'])
-        ctx_ids = tokenizer.tokenize(context['text'])
-        ctx_ids = title_ids + [tokenizer.sep_id] + ctx_ids
-
-        ctx_ids, ctx_types, _ = build_tokens_types_paddings_from_ids(ctx_ids,
-                                    max_seq_length, tokenizer.cls,
-                                    tokenizer.sep, tokenizer.pad)
-        ctx_id_list.append(ctx_ids)
-        ctx_types_list.append(ctx_types)
-
-    return ctx_id_list, ctx_types_list
-
-
-def build_tokens_types_paddings_from_text(query, context,
-                                          tokenizer, max_seq_length):
-    """Build token types and paddings, trim if needed, and pad if needed."""
-
-    query_ids = tokenizer.tokenize(query)
-    query_ids, query_types, query_pad_mask = \
-        build_tokens_types_paddings_from_ids(query_ids, max_seq_length, \
-            tokenizer.cls, tokenizer.sep, tokenizer.pad)
-
-    # Appending the title of the context at front
-    extended_ctx_ids = None
-    if context is not None:
-        title_ids = tokenizer.tokenize(context['title'])
-        ctx_ids = tokenizer.tokenize(context['text'])
-        extended_ctx_ids = title_ids + [tokenizer.sep] + ctx_ids
-
-    ctx_ids, ctx_types, ctx_pad_mask = \
-        build_tokens_types_paddings_from_ids(extended_ctx_ids,
-            max_seq_length, tokenizer.cls, tokenizer.sep, tokenizer.pad)
-
-    return query_ids, query_types, query_pad_mask, \
-           ctx_ids, ctx_types, ctx_pad_mask
-
-
-# Similar code tasks/data_utils with some changes
-def build_tokens_types_paddings_from_ids(text_ids, max_seq_length,
-                                         cls_id, sep_id, pad_id):
-    """Build token types and paddings, trim if needed, and pad if needed."""
-    enc_ids = []
-    tokentypes_enc = []
-
-    # [CLS].
-    enc_ids.append(cls_id)
-    tokentypes_enc.append(0)
-
-    # A.
-    len_src = len(text_ids)
-    enc_ids.extend(text_ids)
-    tokentypes_enc.extend([0] * len_src)
-
-    # Cap the size.
-    if len(enc_ids) > max_seq_length - 1:
-        enc_ids = enc_ids[0: max_seq_length - 1]
-        tokentypes_enc = tokentypes_enc[0: max_seq_length - 1]
-
-    # [SEP].
-    enc_ids.append(sep_id)
-    tokentypes_enc.append(0)
-
-    num_tokens_enc = len(enc_ids)
-    # Padding.
-    padding_length = max_seq_length - len(enc_ids)
-    if padding_length > 0:
-        enc_ids.extend([pad_id] * padding_length)
-        tokentypes_enc.extend([pad_id] * padding_length)
-
-    pad_mask = ([1] * num_tokens_enc) + ([0] * padding_length)
-    pad_mask = np.array(pad_mask, dtype=np.int64)
-
-    return enc_ids, tokentypes_enc, pad_mask
-
-
-def build_sample(query_ids, query_types, query_pad_mask,
-                ctx_ids, ctx_types, ctx_pad_mask, answers,
-                neg_ctx_id_list=None, neg_ctx_types_list=None,
-                include_neg=False):
-    """Convert to numpy and return a sample consumed by the batch producer."""
-
-    query_ids = np.array(query_ids, dtype=np.int64)
-    query_types = np.array(query_types, dtype=np.int64)
-    query_mask = make_attention_mask(query_ids, query_ids)
-
-    ctx_ids = np.array(ctx_ids, dtype=np.int64)
-    ctx_types = np.array(ctx_types, dtype=np.int64)
-    ctx_mask = make_attention_mask(ctx_ids, ctx_ids)
-
-    sample = ({
-        'query': query_ids,
-        'query_mask': query_mask,
-        'query_types': query_types,
-        'query_pad_mask': query_pad_mask,
-        'context': ctx_ids,
-        'context_mask': ctx_mask,
-        'context_types': ctx_types,
-        'context_pad_mask': ctx_pad_mask,
-        'reference': answers
-    })
-
-    if include_neg:
-        neg_ctx_ids = np.array(neg_ctx_id_list, dtype=np.int64)
-        neg_ctx_id_types = np.array(neg_ctx_types_list, dtype=np.int64)
-        neg_ctx_mask = np.array([make_attention_mask(ids, ids) \
-            for ids in neg_ctx_ids], dtype=np.int64)
-
-        sample['neg_context'] = neg_ctx_ids
-        sample['neg_context_types'] = neg_ctx_id_types
-        sample['neg_context_mask'] = neg_ctx_mask
-
-    return sample
-
-
-class OpenRetrievalAbstractDataset(ABC, Dataset):
-    """Open Retrieval base dataset class."""
-
-    def __init__(self, task_name, dataset_name, datapaths, tokenizer, \
-                max_seq_length, evaluate=False):
-        # Store inputs.
-        args = get_args()
-        self.evaluate = evaluate
-        self.val_av_rank_hard_neg = args.val_av_rank_hard_neg
-        self.val_av_rank_other_neg = args.val_av_rank_other_neg
-        self.train_with_neg = args.train_with_neg
-        self.train_hard_neg = args.train_hard_neg
-
-        self.task_name = task_name
-        self.dataset_name = dataset_name
-        self.tokenizer = tokenizer
-        self.max_seq_length = max_seq_length
-        print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
-                                                             self.dataset_name))
-        # Process the files.
-        string = '  > paths:'
-        for path in datapaths:
-            string += ' ' + path
-        print_rank_0(string)
-        self.samples = []
-        for datapath in datapaths:
-            self.samples.extend(self.process_samples_from_single_path(datapath))
-
-        args = get_args()
-        if args.sample_rate < 1:  # subsample
-            k = int(len(self.samples) * args.sample_rate)
-            self.samples = random.sample(self.samples, k)
-
-        print_rank_0('  >> total number of samples: {}'.format(
-            len(self.samples)))
-
-    def __len__(self):
-        return len(self.samples)
-
-    def __getitem__(self, idx):
-        raw_sample = self.samples[idx]
-
-        query_ids, query_types, query_pad_mask, ctx_ids, ctx_types, \
-            ctx_pad_mask = build_tokens_types_paddings_from_text( \
-                raw_sample['question'], raw_sample['pos_context'], \
-                self.tokenizer, self.max_seq_length)
-
-        if self.evaluate:
-            neg_ctx_list = \
-                raw_sample['negative_context'][:self.val_av_rank_other_neg] + \
-                raw_sample['hard_negative_context'][:self.val_av_rank_hard_neg]
-            neg_ctx_id_list, neg_ctx_types_list = \
-                build_token_types_from_context_list(neg_ctx_list, \
-                    self.tokenizer, self.max_seq_length)
-
-        elif self.train_with_neg:
-            hard_negative_ctx = raw_sample['hard_negative_context']
-            negative_ctx = raw_sample['negative_context']
-            if True:  # TODO: fix this or remove this condition
-                random.shuffle(hard_negative_ctx)
-                random.shuffle(negative_ctx)
-
-            neg_ctx_list = hard_negative_ctx[:self.train_hard_neg]
-            # In the Google NQ dataset by DPR paper, there are around more than
-            # 50 missing hard negatives in training data.
-            # In those cases, substitute hard negatives by simple negatives.
-            if len(neg_ctx_list) < self.train_hard_neg:
-                neg_ctx_list += negative_ctx[:self.train_hard_neg - \
-                    len(neg_ctx_list)]
-
-            neg_ctx_id_list, neg_ctx_types_list = \
-                build_token_types_from_context_list(neg_ctx_list,
-                    self.tokenizer, self.max_seq_length)
-        else:
-            neg_ctx_id_list = None
-            neg_ctx_types_list = None
-
-        sample = build_sample(query_ids, query_types, query_pad_mask,
-                              ctx_ids, ctx_types, ctx_pad_mask,
-                              raw_sample['answers'],
-                              neg_ctx_id_list, neg_ctx_types_list,
-                              include_neg=self.evaluate or self.train_with_neg)
-
-        return sample
-
-    @staticmethod
-    @abstractmethod
-    def process_samples_from_single_path(filename):
-        """Abstract method that takes a filename and
-        returns a list of dataset samples, each sample being a dict of
-            {'text': string, 'text': string}
-        """
-        pass
-
-
-
-def normalize_question(question):
-    if question[-1] == '?':
-        question = question[:-1]
-    return question
-
-# The following class reads the datasets for training retriever as
-# prepared by the DPR codebase (https://github.com/facebookresearch/DPR)
-
-class NQSupervisedDataset(OpenRetrievalAbstractDataset):
-
-    def __init__(self, name, datapaths, tokenizer, max_seq_length, \
-                evaluate=False):
-        super().__init__('natural_questions_ret',
-                         name,
-                         datapaths,
-                         tokenizer,
-                         max_seq_length,
-                         evaluate=evaluate)
-
-    @staticmethod
-    def process_samples_from_single_path(filename):
-        """"Implement abstract method."""
-        print_rank_0(' > Processing {} ...'.format(filename))
-        samples = []
-        total = 0
-
-        with open(filename, 'r', encoding="utf-8") as f:
-            data = json.load(f)
-            for row in data:
-                question = normalize_question(row['question'])
-                pos_context = row['positive_ctxs'][0]
-
-                # Hard Negative Contexts
-                if len(row['hard_negative_ctxs']) > 0:
-                    hard_neg_context = row['hard_negative_ctxs']
-                else:
-                    hard_neg_context = []
-
-                # Negative Contexts
-                if len(row['negative_ctxs']) > 0:
-                    neg_context = row['negative_ctxs']
-                else:
-                    neg_context = []
-
-                answers = row['answers']
-                sample = {'question': question,
-                          'pos_context': pos_context,
-                          'hard_negative_context': hard_neg_context,
-                          'negative_context': neg_context,
-                          'answers': answers}
-                total += 1
-                samples.append(sample)
-
-                if total % 5000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/eval_utils.py b/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/eval_utils.py
deleted file mode 100644
index bb718c320a1e53dc55cb7c2f402162a6eab1262a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/eval_utils.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Evaluation utilities."""
-from collections import OrderedDict
-import math
-import numpy as np
-import time
-import torch
-import torch.nn.functional as F
-from torch.utils.data import DataLoader
-
-from megatron_ds import get_args, print_rank_0
-from megatron_ds.core import mpu
-from megatron_ds.utils import average_losses_across_data_parallel_group
-from tasks.finetune_utils import build_data_loader
-
-def task_collate_fn(batch_data):
-    # generate batch
-    batch_size = len(batch_data)
-    tensorized = OrderedDict()
-    for d in batch_data:
-        for k, v in d.items():
-            tensorized.setdefault(k, []).append(v)
-
-    tensorized['query'] = torch.LongTensor(tensorized['query'])
-    tensorized['query_mask'] = torch.LongTensor(tensorized['query_mask'])
-    tensorized['query_types'] = torch.LongTensor(tensorized['query_types'])
-    tensorized['query_pad_mask'] = \
-        torch.LongTensor(tensorized['query_pad_mask'])
-
-    tensorized['context'] = torch.LongTensor(tensorized['context'])
-    tensorized['context_mask'] = \
-        torch.LongTensor(tensorized['context_mask'])
-    tensorized['context_types'] = \
-        torch.LongTensor(tensorized['context_types'])
-    tensorized['context_pad_mask'] = \
-        torch.LongTensor(tensorized['context_pad_mask'])
-
-    if 'neg_context' in tensorized:
-        tensorized['neg_context'] = \
-            torch.LongTensor(np.concatenate(tensorized['neg_context']))
-        tensorized['neg_context_mask'] = \
-            torch.LongTensor(np.concatenate(tensorized['neg_context_mask']))
-        tensorized['neg_context_types'] = \
-            torch.LongTensor(np.concatenate(tensorized['neg_context_types']))
-
-    return tensorized
-
-
-
-def process_batch(batch):
-    """Process batch and produce inputs for the model."""
-    query_tokens = batch['query'].long().cuda()
-    query_mask = (batch['query_mask'] < 0.5).cuda()
-    query_types = batch['query_types'].long().cuda()
-    query_pad_mask = batch['query_pad_mask'].long().cuda()
-
-    context_tokens = batch['context'].long().cuda()
-    context_mask = (batch['context_mask'] < 0.5).cuda()
-    context_types = batch['context_types'].long().cuda()
-    context_pad_mask = batch['context_pad_mask'].long().cuda()
-
-    if 'neg_context' in batch:
-        neg_context_tokens = batch['neg_context'].long().cuda()
-        neg_context_mask = (batch['neg_context_mask'] < 0.5).cuda()
-        neg_context_types = batch['neg_context_types'].long().cuda()
-    else:
-        neg_context_tokens = None
-        neg_context_mask = None
-        neg_context_types = None
-
-    reference = batch['reference']
-
-    return query_tokens, query_mask, query_types, query_pad_mask, \
-           context_tokens, context_mask, context_types, context_pad_mask, \
-           neg_context_tokens, neg_context_mask, neg_context_types, reference
-
-def accuracy_func_provider(single_dataset_provider, rank0sampler=False):
-    """Provide function that calculates accuracies."""
-    args = get_args()
-
-    print_rank_0("accuracy_func_provider is CALLED")
-
-    # Build dataloaders
-    datapath = args.valid_data
-    dataset = single_dataset_provider(datapath)
-
-    drop_last = False
-    if mpu.get_data_parallel_world_size() > 1 and not rank0sampler:
-        drop_last = True
-
-    print_rank_0(datapath)
-    print_rank_0(rank0sampler)
-
-    dataloader = build_data_loader(dataset,
-                                   args.eval_micro_batch_size,
-                                   num_workers=args.num_workers,
-                                   drop_last=drop_last,
-                                   task_collate_fn=task_collate_fn)
-    dataloaders = (dataset.dataset_name, dataloader)
-
-    def metrics_func(model, epoch, output_predictions=False):
-        print_rank_0('calculating metrics by accuracy func in ORQA...')
-
-        if output_predictions:
-            assert rank0sampler
-            names = 'predictions'
-        name, dataloader = dataloaders
-        if args.task == "RET-FINETUNE-NQ":
-            start_time = time.time()
-            output = retrieval_loss(model, dataloader)
-            stats_dict, total = output
-            format_string = ""
-            for k, v in stats_dict.items():
-                format_string += "|{} = {:.2f}".format(k, v / total)
-            print_rank_0("epoch:{}{}".format(epoch, format_string))
-            print_rank_0("taken time to calcuate metrics {:.3f}".format(\
-                time.time() - start_time))
-        else:
-            raise AssertionError("{} Task not supported".format(args.task))
-
-    return metrics_func
-
-
-def retrieval_loss(model, dataloader):
-    args = get_args()
-    total = 0
-    topk_stats_dict = {'top{}_acc'.format(k): 0 for k in \
-        args.retriever_report_topk_accuracies}
-    stats_dict = dict(rank=0, **topk_stats_dict)
-
-    assert len(model) == 1
-    unwrapped_model = model[0]
-    unwrapped_model.eval()
-
-    with torch.no_grad():
-        # For all the batches in the dataset.
-        for batch in dataloader:
-            # Run the model forward.
-            query_tokens, query_mask, query_types, _, \
-            context_tokens, context_mask, context_types, _, \
-            neg_context_tokens, neg_context_mask, neg_context_types, \
-            reference = process_batch(batch)
-
-            query_logits, context_logits = unwrapped_model(query_tokens,
-                query_mask, query_types,
-                torch.cat([context_tokens, neg_context_tokens]),
-                torch.cat([context_mask, neg_context_mask]),
-                torch.cat([context_types, neg_context_types]))
-
-            retrieval_scores = torch.matmul(query_logits,
-                                    torch.transpose(context_logits, 0, 1))
-
-            if args.retriever_score_scaling:
-                retrieval_scores = retrieval_scores / \
-                    math.sqrt(args.hidden_size)
-
-            local_batch_size = query_logits.shape[0]
-            labels = torch.arange(local_batch_size).long().cuda()
-
-            softmax_scores = F.softmax(retrieval_scores, dim=1)
-            sorted_vals, sorted_indices = torch.topk(softmax_scores,
-                                                     k=softmax_scores.shape[1],
-                                                     sorted=True)
-
-            def topk_accuracy(k):
-                return torch.cuda.FloatTensor(
-                    [sum([int(labels[i] in sorted_indices[i, :k]) for i in \
-                        range(local_batch_size)])])
-
-            def get_rank():
-                return torch.cuda.FloatTensor(
-                    [sum([torch.nonzero(labels[i] == sorted_indices[i])[0][0] \
-                        for i in range(local_batch_size)])])
-
-            topk_accs = [topk_accuracy(k) for k in \
-                args.retriever_report_topk_accuracies]
-            rank = get_rank()
-            losses = average_losses_across_data_parallel_group([rank, \
-                *topk_accs])
-
-            # create stats_dict with retrieval loss and all specified
-            # top-k accuracies
-            topk_acc_dict = {'top{}_acc'.format(k): v * 100 for k, v in \
-                zip(args.retriever_report_topk_accuracies, losses[1:])}
-            temp_stats_dict = dict(rank=losses[0], **topk_acc_dict)
-            for k in stats_dict.keys():
-                stats_dict[k] += temp_stats_dict[k]
-            total += local_batch_size
-
-    unwrapped_model.train()
-
-    return stats_dict, total
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/finetune.py b/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/finetune.py
deleted file mode 100644
index f767a407c37bc7fd3862ad57c08479ffabf97302..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/supervised/finetune.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""ORQA finetuning/evaluation."""
-
-from functools import partial
-import sys
-
-import math
-import torch
-import torch.nn.functional as F
-
-from megatron_ds import get_args, get_timers, get_tokenizer, print_rank_0
-from megatron_ds.core import mpu
-from megatron_ds.indexer import IndexBuilder
-from megatron_ds.model.biencoder_model import biencoder_model_provider
-from megatron_ds.utils import average_losses_across_data_parallel_group
-from pretrain_ict import get_group_world_size_rank
-from tasks.finetune_utils import finetune
-from tasks.orqa.supervised.eval_utils import accuracy_func_provider
-from tasks.orqa.supervised.eval_utils import process_batch, task_collate_fn
-from tasks.orqa.evaluate_utils import ORQAEvaluator
-
-# input_ is a 2D tensor
-def check_and_append_tensor_for_gather(group, rank, world_size, input_):
-
-    # gather the size of the first dimension of the tensor from all ranks
-    current_length = input_.size()[0]
-    first_dim = torch.tensor([[current_length]], 
-        device=torch.cuda.current_device())
-    input_list = [torch.empty_like(first_dim) for _ in range(world_size)]
-    input_list[rank].copy_(first_dim)
-    torch.distributed.all_gather(input_list, first_dim, group=group)
-    all_input_list = torch.cat(input_list, dim=0).contiguous()
-    max_length = torch.max(all_input_list)
-
-    # if the size are different than the max, extend the tensor
-    # accordingly
-    if max_length > current_length:
-        padding=tuple([0] * (input_.dim() * 2 - 1)) + \
-            tuple([max_length - current_length])
-        input_ = F.pad(input=input_, pad=padding)
-
-    return input_
-
-def orqa(Dataset):
-
-    def cross_entropy_forward_step(batch, model):
-        """Simple forward step with cross-entropy loss."""
-        timers = get_timers()
-        tokenizer = get_tokenizer()
-
-        # Get the batch.
-        timers('batch generator', log_level=2).start()
-        try:
-            batch_ = next(batch)
-        except BaseException:
-            batch_ = batch
-
-        group, rank, world_size = get_group_world_size_rank()
-
-        query_tokens, query_mask, query_types, query_pad_mask, \
-        context_tokens, context_mask, context_types, context_pad_mask, \
-        neg_context_tokens, neg_context_mask, neg_context_types, \
-        reference = process_batch(batch_)
-
-        timers('batch generator').stop()
-        local_batch_size = query_tokens.shape[0]
-
-        # Text representation of query and context
-        query_list, context_list = [], []
-        for i in range(local_batch_size):
-            query_list.append(tokenizer.decode(query_tokens[i].tolist()))
-            context_list.append(tokenizer.decode(context_tokens[i].tolist()))
-
-        if neg_context_tokens is not None:
-            neg_context_tokens = check_and_append_tensor_for_gather(group,
-                rank, world_size, neg_context_tokens)
-            neg_context_mask = check_and_append_tensor_for_gather(group,
-                rank, world_size, neg_context_mask)
-            neg_context_types = check_and_append_tensor_for_gather(group,
-                rank, world_size, neg_context_types)
-
-        if neg_context_tokens is not None:
-            context_tokens = torch.cat([context_tokens, neg_context_tokens])
-            context_mask = torch.cat([context_mask, neg_context_mask])
-            context_types = torch.cat([context_types, neg_context_types])
-
-        # Forward model.
-        output_tensor = model(query_tokens, query_mask,
-                                        query_types, context_tokens,
-                                        context_mask, context_types)
-        return output_tensor, partial(cross_entropy_loss_func, query_tokens, context_tokens)
-
-
-    def cross_entropy_loss_func(query_tokens, context_tokens, output_tensor):
-        args = get_args()
-
-        local_batch_size = query_tokens.shape[0]
-        group, rank, world_size = get_group_world_size_rank()
-        # recall we assert that model_parallel_size == 1
-        global_batch_size = world_size * local_batch_size
-
-        query_logits, context_logits = output_tensor
-
-        if world_size > 1:
-            input_ = torch.empty_like(context_logits).copy_(\
-                context_logits).detach_()
-            tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-            tensor_list[rank].copy_(input_)
-            torch.distributed.all_gather(tensor_list, input_, group=group)
-
-            # Check if all-gather happens in order
-            assert tensor_list[rank].sum().item() == \
-                context_logits.sum().item()
-
-            # Preserves the gradient
-            tensor_list[rank] = context_logits
-            all_context_logits = torch.cat(tensor_list, dim=0).contiguous()
-
-            # Query tensors
-            input_ = torch.empty_like(query_logits).copy_(\
-                query_logits).detach_()
-            tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-            tensor_list[rank].copy_(input_)
-            torch.distributed.all_gather(tensor_list, input_, group=group)
-
-            # Check if all-gather happens in order
-            assert tensor_list[rank].sum().item() == query_logits.sum().item()
-
-            # Preserves the gradient
-            tensor_list[rank] = query_logits
-            all_query_logits = torch.cat(tensor_list, dim=0).contiguous()
-        else:
-            all_query_logits = query_logits
-            all_context_logits = context_logits
-
-        retrieval_scores = torch.matmul(all_query_logits,
-                            torch.transpose(all_context_logits, 0, 1))
-        # Scaling the retrieval scores
-        if args.retriever_score_scaling:
-            retrieval_scores = retrieval_scores / math.sqrt(args.hidden_size)
-
-        if args.train_with_neg:
-            # if the world size is 3, local batch size is 4, and
-            # local context size is 8, what we want is
-            # labels = [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19]
-            labels = []
-            local_context_size = context_tokens.shape[0]
-            for i in range(world_size):
-                j = i * local_context_size
-                labels.extend(list(range(j, j + local_batch_size)))
-            labels = torch.LongTensor(labels).cuda()
-            assert len(labels) == global_batch_size
-        else:
-            labels = torch.arange(global_batch_size).long().cuda()
-
-        # Cross-entropy loss.
-        softmax_scores = F.log_softmax(retrieval_scores, dim=1)
-
-        loss = F.nll_loss(softmax_scores, labels, reduction='mean')
-
-        max_score, max_idxs = torch.max(softmax_scores, 1)
-        correct_predictions_count = (max_idxs == labels).sum().float()
-
-        # Reduce loss for logging.
-        reduced_loss = average_losses_across_data_parallel_group([loss, \
-            correct_predictions_count])
-
-        # Loss scaling for correct losses in Supervised Retrieval
-        loss = loss * mpu.get_data_parallel_world_size()
-
-        return loss, {'lm loss': reduced_loss[0],
-                      'correct_prediction_count': reduced_loss[1]}
-
-
-    def train_valid_datasets_provider():
-        """Build train and validation dataset."""
-        args = get_args()
-        tokenizer = get_tokenizer()
-
-        train_dataset = Dataset('training',
-                                args.train_data,
-                                tokenizer,
-                                args.retriever_seq_length,
-                                evaluate=False)
-        valid_dataset = Dataset('validation',
-                                args.valid_data,
-                                tokenizer,
-                                args.retriever_seq_length,
-                                evaluate=True)
-        return train_dataset, valid_dataset
-
-    def model_provider(pre_process=True, post_process=True):
-        """Build the model."""
-        args = get_args()
-        print_rank_0('building retriever model for {} ...'.format(args.task))
-
-        model = biencoder_model_provider(only_context_model=False,
-                    only_query_model=False,
-                    biencoder_shared_query_context_model=\
-                    args.biencoder_shared_query_context_model,
-                    pre_process=pre_process, post_process=post_process)
-
-        return model
-
-    def single_dataset_provider(datapath):
-        args = get_args()
-        tokenizer = get_tokenizer()
-
-        name = datapath[0].split('/')[-1].split('.')[0]
-        return Dataset(name,
-                       datapath,
-                       tokenizer,
-                       args.retriever_seq_length,
-                       evaluate=True)
-
-    def metrics_func_provider():
-        """Provide metrics callback function."""
-        return accuracy_func_provider(single_dataset_provider)
-
-    """Finetune/evaluate."""
-    finetune(train_valid_datasets_provider,
-             model_provider,
-             forward_step=cross_entropy_forward_step,
-             end_of_epoch_callback_provider=metrics_func_provider,
-             task_collate_fn=task_collate_fn)
-
-def main():
-    args = get_args()
-
-    if args.task == 'RET-FINETUNE-NQ':
-        from tasks.orqa.supervised.data import NQSupervisedDataset as Dataset
-    else:
-        raise NotImplementedError('ORQA task {} is not implemented.'.format(
-            args.task))
-
-    orqa(Dataset)
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/nq.py b/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/nq.py
deleted file mode 100644
index ab0449d089e193b2e991a33471372ef7c27e1467..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/nq.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""
- Data Loader for Google NQ dataset
-"""
-
-from abc import ABC
-import csv
-from collections import OrderedDict
-import numpy as np
-
-import torch
-from torch.utils.data import DataLoader
-from torch.utils.data import Dataset, BatchSampler
-
-from megatron_ds import print_rank_0, get_args, get_tokenizer
-from megatron_ds.data.biencoder_dataset_utils import make_attention_mask
-from deepspeed.accelerator import get_accelerator
-
-def get_nq_dataset(qa_data, split):
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    dataset = NQDataset('Google NQ {} Split'.format(split),
-                        'Google Natural Questions',
-                        qa_data,
-                        tokenizer,
-                        args.retriever_seq_length)
-    return dataset
-
-
-def process_nq_batch(batch):
-    query_tokens = batch['token_ids'].long().to(get_accelerator().device_name())
-    query_mask = (batch['token_mask'] < 0.5).to(get_accelerator().device_name())
-    query_types = batch['token_types'].long().to(get_accelerator().device_name())
-    query_len = batch['seq_len'].long().to(get_accelerator().device_name())
-    reference = batch['reference']
-
-    return query_tokens, query_mask, query_types, query_len, reference
-
-
-class CustomDataLoader(DataLoader):
-    def __init__(self, dataset, eval=False, **kwargs):
-        if kwargs.get('collate_fn', None) is None:
-            kwargs['collate_fn'] = self._collate_fn
-        self.eval = eval
-        super().__init__(dataset, **kwargs)
-
-    def _collate_fn(self, batch_data):
-        # generate batch
-        batch_size = len(batch_data)
-        tensorized = OrderedDict()
-        for d in batch_data:
-            for k, v in d.items():
-                tensorized.setdefault(k, []).append(v)
-        assert len(tensorized) == 5
-
-        tensorized['token_ids'] = torch.LongTensor(tensorized['token_ids'])
-        tensorized['token_mask'] = torch.LongTensor(tensorized['token_mask'])
-        tensorized['token_types'] = torch.LongTensor(tensorized['token_types'])
-        tensorized['seq_len'] = torch.LongTensor(tensorized['seq_len'])
-        return tensorized
-
-
-def get_one_epoch_nq_dataloader(dataset, micro_batch_size=None):
-    """Data loader. Note that batch-size is the local (per GPU) batch-size.
-       NOTE: This dataloader is not distributed !!!
-    """
-
-    args = get_args()
-    if micro_batch_size is None:
-        micro_batch_size = args.micro_batch_size
-    num_workers = args.num_workers
-
-    sampler = torch.utils.data.SequentialSampler(dataset)
-    # importantly, drop_last must be False to get all the data.
-    batch_sampler = BatchSampler(sampler,
-                                 batch_size=micro_batch_size,
-                                 drop_last=False)
-
-    # Data loader. Note that batch size is the per GPU batch size.
-    data_loader = CustomDataLoader(dataset,
-                                   batch_sampler=batch_sampler,
-                                   num_workers=num_workers,
-                                   pin_memory=True)
-    return data_loader
-
-
-def build_tokens_types_paddings_from_text(src_text, tokenizer, max_seq_length):
-    """Build token types and paddings, trim if needed, and pad if needed."""
-
-    src_text_ids = tokenizer.tokenize(src_text)
-
-    return build_tokens_types_paddings_from_ids(src_text_ids,
-                                                max_seq_length,
-                                                tokenizer.cls,
-                                                tokenizer.sep,
-                                                tokenizer.pad)
-
-
-def build_tokens_types_paddings_from_ids(src_ids, max_seq_length, cls_id, \
-    sep_id, pad_id):
-    """
-    Build token types and paddings, trim if needed, and pad if needed.
-
-    TODO: Design modular interface to reuse this function. This is getting
-    repeated multiple times in different tasks
-    """
-
-    enc_ids = []
-    tokentypes_enc = []
-
-    # [CLS].
-    enc_ids.append(cls_id)
-    tokentypes_enc.append(0)
-
-    # A.
-    len_src = len(src_ids)
-    enc_ids.extend(src_ids)
-    tokentypes_enc.extend([0] * len_src)
-
-    # Cap the size.
-    if len(enc_ids) > max_seq_length - 1:
-        enc_ids = enc_ids[0: max_seq_length - 1]
-        tokentypes_enc = tokentypes_enc[0: max_seq_length - 1]
-
-    # [SEP].
-    enc_ids.append(sep_id)
-    tokentypes_enc.append(0)
-
-    num_tokens_enc = len(enc_ids)
-    # Padding.
-    padding_length = max_seq_length - len(enc_ids)
-    if padding_length > 0:
-        enc_ids.extend([pad_id] * padding_length)
-        tokentypes_enc.extend([pad_id] * padding_length)
-
-    return enc_ids, tokentypes_enc, num_tokens_enc
-
-
-def build_sample(token_ids, token_types, num_tokens, reference):
-    """
-    Convert to numpy and return a sample consumed by the
-    batch producer.
-    """
-
-    token_ids = np.array(token_ids, dtype=np.int64)
-    token_types = np.array(token_types, dtype=np.int64)
-    token_mask = make_attention_mask(token_ids, token_ids)
-
-    sample = ({
-        'token_ids': token_ids,
-        'token_mask': token_mask,
-        'token_types': token_types,
-        'seq_len': num_tokens,
-        'reference': reference
-    })
-    return sample
-
-
-class NQDataset(ABC, Dataset):
-    """
-    Open Retrieval Question Answering evaluation using Google NQ dataset.
-    """
-
-    def __init__(self, task_name, dataset_name, datapath,
-                 tokenizer, max_seq_length):
-        # Store inputs.
-        self.task_name = task_name
-        self.dataset_name = dataset_name
-        self.tokenizer = tokenizer
-        self.max_seq_length = max_seq_length
-        print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
-                                                             self.dataset_name))
-        print_rank_0(datapath)
-        self.samples = self.process_samples_from_single_path(datapath)
-        print_rank_0('  >> total number of samples: {}'.format(\
-                                                        len(self.samples)))
-
-    def __len__(self):
-        return len(self.samples)
-
-    def __getitem__(self, idx):
-        raw_sample = self.samples[idx]
-
-        ques_tokens, tokentypes_enc, num_tokens_ques = \
-            build_tokens_types_paddings_from_text(raw_sample['question'],
-                self.tokenizer, self.max_seq_length)
-
-        sample = build_sample(ques_tokens,
-                              tokentypes_enc,
-                              num_tokens_ques,
-                              raw_sample['answers'])
-        return sample
-
-    @staticmethod
-    def process_samples_from_single_path(filename):
-        print_rank_0(' > Processing {} ...'.format(filename))
-        samples = []
-        total = 0
-
-        with open(filename, 'r') as ifile:
-            reader = csv.reader(ifile, delimiter='\t')
-            for row in reader:
-                question = row[0]
-                answers = eval(row[1])
-
-                sample = {'question': question, 'answers': answers}
-                total += 1
-                samples.append(sample)
-
-                if total % 1000 == 0:
-                    print_rank_0('  > processed {} so far ...'.format(total))
-
-        print_rank_0(' >> processed {} samples.'.format(len(samples)))
-        return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/qa_utils.py b/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/qa_utils.py
deleted file mode 100644
index 811a05834a47ce1e9f9cca9bae9e0f77f937b588..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/qa_utils.py
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-
-# The following code has been taken from
-# https://github.com/facebookresearch/DPR, which is CC-BY-NC 4.0
-# licensed as of now. More details on the license can be found
-# at https://github.com/facebookresearch/DPR/blob/master/LICENSE
-
-"""
- Set of utilities for Q&A results validation tasks - Retriver passage
- validation and Reader predicted answer validation
-"""
-
-import collections
-import logging
-import string
-import unicodedata
-from functools import partial
-from multiprocessing import Pool as ProcessPool
-from typing import Tuple, List, Dict
-
-import regex as re
-from tasks.orqa.unsupervised.tokenizers import SimpleTokenizer
-
-logger = logging.getLogger(__name__)
-
-QAMatchStats = collections.namedtuple('QAMatchStats', ['top_k_hits',\
-                                        'questions_doc_hits'])
-
-def calculate_matches(all_docs: Dict[object, Tuple[str, str]], 
-    answers: List[List[str]], closest_docs: List[Tuple[List[object], 
-    List[float]]], workers_num: int, match_type: str) -> QAMatchStats:
-    """
-    Evaluates answers presence in the set of documents. This function is 
-    supposed to be used with a large collection of documents and results. 
-    It internally forks multiple sub-processes for evaluation and then 
-    merges results
-    :param all_docs: dictionary of the entire documents database. 
-        doc_id -> (doc_text, title)
-    :param answers: list of answers's list. One list per question
-    :param closest_docs: document ids of the top results along with their
-        scores
-    :param workers_num: amount of parallel threads to process data
-    :param match_type: type of answer matching. Refer to has_answer code for
-        available options
-    :return: matching information tuple.
-    top_k_hits - a list where the index is the amount of top documents retrieved
-        and the value is the total amount of valid matches across an entire
-        dataset.
-    questions_doc_hits - more detailed info with answer matches for every
-        question and every retrieved document
-    """
-    global dpr_all_documents
-    dpr_all_documents = all_docs
-
-    tok_opts = {}
-    tokenizer = SimpleTokenizer(**tok_opts)
-
-    processes = ProcessPool(
-        processes=workers_num,
-    )
-
-    logger.info('Matching answers in top docs...')
-
-    get_score_partial = partial(check_answer, match_type=match_type,
-                                    tokenizer=tokenizer)
-
-    questions_answers_docs = zip(answers, closest_docs)
-
-    scores = processes.map(get_score_partial, questions_answers_docs)
-
-    logger.info('Per question validation results len=%d', len(scores))
-
-    n_docs = len(closest_docs[0][0])
-    top_k_hits = [0] * n_docs
-    for question_hits in scores:
-        best_hit = next((i for i, x in enumerate(question_hits) if x), None)
-        if best_hit is not None:
-            top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]
-
-    return QAMatchStats(top_k_hits, scores)
-
-
-def check_answer(questions_answers_docs, tokenizer, match_type) -> List[bool]:
-    """
-    Search through all the top docs to see if they have any of the answers.
-    """
-    answers, (doc_ids, doc_scores) = questions_answers_docs
-
-    global dpr_all_documents
-    hits = []
-
-    for i, doc_id in enumerate(doc_ids):
-        doc = dpr_all_documents[doc_id]
-        text = doc[0]
-
-        answer_found = False
-        if text is None:  # cannot find the document for some reason
-            logger.warning("no doc in db")
-            hits.append(False)
-            continue
-
-        if has_answer(answers, text, tokenizer, match_type):
-            answer_found = True
-        hits.append(answer_found)
-    return hits
-
-
-def has_answer(answers, text, tokenizer, match_type) -> bool:
-    """
-    Check if a document contains an answer string.
-    If `match_type` is string, token matching is done between the text 
-        and answer.
-    If `match_type` is regex, we search the whole text with the regex.
-    """
-    text = _normalize(text)
-
-    if match_type == 'string':
-        # Answer is a list of possible strings
-        text = tokenizer.tokenize(text).words(uncased=True)
-
-        for single_answer in answers:
-            single_answer = _normalize(single_answer)
-            single_answer = tokenizer.tokenize(single_answer)
-            single_answer = single_answer.words(uncased=True)
-
-            for i in range(0, len(text) - len(single_answer) + 1):
-                if single_answer == text[i: i + len(single_answer)]:
-                    return True
-
-    elif match_type == 'regex':
-        # Answer is a regex
-        for single_answer in answers:
-            single_answer = _normalize(single_answer)
-            if regex_match(text, single_answer):
-                return True
-    return False
-
-
-def regex_match(text, pattern):
-    """Test if a regex pattern is contained within a text."""
-    try:
-        pattern = re.compile(
-            pattern,
-            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE,
-        )
-    except BaseException:
-        return False
-    return pattern.search(text) is not None
-
-
-# function for the reader model answer validation
-def exact_match_score(prediction, ground_truth):
-    return _normalize_answer(prediction) == _normalize_answer(ground_truth)
-
-
-def _normalize_answer(s):
-    def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
-
-    def white_space_fix(text):
-        return ' '.join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def _normalize(text):
-    return unicodedata.normalize('NFD', text)
diff --git a/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/tokenizers.py b/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/tokenizers.py
deleted file mode 100644
index fb23887ebdd43ca83b2a6746ddc77b2a69fc1dd8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/orqa/unsupervised/tokenizers.py
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-
-# The following code has been taken from
-# https://github.com/facebookresearch/DPR, which is CC-BY-NC 4.0
-# licensed as of now. More details on the license can be found
-# at https://github.com/facebookresearch/DPR/blob/master/LICENSE
-
-"""
-Most of the tokenizers code here is copied from DrQA codebase to avoid adding extra dependency
-"""
-
-import copy
-import logging
-
-import regex
-import spacy
-
-logger = logging.getLogger(__name__)
-
-
-class Tokens(object):
-    """A class to represent a list of tokenized text."""
-    TEXT = 0
-    TEXT_WS = 1
-    SPAN = 2
-    POS = 3
-    LEMMA = 4
-    NER = 5
-
-    def __init__(self, data, annotators, opts=None):
-        self.data = data
-        self.annotators = annotators
-        self.opts = opts or {}
-
-    def __len__(self):
-        """The number of tokens."""
-        return len(self.data)
-
-    def slice(self, i=None, j=None):
-        """Return a view of the list of tokens from [i, j)."""
-        new_tokens = copy.copy(self)
-        new_tokens.data = self.data[i: j]
-        return new_tokens
-
-    def untokenize(self):
-        """Returns the original text (with whitespace reinserted)."""
-        return ''.join([t[self.TEXT_WS] for t in self.data]).strip()
-
-    def words(self, uncased=False):
-        """Returns a list of the text of each token
-
-        Args:
-            uncased: lower cases text
-        """
-        if uncased:
-            return [t[self.TEXT].lower() for t in self.data]
-        else:
-            return [t[self.TEXT] for t in self.data]
-
-    def offsets(self):
-        """Returns a list of [start, end) character offsets of each token."""
-        return [t[self.SPAN] for t in self.data]
-
-    def pos(self):
-        """Returns a list of part-of-speech tags of each token.
-        Returns None if this annotation was not included.
-        """
-        if 'pos' not in self.annotators:
-            return None
-        return [t[self.POS] for t in self.data]
-
-    def lemmas(self):
-        """Returns a list of the lemmatized text of each token.
-        Returns None if this annotation was not included.
-        """
-        if 'lemma' not in self.annotators:
-            return None
-        return [t[self.LEMMA] for t in self.data]
-
-    def entities(self):
-        """Returns a list of named-entity-recognition tags of each token.
-        Returns None if this annotation was not included.
-        """
-        if 'ner' not in self.annotators:
-            return None
-        return [t[self.NER] for t in self.data]
-
-    def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True):
-        """Returns a list of all ngrams from length 1 to n.
-
-        Args:
-            n: upper limit of ngram length
-            uncased: lower cases text
-            filter_fn: user function that takes in an ngram list and returns
-              True or False to keep or not keep the ngram
-            as_string: return the ngram as a string vs list
-        """
-
-        def _skip(gram):
-            if not filter_fn:
-                return False
-            return filter_fn(gram)
-
-        words = self.words(uncased)
-        ngrams = [(s, e + 1)
-                  for s in range(len(words))
-                  for e in range(s, min(s + n, len(words)))
-                  if not _skip(words[s:e + 1])]
-
-        # Concatenate into strings
-        if as_strings:
-            ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams]
-
-        return ngrams
-
-    def entity_groups(self):
-        """Group consecutive entity tokens with the same NER tag."""
-        entities = self.entities()
-        if not entities:
-            return None
-        non_ent = self.opts.get('non_ent', 'O')
-        groups = []
-        idx = 0
-        while idx < len(entities):
-            ner_tag = entities[idx]
-            # Check for entity tag
-            if ner_tag != non_ent:
-                # Chomp the sequence
-                start = idx
-                while (idx < len(entities) and entities[idx] == ner_tag):
-                    idx += 1
-                groups.append((self.slice(start, idx).untokenize(), ner_tag))
-            else:
-                idx += 1
-        return groups
-
-
-class Tokenizer(object):
-    """Base tokenizer class.
-    Tokenizers implement tokenize, which should return a Tokens class.
-    """
-
-    def tokenize(self, text):
-        raise NotImplementedError
-
-    def shutdown(self):
-        pass
-
-    def __del__(self):
-        self.shutdown()
-
-
-class SimpleTokenizer(Tokenizer):
-    ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
-    NON_WS = r'[^\p{Z}\p{C}]'
-
-    def __init__(self, **kwargs):
-        """
-        Args:
-            annotators: None or empty set (only tokenizes).
-        """
-        self._regexp = regex.compile(
-            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
-            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
-        )
-        if len(kwargs.get('annotators', {})) > 0:
-            logger.warning('%s only tokenizes! Skipping annotators: %s' %
-                           (type(self).__name__, kwargs.get('annotators')))
-        self.annotators = set()
-
-    def tokenize(self, text):
-        data = []
-        matches = [m for m in self._regexp.finditer(text)]
-        for i in range(len(matches)):
-            # Get text
-            token = matches[i].group()
-
-            # Get whitespace
-            span = matches[i].span()
-            start_ws = span[0]
-            if i + 1 < len(matches):
-                end_ws = matches[i + 1].span()[0]
-            else:
-                end_ws = span[1]
-
-            # Format data
-            data.append((
-                token,
-                text[start_ws: end_ws],
-                span,
-            ))
-        return Tokens(data, self.annotators)
-
-
-class SpacyTokenizer(Tokenizer):
-
-    def __init__(self, **kwargs):
-        """
-        Args:
-            annotators: set that can include pos, lemma, and ner.
-            model: spaCy model to use (either path, or keyword like 'en').
-        """
-        model = kwargs.get('model', 'en')
-        self.annotators = copy.deepcopy(kwargs.get('annotators', set()))
-        nlp_kwargs = {'parser': False}
-        if not any([p in self.annotators for p in ['lemma', 'pos', 'ner']]):
-            nlp_kwargs['tagger'] = False
-        if 'ner' not in self.annotators:
-            nlp_kwargs['entity'] = False
-        self.nlp = spacy.load(model, **nlp_kwargs)
-
-    def tokenize(self, text):
-        # We don't treat new lines as tokens.
-        clean_text = text.replace('\n', ' ')
-        tokens = self.nlp.tokenizer(clean_text)
-        if any([p in self.annotators for p in ['lemma', 'pos', 'ner']]):
-            self.nlp.tagger(tokens)
-        if 'ner' in self.annotators:
-            self.nlp.entity(tokens)
-
-        data = []
-        for i in range(len(tokens)):
-            # Get whitespace
-            start_ws = tokens[i].idx
-            if i + 1 < len(tokens):
-                end_ws = tokens[i + 1].idx
-            else:
-                end_ws = tokens[i].idx + len(tokens[i].text)
-
-            data.append((
-                tokens[i].text,
-                text[start_ws: end_ws],
-                (tokens[i].idx, tokens[i].idx + len(tokens[i].text)),
-                tokens[i].tag_,
-                tokens[i].lemma_,
-                tokens[i].ent_type_,
-            ))
-
-        # Set special option for non-entity tag: '' vs 'O' in spaCy
-        return Tokens(data, self.annotators, opts={'non_ent': ''})
diff --git a/toolbox/Megatron-DeepSpeed/tasks/race/data.py b/toolbox/Megatron-DeepSpeed/tasks/race/data.py
deleted file mode 100644
index fa44ae736825b7afcac41181eea622b54cc545b3..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/race/data.py
+++ /dev/null
@@ -1,135 +0,0 @@
-
-import glob
-import json
-import os
-import time
-
-from torch.utils.data import Dataset
-
-from megatron_ds import print_rank_0
-from tasks.data_utils import build_sample
-from tasks.data_utils import build_tokens_types_paddings_from_ids
-from tasks.data_utils import clean_text
-
-
-NUM_CHOICES = 4
-MAX_QA_LENGTH = 128
-
-
-class RaceDataset(Dataset):
-
-    def __init__(self, dataset_name, datapaths, tokenizer, max_seq_length,
-                 max_qa_length=MAX_QA_LENGTH):
-
-        self.dataset_name = dataset_name
-        print_rank_0(' > building RACE dataset for {}:'.format(
-            self.dataset_name))
-
-        string = '  > paths:'
-        for path in datapaths:
-            string += ' ' + path
-        print_rank_0(string)
-
-        self.samples = []
-        for datapath in datapaths:
-            self.samples.extend(process_single_datapath(datapath, tokenizer,
-                                                        max_qa_length,
-                                                        max_seq_length))
-
-        print_rank_0('  >> total number of samples: {}'.format(
-            len(self.samples)))
-
-        # This indicates that each "sample" has multiple samples that
-        # will collapse into batch dimension
-        self.sample_multiplier = NUM_CHOICES
-
-    def __len__(self):
-        return len(self.samples)
-
-    def __getitem__(self, idx):
-        return self.samples[idx]
-
-
-def process_single_datapath(datapath, tokenizer, max_qa_length, max_seq_length):
-    """Read in RACE files, combine, clean-up, tokenize, and convert to
-    samples."""
-
-    print_rank_0('   > working on {}'.format(datapath))
-    start_time = time.time()
-
-    # Get list of files.
-    filenames = glob.glob(os.path.join(datapath, '*.txt'))
-
-    samples = []
-    num_docs = 0
-    num_questions = 0
-    num_samples = 0
-    # Load all the files
-    for filename in filenames:
-        with open(filename, 'r') as f:
-            for line in f:
-                data = json.loads(line)
-                num_docs += 1
-
-                context = data["article"]
-                questions = data["questions"]
-                choices = data["options"]
-                answers = data["answers"]
-                # Check the length.
-                assert len(questions) == len(answers)
-                assert len(questions) == len(choices)
-
-                # Context: clean up and convert to ids.
-                context = clean_text(context)
-                context_ids = tokenizer.tokenize(context)
-
-                # Loop over questions.
-                for qi, question in enumerate(questions):
-                    num_questions += 1
-                    # Label.
-                    label = ord(answers[qi]) - ord("A")
-                    assert label >= 0
-                    assert label < NUM_CHOICES
-                    assert len(choices[qi]) == NUM_CHOICES
-
-                    # For each question, build num-choices samples.
-                    ids_list = []
-                    types_list = []
-                    paddings_list = []
-                    for ci in range(NUM_CHOICES):
-                        choice = choices[qi][ci]
-                        # Merge with choice.
-                        if "_" in question:
-                            qa = question.replace("_", choice)
-                        else:
-                            qa = " ".join([question, choice])
-                        # Clean QA.
-                        qa = clean_text(qa)
-                        # Tokenize.
-                        qa_ids = tokenizer.tokenize(qa)
-                        # Trim if needed.
-                        if len(qa_ids) > max_qa_length:
-                            qa_ids = qa_ids[0:max_qa_length]
-
-                        # Build the sample.
-                        ids, types, paddings \
-                            = build_tokens_types_paddings_from_ids(
-                                qa_ids, context_ids, max_seq_length,
-                                tokenizer.cls, tokenizer.sep, tokenizer.pad)
-
-                        ids_list.append(ids)
-                        types_list.append(types)
-                        paddings_list.append(paddings)
-
-                    # Convert to numpy and add to samples
-                    samples.append(build_sample(ids_list, types_list,
-                                                paddings_list, label,
-                                                num_samples))
-                    num_samples += 1
-
-    elapsed_time = time.time() - start_time
-    print_rank_0('    > processed {} document, {} questions, and {} samples'
-                 ' in {:.2f} seconds'.format(num_docs, num_questions,
-                                             num_samples, elapsed_time))
-
-    return samples
diff --git a/toolbox/Megatron-DeepSpeed/tasks/race/finetune.py b/toolbox/Megatron-DeepSpeed/tasks/race/finetune.py
deleted file mode 100644
index a23128adbc579acfd77b43f7a2162550ff85c89f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/race/finetune.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Race."""
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds import get_tokenizer
-from megatron_ds.model.multiple_choice import MultipleChoice
-from tasks.eval_utils import accuracy_func_provider
-from tasks.finetune_utils import finetune
-from tasks.race.data import RaceDataset
-from megatron_ds.arguments import core_transformer_config_from_args
-
-
-def train_valid_datasets_provider():
-    """Provide train and validation datasets."""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    train_dataset = RaceDataset('training', args.train_data,
-                                tokenizer, args.seq_length)
-    valid_dataset = RaceDataset('validation', args.valid_data,
-                                tokenizer, args.seq_length)
-
-    return train_dataset, valid_dataset
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-    config = core_transformer_config_from_args(get_args())
-    print_rank_0('building multichoice model for RACE ...')
-    model = MultipleChoice(config=config,
-                           num_tokentypes=2,
-                           pre_process=pre_process,
-                           post_process=post_process)
-
-    return model
-
-
-def metrics_func_provider():
-    """Privde metrics callback function."""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    def single_dataset_provider(datapath):
-        name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
-        return RaceDataset(name, [datapath], tokenizer, args.seq_length)
-
-    return accuracy_func_provider(single_dataset_provider)
-
-
-def main():
-
-    finetune(train_valid_datasets_provider, model_provider,
-             end_of_epoch_callback_provider=metrics_func_provider)
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/classification/classification.py b/toolbox/Megatron-DeepSpeed/tasks/vision/classification/classification.py
deleted file mode 100644
index d25da0c5f2ccbf2af27d9ec463e05bab39346345..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/classification/classification.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Vision-classification finetuning/evaluation."""
-
-import torch.nn.functional as F
-from functools import partial
-from megatron_ds import get_args, get_timers
-from megatron_ds import print_rank_0
-from megatron_ds.model.vision.classification import VitClassificationModel
-from megatron_ds.data.vit_dataset import build_train_valid_datasets
-from tasks.vision.classification.eval_utils import accuracy_func_provider
-from tasks.vision.finetune_utils import finetune
-from megatron_ds.utils import average_losses_across_data_parallel_group
-
-
-def classification():
-    def train_valid_datasets_provider():
-        """Build train and validation dataset."""
-        args = get_args()
-
-        train_ds, valid_ds = build_train_valid_datasets(
-            data_path=args.data_path,
-            image_size=(args.img_h, args.img_w),
-        )
-        return train_ds, valid_ds
-
-    def model_provider(pre_process=True, post_process=True):
-        """Build the model."""
-        args = get_args()
-
-        print_rank_0("building classification model for ImageNet ...")
-
-        return VitClassificationModel(num_classes=args.num_classes, finetune=True,
-                                      pre_process=pre_process, post_process=post_process)
-
-    def process_batch(batch):
-        """Process batch and produce inputs for the model."""
-        images = batch[0].cuda().contiguous()
-        labels = batch[1].cuda().contiguous()
-        return images, labels
-
-    def cross_entropy_loss_func(labels, output_tensor):
-        logits = output_tensor
-
-        # Cross-entropy loss.
-        loss = F.cross_entropy(logits.contiguous().float(), labels)
-
-        # Reduce loss for logging.
-        averaged_loss = average_losses_across_data_parallel_group([loss])
-
-        return loss, {'lm loss': averaged_loss[0]}
-
-    def _cross_entropy_forward_step(batch, model):
-        """Simple forward step with cross-entropy loss."""
-        timers = get_timers()
-
-        # Get the batch.
-        timers("batch generator", log_level=2).start()
-        try:
-            batch_ = next(batch)
-        except BaseException:
-            batch_ = batch
-        images, labels = process_batch(batch_)
-        timers("batch generator").stop()
-
-        # Forward model.
-        output_tensor = model(images)
-      
-        return output_tensor, partial(cross_entropy_loss_func, labels)
-
-    """Finetune/evaluate."""
-    finetune(
-        train_valid_datasets_provider,
-        model_provider,
-        forward_step=_cross_entropy_forward_step,
-        end_of_epoch_callback_provider=accuracy_func_provider,
-    )
-
-def main():
-    classification()
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/classification/eval_utils.py b/toolbox/Megatron-DeepSpeed/tasks/vision/classification/eval_utils.py
deleted file mode 100644
index 2795bce3e535bdb4b612ad24ef3a4a827aa1e8e4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/classification/eval_utils.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Evaluation utilities."""
-
-import os
-from functools import partial
-
-import torch
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_0, print_rank_last
-from megatron_ds.core import mpu
-from megatron_ds.schedules import get_forward_backward_func
-from tasks.vision.finetune_utils import build_data_loader
-from tasks.vision.finetune_utils import process_batch
-from torchvision import datasets, transforms
-
-
-def accuracy_func_provider():
-    """Provide function that calculates accuracies."""
-    args = get_args()
-    data_path = args.data_path
-    crop_size = (args.img_h, args.img_w)
-
-    # Build dataloaders.
-    val_data_path = data_path[1]
-    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
-    transform_val = transforms.Compose(
-        [
-            transforms.Resize(crop_size),
-            transforms.CenterCrop(crop_size),
-            transforms.ToTensor(),
-            normalize,
-        ]
-    )
-    dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val)
-
-    dataloader = build_data_loader(
-        dataset,
-        args.micro_batch_size,
-        num_workers=args.num_workers,
-        drop_last=(mpu.get_data_parallel_world_size() > 1),
-        shuffle=False
-    )
-
-    def metrics_func(model, epoch):
-        print_rank_0("calculating metrics ...")
-        correct, total = calculate_correct_answers(model, dataloader, epoch)
-        percent = float(correct) * 100.0 / float(total)
-        print_rank_last(
-            " >> |epoch: {}| overall: correct / total = {} / {} = "
-            "{:.4f} %".format(epoch, correct, total, percent)
-        )
-
-    return metrics_func
-
-
-def calculate_correct_answers(model, dataloader, epoch):
-    """Calculate correct over total answers"""
-
-    forward_backward_func = get_forward_backward_func()
-    for m in model:
-        m.eval()
-
-    def loss_func(labels, output_tensor):
-        logits = output_tensor
-
-        loss_dict = {}
-        # Compute the correct answers.
-        predicted = torch.argmax(logits, dim=-1)
-        corrects = (predicted == labels).float()
-        # Add to the counters.
-        loss_dict['total'] = labels.size(0)
-        loss_dict['correct'] = corrects.sum().item()
-
-        return 0, loss_dict
-
-    #defined inside to capture output_predictions
-    def correct_answers_forward_step(batch, model):
-        try:
-            batch_ = next(batch)
-        except BaseException:
-            batch_ = batch
-        images, labels = process_batch(batch_)
-
-        # Forward model.
-        output_tensor = model(images)
-
-        return output_tensor, partial(loss_func, labels)
-
-    with torch.no_grad():
-        # For all the batches in the dataset.
-        total = 0
-        correct = 0
-        for _, batch in enumerate(dataloader):
-
-            loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model,
-                                               optimizer=None, timers=None, forward_only=True)
-
-            for loss_dict in loss_dicts:
-                total += loss_dict['total']
-                correct += loss_dict['correct']
-
-    for m in model:
-        m.train()
-
-    # Reduce.
-    if mpu.is_pipeline_last_stage():
-        unreduced = torch.cuda.LongTensor([correct, total])
-        torch.distributed.all_reduce(unreduced,
-                                     group=mpu.get_data_parallel_group())
-
-        # Print on screen.
-        correct_ans = unreduced[0].item()
-        total_count = unreduced[1].item()
-        return correct_ans, total_count
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/finetune_utils.py b/toolbox/Megatron-DeepSpeed/tasks/vision/finetune_utils.py
deleted file mode 100644
index 7e74aca01d13916c35f66af7477376c95b6c37f9..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/finetune_utils.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Finetune utilities."""
-
-import torch
-import torch.nn.functional as F
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds import get_timers
-from megatron_ds import utils
-from megatron_ds.core import mpu
-from megatron_ds.checkpointing import load_checkpoint
-from megatron_ds.checkpointing import save_checkpoint
-from megatron_ds.training import evaluate_and_print_results
-from megatron_ds.training import setup_model_and_optimizer
-from megatron_ds.training import train_step
-from megatron_ds.training import training_log
-from megatron_ds.utils import check_adlr_autoresume_termination
-from megatron_ds.utils import average_losses_across_data_parallel_group, print_params_min_max_norm
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron_ds.model import DistributedDataParallel as LocalDDP
-from megatron_ds.model import Float16Module
-from megatron_ds.core.enums import ModelType
-from deepspeed.accelerator import get_accelerator
-
-def process_batch(batch):
-    """Process batch and produce inputs for the model."""
-    images = batch[0].to(get_accelerator().device_name()).contiguous()
-    labels = batch[1].to(get_accelerator().device_name()).contiguous()
-    return images, labels
-
-
-def build_data_loader(dataset, micro_batch_size,
-                      num_workers, drop_last, shuffle):
-    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
-
-    # Sampler.
-    world_size = mpu.get_data_parallel_world_size()
-    rank = mpu.get_data_parallel_rank()
-    sampler = torch.utils.data.distributed.DistributedSampler(
-        dataset, num_replicas=world_size, rank=rank,
-        drop_last=drop_last, shuffle=shuffle
-    )
-
-    # Data loader. Note that batch size is the per GPU batch size.
-    data_loader = torch.utils.data.DataLoader(
-        dataset,
-        batch_size=micro_batch_size,
-        sampler=sampler,
-        shuffle=False,
-        num_workers=num_workers,
-        drop_last=drop_last,
-        pin_memory=True,
-    )
-
-    return data_loader
-
-
-def _build_infinite_size_dataloader(dataloader):
-    """Build a looped dataloader with infinite size."""
-
-    iterator = dataloader.__iter__()
-    while True:
-        try:
-            yield iterator.__next__()
-        except StopIteration:
-            iterator = dataloader.__iter__()
-
-
-def _build_train_valid_dataloaders(train_dataset, valid_dataset):
-    """Traing and validation dataloaders."""
-    args = get_args()
-
-    print_rank_0('building train and validation dataloaders ...')
-    # Training dataset.
-    train_dataloader = build_data_loader(train_dataset, args.micro_batch_size,
-                                         args.num_workers, False, True)
-    # Set the training iterations.
-    args.train_iters_per_epoch = len(train_dataloader)
-    args.train_iters = args.epochs * args.train_iters_per_epoch
-    # Validation dataset. For this dataset, we do not need to set up
-    # shuffling so we can just use a simple infinite loop.
-    valid_dataloader_ = build_data_loader(valid_dataset, args.micro_batch_size,
-                                          args.num_workers, True,  False)
-    valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)
-
-    # Now that we've built the data loaders, set batch_size arguments
-    # to the actual batch size the model will see for this dataset.
-    # This is necessary so pipeline transfers know what size they are
-    # and the LR schedule, which is based on samples seen, gets set
-    # correctly.
-    args.orig_micro_batch_size = args.micro_batch_size
-    args.orig_global_batch_size = args.global_batch_size
-
-    return train_dataloader, valid_dataloader
-
-
-def _train(
-    model,
-    optimizer,
-    opt_param_scheduler,
-    forward_step,
-    train_dataloader,
-    valid_dataloader,
-    end_of_epoch_callback,
-    process_non_loss_data_func=None
-):
-    """Train the model."""
-    args = get_args()
-    timers = get_timers()
-
-    # Turn on training mode which enables dropout.
-    for m in model:
-        m.train()
-
-    # Tracking loss.
-    losses_dict_sum = {}
-
-    # Starting epoch and iteration
-    start_epoch = args.iteration // args.train_iters_per_epoch
-    start_iteration = args.iteration % args.train_iters_per_epoch
-    iteration = args.iteration
-
-    # Memory reporting flag.
-    report_memory_flag = True
-
-    # For each remaining epoch
-    timers("interval-time", log_level=0).start(barrier=True)
-    for epoch in range(start_epoch, args.epochs):
-        print_rank_0("working on epoch {} ...".format(epoch + 1))
-
-        # Set the data loader epoch to shuffle the index iterator.
-        train_dataloader.sampler.set_epoch(args.seed + epoch)
-        train_dataloader.dataset.set_epoch(epoch)
-
-        # For all the batches in the dataset.
-        for iteration_, batch in enumerate(train_dataloader):
-
-            # Ignore the iterations before starting value
-            if iteration_ < start_iteration:
-                continue
-            # Set to zero so the next epoch does not skip any batches.
-            start_iteration = 0
-
-            # Train for one step.
-            losses_dict, skipped_iter, grad_norm, num_zeros_in_grad = train_step(
-                forward_step, batch, model, optimizer, opt_param_scheduler
-            )
-            iteration += 1
-
-            # Logging.
-            params_norm = None
-
-            report_memory_flag = training_log(
-                losses_dict,
-                losses_dict_sum,
-                optimizer.param_groups[0]["lr"],
-                iteration,
-                optimizer.get_loss_scale().item(),
-                report_memory_flag,
-                skipped_iter,
-                grad_norm,
-                params_norm,
-                num_zeros_in_grad
-            )
-
-            # Autoresume
-            if args.adlr_autoresume and \
-                    iteration % args.adlr_autoresume_interval == 0:
-                check_adlr_autoresume_termination(iteration, model, optimizer,
-                                                  opt_param_scheduler)
-
-            # Checkpointing
-            if args.save and args.save_interval and \
-                    iteration % args.save_interval == 0:
-                save_checkpoint(iteration, model, optimizer,
-                                opt_param_scheduler)
-
-            # Evaluation
-            if args.eval_interval and iteration % args.eval_interval == 0:
-                prefix = "iteration {}".format(iteration)
-                evaluate_and_print_results(
-                    prefix,
-                    forward_step,
-                    valid_dataloader,
-                    model,
-                    iteration,
-                    process_non_loss_data_func,
-                    False,
-                )
-
-        # Callback at the end of each epoch.
-        if end_of_epoch_callback is not None:
-            end_of_epoch_callback(model, epoch)
-
-
-def finetune(
-    train_valid_datasets_provider,
-    model_provider,
-    forward_step,
-    model_type=ModelType.encoder_or_decoder,
-    process_non_loss_data_func=None,
-    end_of_epoch_callback_provider=None,
-):
-    """Main finetune function used across all tasks."""
-    args = get_args()
-    timers = get_timers()
-
-    # Train and validation data loaders.
-    timers("train/valid/test dataset/dataloder", log_level=0).start()
-    if args.epochs > 0:
-        train_dataset, valid_dataset = train_valid_datasets_provider()
-        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
-            train_dataset, valid_dataset
-        )
-    timers("train/valid/test dataset/dataloder").stop()
-
-    # Build calback function.
-    timers("callback function", log_level=0).start()
-    end_of_epoch_callback = None
-    if end_of_epoch_callback_provider is not None:
-        end_of_epoch_callback = end_of_epoch_callback_provider()
-    timers("callback function").stop()
-
-    # Build model, optimizer and learning rate scheduler.
-    timers("model and optimizer", log_level=0).start()
-    model, optimizer, opt_param_scheduler = \
-        setup_model_and_optimizer(
-            model_provider,
-            model_type,
-            scale_lr_cond=lambda name, param: ".head." in name,
-            lr_mult=args.head_lr_mult)
-    timers("model and optimizer").stop()
-
-    # If pretrained checkpoint is provided and we have not trained for
-    # any iteration (i.e., iteration is zero), then load the pretrained
-    # checkpoint.
-    timers("pretrained checkpoint", log_level=0).start(barrier=True)
-    if args.iteration == 0 and args.pretrained_checkpoint is not None:
-        if args.pretrained_checkpoint_type == 'default':
-            original_load = args.load
-            args.load = args.pretrained_checkpoint
-            _ = load_checkpoint(model, None, None, strict=False)
-            args.load = original_load
-        elif args.pretrained_checkpoint_type == 'external':
-            unwrap_model = utils.unwrap_model(model)
-            state_dict = torch.load(args.pretrained_checkpoint,
-                                    map_location="cpu")
-            unwrap_model[0].module.backbone.load_state_dict(state_dict,
-                                                            strict=False)
-        elif args.pretrained_checkpoint_type == 'constrastive':
-            unwrap_model = utils.unwrap_model(model)
-            state_dict = torch.load(args.pretrained_checkpoint,
-                                    map_location="cpu")
-            state_dict = state_dict["model"]
-            state_dict = {k.replace("teacher.backbone.", ""): v
-                          for k, v in state_dict.items()
-                          if k.startswith("teacher.backbone.")}
-            unwrap_model[0].module.backbone.load_state_dict(state_dict,
-                                                            strict=False)
-        else:
-            raise Exception("pretrained checkpoint type {} not supported".format(args.pretrained_checkpoint_type))
-
-        # This is critical when only model is loaded. We should make sure
-        # master parameters are also updated.
-        optimizer.reload_model_params()
-
-    timers("pretrained checkpoint").stop()
-
-    # Print setup timing.
-    print_rank_0("done with setups ...")
-    timers.log(
-        [
-            "train/valid/test dataset/dataloder",
-            "callback function",
-            "model and optimizer",
-            "pretrained checkpoint",
-        ]
-    )
-    print_rank_0("training ...")
-
-    # Finetune the model.
-    if args.epochs > 0:
-        _train(
-            model,
-            optimizer,
-            opt_param_scheduler,
-            forward_step,
-            train_dataloader,
-            valid_dataloader,
-            end_of_epoch_callback,
-            process_non_loss_data_func,
-        )
-    # Or just evaluate.
-    else:
-        if end_of_epoch_callback is not None:
-            print_rank_0("evaluation only mode, setting epoch to -1")
-            end_of_epoch_callback(model, epoch=-1)
-
-    print_rank_0("done :-)")
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/main.py b/toolbox/Megatron-DeepSpeed/tasks/vision/main.py
deleted file mode 100644
index 3075d410ff406c45c08f4677c4044f3bafeb6508..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/main.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Main tasks functionality."""
-
-import os
-import sys
-
-sys.path.append(
-    os.path.abspath(
-        os.path.join(
-            os.path.join(os.path.dirname(__file__), os.path.pardir),
-            os.path.pardir,
-        )
-    )
-)
-from megatron_ds import get_args
-from megatron_ds.initialize import initialize_megatron
-
-def get_tasks_args(parser):
-    """Provide extra arguments required for tasks."""
-    group = parser.add_argument_group(title="tasks")
-
-    group.add_argument('--task', type=str, default='segment',
-                       choices=['classify', 'segment_setr', 'segment_segformer'],
-                       help='task name.')
-    group.add_argument("--epochs", type=int, default=None,
-                       help="Number of finetunning epochs. Zero results in "
-                       "evaluation only.")
-    group.add_argument('--pretrained-checkpoint-type', type=str, default='default',
-                       choices=['default', 'external', 'constrastive'],
-                       help='Type of pretrained checkpoint')
-    group.add_argument("--pretrained-checkpoint", type=str, default=None,
-                       help="Pretrained checkpoint used for finetunning.")
-    group.add_argument('--seg-stride', type=int, default=None,
-                       help='sliding window stride during evaluation')
-    return parser
-
-
-if __name__ == "__main__":
-
-    initialize_megatron(extra_args_provider=get_tasks_args)
-    args = get_args()
-
-    if args.task == 'classify':
-        from tasks.vision.classification.classification import main
-        main()
-    elif args.task == 'segment_setr':
-        from tasks.vision.segmentation.finetune_setr import main
-        main()
-    elif args.task == 'segment_segformer':
-        from tasks.vision.segmentation.finetune_segformer import main
-        main()
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/cityscapes.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/cityscapes.py
deleted file mode 100644
index 4baf09eee17810874de2de797657bde24631139e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/cityscapes.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) Soumith Chintala 2016, 
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# code taken from 
-# https://github.com/pytorch/vision/blob/main/torchvision/datasets/cityscapes.py
-# modified it to change max label index from 255 to 19 (num_classes)
-
-import torch
-import json
-import os
-from collections import namedtuple
-from typing import Any, Callable, Dict, List, Optional, Union, Tuple
-import numpy as np
-from torchvision.datasets.utils import extract_archive, verify_str_arg, iterable_to_str
-from torchvision.datasets import VisionDataset
-from PIL import Image
-from megatron_ds import print_rank_0
-
-
-class Cityscapes(VisionDataset):
-    """`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
-    Args:
-        root (string): Root directory of dataset where directory ``leftImg8bit``
-            and ``gtFine`` or ``gtCoarse`` are located.
-        split (string, optional): The image split to use, ``train``, ``test`` or ``val`` if mode="fine"
-            otherwise ``train``, ``train_extra`` or ``val``
-        mode (string, optional): The quality mode to use, ``fine`` or ``coarse``
-        target_type (string or list, optional): Type of target to use, ``instance``, ``semantic``, ``polygon``
-            or ``color``. Can also be a list to output a tuple with all specified target types.
-        transform (callable, optional): A function/transform that takes in a PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    Examples:
-        Get semantic segmentation target
-        .. code-block:: python
-            dataset = Cityscapes('./data/cityscapes', split='train', mode='fine',
-                                 target_type='semantic')
-            img, smnt = dataset[0]
-        Get multiple targets
-        .. code-block:: python
-            dataset = Cityscapes('./data/cityscapes', split='train', mode='fine',
-                                 target_type=['instance', 'color', 'polygon'])
-            img, (inst, col, poly) = dataset[0]
-        Validate on the "coarse" set
-        .. code-block:: python
-            dataset = Cityscapes('./data/cityscapes', split='val', mode='coarse',
-                                 target_type='semantic')
-            img, smnt = dataset[0]
-    """
-    num_classes = 19
-    ignore_index = 19
-    color_table = torch.tensor(
-        [[128, 64, 128],
-         [244, 35, 232],
-         [70, 70, 70],
-         [102, 102, 156],
-         [190, 153, 153],
-         [153, 153, 153],
-         [250, 170, 30],
-         [220, 220, 0],
-         [107, 142, 35],
-         [152, 251, 152],
-         [70, 130, 180],
-         [220, 20, 60],
-         [255, 0, 0],
-         [0, 0, 142],
-         [0, 0, 70],
-         [0, 60, 100],
-         [0, 80, 100],
-         [0, 0, 230],
-         [119, 11, 32],
-         [0, 0, 0]], dtype=torch.float, device='cuda')
-
-
-    # Based on https://github.com/mcordts/cityscapesScripts
-    CityscapesClass = namedtuple('CityscapesClass', ['name', 'id', 'train_id', 
-        'category', 'category_id', 'has_instances', 'ignore_in_eval', 'color'])
-
-    classes = [
-        CityscapesClass('unlabeled', 0, 19, 'void', 0, False, True, (0, 0, 0)),
-        CityscapesClass('ego vehicle', 1, 19, 'void', 0, False, True, (0, 0, 0)),
-        CityscapesClass('rectification border', 2, 19, 'void', 0, False, True, (0, 0, 0)),
-        CityscapesClass('out of roi', 3, 19, 'void', 0, False, True, (0, 0, 0)),
-        CityscapesClass('static', 4, 19, 'void', 0, False, True, (0, 0, 0)),
-        CityscapesClass('dynamic', 5, 19, 'void', 0, False, True, (111, 74, 0)),
-        CityscapesClass('ground', 6, 19, 'void', 0, False, True, (81, 0, 81)),
-        CityscapesClass('road', 7, 0, 'flat', 1, False, False, (128, 64, 128)),
-        CityscapesClass('sidewalk', 8, 1, 'flat', 1, False, False, (244, 35, 232)),
-        CityscapesClass('parking', 9, 19, 'flat', 1, False, True, (250, 170, 160)),
-        CityscapesClass('rail track', 10, 19, 'flat', 1, False, True, (230, 150, 140)),
-        CityscapesClass('building', 11, 2, 'construction', 2, False, False, (70, 70, 70)),
-        CityscapesClass('wall', 12, 3, 'construction', 2, False, False, (102, 102, 156)),
-        CityscapesClass('fence', 13, 4, 'construction', 2, False, False, (190, 153, 153)),
-        CityscapesClass('guard rail', 14, 19, 'construction', 2, False, True, (180, 165, 180)),
-        CityscapesClass('bridge', 15, 19, 'construction', 2, False, True, (150, 100, 100)),
-        CityscapesClass('tunnel', 16, 19, 'construction', 2, False, True, (150, 120, 90)),
-        CityscapesClass('pole', 17, 5, 'object', 3, False, False, (153, 153, 153)),
-        CityscapesClass('polegroup', 18, 19, 'object', 3, False, True, (153, 153, 153)),
-        CityscapesClass('traffic light', 19, 6, 'object', 3, False, False, (250, 170, 30)),
-        CityscapesClass('traffic sign', 20, 7, 'object', 3, False, False, (220, 220, 0)),
-        CityscapesClass('vegetation', 21, 8, 'nature', 4, False, False, (107, 142, 35)),
-        CityscapesClass('terrain', 22, 9, 'nature', 4, False, False, (152, 251, 152)),
-        CityscapesClass('sky', 23, 10, 'sky', 5, False, False, (70, 130, 180)),
-        CityscapesClass('person', 24, 11, 'human', 6, True, False, (220, 20, 60)),
-        CityscapesClass('rider', 25, 12, 'human', 6, True, False, (255, 0, 0)),
-        CityscapesClass('car', 26, 13, 'vehicle', 7, True, False, (0, 0, 142)),
-        CityscapesClass('truck', 27, 14, 'vehicle', 7, True, False, (0, 0, 70)),
-        CityscapesClass('bus', 28, 15, 'vehicle', 7, True, False, (0, 60, 100)),
-        CityscapesClass('caravan', 29, 19, 'vehicle', 7, True, True, (0, 0, 90)),
-        CityscapesClass('trailer', 30, 19, 'vehicle', 7, True, True, (0, 0, 110)),
-        CityscapesClass('train', 31, 16, 'vehicle', 7, True, False, (0, 80, 100)),
-        CityscapesClass('motorcycle', 32, 17, 'vehicle', 7, True, False, (0, 0, 230)),
-        CityscapesClass('bicycle', 33, 18, 'vehicle', 7, True, False, (119, 11, 32)),
-        CityscapesClass('license plate', -1, -1, 'vehicle', 7, False, True, (0, 0, 142)),
-    ]
-
-    # label2trainid
-    label2trainid   = { label.id  : label.train_id for label in classes}
-
-    def __init__(
-            self,
-            root: str,
-            split: str = "train",
-            mode: str = "fine",
-            resolution: int = 1024,
-            transform: Optional[Callable] = None,
-            target_transform: Optional[Callable] = None,
-            transforms: Optional[Callable] = None,
-    ) -> None:
-        super(Cityscapes, self).__init__(root, transforms, transform, target_transform)
-        self.mode = 'gtFine' if mode == 'fine' else 'gtCoarse'
-        self.images_dir = os.path.join(self.root, 'leftImg8bit_trainvaltest/leftImg8bit', split)
-        self.targets_dir = os.path.join(self.root, 'gtFine_trainvaltest/gtFine', split)
-        self.split = split
-        self.resolution = resolution
-        self.images = []
-        self.targets = []
-
-        for city in sorted(os.listdir(self.images_dir)):
-            img_dir = os.path.join(self.images_dir, city)
-            target_dir = os.path.join(self.targets_dir, city)
-            for file_name in os.listdir(img_dir):
-                target_name = '{}_{}_labelIds.png'.format(file_name.split('_leftImg8bit')[0], self.mode)
-                self.images.append(os.path.join(img_dir, file_name))
-                self.targets.append(os.path.join(target_dir, target_name))
-
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        """
-        Args:
-            index (int): Index
-        Returns:
-            tuple: (image, target) where target is a tuple of all target types if target_type is a list with more
-            than one item. Otherwise target is a json object if target_type="polygon", else the image segmentation.
-        """
-        image = Image.open(self.images[index]).convert('RGB')
-        
-        target = Image.open(self.targets[index]) 
-        target = np.array(target)
-
-        target_copy = target.copy()
-        for k, v in Cityscapes.label2trainid.items():
-            binary_target = (target == k)
-            target_copy[binary_target] = v
-        target = target_copy
-
-        target = Image.fromarray(target.astype(np.uint8))
-
-        if self.transforms is not None:
-            image, target = self.transforms(image, target)
-
-        return image, target
-
-    def __len__(self) -> int:
-        # len(self.images)
-        return len(self.images)
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/data.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/data.py
deleted file mode 100644
index 6a6bd288fff176c00ad439afea69ba0d65dc178d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/data.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import random
-import os
-import math
-import mmcv
-import torch
-import numpy as np
-import torchvision.transforms as T
-from torchvision import datasets
-from torch.utils.data import Dataset
-from megatron_ds.data.autoaugment import ImageNetPolicy
-from tasks.vision.segmentation.cityscapes import Cityscapes
-import tasks.vision.segmentation.transforms as ET
-from megatron_ds.data.autoaugment import ImageNetPolicy
-from megatron_ds import get_args
-from PIL import Image, ImageOps
-
-
-class VitSegmentationJointTransform():
-    def __init__(self, train=True, resolution=None):
-        self.train = train
-        if self.train:
-            self.transform0 = ET.RandomSizeAndCrop(resolution)
-            self.transform1 = ET.RandomHorizontallyFlip()
-
-    def __call__(self, img, mask):
-        if self.train:
-            img, mask = self.transform0(img, mask)
-            img, mask = self.transform1(img, mask)
-        return img, mask
-
-
-class VitSegmentationImageTransform():
-    def __init__(self, train=True, resolution=None):
-        args = get_args()
-        self.train = train
-        assert args.fp16 or args.bf16
-        self.data_type = torch.half if args.fp16 else torch.bfloat16
-        self.mean_std = args.mean_std
-        if self.train:
-            assert resolution is not None
-            self.transform = T.Compose([
-                ET.PhotoMetricDistortion(),
-                T.ToTensor(),
-                T.Normalize(*self.mean_std),
-                T.ConvertImageDtype(self.data_type)
-            ])
-        else:
-            self.transform = T.Compose([
-                T.ToTensor(),
-                T.Normalize(*self.mean_std),
-                T.ConvertImageDtype(self.data_type)
-            ])
-
-    def __call__(self, input):
-        output = self.transform(input)
-        return output
-
-
-class VitSegmentationTargetTransform():
-    def __init__(self, train=True, resolution=None):
-        self.train = train
-
-    def __call__(self, input):
-        output = torch.from_numpy(np.array(input, dtype=np.int32)).long()
-        return output
-
-
-class RandomSeedSegmentationDataset(Dataset):
-    def __init__(self,
-                 dataset,
-                 joint_transform,
-                 image_transform,
-                 target_transform):
-
-        args = get_args()
-        self.base_seed = args.seed
-        self.curr_seed = self.base_seed
-        self.dataset = dataset
-        self.joint_transform = joint_transform
-        self.image_transform = image_transform
-        self.target_transform = target_transform
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def set_epoch(self, epoch):
-        self.curr_seed = self.base_seed + 100 * epoch
-
-    def __getitem__(self, idx):
-        seed = idx + self.curr_seed
-        img, mask = self.dataset[idx]
-
-        torch.manual_seed(seed)
-        random.seed(seed)
-        np.random.seed(seed)
-        img, mask = self.joint_transform(img, mask)
-        img = self.image_transform(img)
-        mask = self.target_transform(mask)
-
-        return img, mask
-
-
-def build_cityscapes_train_valid_datasets(data_path, image_size):
-    args = get_args()
-    args.num_classes = Cityscapes.num_classes
-    args.ignore_index = Cityscapes.ignore_index
-    args.color_table = Cityscapes.color_table
-    args.mean_std = ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-
-    train_joint_transform = \
-        VitSegmentationJointTransform(train=True, resolution=image_size)
-    val_joint_transform = \
-        VitSegmentationJointTransform(train=False, resolution=image_size)
-    train_image_transform = \
-        VitSegmentationImageTransform(train=True, resolution=image_size)
-    val_image_transform = \
-        VitSegmentationImageTransform(train=False, resolution=image_size)
-    train_target_transform = \
-        VitSegmentationTargetTransform(train=True, resolution=image_size)
-    val_target_transform = \
-        VitSegmentationTargetTransform(train=False, resolution=image_size)
-
-    # training dataset
-    train_data = Cityscapes(
-        root=data_path[0],
-        split='train',
-        mode='fine',
-        resolution=image_size
-    )
-    train_data = RandomSeedSegmentationDataset(
-        train_data,
-        joint_transform=train_joint_transform,
-        image_transform=train_image_transform,
-        target_transform=train_target_transform)
-
-    # validation dataset
-    val_data = Cityscapes(
-        root=data_path[0],
-        split='val',
-        mode='fine',
-        resolution=image_size
-    )
-
-    val_data = RandomSeedSegmentationDataset(
-        val_data,
-        joint_transform=val_joint_transform,
-        image_transform=val_image_transform,
-        target_transform=val_target_transform)
-
-    return train_data, val_data
-
-
-def build_train_valid_datasets(data_path, image_size):
-    return build_cityscapes_train_valid_datasets(data_path, image_size)
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/finetune_segformer.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/finetune_segformer.py
deleted file mode 100644
index 52be1df00c7687150ee5eb5cd17eec1ccaec5d8b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/finetune_segformer.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Vision-classification finetuning/evaluation."""
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from functools import partial
-from megatron_ds import get_args, get_timers
-from megatron_ds import print_rank_0, print_rank_last
-from megatron_ds.core import mpu
-from tasks.vision.finetune_utils import finetune
-from tasks.vision.finetune_utils import build_data_loader
-from megatron_ds.utils import average_losses_across_data_parallel_group
-from megatron_ds.schedules import get_forward_backward_func
-from tasks.vision.segmentation.data import build_train_valid_datasets
-from tasks.vision.segmentation.seg_models import SegformerSegmentationModel
-from megatron_ds.model.vision.utils import resize
-
-
-def calculate_iou(hist_data):
-    acc = np.diag(hist_data).sum() / hist_data.sum()
-    acc_cls = np.diag(hist_data) / hist_data.sum(axis=1)
-    acc_cls = np.nanmean(acc_cls)
-    divisor = hist_data.sum(axis=1) + hist_data.sum(axis=0) - \
-        np.diag(hist_data)
-    iu = np.diag(hist_data) / divisor
-    return iu, acc, acc_cls
-
-
-def fast_hist(pred, gtruth, num_classes):
-    # mask indicates pixels we care about
-    mask = (gtruth >= 0) & (gtruth < num_classes)
-
-    # stretch ground truth labels by num_classes
-    #   class 0  -> 0
-    #   class 1  -> 19
-    #   class 18 -> 342
-    #
-    # TP at 0 + 0, 1 + 1, 2 + 2 ...
-    #
-    # TP exist where value == num_classes*class_id + class_id
-    # FP = row[class].sum() - TP
-    # FN = col[class].sum() - TP
-    hist = np.bincount(num_classes * gtruth[mask].astype(int) + pred[mask],
-                       minlength=num_classes ** 2)
-    hist = hist.reshape(num_classes, num_classes)
-    return hist
-
-
-def segmentation():
-
-    def train_valid_datasets_provider():
-        """Build train and validation dataset."""
-        args = get_args()
-
-        train_ds, valid_ds = build_train_valid_datasets(
-            data_path=args.data_path,
-            image_size=(args.img_h, args.img_w)
-
-        )
-        return train_ds, valid_ds
-
-    def model_provider(pre_process=True, post_process=True):
-        """Build the model."""
-        args = get_args()
-
-        model = SegformerSegmentationModel(num_classes=args.num_classes,
-                                           pre_process=pre_process,
-                                           post_process=post_process)
-        print_rank_0("model = {}".format(model))
-        return model
-
-    def process_batch(batch):
-        """Process batch and produce inputs for the model."""
-        images = batch[0].cuda().contiguous()
-        masks = batch[1].cuda().contiguous()
-        return images, masks
-
-    def calculate_weight(masks, num_classes):
-        bins = torch.histc(masks, bins=num_classes, min=0.0, max=num_classes)
-        hist_norm = bins.float()/bins.sum()
-        hist = ((bins != 0).float() * (1. - hist_norm)) + 1.0
-        return hist
-
-    def cross_entropy_loss_func(images, masks, output_tensor,
-                                non_loss_data=False):
-        args = get_args()
-        ignore_index = args.ignore_index
-        color_table = args.color_table
-        logits = output_tensor.contiguous().float()
-        logits = resize(logits, size=masks.shape[1:],
-                        mode='bilinear', align_corners=False)
-      
-        # Cross-entropy loss.
-        # weight = calculate_weight(masks, num_classes)
-        loss = F.cross_entropy(logits, masks, ignore_index=ignore_index)
-
-        if not non_loss_data:
-            # Reduce loss for logging.
-            averaged_loss = average_losses_across_data_parallel_group([loss])
-            return loss, {'lm loss': averaged_loss[0]}
-        else:
-            seg_mask = logits.argmax(dim=1)
-            output_mask = F.embedding(seg_mask, color_table).permute(0, 3, 1, 2)
-            gt_mask = F.embedding(masks, color_table).permute(0, 3, 1, 2)
-            return torch.cat((images, output_mask, gt_mask), dim=2), loss
-
-    def _cross_entropy_forward_step(batch, model):
-        """Simple forward step with cross-entropy loss."""
-        timers = get_timers()
-
-        # Get the batch.
-        timers("batch generator", log_level=2).start()
-        import types
-        if isinstance(batch, types.GeneratorType):
-            batch_ = next(batch)
-        else:
-            batch_ = batch
-        images, masks = process_batch(batch_)
-        timers("batch generator").stop()
-
-        # Forward model.
-        output_tensor = model(images)
-
-        return output_tensor, partial(cross_entropy_loss_func, images, masks)
-
-    def calculate_correct_answers(model, dataloader, epoch):
-        """Calculate correct over total answers"""
-
-        forward_backward_func = get_forward_backward_func()
-        for m in model:
-            m.eval()
-
-        def loss_func(labels, output_tensor):
-            args = get_args()
-            logits = output_tensor
-            logits = resize(logits, size=labels.shape[1:],
-                            mode='bilinear', align_corners=False)
-
-            loss_dict = {}
-            # Compute the correct answers.
-            probs = logits.contiguous().float().softmax(dim=1)
-            max_probs, preds = torch.max(probs, 1)
-
-            preds = preds.cpu().numpy()
-            performs = fast_hist(preds.flatten(),
-                                 labels.cpu().numpy().flatten(),
-                                 args.ignore_index)
-            loss_dict['performs'] = performs
-            return 0, loss_dict
-
-        # defined inside to capture output_predictions
-        def correct_answers_forward_step(batch, model):
-            try:
-                batch_ = next(batch)
-            except BaseException:
-                batch_ = batch
-            images, labels = process_batch(batch_)
-
-            # Forward model.
-            output_tensor = model(images)
-
-            return output_tensor, partial(loss_func, labels)
-
-        with torch.no_grad():
-            # For all the batches in the dataset.
-            performs = None
-            for _, batch in enumerate(dataloader):
-                loss_dicts = forward_backward_func(correct_answers_forward_step,
-                                                   batch, model,
-                                                   optimizer=None,
-                                                   timers=None,
-                                                   forward_only=True)
-                for loss_dict in loss_dicts:
-                    if performs is None:
-                        performs = loss_dict['performs']
-                    else:
-                        performs += loss_dict['performs']
-
-        for m in model:
-            m.train()
-        # Reduce.
-        if mpu.is_pipeline_last_stage():
-            performs_tensor = torch.cuda.FloatTensor(performs)
-            torch.distributed.all_reduce(performs_tensor,
-                                         group=mpu.get_data_parallel_group())
-            hist = performs_tensor.cpu().numpy()
-            iu, acc, acc_cls = calculate_iou(hist)
-            miou = np.nanmean(iu)
-
-            return iu, miou
-
-    def accuracy_func_provider():
-        """Provide function that calculates accuracies."""
-        args = get_args()
-
-        train_ds, valid_ds = build_train_valid_datasets(
-            data_path=args.data_path,
-            image_size=(args.img_h, args.img_w)
-        )
-        dataloader = build_data_loader(
-            valid_ds,
-            args.micro_batch_size,
-            num_workers=args.num_workers,
-            drop_last=(mpu.get_data_parallel_world_size() > 1),
-            shuffle=False
-        )
-
-        def metrics_func(model, epoch):
-            print_rank_0("calculating metrics ...")
-            iou, miou = calculate_correct_answers(model, dataloader, epoch)
-            print_rank_last(
-                " >> |epoch: {}| overall: iou = {},"
-                "miou = {:.4f} %".format(epoch, iou, miou*100.0)
-            )
-        return metrics_func
-
-    def dump_output_data(data, iteration, writer):
-        for (output_tb, loss) in data:
-            # output_tb[output_tb < 0] = 0
-            # output_tb[output_tb > 1] = 1
-            writer.add_images("image-outputseg-realseg", output_tb,
-                              global_step=None, walltime=None,
-                              dataformats='NCHW')
-
-    """Finetune/evaluate."""
-    finetune(
-        train_valid_datasets_provider,
-        model_provider,
-        forward_step=_cross_entropy_forward_step,
-        process_non_loss_data_func=dump_output_data,
-        end_of_epoch_callback_provider=accuracy_func_provider,
-    )
-
-
-def main():
-    segmentation()
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/finetune_setr.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/finetune_setr.py
deleted file mode 100644
index 868d4fb758dbfc8cc06bec56786919dc0f3dccf8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/finetune_setr.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Vision-classification finetuning/evaluation."""
-
-import torch
-import torch.nn.functional as F
-from functools import partial
-from megatron_ds import get_args, get_timers
-from megatron_ds import print_rank_0, print_rank_last
-from megatron_ds.core import mpu
-from tasks.vision.finetune_utils import finetune
-from tasks.vision.finetune_utils import build_data_loader
-from megatron_ds.utils import average_losses_across_data_parallel_group
-from megatron_ds.schedules import get_forward_backward_func
-from tasks.vision.segmentation.metrics import CFMatrix
-from tasks.vision.segmentation.data import build_train_valid_datasets
-from tasks.vision.segmentation.seg_models import SetrSegmentationModel
-from tasks.vision.segmentation.utils import slidingcrops, slidingjoins
-
-def segmentation():
-    def train_valid_datasets_provider():
-        """Build train and validation dataset."""
-        args = get_args()
-
-        train_ds, valid_ds = build_train_valid_datasets(
-            data_path=args.data_path,
-            image_size=(args.img_h, args.img_w)
-
-        )
-        return train_ds, valid_ds
-
-    def model_provider(pre_process=True, post_process=True):
-        """Build the model."""
-        args = get_args()
-
-        return SetrSegmentationModel(num_classes=args.num_classes,
-                                     pre_process=pre_process,
-                                     post_process=post_process)
-
-    def process_batch(batch):
-        """Process batch and produce inputs for the model."""
-        images = batch[0].cuda().contiguous()
-        masks = batch[1].cuda().contiguous()
-        return images, masks
-
-    def calculate_weight(masks, num_classes):
-        bins = torch.histc(masks, bins=num_classes, min=0.0, max=num_classes)
-        hist_norm = bins.float()/bins.sum()
-        hist = ((bins != 0).float() * (1. - hist_norm)) + 1.0
-        return hist
-
-    def cross_entropy_loss_func(images, masks, output_tensor, non_loss_data=False):
-        args = get_args()
-        ignore_index = args.ignore_index
-        color_table = args.color_table
-        weight = calculate_weight(masks, args.num_classes)
-        logits = output_tensor.contiguous().float()
-        loss = F.cross_entropy(logits, masks, weight=weight, ignore_index=ignore_index)
-
-        if not non_loss_data:
-            # Reduce loss for logging.
-            averaged_loss = average_losses_across_data_parallel_group([loss])
-
-            return loss, {'lm loss': averaged_loss[0]}
-        else:
-            seg_mask = logits.argmax(dim=1)
-            output_mask = F.embedding(seg_mask, color_table).permute(0, 3, 1, 2)
-            gt_mask = F.embedding(masks, color_table).permute(0, 3, 1, 2)
-            return torch.cat((images, output_mask, gt_mask), dim=2), loss
-
-    def _cross_entropy_forward_step(batch, model):
-        """Simple forward step with cross-entropy loss."""
-        args = get_args()
-        timers = get_timers()
-
-        # Get the batch.
-        timers("batch generator", log_level=2).start()
-        import types
-        if isinstance(batch, types.GeneratorType):
-            batch_ = next(batch)
-        else:
-            batch_ = batch
-        images, masks = process_batch(batch_)
-        timers("batch generator").stop()
-
-        # Forward model.
-        if not model.training:
-            images, masks, _, _ = slidingcrops(images, masks)
-        #print_rank_0("images size = {}".format(images.size()))
-       
-        if not model.training:
-            output_tensor = torch.cat([model(image) for image in torch.split(images, args.micro_batch_size)])
-        else:
-            output_tensor = model(images)
-
-        return output_tensor, partial(cross_entropy_loss_func, images, masks)
-
-    def calculate_correct_answers(model, dataloader, epoch):
-        """Calculate correct over total answers"""
-
-        forward_backward_func = get_forward_backward_func()
-        for m in model:
-            m.eval()
-
-        def loss_func(labels, slices_info, img_size, output_tensor):
-            args = get_args()
-            logits = output_tensor
-
-            loss_dict = {}
-            # Compute the correct answers.
-            probs = logits.contiguous().float().softmax(dim=1)
-            max_probs, preds = torch.max(probs, 1)
-            preds = preds.int()
-            preds, labels = slidingjoins(preds, max_probs, labels, slices_info, img_size)
-            _, performs = CFMatrix()(preds, labels, args.ignore_index)
-
-            loss_dict['performs'] = performs
-            return 0, loss_dict
-
-        # defined inside to capture output_predictions
-        def correct_answers_forward_step(batch, model):
-            args = get_args()
-            try:
-                batch_ = next(batch)
-            except BaseException:
-                batch_ = batch
-            images, labels = process_batch(batch_)
-
-            assert not model.training
-            images, labels, slices_info, img_size = slidingcrops(images, labels)
-            # Forward model.
-            output_tensor = torch.cat([model(image) for image in torch.split(images, args.micro_batch_size)])
-
-            return output_tensor, partial(loss_func, labels, slices_info, img_size)
-
-        with torch.no_grad():
-            # For all the batches in the dataset.
-            performs = None
-            for _, batch in enumerate(dataloader):
-                loss_dicts = forward_backward_func(correct_answers_forward_step,
-                                                   batch, model,
-                                                   optimizer=None,
-                                                   timers=None,
-                                                   forward_only=True)
-                for loss_dict in loss_dicts:
-                    if performs is None:
-                        performs = loss_dict['performs']
-                    else:
-                        performs += loss_dict['performs']
-
-        for m in model:
-            m.train()
-        # Reduce.
-        if mpu.is_pipeline_last_stage():
-            torch.distributed.all_reduce(performs,
-                                         group=mpu.get_data_parallel_group())
-            # Print on screen.
-            # performs[int(ch), :] = [nb_tp, nb_fp, nb_tn, nb_fn]
-            true_positive = performs[:, 0]
-            false_positive = performs[:, 1]
-            false_negative = performs[:, 3]
-
-            iou = true_positive / (true_positive + false_positive + false_negative)
-            miou = iou[~torch.isnan(iou)].mean()
-
-            return iou.tolist(), miou.item()
-
-    def accuracy_func_provider():
-        """Provide function that calculates accuracies."""
-        args = get_args()
-
-        train_ds, valid_ds = build_train_valid_datasets(
-            data_path=args.data_path,
-            image_size=(args.img_h, args.img_w)
-        )
-        dataloader = build_data_loader(
-            valid_ds,
-            args.micro_batch_size,
-            num_workers=args.num_workers,
-            drop_last=(mpu.get_data_parallel_world_size() > 1),
-            shuffle=False
-        )
-
-        def metrics_func(model, epoch):
-            print_rank_0("calculating metrics ...")
-            iou, miou = calculate_correct_answers(model, dataloader, epoch)
-            print_rank_last(
-                " >> |epoch: {}| overall: iou = {},"
-                "miou = {:.4f} %".format(epoch, iou, miou*100.0)
-            )
-        return metrics_func
-
-    def dump_output_data(data, iteration, writer):
-        for (output_tb, loss) in data:
-            # output_tb[output_tb < 0] = 0
-            # output_tb[output_tb > 1] = 1
-            writer.add_images("image-outputseg-realseg", output_tb,
-                              global_step=None, walltime=None,
-                              dataformats='NCHW')
-
-    """Finetune/evaluate."""
-    finetune(
-        train_valid_datasets_provider,
-        model_provider,
-        forward_step=_cross_entropy_forward_step,
-        process_non_loss_data_func=dump_output_data,
-        end_of_epoch_callback_provider=accuracy_func_provider,
-    )
-
-
-def main():
-    segmentation()
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/metrics.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/metrics.py
deleted file mode 100644
index 750c10a90da5dd41c7d28b7f19041cf5e2d333b2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/metrics.py
+++ /dev/null
@@ -1,594 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
-#copyright (c) go-hiroaki & Chokurei
-#email: guangmingwu2010@gmail.com 
-#       guozhilingty@gmail.com
-#
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-eps = 1e-6
-
-def _binarize(y_data, threshold):
-    """
-    args:
-        y_data : [float] 4-d tensor in [batch_size, channels, img_rows, img_cols]
-        threshold : [float] [0.0, 1.0]
-    return 4-d binarized y_data
-    """
-    y_data[y_data < threshold] = 0.0
-    y_data[y_data >= threshold] = 1.0
-    return y_data
-
-def _argmax(y_data, dim):
-    """
-    args:
-        y_data : 4-d tensor in [batch_size, chs, img_rows, img_cols]
-        dim : int
-    return 3-d [int] y_data
-    """
-    return torch.argmax(y_data, dim).int()
-
-
-def _get_tp(y_pred, y_true):
-    """
-    args:
-        y_true : [int] 3-d in [batch_size, img_rows, img_cols]
-        y_pred : [int] 3-d in [batch_size, img_rows, img_cols]
-    return [float] true_positive
-    """
-    return torch.sum(y_true * y_pred).float()
-
-
-def _get_fp(y_pred, y_true):
-    """
-    args:
-        y_true : 3-d ndarray in [batch_size, img_rows, img_cols]
-        y_pred : 3-d ndarray in [batch_size, img_rows, img_cols]
-    return [float] false_positive
-    """
-    return torch.sum((1 - y_true) * y_pred).float()
-
-
-def _get_tn(y_pred, y_true):
-    """
-    args:
-        y_true : 3-d ndarray in [batch_size, img_rows, img_cols]
-        y_pred : 3-d ndarray in [batch_size, img_rows, img_cols]
-    return [float] true_negative
-    """
-    return torch.sum((1 - y_true) * (1 - y_pred)).float()
-
-
-def _get_fn(y_pred, y_true):
-    """
-    args:
-        y_true : 3-d ndarray in [batch_size, img_rows, img_cols]
-        y_pred : 3-d ndarray in [batch_size, img_rows, img_cols]
-    return [float] false_negative
-    """
-    return torch.sum(y_true * (1 - y_pred)).float()
-
-
-def _get_weights(y_true, nb_ch):
-    """
-    args:
-        y_true : 3-d ndarray in [batch_size, img_rows, img_cols]
-        nb_ch : int 
-    return [float] weights
-    """
-    batch_size, img_rows, img_cols = y_true.shape
-    pixels = batch_size * img_rows * img_cols
-    weights = [torch.sum(y_true==ch).item() / pixels for ch in range(nb_ch)]
-    return weights
-
-
-class CFMatrix(object):
-    def __init__(self, des=None):
-        self.des = des
-
-    def __repr__(self):
-        return "ConfusionMatrix"
-
-    def __call__(self, y_pred, y_true, ignore_index, threshold=0.5):
-
-        """
-        args:
-            y_true : 3-d ndarray in [batch_size, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return confusion matrix
-        """
-        batch_size, img_rows, img_cols = y_pred.shape
-        chs = ignore_index
-        device = y_true.device
-        if chs == 1:
-            y_pred = _binarize(y_pred, threshold)
-            y_true = _binarize(y_true, threshold)
-            nb_tp = _get_tp(y_pred, y_true)
-            nb_fp = _get_fp(y_pred, y_true)
-            nb_tn = _get_tn(y_pred, y_true)
-            nb_fn = _get_fn(y_pred, y_true)
-            mperforms = [nb_tp, nb_fp, nb_tn, nb_fn]
-            performs = None
-        else:
-            performs = torch.zeros(chs, 4).to(device)
-            weights = _get_weights(y_true, chs)
-            for ch in range(chs):
-                y_true_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_false_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_pred_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_true_ch[y_true == ch] = 1
-                y_false_ch[torch.logical_and((y_true != ch), (y_true != ignore_index))] = 1
-                y_pred_ch[y_pred == ch] = 1
-                nb_tp = _get_tp(y_pred_ch, y_true_ch)
-                nb_fp = torch.sum(y_false_ch * y_pred_ch).float()
-                nb_tn = torch.sum(y_false_ch * (1 - y_pred_ch)).float()
-                nb_fn = _get_fn(y_pred_ch, y_true_ch)
-                performs[int(ch), :] = torch.FloatTensor([nb_tp, nb_fp, nb_tn, nb_fn])
-            mperforms = sum([i*j for (i, j) in zip(performs, weights)])
-        return mperforms, performs
-
-
-class OAAcc(object):
-    def __init__(self, des="Overall Accuracy"):
-        self.des = des
-
-    def __repr__(self):
-        return "OAcc"
-
-    def __call__(self, y_pred, y_true, threshold=0.5):
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return (tp+tn)/total
-        """
-        batch_size, chs, img_rows, img_cols = y_true.shape
-        device = y_true.device
-        if chs == 1:
-            y_pred = _binarize(y_pred, threshold)
-            y_true = _binarize(y_true, threshold)
-        else:
-            y_pred = _argmax(y_pred, 1)
-            y_true = _argmax(y_true, 1)
-
-        nb_tp_tn = torch.sum(y_true == y_pred).float()
-        mperforms = nb_tp_tn / (batch_size * img_rows * img_cols)
-        performs = None
-        return mperforms, performs
-
-
-class Precision(object):
-    def __init__(self, des="Precision"):
-        self.des = des
-
-    def __repr__(self):
-        return "Prec"
-
-    def __call__(self, y_pred, y_true, threshold=0.5):
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return tp/(tp+fp)
-        """
-        batch_size, chs, img_rows, img_cols = y_true.shape
-        device = y_true.device
-        if chs == 1:
-            y_pred = _binarize(y_pred, threshold)
-            y_true = _binarize(y_true, threshold)
-            nb_tp = _get_tp(y_pred, y_true)
-            nb_fp = _get_fp(y_pred, y_true)
-            mperforms = nb_tp / (nb_tp + nb_fp + esp)
-            performs = None
-        else:
-            y_pred = _argmax(y_pred, 1)
-            y_true = _argmax(y_true, 1)
-            performs = torch.zeros(chs, 1).to(device)
-            weights = _get_weights(y_true, chs)
-            for ch in range(chs):
-                y_true_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_pred_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_true_ch[y_true == ch] = 1
-                y_pred_ch[y_pred == ch] = 1
-                nb_tp = _get_tp(y_pred_ch, y_true_ch)
-                nb_fp = _get_fp(y_pred_ch, y_true_ch)
-                performs[int(ch)] = nb_tp / (nb_tp + nb_fp + esp)
-            mperforms = sum([i*j for (i, j) in zip(performs, weights)])
-        return mperforms, performs
-
-
-class Recall(object):
-    def __init__(self, des="Recall"):
-        self.des = des
-
-    def __repr__(self):
-        return "Reca"
-
-    def __call__(self, y_pred, y_true, threshold=0.5):
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return tp/(tp+fn)
-        """
-        batch_size, chs, img_rows, img_cols = y_true.shape
-        device = y_true.device
-        if chs == 1:
-            y_pred = _binarize(y_pred, threshold)
-            y_true = _binarize(y_true, threshold)
-            nb_tp = _get_tp(y_pred, y_true)
-            nb_fn = _get_fn(y_pred, y_true)
-            mperforms = nb_tp / (nb_tp + nb_fn + esp)
-            performs = None
-        else:
-            y_pred = _argmax(y_pred, 1)
-            y_true = _argmax(y_true, 1)
-            performs = torch.zeros(chs, 1).to(device)
-            weights = _get_weights(y_true, chs)
-            for ch in range(chs):
-                y_true_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_pred_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_true_ch[y_true == ch] = 1
-                y_pred_ch[y_pred == ch] = 1
-                nb_tp = _get_tp(y_pred_ch, y_true_ch)
-                nb_fn = _get_fn(y_pred_ch, y_true_ch)
-                performs[int(ch)] = nb_tp / (nb_tp + nb_fn + esp)
-            mperforms = sum([i*j for (i, j) in zip(performs, weights)])
-        return mperforms, performs
-
-
-class F1Score(object):
-    def __init__(self, des="F1Score"):
-        self.des = des
-
-    def __repr__(self):
-        return "F1Sc"
-
-    def __call__(self, y_pred, y_true, threshold=0.5):
-
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return 2*precision*recall/(precision+recall)
-        """
-        batch_size, chs, img_rows, img_cols = y_true.shape
-        device = y_true.device
-        if chs == 1:
-            y_pred = _binarize(y_pred, threshold)
-            y_true = _binarize(y_true, threshold)
-            nb_tp = _get_tp(y_pred, y_true)
-            nb_fp = _get_fp(y_pred, y_true)
-            nb_fn = _get_fn(y_pred, y_true)
-            _precision = nb_tp / (nb_tp + nb_fp + esp)
-            _recall = nb_tp / (nb_tp + nb_fn + esp)
-            mperforms = 2 * _precision * _recall / (_precision + _recall + esp)
-            performs = None
-        else:
-            y_pred = _argmax(y_pred, 1)
-            y_true = _argmax(y_true, 1)
-            performs = torch.zeros(chs, 1).to(device)
-            weights = _get_weights(y_true, chs)
-            for ch in range(chs):
-                y_true_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_pred_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_true_ch[y_true == ch] = 1
-                y_pred_ch[y_pred == ch] = 1
-                nb_tp = _get_tp(y_pred_ch, y_true_ch)
-                nb_fp = _get_fp(y_pred_ch, y_true_ch)
-                nb_fn = _get_fn(y_pred_ch, y_true_ch)
-                _precision = nb_tp / (nb_tp + nb_fp + esp)
-                _recall = nb_tp / (nb_tp + nb_fn + esp)
-                performs[int(ch)] = 2 * _precision * \
-                    _recall / (_precision + _recall + esp)
-            mperforms = sum([i*j for (i, j) in zip(performs, weights)])
-        return mperforms, performs
-
-
-class Kappa(object):
-    def __init__(self, des="Kappa"):
-        self.des = des
-
-    def __repr__(self):
-        return "Kapp"
-
-    def __call__(self, y_pred, y_true, threshold=0.5):
-
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return (Po-Pe)/(1-Pe)
-        """
-        batch_size, chs, img_rows, img_cols = y_true.shape
-        device = y_true.device
-        if chs == 1:
-            y_pred = _binarize(y_pred, threshold)
-            y_true = _binarize(y_true, threshold)
-            nb_tp = _get_tp(y_pred, y_true)
-            nb_fp = _get_fp(y_pred, y_true)
-            nb_tn = _get_tn(y_pred, y_true)
-            nb_fn = _get_fn(y_pred, y_true)
-            nb_total = nb_tp + nb_fp + nb_tn + nb_fn
-            Po = (nb_tp + nb_tn) / nb_total
-            Pe = ((nb_tp + nb_fp) * (nb_tp + nb_fn) +
-                  (nb_fn + nb_tn) * (nb_fp + nb_tn)) / (nb_total**2)
-            mperforms = (Po - Pe) / (1 - Pe + esp)
-            performs = None
-        else:
-            y_pred = _argmax(y_pred, 1)
-            y_true = _argmax(y_true, 1)
-            performs = torch.zeros(chs, 1).to(device)
-            weights = _get_weights(y_true, chs)
-            for ch in range(chs):
-                y_true_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_pred_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_true_ch[y_true == ch] = 1
-                y_pred_ch[y_pred == ch] = 1
-                nb_tp = _get_tp(y_pred_ch, y_true_ch)
-                nb_fp = _get_fp(y_pred_ch, y_true_ch)
-                nb_tn = _get_tn(y_pred_ch, y_true_ch)
-                nb_fn = _get_fn(y_pred_ch, y_true_ch)
-                nb_total = nb_tp + nb_fp + nb_tn + nb_fn
-                Po = (nb_tp + nb_tn) / nb_total
-                Pe = ((nb_tp + nb_fp) * (nb_tp + nb_fn)
-                      + (nb_fn + nb_tn) * (nb_fp + nb_tn)) / (nb_total**2)
-                performs[int(ch)] = (Po - Pe) / (1 - Pe + esp)
-            mperforms = sum([i*j for (i, j) in zip(performs, weights)])
-        return mperforms, performs
-
-
-class Jaccard(object):
-    def __init__(self, des="Jaccard"):
-        self.des = des
-
-    def __repr__(self):
-        return "Jacc"
-
-    def __call__(self, y_pred, y_true, threshold=0.5):
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, chs, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return intersection / (sum-intersection)
-        """
-        batch_size, chs, img_rows, img_cols = y_true.shape
-        device = y_true.device
-        if chs == 1:
-            y_pred = _binarize(y_pred, threshold)
-            y_true = _binarize(y_true, threshold)
-            _intersec = torch.sum(y_true * y_pred).float()
-            _sum = torch.sum(y_true + y_pred).float()
-            mperforms = _intersec / (_sum - _intersec + esp)
-            performs = None
-        else:
-            y_pred = _argmax(y_pred, 1)
-            y_true = _argmax(y_true, 1)
-            performs = torch.zeros(chs, 1).to(device)
-            weights = _get_weights(y_true, chs)
-            for ch in range(chs):
-                y_true_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_pred_ch = torch.zeros(batch_size, img_rows, img_cols)
-                y_true_ch[y_true == ch] = 1
-                y_pred_ch[y_pred == ch] = 1
-                _intersec = torch.sum(y_true_ch * y_pred_ch).float()
-                _sum = torch.sum(y_true_ch + y_pred_ch).float()
-                performs[int(ch)] = _intersec / (_sum - _intersec + esp)
-            mperforms = sum([i*j for (i, j) in zip(performs, weights)])
-        return mperforms, performs
-
-
-class MSE(object):
-    def __init__(self, des="Mean Square Error"):
-        self.des = des
-
-    def __repr__(self):
-        return "MSE"
-
-    def __call__(self, y_pred, y_true, dim=1, threshold=None):
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, channels, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, channels, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return mean_squared_error, smaller the better
-        """
-        if threshold:
-            y_pred = _binarize(y_pred, threshold)
-        return torch.mean((y_pred - y_true) ** 2)
-
-
-class PSNR(object):
-    def __init__(self, des="Peak Signal to Noise Ratio"):
-        self.des = des
-
-    def __repr__(self):
-        return "PSNR"
-
-    def __call__(self, y_pred, y_true, dim=1, threshold=None):
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, channels, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, channels, img_rows, img_cols]
-            threshold : [0.0, 1.0]
-        return PSNR, larger the better
-        """
-        if threshold:
-            y_pred = _binarize(y_pred, threshold)
-        mse = torch.mean((y_pred - y_true) ** 2)
-        return 10 * torch.log10(1 / mse)
-
-
-class SSIM(object):
-    '''
-    modified from https://github.com/jorge-pessoa/pytorch-msssim
-    '''
-    def __init__(self, des="structural similarity index"):
-        self.des = des
-
-    def __repr__(self):
-        return "SSIM"
-
-    def gaussian(self, w_size, sigma):
-        gauss = torch.Tensor([math.exp(-(x - w_size//2)**2/float(2*sigma**2)) for x in range(w_size)])
-        return gauss/gauss.sum()
-
-    def create_window(self, w_size, channel=1):
-        _1D_window = self.gaussian(w_size, 1.5).unsqueeze(1)
-        _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
-        window = _2D_window.expand(channel, 1, w_size, w_size).contiguous()
-        return window
-
-    def __call__(self, y_pred, y_true, w_size=11, size_average=True, full=False):
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, channels, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, channels, img_rows, img_cols]
-            w_size : int, default 11
-            size_average : boolean, default True
-            full : boolean, default False
-        return ssim, larger the better
-        """
-        # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
-        if torch.max(y_pred) > 128:
-            max_val = 255
-        else:
-            max_val = 1
-
-        if torch.min(y_pred) < -0.5:
-            min_val = -1
-        else:
-            min_val = 0
-        L = max_val - min_val
-
-        padd = 0
-        (_, channel, height, width) = y_pred.size()
-        window = self.create_window(w_size, channel=channel).to(y_pred.device)
-
-        mu1 = F.conv2d(y_pred, window, padding=padd, groups=channel)
-        mu2 = F.conv2d(y_true, window, padding=padd, groups=channel)
-
-        mu1_sq = mu1.pow(2)
-        mu2_sq = mu2.pow(2)
-        mu1_mu2 = mu1 * mu2
-
-        sigma1_sq = F.conv2d(y_pred * y_pred, window, padding=padd, groups=channel) - mu1_sq
-        sigma2_sq = F.conv2d(y_true * y_true, window, padding=padd, groups=channel) - mu2_sq
-        sigma12 = F.conv2d(y_pred * y_true, window, padding=padd, groups=channel) - mu1_mu2
-
-        C1 = (0.01 * L) ** 2
-        C2 = (0.03 * L) ** 2
-
-        v1 = 2.0 * sigma12 + C2
-        v2 = sigma1_sq + sigma2_sq + C2
-        cs = torch.mean(v1 / v2)  # contrast sensitivity
-
-        ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
-
-        if size_average:
-            ret = ssim_map.mean()
-        else:
-            ret = ssim_map.mean(1).mean(1).mean(1)
-
-        if full:
-            return ret, cs
-        return ret
-
-
-class AE(object):
-    """
-    Modified from matlab : colorangle.m, MATLAB V2019b
-    angle = acos(RGB1' * RGB2 / (norm(RGB1) * norm(RGB2)));
-    angle = 180 / pi * angle;
-    """
-    def __init__(self, des='average Angular Error'):
-        self.des = des
-
-    def __repr__(self):
-        return "AE"
-    
-    def __call__(self, y_pred, y_true):
-        """
-        args:
-            y_true : 4-d ndarray in [batch_size, channels, img_rows, img_cols]
-            y_pred : 4-d ndarray in [batch_size, channels, img_rows, img_cols]
-        return average AE, smaller the better
-        """
-        dotP = torch.sum(y_pred * y_true, dim=1)
-        Norm_pred = torch.sqrt(torch.sum(y_pred * y_pred, dim=1))
-        Norm_true = torch.sqrt(torch.sum(y_true * y_true, dim=1))
-        ae = 180 / math.pi * torch.acos(dotP / (Norm_pred * Norm_true + eps))
-        return ae.mean(1).mean(1)
-
-
-if __name__ == "__main__":
-    for ch in [3, 1]:
-        batch_size, img_row, img_col = 1, 224, 224
-        y_true = torch.rand(batch_size, ch, img_row, img_col)
-        noise = torch.zeros(y_true.size()).data.normal_(0, std=0.1)
-        y_pred = y_true + noise
-        for cuda in [False, True]:
-            if cuda:
-                y_pred = y_pred.cuda()
-                y_true = y_true.cuda()
-
-            print('#'*20, 'Cuda : {} ; size : {}'.format(cuda, y_true.size()))
-            ########### similarity metrics
-            metric = MSE()
-            acc = metric(y_pred, y_true).item()
-            print("{} ==> {}".format(repr(metric), acc))
-
-            metric = PSNR()
-            acc = metric(y_pred, y_true).item()
-            print("{} ==> {}".format(repr(metric), acc))
-
-            metric = SSIM()
-            acc = metric(y_pred, y_true).item()
-            print("{} ==> {}".format(repr(metric), acc))
-                  
-            metric = LPIPS(cuda)
-            acc = metric(y_pred, y_true).item()
-            print("{} ==> {}".format(repr(metric), acc))
-            
-            metric = AE()
-            acc = metric(y_pred, y_true).item()
-            print("{} ==> {}".format(repr(metric), acc))
-            
-            ########### accuracy metrics
-            metric = OAAcc()
-            maccu, accu = metric(y_pred, y_true)
-            print('mAccu:', maccu, 'Accu', accu)
-
-            metric = Precision()
-            mprec, prec = metric(y_pred, y_true)
-            print('mPrec:', mprec, 'Prec', prec)
-
-            metric = Recall()
-            mreca, reca = metric(y_pred, y_true)
-            print('mReca:', mreca, 'Reca', reca)
-
-            metric = F1Score()
-            mf1sc, f1sc = metric(y_pred, y_true)
-            print('mF1sc:', mf1sc, 'F1sc', f1sc)
-
-            metric = Kappa()
-            mkapp, kapp = metric(y_pred, y_true)
-            print('mKapp:', mkapp, 'Kapp', kapp)
-
-            metric = Jaccard()
-            mjacc, jacc = metric(y_pred, y_true)
-            print('mJacc:', mjacc, 'Jacc', jacc)
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/seg_heads.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/seg_heads.py
deleted file mode 100644
index 0f4caef659de946323cbeba58669fefeb715f291..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/seg_heads.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-import math
-import einops
-import torch
-import apex
-import torch.nn.functional as F
-from megatron_ds import get_args
-from megatron_ds.model import LayerNorm
-from megatron_ds.model.module import MegatronModule
-from megatron_ds.model.vision.utils import resize
-
-
-class SetrSegmentationHead(MegatronModule):
-    def __init__(self, hidden_size, num_classes):
-        super(SetrSegmentationHead, self).__init__()
-        args = get_args()
-        self.hidden_size = hidden_size
-        self.num_classes = num_classes
-        self.img_h = args.img_h
-        self.img_w = args.img_w
-        self.patch_dim = args.patch_dim
-
-        self.layernorm = LayerNorm(hidden_size, eps=args.layernorm_epsilon)
-        self.conv_0 = torch.nn.Conv2d(hidden_size, hidden_size,
-                                      1, 1, bias=False)
-        self.norm_0 = apex.parallel.SyncBatchNorm(hidden_size)
-        self.conv_1 = torch.nn.Conv2d(hidden_size, num_classes, 1, 1)
-
-    def to_2D(self, x):
-        n, hw, c = x.shape
-        h = self.img_h // self.patch_dim
-        w = self.img_w // self.patch_dim
-        assert(hw == h * w)
-        x = x.transpose(1, 2).reshape(n, c, h, w)
-        return x
-
-    def forward(self, hidden_states):
-        # [b c h w]
-        hidden_states = self.layernorm(hidden_states)
-        hidden_states = self.to_2D(hidden_states)
-
-        hidden_states = self.conv_0(hidden_states)
-        hidden_states = self.norm_0(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.conv_1(hidden_states)
-
-        # [b c h w]
-        result = F.interpolate(hidden_states,
-                               size=(self.img_h, self.img_w),
-                               mode='bilinear')
-
-        return result
-
-
-class MLP(torch.nn.Module):
-    """
-    Linear Embedding
-    """
-    def __init__(self, input_dim=2048, embed_dim=768):
-        super().__init__()
-        self.proj = torch.nn.Linear(input_dim, embed_dim)
-
-    def forward(self, x):
-        x = x.flatten(2).transpose(1, 2)
-        x = self.proj(x)
-        return x
-
-
-class SegformerSegmentationHead(MegatronModule):
-    def __init__(self, feature_strides, in_channels,
-                 embedding_dim, dropout_ratio):
-        super(SegformerSegmentationHead, self).__init__()
-        assert len(feature_strides) == len(in_channels)
-        assert min(feature_strides) == feature_strides[0]
-        args = get_args()
-        self.feature_strides = feature_strides
-        self.in_channels = in_channels
-        self.embedding_dim = embedding_dim
-        self.num_classes = args.num_classes
-        self.dropout_ratio = dropout_ratio
-
-        c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = \
-            self.in_channels
-
-        self.linear_c4 = MLP(input_dim=c4_in_channels,
-                             embed_dim=self.embedding_dim)
-        self.linear_c3 = MLP(input_dim=c3_in_channels,
-                             embed_dim=self.embedding_dim)
-        self.linear_c2 = MLP(input_dim=c2_in_channels,
-                             embed_dim=self.embedding_dim)
-        self.linear_c1 = MLP(input_dim=c1_in_channels,
-                             embed_dim=self.embedding_dim)
-
-        self.conv_fuse = torch.nn.Conv2d(self.embedding_dim*4,
-                                         self.embedding_dim, 1, 1)
-        self.norm = apex.parallel.SyncBatchNorm(self.embedding_dim)
-
-        self.dropout = torch.nn.Dropout2d(self.dropout_ratio)
-        self.linear_pred = torch.nn.Conv2d(self.embedding_dim,
-                                           self.num_classes,
-                                           kernel_size=1)
-
-    def forward(self, inputs):
-        c1, c2, c3, c4 = inputs
-
-        ############## MLP decoder on C1-C4 ###########
-        n, _, h, w = c4.shape
-
-        _c4 = self.linear_c4(c4).permute(0, 2, 1).reshape(n, -1, c4.shape[2], c4.shape[3])
-        _c4 = resize(_c4, size=c1.size()[2:], mode='bilinear', align_corners=False)
-
-        _c3 = self.linear_c3(c3).permute(0, 2, 1).reshape(n, -1, c3.shape[2], c3.shape[3])
-        _c3 = resize(_c3, size=c1.size()[2:], mode='bilinear', align_corners=False)
-
-        _c2 = self.linear_c2(c2).permute(0, 2, 1).reshape(n, -1, c2.shape[2], c2.shape[3])
-        _c2 = resize(_c2, size=c1.size()[2:], mode='bilinear', align_corners=False)
-
-        _c1 = self.linear_c1(c1).permute(0, 2, 1).reshape(n, -1, c1.shape[2], c1.shape[3])
-
-        _c = self.conv_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1))
-        x = self.norm(_c)
-        x = F.relu(x, inplace=True)
-        x = self.dropout(x)
-        x = self.linear_pred(x)
-
-        return x
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/seg_models.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/seg_models.py
deleted file mode 100644
index d8589bc785b7b6c2bef900579d1638fd7e0346c2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/seg_models.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-import math
-import einops
-import torch
-import apex
-import torch.nn.functional as F
-from megatron_ds import get_args
-from megatron_ds.model.module import MegatronModule
-from megatron_ds.model.vision.vit_backbone import VitBackbone, VitMlpHead
-from megatron_ds.model.vision.mit_backbone import mit_b3, mit_b5
-from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead
-
-
-class SetrSegmentationModel(MegatronModule):
-
-    def __init__(self,
-                 num_classes,
-                 pre_process=True,
-                 post_process=True):
-        super(SetrSegmentationModel, self).__init__()
-        args = get_args()
-        assert post_process & pre_process
-        self.hidden_size = args.hidden_size
-        self.num_classes = num_classes
-        self.backbone = VitBackbone(
-            pre_process=pre_process,
-            post_process=post_process,
-            class_token=False,
-            post_layer_norm=False,
-            drop_path_rate=0.1
-        )
-
-        self.head = SetrSegmentationHead(
-            self.hidden_size,
-            self.num_classes
-        )
-
-    def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
-        pass
-
-    def forward(self, input):
-        # [b hw c]
-        hidden_states = self.backbone(input)
-        result_final = self.head(hidden_states)
-        return result_final
-
-
-class SegformerSegmentationModel(MegatronModule):
-
-    def __init__(self,
-                 num_classes,
-                 pre_process=True,
-                 post_process=True):
-        super(SegformerSegmentationModel, self).__init__()
-        args = get_args()
-        self.hidden_size = args.hidden_size
-        self.num_classes = num_classes
-        self.pre_process = pre_process
-        self.post_process = post_process
-
-        self.backbone = mit_b5()
-        self.head = SegformerSegmentationHead(
-            feature_strides=[4, 8, 16, 32],
-            in_channels=[64, 128, 320, 512],
-            embedding_dim=768,
-            dropout_ratio=0.1
-        )
-
-    def set_input_tensor(self, input_tensor):
-        """See megatron_ds.model.transformer.set_input_tensor()"""
-        pass
-
-    def forward(self, input):
-        # [b hw c]
-        hidden_states = self.backbone(input)
-        hidden_states = self.head(hidden_states)
-        return hidden_states
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/transforms.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/transforms.py
deleted file mode 100644
index 83290b47497e1f86f711cc18b9d4274907d9c13d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/transforms.py
+++ /dev/null
@@ -1,433 +0,0 @@
-# Copyright (c) 2020 The MMSegmenation Authors.
-#
-# This source code is licensed under the Apache license found in the
-# LICENSE file in the root directory of this source tree.
-
-import random
-import os
-import math
-import mmcv
-import torch
-import numpy as np
-import torchvision.transforms as T
-from torchvision import datasets
-from torch.utils.data import Dataset
-from megatron_ds import print_rank_0
-from megatron_ds import get_args
-from PIL import Image, ImageOps, ImageEnhance
-import torchvision.transforms as torch_tr
-
-def _is_pil_image(img):
-    return isinstance(img, Image.Image)
-
-class PhotoMetricDistortion(object):
-    """Apply photometric distortion to image sequentially, every transformation
-    is applied with a probability of 0.5. The position of random contrast is in
-    second or second to last.
-    1. random brightness
-    2. random contrast (mode 0)
-    3. convert color from BGR to HSV
-    4. random saturation
-    5. random hue
-    6. convert color from HSV to BGR
-    7. random contrast (mode 1)
-    8. randomly swap channels
-    Args:
-        brightness_delta (int): delta of brightness.
-        contrast_range (tuple): range of contrast.
-        saturation_range (tuple): range of saturation.
-        hue_delta (int): delta of hue.
-    """
-
-    def __init__(self,
-                 brightness_delta=32,
-                 contrast_range=(0.5, 1.5),
-                 saturation_range=(0.5, 1.5),
-                 hue_delta=18):
-        self.brightness_delta = brightness_delta
-        self.contrast_lower, self.contrast_upper = contrast_range
-        self.saturation_lower, self.saturation_upper = saturation_range
-        self.hue_delta = hue_delta
-
-    def convert(self, img, alpha=1, beta=0):
-        """Multiple with alpha and add beat with clip."""
-        img = img.astype(np.float32) * alpha + beta
-        img = np.clip(img, 0, 255)
-        return img.astype(np.uint8)
-
-    def brightness(self, img):
-        """Brightness distortion."""
-        if random.randint(0, 1):
-            return self.convert(
-                img,
-                beta=random.uniform(-self.brightness_delta,
-                                    self.brightness_delta))
-        return img
-
-    def contrast(self, img):
-        """Contrast distortion."""
-        if random.randint(0, 1):
-            return self.convert(
-                img,
-                alpha=random.uniform(self.contrast_lower, self.contrast_upper))
-        return img
-
-    def saturation(self, img):
-        """Saturation distortion."""
-        if random.randint(0, 1):
-            img = mmcv.bgr2hsv(img)
-            img[:, :, 1] = self.convert(
-                img[:, :, 1],
-                alpha=random.uniform(self.saturation_lower,
-                                     self.saturation_upper))
-            img = mmcv.hsv2bgr(img)
-        return img
-
-    def hue(self, img):
-        """Hue distortion."""
-        if random.randint(0, 1):
-            img = mmcv.bgr2hsv(img)
-            img[:, :,
-                0] = (img[:, :, 0].astype(int) +
-                      random.randint(-self.hue_delta, self.hue_delta)) % 180
-            img = mmcv.hsv2bgr(img)
-        return img
-
-    def __call__(self, img):
-        """Call function to perform photometric distortion on images.
-        Args:
-            results (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Result dict with images distorted.
-        """
-        img = np.array(img)
-
-        # random brightness
-        img = self.brightness(img)
-
-        # mode == 0 --> do random contrast first
-        # mode == 1 --> do random contrast last
-        mode = random.randint(0, 1)
-        if mode == 1:
-            img = self.contrast(img)
-
-        # random saturation
-        img = self.saturation(img)
-
-        # random hue
-        img = self.hue(img)
-
-        # random contrast
-        if mode == 0:
-            img = self.contrast(img)
-
-        img = Image.fromarray(img.astype(np.uint8)).convert('RGB')
-        return img
-
-
-class RandomCrop(object):
-    """
-    Take a random crop from the image.
-
-    First the image or crop size may need to be adjusted if the incoming image
-    is too small...
-
-    If the image is smaller than the crop, then:
-         the image is padded up to the size of the crop
-         unless 'nopad', in which case the crop size is shrunk to fit the image
-
-    A random crop is taken such that the crop fits within the image.
-
-
-    if cfg.DATASET.TRANSLATION_AUG_FIX is set, we insure that there's always
-    translation randomness of at least that value around the image.
-
-    if image < crop_size:
-        # slide crop within image, random offset
-    else:
-        # slide image within crop
-    """
-    def __init__(self, crop_size):
-        args = get_args()
-        self.size = crop_size
-        self.cat_max_ratio = 0.75
-        self.ignore_index = args.ignore_index
-        self.pad_color = (0, 0, 0)
-
-    def get_crop_bbox(self, img):
-        """Randomly get a crop bounding box."""
-        img_w, img_h = img.size
-        target_h, target_w = self.size  #[H W]
-        margin_h = max(img_h - target_h, 0)
-        margin_w = max(img_w - target_w, 0)
-        offset_h = random.randint(0, margin_h)
-        offset_w = random.randint(0, margin_w)
-        crop_y1, crop_y2 = offset_h, offset_h + target_h
-        crop_x1, crop_x2 = offset_w, offset_w + target_w
-
-        return crop_y1, crop_y2, crop_x1, crop_x2
-
-    def crop(self, img, crop_bbox):
-        """Crop from ``img``"""
-        crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
-        img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
-        return img
-
-    @staticmethod
-    def crop_in_image(target_w, target_h, w, h, img, mask):
-        if w == target_w:
-            x1 = 0
-        else:
-            x1 = random.randint(0, w - target_w)
-        if h == target_h:
-            y1 = 0
-        else:
-            y1 = random.randint(0, h - target_h)
-
-        return [img.crop((x1, y1, x1 + target_w, y1 + target_h)),
-                mask.crop((x1, y1, x1 + target_w, y1 + target_h))]
-
-
-    def __call__(self, img, mask):
-        w, h = img.size
-        target_h, target_w = self.size   # ASSUME H, W
-
-        if w == target_w and h == target_h:
-            return img, mask
-
-        # Pad image if image < crop
-        if target_h > h:
-            pad_h = (target_h - h) // 2 + 1
-        else:
-            pad_h = 0
-        if target_w > w:
-            pad_w = (target_w - w) // 2 + 1
-        else:
-            pad_w = 0
-        border = (pad_w, pad_h, pad_w, pad_h)
-        if pad_h or pad_w:
-            img = ImageOps.expand(img, border=border, fill=(0, 0, 0))
-            mask = ImageOps.expand(mask, border=border, fill=self.ignore_index)
-            w, h = img.size
-
-        crop_bbox = self.get_crop_bbox(img)
-        if self.cat_max_ratio < 1.:
-            # Repeat 10 times
-            for _ in range(10):
-                seg_temp = self.crop(mask, crop_bbox)
-                labels, cnt = np.unique(seg_temp, return_counts=True)
-                cnt = cnt[labels != self.ignore_index]
-                if len(cnt) > 1 and np.max(cnt) / np.sum(
-                        cnt) < self.cat_max_ratio:
-                    break
-                crop_bbox = self.get_crop_bbox(img)
-
-        # crop the image
-        img = self.crop(img, crop_bbox)
-
-        # crop semantic seg
-        mask = self.crop(mask, crop_bbox)
-        assert(img.size[0] == self.size[1] and img.size[1] == self.size[0])
-          
-        return img, mask
-
-
-class RandomSizeAndCrop(object):
-    def __init__(self,
-                 crop_size,
-                 scale_min=0.5,
-                 scale_max=2.0):
-        self.crop = RandomCrop(crop_size)
-        self.scale_min = scale_min
-        self.scale_max = scale_max
-
-    def __call__(self, img, mask):
-
-        scale_amt = random.uniform(self.scale_min, self.scale_max)
-        w, h = [int(i * scale_amt) for i in img.size]
-
-        resized_img = img.resize((w, h), Image.BICUBIC)
-        resized_mask = mask.resize((w, h), Image.NEAREST)
-        img, mask = self.crop(resized_img, resized_mask)
-        return img, mask
-
-class RandomHorizontallyFlip(object):
-    def __call__(self, img, mask):
-        if random.random() < 0.5:
-            return img.transpose(Image.FLIP_LEFT_RIGHT), mask.transpose(
-                Image.FLIP_LEFT_RIGHT)
-        return img, mask
-
-
-def adjust_brightness(img, brightness_factor):
-    """Adjust brightness of an Image.
-
-    Args:
-        img (PIL Image): PIL Image to be adjusted.
-        brightness_factor (float):  How much to adjust the brightness. Can be
-            any non negative number. 0 gives a black image, 1 gives the
-            original image while 2 increases the brightness by a factor of 2.
-
-    Returns:
-        PIL Image: Brightness adjusted image.
-    """
-    if not _is_pil_image(img):
-        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
-
-    enhancer = ImageEnhance.Brightness(img)
-    img = enhancer.enhance(brightness_factor)
-    return img
-
-
-def adjust_contrast(img, contrast_factor):
-    """Adjust contrast of an Image.
-
-    Args:
-        img (PIL Image): PIL Image to be adjusted.
-        contrast_factor (float): How much to adjust the contrast. Can be any
-            non negative number. 0 gives a solid gray image, 1 gives the
-            original image while 2 increases the contrast by a factor of 2.
-
-    Returns:
-        PIL Image: Contrast adjusted image.
-    """
-    if not _is_pil_image(img):
-        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
-
-    enhancer = ImageEnhance.Contrast(img)
-    img = enhancer.enhance(contrast_factor)
-    return img
-
-
-def adjust_saturation(img, saturation_factor):
-    """Adjust color saturation of an image.
-
-    Args:
-        img (PIL Image): PIL Image to be adjusted.
-        saturation_factor (float):  How much to adjust the saturation. 0 will
-            give a black and white image, 1 will give the original image while
-            2 will enhance the saturation by a factor of 2.
-
-    Returns:
-        PIL Image: Saturation adjusted image.
-    """
-    if not _is_pil_image(img):
-        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
-
-    enhancer = ImageEnhance.Color(img)
-    img = enhancer.enhance(saturation_factor)
-    return img
-
-
-def adjust_hue(img, hue_factor):
-    """Adjust hue of an image.
-
-    The image hue is adjusted by converting the image to HSV and
-    cyclically shifting the intensities in the hue channel (H).
-    The image is then converted back to original image mode.
-
-    `hue_factor` is the amount of shift in H channel and must be in the
-    interval `[-0.5, 0.5]`.
-
-    See https://en.wikipedia.org/wiki/Hue for more details on Hue.
-
-    Args:
-        img (PIL Image): PIL Image to be adjusted.
-        hue_factor (float):  How much to shift the hue channel. Should be in
-            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
-            HSV space in positive and negative direction respectively.
-            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
-            with complementary colors while 0 gives the original image.
-
-    Returns:
-        PIL Image: Hue adjusted image.
-    """
-    if not(-0.5 <= hue_factor <= 0.5):
-        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
-
-    if not _is_pil_image(img):
-        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
-
-    input_mode = img.mode
-    if input_mode in {'L', '1', 'I', 'F'}:
-        return img
-
-    h, s, v = img.convert('HSV').split()
-
-    np_h = np.array(h, dtype=np.uint8)
-    # uint8 addition take cares of rotation across boundaries
-    with np.errstate(over='ignore'):
-        np_h += np.uint8(hue_factor * 255)
-    h = Image.fromarray(np_h, 'L')
-
-    img = Image.merge('HSV', (h, s, v)).convert(input_mode)
-    return img
-
-
-class ColorJitter(object):
-    """Randomly change the brightness, contrast and saturation of an image.
-
-    Args:
-        brightness (float): How much to jitter brightness. brightness_factor
-            is chosen uniformly from [max(0, 1 - brightness), 1 + brightness].
-        contrast (float): How much to jitter contrast. contrast_factor
-            is chosen uniformly from [max(0, 1 - contrast), 1 + contrast].
-        saturation (float): How much to jitter saturation. saturation_factor
-            is chosen uniformly from [max(0, 1 - saturation), 1 + saturation].
-        hue(float): How much to jitter hue. hue_factor is chosen uniformly from
-            [-hue, hue]. Should be >=0 and <= 0.5.
-    """
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
-        self.brightness = brightness
-        self.contrast = contrast
-        self.saturation = saturation
-        self.hue = hue
-
-    @staticmethod
-    def get_params(brightness, contrast, saturation, hue):
-        """Get a randomized transform to be applied on image.
-
-        Arguments are same as that of __init__.
-
-        Returns:
-            Transform which randomly adjusts brightness, contrast and
-            saturation in a random order.
-        """
-        transforms = []
-        if brightness > 0:
-            brightness_factor = np.random.uniform(max(0, 1 - brightness), 1 + brightness)
-            transforms.append(
-                torch_tr.Lambda(lambda img: adjust_brightness(img, brightness_factor)))
-
-        if contrast > 0:
-            contrast_factor = np.random.uniform(max(0, 1 - contrast), 1 + contrast)
-            transforms.append(
-                torch_tr.Lambda(lambda img: adjust_contrast(img, contrast_factor)))
-
-        if saturation > 0:
-            saturation_factor = np.random.uniform(max(0, 1 - saturation), 1 + saturation)
-            transforms.append(
-                torch_tr.Lambda(lambda img: adjust_saturation(img, saturation_factor)))
-
-        if hue > 0:
-            hue_factor = np.random.uniform(-hue, hue)
-            transforms.append(
-                torch_tr.Lambda(lambda img: adjust_hue(img, hue_factor)))
-
-        np.random.shuffle(transforms)
-        transform = torch_tr.Compose(transforms)
-
-        return transform
-
-    def __call__(self, img):
-        """
-        Args:
-            img (PIL Image): Input image.
-
-        Returns:
-            PIL Image: Color jittered image.
-        """
-        transform = self.get_params(self.brightness, self.contrast,
-                                    self.saturation, self.hue)
-        return transform(img)
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/utils.py b/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/utils.py
deleted file mode 100644
index 9b9486629e309983c64f602cce0a4d5f2f22a861..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/vision/segmentation/utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import math
-import torch
-import numpy as np
-from megatron_ds import get_args
-
-def slidingcrops(img, mask):
-    # img: [b c h w]
-    # mask: [b h w]
-    args = get_args()
-    assert args.img_h == args.img_w
-    crop_size = args.img_h
-    stride = args.seg_stride
-    ignore_index = args.ignore_index
-    n, c, h, w = img.shape
-    assert h >= crop_size
-    assert w >= crop_size
-    long_size = max(h, w)
-
-    img_slices, mask_slices, slices_info = [], [], []
-    if long_size > crop_size:
-        assert stride <= crop_size
-        h_step_num = int(math.ceil((h - crop_size) / float(stride))) + 1
-        w_step_num = int(math.ceil((w - crop_size) / float(stride))) + 1
-        for yy in range(h_step_num):
-            for xx in range(w_step_num):
-                sy, sx = yy * stride, xx * stride
-                ey, ex = sy + crop_size, sx + crop_size
-                img_sub = img[:, :, sy: ey, sx: ex]
-                mask_sub = mask[:, sy: ey, sx: ex]
-
-                # padding
-                sub_h, sub_w = img_sub.shape[2:]
-                pad_h = max(crop_size - sub_h, 0)
-                pad_w = max(crop_size - sub_w, 0)
-                img_sub = torch.nn.functional.pad(img_sub, pad=(0, pad_w, 0, pad_h), value=ignore_index)
-                mask_sub = torch.nn.functional.pad(mask_sub, pad=(0, pad_w, 0, pad_h))
-
-                img_slices.append(img_sub)
-                mask_slices.append(mask_sub)
-                slices_info.append([sy, ey, sx, ex, sub_h, sub_w])
-
-        return torch.cat(img_slices), torch.cat(mask_slices), slices_info, (h, w)
-    else:
-        return img, mask, [[0, h, 0, w, h, w]], (h, w)
-
-
-def slidingjoins(preds, probs, labels, slices_info, img_size):
-    args = get_args()
-    num_slices = len(slices_info)
-
-    if num_slices == 1:
-        return preds, labels
-
-    h, w = img_size
-    split_size = args.micro_batch_size
-
-    preds_split = torch.split(preds, split_size)
-    probs_split = torch.split(probs, split_size)
-    labels_split = torch.split(labels, split_size)
-
-    assert(len(preds_split) == num_slices)
-
-    total_max_probs = torch.zeros((split_size, h, w), dtype=torch.float, device='cuda')
-    total_preds = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
-    total_labels = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
-
-    for i in range(num_slices):
-        sy, ey, sx, ex, sub_h, sub_w = slices_info[i]
-        assert sy + sub_h <= h
-        assert sx + sub_w <= w
-        curr_max_probs = total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w]
-        curr_preds = total_preds[:, sy:sy + sub_h, sx:sx + sub_w]
-
-        local_max_probs = probs_split[i][:, :sub_h, : sub_w]
-        local_preds = preds_split[i][:, :sub_h, :sub_w]
-
-        result_max_probs = torch.maximum(curr_max_probs, local_max_probs)
-        result_preds = torch.where(curr_max_probs >= local_max_probs, curr_preds, local_preds)
-
-        total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] = result_max_probs
-        total_preds[:, sy:sy + sub_h, sx:sx + sub_w] = result_preds
-        total_labels[:, sy:sy + sub_h, sx:sx + sub_w] = labels_split[i][0, :sub_h, :sub_w]
-
-    return total_preds, total_labels
-
diff --git a/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/datasets.py b/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/datasets.py
deleted file mode 100644
index 584ff29af00e3ce187dae1f93ae754789b101bc1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/datasets.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Zero-shot datasets."""
-
-import json
-import math
-
-import numpy as np
-import torch
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds import get_tokenizer
-from .detokenizer import get_detokenizer
-
-
-def build_dataset(task):
-    """Helper function to select and build dataset."""
-
-    if task == 'LAMBADA':
-        return _build_lambada_dataset()
-    if task == 'WIKITEXT103':
-        return _build_wikitext103_dataset()
-
-    raise NotImplementedError('dataset for {} task is not '
-                              'implemented.'.format(task))
-
-
-class _LMDataset(torch.utils.data.Dataset):
-
-    def __init__(self, tokens, seq_len, pad_idx, num_original_tokens,
-                 num_tokenized_tokens, overalapping_eval=None):
-        self.tokens = tokens
-        self.seq_len = seq_len
-        self.pad_idx = pad_idx
-        self.overalapping_eval = overalapping_eval
-        if self.overalapping_eval is None:
-            self.overalapping_eval = self.seq_len
-        self.overalapping_eval = max(1, self.overalapping_eval)
-        self.num_original_tokens = num_original_tokens
-        self.num_tokenized_tokens = num_tokenized_tokens
-        self.total_targets = len(self.tokens) - 1
-        # remove first sequence tokens
-        targets = max(self.total_targets - self.overalapping_eval, 0)
-        self.total_sequences = max(
-            math.ceil(targets / self.overalapping_eval) + 1, 1)
-
-    def __len__(self):
-        return self.total_sequences
-
-    def __getitem__(self, idx):
-        start_idx = idx * self.overalapping_eval
-        end_idx = start_idx + self.seq_len
-        tokens = self.tokens[start_idx:end_idx + 1]
-        num_tokens = len(tokens)
-        pad_mask = [1] * num_tokens
-        if num_tokens < self.seq_len + 1:
-            num_pad = (self.seq_len + 1 - num_tokens)
-            pad_mask += [0] * (num_pad)
-            tokens += [self.pad_idx] * num_pad
-        pad_mask = np.array(pad_mask[1:])
-        if self.overalapping_eval != self.seq_len and idx != 0:
-            pad_mask[:-self.overalapping_eval] *= 0
-
-        return {'text': np.array(tokens), 'pad_mask': pad_mask}
-
-
-class _LambadaDataset(torch.utils.data.Dataset):
-
-    def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False):
-        print_rank_0('> building lambada dataset from {} ...'.format(path))
-        self.seq_len = seq_len
-        self.pad_idx = pad_idx
-        self.tokenizer = tokenizer
-        self.strict = strict
-
-        self.tokens = []
-        self.labels = []
-        with open(path, 'r') as f:
-            for line in f.readlines():
-                text = json.loads(line)['text']
-                tokens, labels = self.get_tokens(text)
-                self.tokens.append(tokens)
-                self.labels.append(labels)
-
-    def get_tokens(self, text):
-        if not self.strict:
-            tokens = self.tokenizer.tokenize(text)
-            return tokens[:-1], [tokens[-1]]
-        last_token = text.split()[-1]
-        start_idx = text.rfind(last_token)
-        beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip())
-        last_token = self.tokenizer.tokenize(' ' + last_token)
-        return beginning_tokens, last_token
-
-    def __len__(self):
-        return len(self.tokens)
-
-    def __getitem__(self, idx):
-        tokens = self.tokens[idx]
-        num_tokens = len(tokens)
-        pad_mask = [0] * num_tokens
-        labels = self.labels[idx]
-        pad_mask += [1] * len(labels)
-        tokens = tokens + labels
-        num_tokens = len(tokens)
-        if num_tokens < self.seq_len + 1:
-            num_pad = (self.seq_len + 1 - num_tokens)
-            pad_mask += [0] * (num_pad)
-            tokens += [self.pad_idx] * num_pad
-        pad_mask = np.array(pad_mask[1:])
-
-        return {'text': np.array(tokens), 'pad_mask': pad_mask}
-
-
-def _build_lambada_dataset():
-    """Build lambada dataset."""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    assert len(args.valid_data) == 1
-    val_dataset = _LambadaDataset(args.valid_data[0], tokenizer.eod, tokenizer,
-                                  args.seq_length, args.strict_lambada)
-    print_rank_0(' > found {} samples.'.format(len(val_dataset)))
-
-    return val_dataset
-
-
-def _build_wikitext103_dataset():
-    """"""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    assert len(args.valid_data) == 1
-    with open(args.valid_data[0], "rb") as reader:
-        entire_data = reader.read().decode('utf-8')
-    num_original_tokens = len(entire_data.strip().split(" "))
-    entire_data = get_detokenizer(args.valid_data[0])(entire_data)
-    tokenized_data = tokenizer.tokenize(entire_data)
-    num_tokenized_tokens = len(tokenized_data)
-
-    val_dataset = _LMDataset(tokenized_data, args.seq_length, tokenizer.eod,
-                             num_original_tokens, num_tokenized_tokens,
-                             args.overlapping_eval)
-    print_rank_0(' > number of original tokens: {}, number of detokenized '
-                 'tokens: {}'.format(num_original_tokens, num_tokenized_tokens))
-
-    return val_dataset
diff --git a/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/detokenizer.py b/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/detokenizer.py
deleted file mode 100644
index f7dfe4b775e0363b89ea4930317492a3cb1731b0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/detokenizer.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Detokenization."""
-
-import re
-
-
-def ptb_detokenizer(string):
-    string = string.replace(" '", "'")
-    string = string.replace(" \n", "\n")
-    string = string.replace("\n ", "\n")
-    string = string.replace(" n't", "n't")
-    string = string.replace(" N ", "1 ")
-    string = string.replace("$ 1", "$1")
-    string = string.replace("# 1", "#1")
-    return string
-
-
-def wikitext_detokenizer(string):
-    # contractions
-    string = string.replace("s '", "s'")
-    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
-    # number separators
-    string = string.replace(" @-@ ", "-")
-    string = string.replace(" @,@ ", ",")
-    string = string.replace(" @.@ ", ".")
-    # punctuation
-    string = string.replace(" : ", ": ")
-    string = string.replace(" ; ", "; ")
-    string = string.replace(" . ", ". ")
-    string = string.replace(" ! ", "! ")
-    string = string.replace(" ? ", "? ")
-    string = string.replace(" , ", ", ")
-    # double brackets
-    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
-    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
-    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
-    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
-    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
-    # miscellaneous
-    string = string.replace("= = = =", "====")
-    string = string.replace("= = =", "===")
-    string = string.replace("= =", "==")
-    string = string.replace(" " + chr(176) + " ", chr(176))
-    string = string.replace(" \n", "\n")
-    string = string.replace("\n ", "\n")
-    string = string.replace(" N ", " 1 ")
-    string = string.replace(" 's", "'s")
-
-    return string
-
-
-def lambada_detokenizer(string):
-    return string
-
-
-_DETOKENIZERS = {
-    'ptb': ptb_detokenizer,
-    'wiki': wikitext_detokenizer,
-    'lambada': lambada_detokenizer,
-}
-
-
-def get_detokenizer(path):
-    for key in _DETOKENIZERS.keys():
-        if key in path:
-            return _DETOKENIZERS[key]
diff --git a/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/evaluate.py b/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/evaluate.py
deleted file mode 100644
index 1564568583db733f836a3e2d41858330b2862a62..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tasks/zeroshot_gpt/evaluate.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""GPT zero-shot evaluation."""
-
-import math
-
-import torch
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_0, is_last_rank
-from megatron_ds import get_tokenizer
-from megatron_ds.core import parallel_state, tensor_parallel
-from megatron_ds.checkpointing import load_checkpoint
-from megatron_ds.model import GPTModel
-from megatron_ds.training import get_model
-from megatron_ds.arguments import core_transformer_config_from_args
-from megatron_ds.utils import get_ltor_masks_and_position_ids, unwrap_model
-from megatron_ds.p2p_communication import recv_forward, send_forward
-from tasks.finetune_utils import build_data_loader
-from deepspeed.accelerator import get_accelerator
-from .datasets import build_dataset
-
-# These are needed to unwrap the model, would be nice to put these in megatron_ds.utils if possible?
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron_ds.model import DistributedDataParallel as LocalDDP
-from megatron_ds.model import Float16Module
-
-def get_model_provider(eval_metric):
-    """Based on evaluation metric set the parallel-output flag and
-    return the model provider."""
-
-    def model_provider(pre_process=True, post_process=True):
-        """Build the model."""
-
-        config = core_transformer_config_from_args(get_args())
-
-        if eval_metric == 'loss':
-            parallel_output = True
-        elif eval_metric == 'accuracy':
-            parallel_output = False
-        else:
-            raise NotImplementedError('output type for {} evaluation metric '
-                                      'is not supported.'.format(eval_metric))
-
-        print_rank_0('building GPT model ...')
-        model = GPTModel(config=config, num_tokentypes=0, parallel_output=parallel_output,
-                         pre_process=pre_process, post_process=post_process)
-
-        return model
-
-    return model_provider
-
-
-def process_batch(batch):
-    """Process batch and produce inputs for the model."""
-    args = get_args()
-    tokenizer = get_tokenizer()
-
-    loss_mask = batch['pad_mask'].long().to(get_accelerator().device_name()).contiguous().byte()
-    tokens_ = batch['text'].long().to(get_accelerator().device_name()).contiguous()
-    labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
-
-    # Get the masks and postition ids.
-    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
-        tokens,
-        tokenizer.eod,
-        args.reset_position_ids,
-        args.reset_attention_mask,
-        args.eod_mask_loss)
-
-    return tokens, labels, attention_mask, position_ids, loss_mask
-
-
-def forward_step(batch, model, eval_metric):
-    """Forward step."""
-
-    # Get the batch.
-    tokens, labels, attention_mask, position_ids, loss_mask = process_batch(
-        batch)
-
-    # Tell the model what our actual batch size will be
-    args = get_args()
-    args.micro_batch_size = len(labels)
-
-    input_tensor = recv_forward()
-
-    # Forward pass through the model.
-    unwrapped_model = unwrap_model(
-        model, (torchDDP, LocalDDP, Float16Module))
-    unwrapped_model.set_input_tensor(input_tensor)
-    output = model(tokens, position_ids, attention_mask)
-
-    send_forward(output)
-
-    if parallel_state.is_pipeline_last_stage():
-        # For loss, return the unreduced loss.
-        if eval_metric == 'loss':
-            losses = tensor_parallel.vocab_parallel_cross_entropy(
-                output.contiguous().float(), labels.contiguous())
-            loss = torch.sum(
-                losses.view(-1) * loss_mask.contiguous().view(-1).float())
-            return loss
-
-        # For accuracy, return the number of correctly predicted samples.
-        if eval_metric == 'accuracy':
-            outputs = torch.argmax(output, -1)
-            correct = (outputs == labels).float()
-            correct[(1 - loss_mask).bool()] = 1
-            correct = correct.prod(-1)
-            return correct.sum()
-
-        raise NotImplementedError('forward method for evaluation metric {} '
-                                  'is not implemented.'.format(eval_metric))
-    return None
-
-
-def evaluate(data_loader, model, eval_metric):
-    """Evaluation."""
-    args = get_args()
-
-    # Turn on evaluation mode which disables dropout.
-    model.eval()
-
-    total_output = 0.0
-    with torch.no_grad():
-        # For all the batches in the dataset.
-        for iteration, batch in enumerate(data_loader):
-            if iteration % args.log_interval == 0:
-                print_rank_0('> working on iteration: {}'.format(iteration))
-            # Forward evaluation.
-            output = forward_step(batch, model, eval_metric)
-
-            # Reduce across processes.
-            if parallel_state.is_pipeline_last_stage():
-                torch.distributed.all_reduce(output,
-                                             group=parallel_state.get_data_parallel_group())
-
-                total_output += output
-
-    return total_output
-
-
-def evaluate_and_print_results(task, data_loader, model, eval_metric):
-    """Evaluate and print results on screen."""
-
-    # Evaluate and get results.
-    output = evaluate(data_loader, model, eval_metric)
-
-    string = ' validation results on {} | '.format(task)
-    if is_last_rank():
-        if eval_metric == 'loss':
-            num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
-            num_original_tokens = data_loader.dataset.num_original_tokens
-            val_loss = output / (num_tokenized_tokens - 1)
-            ppl = math.exp(min(20, val_loss))
-            token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
-            adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
-            string += 'avg loss: {:.4E} | '.format(val_loss)
-            string += 'ppl: {:.4E} | '.format(ppl)
-            string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
-            string += 'token ratio: {} |'.format(token_ratio)
-
-        elif eval_metric == 'accuracy':
-            num_examples = len(data_loader.dataset)
-            acc = output / num_examples
-            string += 'number correct: {:.4E} | '.format(output)
-            string += 'total examples: {:.4E} | '.format(num_examples)
-            string += 'avg accuracy: {:.4E}'.format(acc)
-
-        else:
-            raise NotImplementedError('evaluation method for {} metric is not '
-                                      'implemented yet.'.format(eval_metric))
-
-        length = len(string) + 1
-        print('-' * length)
-        print(string)
-        print('-' * length)
-
-
-def main():
-    """Main program."""
-    args = get_args()
-
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        print("Interleaved pipeline schedule is not yet supported for text generation.")
-        exit()
-
-    if args.task == 'LAMBADA':
-        eval_metric = 'accuracy'
-    elif args.task == 'WIKITEXT103':
-        eval_metric = 'loss'
-    else:
-        raise NotImplementedError('{} task is not implemented.'.format(
-            args.task))
-
-    # Set up model and load checkpoint.
-    model = get_model(get_model_provider(eval_metric), wrap_with_ddp=False)
-    if args.load is not None:
-        _ = load_checkpoint(model, None, None)
-
-    assert len(model) == 1, "Above condition should have caught this"
-    model = model[0]
-
-    # Data stuff.
-    dataset = build_dataset(args.task)
-    dataloader = build_data_loader(dataset, args.micro_batch_size,
-                                   args.num_workers, drop_last=False)
-
-    # Run evaluation.
-    evaluate_and_print_results(args.task, dataloader, model, eval_metric)
-
-    print_rank_0('done :-)')
diff --git a/toolbox/Megatron-DeepSpeed/tests/__init__.py b/toolbox/Megatron-DeepSpeed/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/conftest.py b/toolbox/Megatron-DeepSpeed/tests/conftest.py
deleted file mode 100644
index bc5f1ce649065d59d469a8ad128c2483b714255c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/conftest.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import pytest
-
-from megatron_ds.core import parallel_state
-from megatron_ds.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-
-# initialize model parallel for tests
-parallel_state.set_tensor_model_parallel_world_size(1)
-parallel_state.set_tensor_model_parallel_rank(0)
-parallel_state._set_global_memory_buffer()
-parallel_state.set_pipeline_model_parallel_rank(0)
-parallel_state.set_pipeline_model_parallel_world_size(1)
-
-# model_parallel_cuda_manual_seed(123)
-
-
-@pytest.fixture
-def transformer_config():
-    return TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True)
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/__init__.py b/toolbox/Megatron-DeepSpeed/tests/functional_tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/__init__.py b/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/check_slurm_job_completion.py b/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/check_slurm_job_completion.py
deleted file mode 100644
index acd179a4eaf1b3821ed33a5d46b36fe97bdbc383..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/check_slurm_job_completion.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Check if a given slurm job id completed successfully
-   Usage:
-       python3 check_slurm_job_completion.py <JOB_ID>
-"""
-
-import sys
-import subprocess
-
-
-cmd = f"sacct -j {sys.argv[1]}"
-result = subprocess.check_output(cmd, shell=True).decode().split()
-assert len(result) > 14, "JOB state not available."
-
-status = result[19]
-exit_code = result[20]
-
-assert status == "COMPLETED", f"Job {sys.argv[1]} not completed."
-assert exit_code == "0:0", f"Job {sys.argv[1]} did not exit successfully."
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py b/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
deleted file mode 100644
index 362dabab780e634b1017ce75f474d64f129e508e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import os
-import sys
-import json
-import shutil
-import glob
-from tensorboard.backend.event_processing import event_accumulator
-
-
-def read_tb_logs_as_list(path, summary_name):
-    """Reads a TensorBoard Events file from the input path, and returns the
-    summary specified as input as a list.
-
-    Arguments:
-    path: str, path to the dir where the events file is located.
-    summary_name: str, name of the summary to read from the TB logs.
-    Output:
-    summary_list: list, the values in the read summary list, formatted as a list.
-    """
-    files = glob.glob(f"{path}/events*tfevents*")
-    files += glob.glob(f"{path}/results/events*tfevents*")
-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[0]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(f'\nObtained the following list for {summary_name} ------------------')
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")    
-
-def collect_train_test_metrics(logs_dir, run_name):
-    # TODO: Fetch current baseline
-
-    # train loss
-    train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss")
-
-    # num zeros
-    num_zeros = read_tb_logs_as_list(logs_dir, "num-zeros")
-
-    iteration_time = read_tb_logs_as_list(logs_dir, "iteration-time")
-
-    # First few iterations might take a little longer. So we take the last 70 percent of the timings
-    idx = len(iteration_time)//3   
-    iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
-
-    train_metrics = {
-        "lm loss": {
-            "start_step": 0,
-            "end_step": len(train_loss_list),
-            "step_interval": 5,
-            "values": train_loss_list[0:len(train_loss_list):5],
-        },
-        "num-zeros": {
-            "start_step": 0,
-            "end_step": len(num_zeros),
-            "step_interval": 5,
-            "values": num_zeros[0:len(num_zeros):5],
-        },
-        "iteration_timing_avg": iteration_time_avg,
-    }
-    str_train_metrics = str(train_metrics).replace("'", "\"")
-    print(f"\n ----------- Store the following metrics in {run_name}.json ----------")
-    print(f"\n {str_train_metrics}", flush=True)
-
-if __name__ == '__main__':
-    args = sys.argv[1:]
-    logs_dir = args[0] # eg /lustre/fsw/joc/shanmugamr/megatron/logs/
-    run_name = args[1]
-    collect_train_test_metrics(logs_dir, run_name)
-
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/test_ci_pipeline.py b/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/test_ci_pipeline.py
deleted file mode 100644
index 829ebeec4127974cd21a1e04b87c9115bab2279d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import os
-import json
-import pytest
-import sys
-import glob
-from tensorboard.backend.event_processing import event_accumulator
-
-LOGS_DIR = os.getenv('LOGS_DIR')
-EXPECTED_METRICS_FILE = os.getenv('EXPECTED_METRICS_FILE')
-
-import enum
-
-class TypeOfTest(enum.Enum):
-    APPROX = 1
-    DETERMINISTIC = 2
-
-
-def read_tb_logs_as_list(path, summary_name):
-    """Reads a TensorBoard Events file from the input path, and returns the
-    summary specified as input as a list.
-
-    Arguments:
-    path: str, path to the dir where the events file is located.
-    summary_name: str, name of the summary to read from the TB logs.
-    Output:
-    summary_list: list, the values in the read summary list, formatted as a list.
-    """
-    files = glob.glob(f"{path}/events*tfevents*")
-    files += glob.glob(f"{path}/results/events*tfevents*")
-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[0]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(f'\nObtained the following list for {summary_name} ------------------')
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")
-
-
-# If we require a variation of tests for any of the other pipelines we can just inherit this class.
-class TestCIPipeline:
-
-    margin_loss, margin_time = 0.05, 0.1
-    expected = None
-    if os.path.exists(EXPECTED_METRICS_FILE):
-        with open(EXPECTED_METRICS_FILE) as f:
-            expected = json.load(f)
-
-    def _test_helper(self, loss_type, test_type):
-        if self.expected is None:
-            raise FileNotFoundError("Expected data is none")
-        expected = self.expected[loss_type]
-        expected_list = expected["values"]
-        print(expected_list)
-        actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
-        assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
-        actual_list_sliced = actual_list[expected["start_step"]:expected["end_step"]:expected["step_interval"]]
-        for i, (expected_val, actual_val) in enumerate(zip(expected_list, actual_list_sliced)):
-            step = i * expected["step_interval"]
-            print(f"Checking step {step} against expected {i}")
-            if test_type == TypeOfTest.APPROX:
-                assert actual_val == pytest.approx(expected=expected_val, rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_val} but it is {actual_val}."
-            else:
-                assert actual_val == expected_val, f"The value at step {step} should be {expected_val} but it is {actual_val}."
-
-    @pytest.mark.xfail
-    def test_lm_loss_deterministic(self):
-        # Expected training loss curve at different global steps.
-        self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
-
-    def test_lm_loss_approx(self):
-        # Expected training loss curve at different global steps.
-        self._test_helper("lm loss", TypeOfTest.APPROX)
-
-    def test_num_zeros_deterministic(self):
-        # Expected validation loss curve at different global steps.
-        self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC)
-    
-    def iteration_timing_node(self):
-        expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
-        iteration_time = read_tb_logs_as_list(LOGS_DIR, "iteration-time")
-        idx = len(iteration_time)//3   
-        iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
-        assert expected_iteration_timing_avg == pytest.approx(expected=iteration_time_avg, rel=self.margin_time), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}."
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py b/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
deleted file mode 100644
index 5d3e69d1233473d661d552902be0d3bb4b5241f3..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-import sys
-import json
-import shutil
-import glob
-from tensorboard.backend.event_processing import event_accumulator
-
-LOGS_DIR = os.getenv('LOGS_DIR')
-
-def read_tb_logs_as_list(path, summary_name, index):
-    files = glob.glob(f"{path}/events*tfevents*")
-    files += glob.glob(f"{path}/results/events*tfevents*")
-    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
-    if files:
-        event_file = files[index]
-        ea = event_accumulator.EventAccumulator(event_file)
-        ea.Reload()
-        summary = ea.Scalars(summary_name)
-        summary_list = [round(x.value, 5) for x in summary]
-        print(summary_list)
-        return summary_list
-    raise FileNotFoundError(f"File not found matching: {path}/events*")    
-
-def collect_train_test_metrics(logs_dir, index):
-    train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss", index)
-    train_loss_list = [round(elem,3) for elem in train_loss_list]
-    train_metrics = {
-        "lm loss": train_loss_list[0:len(train_loss_list):5],
-    } 
-    str_train_metrics = str(train_metrics).replace("'", "\"")
-    print(f"\n ----------- The following are the metrics for ----------")
-    print(f"\n {str_train_metrics}", flush=True)
-    return train_metrics
-
-class TestCIPipeline:
-
-    train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
-    train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
-
-    def _test_helper(self, loss_type):
-        expected = self.train_metrics_100[loss_type]
-        print('expected : '  + str(expected))
-        actual = self.train_metrics_50_to_100[loss_type]
-        print('actual : '  + str(actual))
-        # NOTE : Doing this way because in gpt3 model when I run from 0 - 100 directly, it produces 1 extra element
-        # i.e expected is [10.84266, 10.89696, 10.90542, 10.87498, 10.86265, 10.83608, 10.64368, 10.62319, 10.53908, 10.25005, 10.20907, 9.96542, 9.96802, 9.92436, 9.79086, 9.26718, 9.61784, 9.19018, 9.45986, 9.62168, 9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22436, 9.19436, 9.11323, 9.09711, 9.04421, 9.36795]
-        # actual is : [9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22435, 9.19435, 9.11322, 9.09711, 9.04422]
-        # That extra element in expected is causing some issues. So doing it this way. Need to figure out whats happening
-        start_idx_expected = expected.index(actual[0]) # First element of actual
-        # Here we will just be comparing values of actual and second half (50-100) of expected
-        for i in range(len(actual)):
-            assert actual[i] == expected[start_idx_expected + i], f"The value at step {i} should be {expected[start_idx_expected + i]} but it is {actual[i]}."
-
-    def test_lm_loss_deterministic(self):
-        self._test_helper("lm loss")
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/shell_test_utils/jobwait.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/shell_test_utils/jobwait.sh
deleted file mode 100644
index dd49fd8cd6aa67ca488a0666a9cdb0b4d7a0a681..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/shell_test_utils/jobwait.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /bin/bash
-
-JOBID=$1
-echo "Job id : $JOBID"
-
-if [[ $JOBID -eq "" ]]; then
-  exit 1
-fi
-
-sleep 10s
-
-while true; do
-    export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1`
-    case "${STATE}" in
-        PENDING|RUNNING|REQUEUED)
-            echo "Job is still in $STATE"
-            sleep 15s
-            ;;
-        *)
-            sleep 30s
-            echo "Exiting with SLURM job status '${STATE}'"
-            exit 0
-            ;;
-    esac
-done
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
deleted file mode 100644
index 760aa31f4c3d7ddeb95389971d18681867c2a27f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49892, 10.46644, 10.41921, 10.30106, 10.16285, 9.97939]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18815.0, 22912.0, 18568.0, 19900.0, 23810.0, 22918.0]}, "iteration_timing_avg": 0.35970588235294115}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
deleted file mode 100644
index 2b5a223e7d87abff31bd6a2e4afef625e0b646a4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51908, 10.49118, 10.46612, 10.31901, 10.15649, 9.96702]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20433.0, 27243.0, 23240.0, 22459.0, 20724.0, 23451.0]}, "iteration_timing_avg": 0.8657461764705884}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
deleted file mode 100644
index e90891762f585edca152a12f985aa6d35756440f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44305, 10.44595, 10.39163, 10.25898, 10.13498, 9.95692]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20551.0, 28114.0, 24328.0, 24070.0, 20653.0, 21346.0]}, "iteration_timing_avg": 0.6318655882352939}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
deleted file mode 100644
index 2c4bafd5f279f2fe3bdfa932f15ba94e2ff36072..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49624, 10.47018, 10.34494, 10.25536, 10.10244, 9.91938]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19042.0, 28718.0, 22408.0, 26377.0, 34320.0, 21873.0]}, "iteration_timing_avg": 1.1249785294117647}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
deleted file mode 100644
index cb07592a1b010f90e012fb3631e89a19ddd141c0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 37, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86279, 10.83628, 10.64437, 10.62386]}, "num-zeros": {"start_step": 0, "end_step": 20, "step_interval": 5, "values": [2093.0, 2474.0, 2327.0, 2213.0]}, "iteration_timing_avg": 0.080846}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json
deleted file mode 100644
index 0cf9359fb98098d27ec137daa78e70dda15ddad8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 49, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83025, 10.78755, 10.56419, 10.57339, 10.48735, 10.19553]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2452.0, 2744.0, 2176.0, 2722.0, 2636.0, 2535.0, 2996.0]}, "iteration_timing_avg": 0.1158709090909091}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
deleted file mode 100644
index 2347dfdf9c5ca5384d93a3deb98b45a45b6de612..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 48, "step_interval": 5, "values": [10.85716, 10.88973, 10.879, 10.87014, 10.87978, 10.84463, 10.67266, 10.62932, 10.52767, 10.25362]}, "num-zeros": {"start_step": 0, "end_step": 31, "step_interval": 5, "values": [2450.0, 2396.0, 2523.0, 2242.0, 2225.0, 2478.0, 2536.0]}, "iteration_timing_avg": 0.11416968750000002}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
deleted file mode 100644
index 5adc692b5d0a567c591814c8e55c329fb34df824..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
+++ /dev/null
@@ -1 +0,0 @@
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86276, 10.88058, 10.87527, 10.88402, 10.89173, 10.84724, 10.6886, 10.62864, 10.53925, 10.26646]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2199.0, 2306.0, 2412.0, 2032.0, 2077.0, 2475.0, 2347.0]}, "iteration_timing_avg": 0.15481029411764707}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
deleted file mode 100644
index d5c2f83e06bc407b17f02e0913cca46275369225..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
+++ /dev/null
@@ -1,100 +0,0 @@
-#! /bin/bash
-
-DATA_PATH=$1
-CHECKPOINT_PATH=$2
-TENSORBOARD_DIR=$3
-TP_SIZE=$4
-PP_SIZE=$5
-NNODES=$6
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-
-# Runs the "345M" parameter model
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-# Run for 100 iterations
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_bert.py \
-       --use-checkpoint-args \
-       --use-checkpoint-opt_param-scheduler \
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 128 \
-       --seq-length 512 \
-       --max-position-embeddings 512 \
-       --train-iters 100 \
-       --timing-log-level 2 \
-       --lr-decay-iters 990000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/bert_data/vocab.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.0001 \
-       --min-lr 0.00001 \
-       --lr-warmup-fraction 0.01 \
-       --log-interval 1 \
-       --save-interval 50 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       --no-gradient-accumulation-fusion \
-       --fp16
-
-echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt
-
-# Resume from 50th iteration ckpt and continue to 100 iterations
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_bert.py \
-       --use-checkpoint-args \
-       --use-checkpoint-opt_param-scheduler \
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 128 \
-       --seq-length 512 \
-       --max-position-embeddings 512 \
-       --train-iters 100 \
-       --timing-log-level 2 \
-       --lr-decay-iters 990000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/bert_data/vocab.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.0001 \
-       --min-lr 0.00001 \
-       --lr-warmup-fraction 0.01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       --no-gradient-accumulation-fusion \
-       --fp16
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
deleted file mode 100644
index af24b473da61fa171d52cea0e6fdeefc01bfca35..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#! /bin/bash
-set -o xtrace
-
-DATA_PATH=$1
-CHECKPOINT_PATH=$2
-TENSORBOARD_DIR=$3
-TP_SIZE=$4
-PP_SIZE=$5
-NNODES=$6
-MAX_STEPS=$7
-VP_SIZE=$8
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-
-# Runs the "345M" parameter model
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_bert.py \
-       --num-layers 24 \
-       --hidden-size 1024 \
-       --num-attention-heads 16 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 128 \
-       --seq-length 512 \
-       --max-position-embeddings 512 \
-       --train-iters $MAX_STEPS \
-       --timing-log-level 2 \
-       --lr-decay-iters 990000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/bert_data/vocab.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.0001 \
-       --min-lr 0.00001 \
-       --lr-warmup-fraction 0.01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
-       --no-gradient-accumulation-fusion \
-       --fp16 
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
deleted file mode 100644
index 31b3ff993737e897927de2a8ed061f803b61a4f1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# Parameters
-#SBATCH --account=adlr
-#SBATCH --job-name=adlr-ci:megatron-job
-#SBATCH --nodes=1
-#SBATCH --partition=luna
-
-DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
-CHECKPOINT_PATH=/workspace/checkpoints
-TENSORBOARD_DIR=/workspace/logs
-
-srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
-  ls 
-  cd /workspace/megatron-lm
-  ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
deleted file mode 100644
index 45a441b27ecd945e96e2b7387319d2c62ec88ef6..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# Parameters
-#SBATCH --account=adlr
-#SBATCH --job-name=adlr-ci:megatron-job
-#SBATCH --nodes=1
-#SBATCH --partition=luna
-
-DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
-CHECKPOINT_PATH=/workspace/checkpoints
-TENSORBOARD_DIR=/workspace/logs
-
-srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
-  ls 
-  cd /workspace/megatron-lm
-  ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE"
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
deleted file mode 100644
index 7a91a13c5460b80c30a801b39bb02d3e523db81e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#! /bin/bash
-
-DATA_PATH=$1
-CHECKPOINT_PATH=$2
-TENSORBOARD_DIR=$3
-TP_SIZE=$4
-PP_SIZE=$5
-NNODES=$6
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-
-# Runs the "345M" parameter model
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-# Run for 100 iterations and save checkpoint at 50
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       --use-checkpoint-args \
-       --use-checkpoint-opt_param-scheduler \
-       --num-layers 12 \
-       --hidden-size 512 \
-       --num-attention-heads 8 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 32 \
-       --seq-length 1024 \
-       --max-position-embeddings 1024 \
-       --train-iters 100 \
-       --timing-log-level 2 \
-       --lr-decay-iters 320000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
-       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --lr-decay-style cosine \
-       --min-lr 1.0e-5 \
-       --weight-decay 1e-2 \
-       --clip-grad 1.0 \
-       --lr-warmup-fraction .01 \
-       --log-interval 1 \
-       --save-interval 50 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       --no-gradient-accumulation-fusion \
-       --fp16
-
-echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt
-
-# Resume from 50th iteration ckpt and continue to 100 iterations
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       --use-checkpoint-args \
-       --use-checkpoint-opt_param-scheduler \
-       --num-layers 12 \
-       --hidden-size 512 \
-       --num-attention-heads 8 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size 4 \
-       --global-batch-size 32 \
-       --seq-length 1024 \
-       --max-position-embeddings 1024 \
-       --train-iters 100 \
-       --timing-log-level 2 \
-       --lr-decay-iters 320000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
-       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --lr-decay-style cosine \
-       --min-lr 1.0e-5 \
-       --weight-decay 1e-2 \
-       --clip-grad 1.0 \
-       --lr-warmup-fraction .01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       --no-gradient-accumulation-fusion \
-       --fp16
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
deleted file mode 100644
index 5ab3b76c420eab911636b9d4959bbd5997011cc0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#! /bin/bash
-
-DATA_PATH=$1
-CHECKPOINT_PATH=$2
-TENSORBOARD_DIR=$3
-USE_TE=$4
-TP_SIZE=$5
-PP_SIZE=$6
-NNODES=$7
-MAX_STEPS=$8
-VP_SIZE=$9
-MBS=${10}
-GBS=${11}
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-TRANSFORMER_IMPL=local
-TRAINING_DTYPE=fp16
-
-if [[ $USE_TE -eq 1 ]]; then
-       echo "Running with TransformerEngine ..."
-       TRANSFORMER_IMPL=transformer_engine
-       TRAINING_DTYPE=bf16
-else
-       echo "Running with local transformer implementation ..."
-fi
-
-# Runs the "345M" parameter model
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES"
-
-torchrun $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       --num-layers 12 \
-       --hidden-size 512 \
-       --num-attention-heads 8 \
-       --log-params-norm \
-       --log-num-zeros-in-grad \
-       --log-validation-ppl-to-tensorboard \
-       --log-timers-to-tensorboard \
-       --tensorboard-dir ${TENSORBOARD_DIR} \
-       --micro-batch-size ${MBS:-4} \
-       --global-batch-size ${GBS:-32} \
-       --seq-length 1024 \
-       --max-position-embeddings 1024 \
-       --train-iters $MAX_STEPS \
-       --timing-log-level 2 \
-       --lr-decay-iters 320000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
-       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --lr-decay-style cosine \
-       --min-lr 1.0e-5 \
-       --weight-decay 1e-2 \
-       --clip-grad 1.0 \
-       --lr-warmup-fraction .01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --transformer-impl $TRANSFORMER_IMPL \
-       --tensor-model-parallel-size $TP_SIZE \
-       --pipeline-model-parallel-size $PP_SIZE \
-       ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
-       --no-gradient-accumulation-fusion \
-       --${TRAINING_DTYPE}
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh
deleted file mode 100644
index f9761a134681424d3125404d5d99c84f3b7c3595..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# Parameters
-#SBATCH --account=adlr
-#SBATCH --job-name=adlr-ci:megatron-job
-#SBATCH --nodes=1
-#SBATCH --partition=luna
-
-DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
-CHECKPOINT_PATH=/workspace/checkpoints
-TENSORBOARD_DIR=/workspace/logs
-
-srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
-  ls 
-  cd /workspace/megatron-lm
-  ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh b/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh
deleted file mode 100644
index cab43bc15658d587e853251e15947b2e13a60fa3..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-# Parameters
-#SBATCH --account=adlr
-#SBATCH --job-name=adlr-ci:megatron-job
-#SBATCH --nodes=1
-#SBATCH --partition=luna
-
-DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
-CHECKPOINT_PATH=/workspace/checkpoints
-TENSORBOARD_DIR=/workspace/logs
-IMAGE=gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
-
-if [[ $USE_TE -eq 1 ]]; then
-  echo "Using container nvcr.io/nvidia/pytorch:23.04-py3 for running with TE ..."
-  IMAGE=nvcr.io/nvidia/pytorch:23.04-py3
-fi
-
-srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
-  ls 
-  cd /workspace/megatron-lm
-  ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $USE_TE $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE $MBS $GBS"
diff --git a/toolbox/Megatron-DeepSpeed/tests/models/__init__.py b/toolbox/Megatron-DeepSpeed/tests/models/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/models/test_gpt_embedding.py b/toolbox/Megatron-DeepSpeed/tests/models/test_gpt_embedding.py
deleted file mode 100644
index 38d4bd767e37915aabca4de6bc9467f892873d1f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/models/test_gpt_embedding.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import pytest
-
-import torch
-
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.models.gpt.gpt_embedding import GPTEmbedding
-
-
-@pytest.fixture
-def gpt_embedding(transformer_config):
-    embedding = GPTEmbedding(config=transformer_config, vocab_size=100, max_sequence_length=4)
-    return embedding
-
-
-class TestGPTEmbedding:
-    def test_constructor(self, gpt_embedding: GPTEmbedding):
-        assert isinstance(gpt_embedding, GPTEmbedding)
-        num_weights = sum([p.numel() for p in gpt_embedding.parameters()])
-        assert num_weights == 1248
-
-    def test_zero_parameters(self, gpt_embedding: GPTEmbedding):
-        sum_weights = sum([p.sum() for p in gpt_embedding.parameters()])
-        assert sum_weights != 0
-        gpt_embedding.zero_parameters()
-        sum_weights = sum([p.sum() for p in gpt_embedding.parameters()])
-        assert sum_weights == 0
-
-    def test_cpu_forward(self, gpt_embedding: GPTEmbedding):
-        input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1))
-        position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1))
-        embeddings = gpt_embedding(input_ids, position_ids)
-        assert embeddings.device.type == 'cpu'
-        assert embeddings.shape[0] == gpt_embedding.max_sequence_length
-        assert embeddings.shape[1] == input_ids.shape[0]
-        assert embeddings.shape[2] == gpt_embedding.config.hidden_size
-
-    def test_gpu_forward(self, gpt_embedding: GPTEmbedding):
-        gpt_embedding.cuda()
-        input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda()
-        position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda()
-        embeddings = gpt_embedding(input_ids, position_ids)
-        assert embeddings.device.type == 'cuda'
-        assert embeddings.shape[0] == gpt_embedding.max_sequence_length
-        assert embeddings.shape[1] == input_ids.shape[0]
-        assert embeddings.shape[2] == gpt_embedding.config.hidden_size
diff --git a/toolbox/Megatron-DeepSpeed/tests/models/test_gpt_model.py b/toolbox/Megatron-DeepSpeed/tests/models/test_gpt_model.py
deleted file mode 100644
index 119a0a1ff88e81b4b2888197f712bc2636f3dcde..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/models/test_gpt_model.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import pytest
-
-import torch
-
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.models.gpt.gpt_model import GPTModel
-
-
-@pytest.fixture
-def gpt_model(transformer_config):
-    language_model = GPTModel(config=transformer_config, vocab_size=100, max_sequence_length=4)
-    return language_model
-
-
-class TestGPTModel:
-    def test_constructor(self, gpt_model: GPTModel):
-        assert isinstance(gpt_model, GPTModel)
-
-        assert gpt_model.max_sequence_length == 4
-
-        num_weights = sum([p.numel() for p in gpt_model.parameters()])
-        assert num_weights == 5040
-
-    def test_set_input_tensor(self, gpt_model: GPTModel):
-        config: TransformerConfig = gpt_model.config
-        sequence_length = gpt_model.max_sequence_length
-        micro_batch_size = 2
-
-        # [sequence length, batch size, hidden size]
-        input_tensor = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
-
-        gpt_model.set_input_tensor(input_tensor)
-
-        assert gpt_model.decoder.input_tensor.shape[0] == sequence_length
-        assert gpt_model.decoder.input_tensor.shape[1] == micro_batch_size
-        assert gpt_model.decoder.input_tensor.shape[2] == config.hidden_size
-
-    def test_post_process_forward(self, gpt_model: GPTModel):
-        config: TransformerConfig = gpt_model.config
-        sequence_length = gpt_model.max_sequence_length
-        micro_batch_size = 2
-
-        gpt_model.cuda()
-
-        data = list(range(sequence_length))
-        input_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
-        position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-
-        logits = gpt_model.forward(input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask)
-
-        assert logits.shape[0] == micro_batch_size
-        assert logits.shape[1] == sequence_length
-        assert logits.shape[2] == gpt_model.vocab_size
-
-    def test_no_post_process_forward(self, gpt_model: GPTModel):
-        pass
-
-    def test_no_preprocess_forward(self, gpt_model: GPTModel):
-        pass
-
-    def test_state_dict_for_save_checkpoint(self, gpt_model: GPTModel):
-        pass
-
-    def test_load_state_dict(self, gpt_model: GPTModel):
-        pass
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/pipeline_parallel/__init__.py b/toolbox/Megatron-DeepSpeed/tests/pipeline_parallel/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/pipeline_parallel/test_schedules.py b/toolbox/Megatron-DeepSpeed/tests/pipeline_parallel/test_schedules.py
deleted file mode 100644
index 46fb70525c9c80eb97157af321439c7008fdd337..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/pipeline_parallel/test_schedules.py
+++ /dev/null
@@ -1,201 +0,0 @@
-import torch
-from tests.test_utilities import Utils
-from megatron_ds.core import ModelParallelConfig
-import megatron_ds.core.pipeline_parallel.schedules as schedule
-from pytest_mock import mocker 
-import pytest
-
-rank = Utils.rank
- 
-def test_get_forward_backward_func():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_no_pipelining)
-    Utils.destroy_model_parallel()
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_without_interleaving)
-    Utils.destroy_model_parallel()
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4, virtual_pipeline_model_parallel_size=2)
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving)
-    Utils.destroy_model_parallel()
-
-def test_deallocate_output_tensor():
-    out = torch.tensor([[1, 2, 3], [4, 5, 6]])
-    schedule.deallocate_output_tensor(out)
-    assert(out.nelement() == 1) 
-
-def test_forward_backward_func_without_pipeline_parallel(mocker):
-    from megatron_ds.core.pipeline_parallel import get_forward_backward_func
-
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
-
-    def forward_step_func(data_iterator, model, config=None):
-        import os
-        rank = int(os.environ['LOCAL_RANK'])
-        dummy_data = torch.ones(1,4)
-        def loss_func(output_tensor):
-            return rank, {'loss_reduced':rank}
-        return model(dummy_data), loss_func
-
-    model = torch.nn.Linear(4,1)
-    model.model_type = 'unit-test'
-    def set_input_tensor(input_tensor):
-        return None
-    model.set_input_tensor = set_input_tensor
-
-    forward_backward_func = get_forward_backward_func()
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_no_pipelining)
-
-    mocker.patch("megatron_ds.core.pipeline_parallel.schedules.custom_backward", return_value=2)
-    config = ModelParallelConfig(
-        pipeline_model_parallel_size = 1
-    )
-    model.config = config
-
-    losses_reduced = forward_backward_func(
-        forward_step_func=forward_step_func,
-        data_iterator=None,
-        model=[model],
-        num_microbatches=4,
-        seq_length=None,
-        micro_batch_size=None,
-        forward_only=False) 
-    
-    loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}]
-    for i,j in zip(losses_reduced, loss_reduced_expected):
-        print(losses_reduced)
-        assert(i['loss_reduced'] == j['loss_reduced'])
-    Utils.destroy_model_parallel() 
-
-def test_forward_backward_func_with_pipeline_parallel(mocker):
-    from megatron_ds.core.pipeline_parallel import get_forward_backward_func
-
-    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=4)
-
-    def forward_step_func(data_iterator, model, config=None):
-        import os
-        rank = int(os.environ['LOCAL_RANK'])
-        def loss_func(output_tensor):
-            return rank, {'loss_reduced':rank}
-        return torch.rand(512,8,256).cuda(), loss_func
-
-    model = torch.nn.Linear(4,1)
-    model.model_type = 'unit-test'
-    def set_input_tensor(input_tensor):
-        return None
-    model.set_input_tensor = set_input_tensor
-
-    forward_backward_func = get_forward_backward_func()
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_without_interleaving)
-
-    sequence_length = 512
-    micro_batch_size = 8
-    hidden_size = 256
-
-    config = ModelParallelConfig(
-        pipeline_model_parallel_size = 4,
-        sequence_parallel = False
-    )
-    model.config = config
-    
-    losses_reduced = forward_backward_func(
-        forward_step_func=forward_step_func,
-        data_iterator=None,
-        dtype=torch.float32,
-        model=[model],
-        num_microbatches= micro_batch_size,
-        seq_length=sequence_length,
-        micro_batch_size=micro_batch_size,
-        forward_only=True) 
-    
-    loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}]
-    for i,j in zip(losses_reduced, loss_reduced_expected):
-        print(losses_reduced)
-        assert(i['loss_reduced'] == j['loss_reduced'])
-    Utils.destroy_model_parallel()  
-
-""" 
-def test_forward_backward_func_with_interleaving(mocker):
-    from megatron_ds.core.pipeline_parallel import get_forward_backward_func
-    from megatron_ds.core.enums import ModelType
-
-    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=4, virtual_pipeline_model_parallel_size=2)
-
-    def forward_step_func(data_iterator, model, config=None):
-        import os
-        rank = int(os.environ['LOCAL_RANK'])
-        def loss_func(output_tensor):
-            return rank, {'loss_reduced':rank}
-        return torch.rand(512,8,256).cuda(), loss_func
-
-    model = torch.nn.Linear(4,1)
-    def set_input_tensor(input_tensor):
-        return None
-    model.set_input_tensor = set_input_tensor
-
-    forward_backward_func = get_forward_backward_func()
-    assert(schedule.get_forward_backward_func() == schedule.forward_backward_pipelining_with_interleaving)
-
-    sequence_length = 512
-    micro_batch_size = 8
-    hidden_size = 256
-
-    mocker.patch("megatron_ds.core.pipeline_parallel.schedules.custom_backward", return_value=2)
-
-    with pytest.raises(RuntimeError):
-        model.model_type = ModelType.encoder_and_decoder
-        forward_backward_func(
-            forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
-            model=[model, model],
-            num_microbatches= micro_batch_size,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-            decoder_seq_length=sequence_length,
-            sequence_parallel=False,
-            forward_only=True)
-        
-    with pytest.raises(RuntimeError):
-        model.model_type = ModelType.encoder_or_decoder
-        forward_backward_func(
-            forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
-            model=[model, model],
-            num_microbatches= micro_batch_size,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-            decoder_seq_length=256,
-            sequence_parallel=False,
-            forward_only=True)
-
-    with pytest.raises(RuntimeError):
-        model.model_type = ModelType.encoder_or_decoder
-        forward_backward_func(
-            forward_step_func=forward_step_func,
-            data_iterator=range(0,100),
-            dtype=torch.float32,
-            model=[model, model],
-            num_microbatches= 7,
-            tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-            decoder_seq_length=512,
-            sequence_parallel=False,
-            forward_only=True)    
-
-    model.model_type = ModelType.encoder_or_decoder
-    losses_reduced = forward_backward_func(
-        forward_step_func=forward_step_func,
-        data_iterator=range(0,100),
-        dtype=torch.float32,
-        model=[model, model],
-        num_microbatches= micro_batch_size,
-        tensor_shape=[sequence_length, micro_batch_size, hidden_size],
-        decoder_seq_length=sequence_length,
-        sequence_parallel=True,
-        forward_only=True) 
-    
-    loss_reduced_expected = [{'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}, {'loss_reduced': rank}]
-    for i,j in zip(losses_reduced, loss_reduced_expected):
-        print(losses_reduced)
-        assert(i['loss_reduced'] == j['loss_reduced'])
-
-    Utils.destroy_model_parallel()  
-"""
diff --git a/toolbox/Megatron-DeepSpeed/tests/requirements.txt b/toolbox/Megatron-DeepSpeed/tests/requirements.txt
deleted file mode 100644
index 92c2518edd905e1c6e69021cf4bc47e37416cb92..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-pytest_mock
-pybind11
-regex
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/run_megatron.py b/toolbox/Megatron-DeepSpeed/tests/run_megatron.py
deleted file mode 100644
index ff40bd8a26e6a7c8748905f989bc2d85f3e9a465..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/run_megatron.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import torch
-import deepspeed
-import megatronspeed.megatron_adaptor
-from megatron import get_args
-from megatron.core import mpu
-from megatron.training.checkpointing import load_checkpoint
-from megatron.training.initialize import initialize_megatron
-from megatron.legacy.model import GPTModel
-from megatron.training.training import get_model
-from megatron.training.arguments import core_transformer_config_from_args
-from megatronspeed.text_generation_utils import generate_samples_eval
-
-
-def model_provider(pre_process=True, post_process=True):
-
-    config = core_transformer_config_from_args(get_args())
-
-    model = GPTModel(
-        config=config,
-        num_tokentypes=0,
-        parallel_output=False,
-        pre_process=pre_process,
-        post_process=post_process,
-        return_moe_loss=False,
-    )
-    return model
-
-
-def add_text_generate_args(parser):
-    """Text generation arguments."""
-    group = parser.add_argument_group(title="text generation")
-
-    group.add_argument(
-        "--temperature", type=float, default=1.0, help="Sampling temperature."
-    )
-    group.add_argument(
-        "--greedy", action="store_true", default=False, help="Use greedy sampling."
-    )
-    group.add_argument("--top_p", type=float, default=0.0, help="Top p sampling.")
-    group.add_argument("--top_k", type=int, default=0, help="Top k sampling.")
-    group.add_argument(
-        "--out-seq-length",
-        type=int,
-        default=1024,
-        help="Size of the output generated text.",
-    )
-    group.add_argument(
-        "--sample-input-file",
-        type=str,
-        default=None,
-        help="Get input from file instead of interactive mode, "
-        "each line is an input.",
-    )
-    group.add_argument(
-        "--sample-output-file",
-        type=str,
-        default=None,
-        help="Output file got from --sample-input-file",
-    )
-    group.add_argument(
-        "--num-samples",
-        type=int,
-        default=0,
-        help="Number of samples to generate unconditionally, "
-        "defaults to 0 and interactive conditional sampling",
-    )
-    group.add_argument(
-        "--genfile", type=str, help="Output file when generating unconditionally"
-    )
-    group.add_argument(
-        "--recompute",
-        action="store_true",
-        help="During generation recompute all attention "
-        "instead of using previously computed keys/values.",
-    )
-    group.add_argument(
-        "--context-tokens", type=str, default="DeepSpeed is the greatest"
-    )
-    group.add_argument("--max-tokens", type=int, default=50)
-
-    return parser
-
-
-if __name__ == "__main__":
-    # initialize megatron
-    initialize_megatron(
-        extra_args_provider=add_text_generate_args,
-        args_defaults={
-            "tokenizer_type": "GPT2BPETokenizer",
-            "no_load_rng": True,
-            "no_load_optim": True,
-        },
-    )
-    args = get_args()
-
-    # setup model
-    model = get_model(model_provider)
-    _ = load_checkpoint(model, None, None)
-    model = model[0]
-    if args.ds_inference:
-        engine = deepspeed.init_inference(
-            model=model,
-            mp_size=args.tensor_model_parallel_size,
-            tensor_parallel={"mpu": mpu},
-            dtype=torch.half,
-            replace_with_kernel_inject=True,
-            moe_experts=args.num_experts,
-            moe_type=args.mlp_type,
-        )
-        model = engine.module
-
-    # generate output
-    generate_samples_eval(
-        model, args.context_tokens, 1, 0
-    )  # Just so we don't get log output from DeepSpeed (this should be removed once we improve logging in DeepSpeed)
-    print("===START OUTPUT===")
-    print(generate_samples_eval(model, args.context_tokens, args.max_tokens, 0))
-    print("===END OUTPUT===")
diff --git a/toolbox/Megatron-DeepSpeed/tests/run_test_multi_node.sh b/toolbox/Megatron-DeepSpeed/tests/run_test_multi_node.sh
deleted file mode 100644
index ffd2504fa51a11f7a0a489154cf8993fc97b8390..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/run_test_multi_node.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-# python3 tests.py \
-# --timeout_per_case 120 \
-# --ignore_timeout \
-# --files  'unit_tests/test_utils.py \
-# unit_tests/test_basic.py \
-# unit_tests/test_parallel_state.py ' \
-# --excludes 'unit_tests/tensor_parallel/test_tensor_parallel_utils.py'
-# exit $?
-
-## 使用sh脚本将每个ci测试的文件在不同节点上执行
-host_name=$HOST_NAME
-addr_array=$ADDR_ARRAY
-container_name=$CONTAINER_NAME
-
-addr_array=(${ADDR_ARRAY//,/ }) ## get ip array
-# addr_array=("10.113.2.1" "10.113.2.2")
-
-HOST_IP=$(hostname -I)
-CURRENT_DIR=`pwd`
-CUR_SCR=$0
-MASTER_PORT=8294
-PROJECT_DIR=$(dirname "$PWD")
-
-function exec_ssh_by_master
-{
-	# only at master host, start all other non master hosts run
-	if [[ "$HOST_IP" =~ "${addr_array[0]}" ]]
-	then
-		for i in "${!addr_array[@]}"
-		do
-			if [ "$i" != "0" ]
-			then	
-				
-				scp -r ${PROJECT_DIR} ${host_name}@${addr_array[$i]}:$(dirname "$PROJECT_DIR") ## scp whole megatron-deepspeed dir
-				ssh ${host_name}@${addr_array[$i]} "docker exec ${container_name} bash -c \"cd ${CURRENT_DIR}; export ADDR_ARRAY=$ADDR_ARRAY; bash ${CUR_SCR} \"" &
-			fi
-		done
-	fi
-}
-
-function run_ddp_mm()
-{
-    for i in "${!addr_array[@]}"
-    do
-	    if [[ "$HOST_IP" =~ "${addr_array[$i]}" ]]
-	    then
-		    echo "nodes: ${#addr_array[@]}, rank: $i, IP: $HOST_IP, MASTER_IP: ${addr_array[0]}"
-		    python3 tests.py \
-            --master_addr ${addr_array[0]} \
-            --master_port $MASTER_PORT \
-            --nnodes ${#addr_array[@]} \
-            --node_rank $i \
-            --timeout_per_case 120 \
-            --files  'unit_tests/test_utils.py \
-			unit_tests/test_basic.py \
-			unit_tests/test_parallel_state.py \
-			unit_tests/tensor_parallel/test_tensor_parallel_utils.py'
-			status=$?
-	    fi
-    done
-}
-
-exec_ssh_by_master
-run_ddp_mm
-## 保存退出码，回传给父shell
-echo $status > exit_code.txt
-
-exit 0
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/run_test_one_node.sh b/toolbox/Megatron-DeepSpeed/tests/run_test_one_node.sh
deleted file mode 100644
index b56c832461384ef3170e2a5b6d1bf68a67d873ee..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/run_test_one_node.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-python3 tests.py \
---timeout_per_case 120 \
---files  'unit_tests/test_utils.py \
-unit_tests/test_basic.py \
-unit_tests/test_parallel_state.py \
-unit_tests/tensor_parallel/test_tensor_parallel_utils.py' \
---master_addr localhost \
---master_port 5673 \
---nnodes 1 \
---node_rank 0
-status=$?
-if [ $status == 255 ]; then
-    exit -1
-else
-    exit $status
-fi
diff --git a/toolbox/Megatron-DeepSpeed/tests/tensor_parallel/__int__.py b/toolbox/Megatron-DeepSpeed/tests/tensor_parallel/__int__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/test_megatron.py b/toolbox/Megatron-DeepSpeed/tests/test_megatron.py
deleted file mode 100644
index d3ef821a3a9c778e3808bccd907e4848f6bebd88..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/test_megatron.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import pytest
-import os
-import re
-import subprocess
-
-
-@pytest.fixture(params=[1])
-def moe_num_experts(request):
-    return str(request.param)
-
-
-@pytest.fixture(params=[1])
-def mp_size(request):
-    return str(request.param)
-
-
-@pytest.fixture
-def params(moe_num_experts, mp_size):
-    base_dir = os.getenv("MEGATRON_CKPT_DIR")
-    assert base_dir, "Please set MEGATRON_CKPT_DIR in your environment"
-
-    vocab_file = os.path.join(base_dir, "gpt2-vocab.json")
-    merge_file = os.path.join(base_dir, "gpt2-merges.txt")
-    ckpt_path = os.path.join(base_dir, "checkpoints/gpt2_345m")
-
-    return [
-        "--micro-batch-size", "1",
-        "--num-layers", "24",
-        "--hidden-size", "1024",
-        "--num-attention-heads", "16",
-        "--max-position-embeddings", "1024",
-        "--vocab-file", vocab_file,
-        "--merge-file", merge_file,
-        "--load", ckpt_path,
-        "--seq-length", "1024",
-        "--out-seq-length", "1024",
-        "--tensor-model-parallel-size", mp_size,
-        "--tokenizer-type", "GPT2BPETokenizer",
-        "--num-experts", moe_num_experts,
-        "--mlp-type", "standard",
-        "--num-samples", "0",
-        "--fp16",
-    ]
-
-
-def test_moe_megatron(params, mp_size):
-    output_re = r"===START OUTPUT===([\S\s]*)===END OUTPUT==="
-
-    # Run the baseline
-    baseline_cmd = ["deepspeed", "--num_gpus", mp_size, "./run_megatron_ds.py"] + params
-    result = subprocess.run(baseline_cmd, stdout=subprocess.PIPE)
-    baseline_output = re.search(output_re, result.stdout.decode("utf-8")).group(1)
-
-    # Run with DeepSpeed
-    deepspeed_cmd = baseline_cmd + ["--ds-inference"]
-    result = subprocess.run(deepspeed_cmd, stdout=subprocess.PIPE)
-    deepspeed_output = re.search(output_re, result.stdout.decode("utf-8")).group(1)
-
-    assert (
-        baseline_output == deepspeed_output
-    ), f"outputs do not match: {baseline_output}\n{deepspeed_output}"
diff --git a/toolbox/Megatron-DeepSpeed/tests/test_megatron_adapter.py b/toolbox/Megatron-DeepSpeed/tests/test_megatron_adapter.py
deleted file mode 100644
index f79b0fffc384783822f84aa576025270bc44113c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/test_megatron_adapter.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-import megatronspeed.megatron_adaptor
diff --git a/toolbox/Megatron-DeepSpeed/tests/tests.py b/toolbox/Megatron-DeepSpeed/tests/tests.py
deleted file mode 100644
index 24b52718ea79ecdb3b9aa8abf01c8cdf31f31fcd..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/tests.py
+++ /dev/null
@@ -1,288 +0,0 @@
-import copy
-import dataclasses
-import enum
-import glob
-import os
-import subprocess
-import sys
-from argparse import ArgumentParser
-from typing import List, Union, Optional
-
-REQUIREMENTS_PY = ["tabulate"]
-DEFAULT_LOG_DIR = "./test_logs"
-
-
-def parse_args():
-    parser = ArgumentParser("Test Application")
-    parser.add_argument("--files", nargs='+', type=str,
-                        help="test files or directions.")
-    parser.add_argument("--log_dir", type=str, default=DEFAULT_LOG_DIR,
-                        help="log dir")
-    parser.add_argument("--timeout_per_case", type=int, default=None,
-                        help="timeout for per case")
-    parser.add_argument("--ignore_timeout", action="store_true",
-                        help="ignore timeoue case when detect return code")
-    parser.add_argument("--excludes", type=str, default=None,
-                        help="excludes file or dir, using comma to split")
-    parser.add_argument("--master_addr", type=str, default=None,
-                        help="master node address")
-    parser.add_argument("--master_port", type=str, default=None,
-                        help="master node port")
-    parser.add_argument("--nnodes", type=int, default=None,
-                        help="total nodes")
-    parser.add_argument("--node_rank", type=int, default=None,
-                        help="this node`s rank in nodes")
-
-    args = parser.parse_args()
-
-    if args.files is None:
-        raise RuntimeError(f"Got invalid files {args.files}.")
-
-    if isinstance(args.files,str):
-        args.files = args.files.splitlines()
-    if isinstance(args.excludes,str):
-        args.excludes = args.excludes.splitlines()
-
-
-    print(args)
-
-    return args
-
-
-def current_dir():
-    return os.path.abspath(os.path.join(__file__, ".."))
-
-
-def setup():
-    with open(os.path.join(current_dir(), "requirements.txt")) as f:
-        deps = f.readlines()
-
-    REQUIREMENTS_PY.extend(deps)
-
-    for dep in REQUIREMENTS_PY:
-        retcode = os.system(f"pip3 install {dep}")
-        if retcode != 0:
-            raise RuntimeError(f"Install {dep} fail.")
-
-
-def get_file_name(file_path):
-    if not isinstance(file_path, str):
-        raise RuntimeError(f"Invalid file path {file_path}")
-
-    return file_path.rsplit(".", maxsplit=1)[0]
-
-
-def get_file_ext(file: str) -> Optional[str]:
-    if "." not in file:
-        return None
-
-    return file.rsplit(".", maxsplit=1)[1]
-
-
-def is_python_file(file: str):
-    return file.endswith(".py")
-
-
-def rename_file_ext(file: str, new_ext: str):
-    if not new_ext.startswith("."):
-        new_ext = f".{new_ext}"
-
-    return f"{get_file_name(file)}{new_ext}"
-
-
-def find_files(dir: str, file_pattern: str) -> List[str]:
-    return glob.glob(os.path.join(dir, "**", file_pattern), recursive=True)
-
-
-def find_python_test_files(dir: str) -> List[str]:
-    if dir.endswith(".py"):
-        return [dir]
-
-    return find_files(dir, "test_*.py")
-
-
-class LogType(enum.Enum):
-    kContent = 0
-    kFile = 1
-
-
-@dataclasses.dataclass
-class Result:
-    command: str
-    retcode: int
-    test_file: str = None
-    log: Optional[str] = None
-    log_type: LogType = LogType.kFile
-    exception: Optional[Exception] = None
-
-    @property
-    def success(self):
-        return self.retcode == 0
-
-    @property
-    def is_timeout(self):
-        return isinstance(self.exception, subprocess.TimeoutExpired)
-
-
-def exec_command(command: Union[str, List], log_path, *args, **kwargs):
-    if not isinstance(command, (list, tuple)):
-        command = [command]
-    stdout = None
-    command.extend(['>', log_path, "2>&1"])
-    command = " ".join(command)
-
-    if "env" not in kwargs:
-        kwargs["env"] = copy.copy(os.environ)
-
-        kwargs["env"]["MEGATRON_TEST"] = "1"
-
-    res = subprocess.run(command, stdout=stdout, stderr=subprocess.STDOUT, shell=True, start_new_session=True, *args, **kwargs)
-
-    return res
-
-
-def run_py_case(args, py_file, test_args: List[str] = None, log_dir: str = None, timeout=None) -> Result:
-    if test_args is None:
-        test_args = []
-
-    if "test_utils.py" in py_file:
-        command = f"torchrun --nproc_per_node=1 -m pytest -s {py_file} {' '.join(test_args)} --junitxml={args.log_dir}/___{py_file.split('/')[-1][:-3]}.xml -o junit_suite_name={py_file.split('/')[-1][:-3]}"
-    else:
-        command = f"torchrun --nproc_per_node=8 --nnodes {args.nnodes} --node_rank {args.node_rank} \
-        --master_addr {args.master_addr} --master_port {args.master_port} -m pytest -s {py_file} {' '.join(test_args)} --junitxml={args.log_dir}/___{py_file.split('/')[-1][:-3]}.xml -o junit_suite_name={py_file.split('/')[-1][:-3]}"
-
-    if log_dir is None:
-        log_dir = DEFAULT_LOG_DIR
-
-    log_path = os.path.join(log_dir, rename_file_ext(os.path.basename(py_file), ".log"))
-
-    new_log_dir = os.path.dirname(log_path)
-    if not os.path.exists(new_log_dir):
-        os.makedirs(new_log_dir, exist_ok=True)
-
-    try:
-        res = exec_command(command, log_path, timeout=timeout)
-        result = Result(command=command, retcode=res.returncode, log=log_path, log_type=LogType.kFile)
-    except Exception as ex:
-        result = Result(command=command, retcode=1, log=log_path, log_type=LogType.kFile, exception=ex)
-
-    os.system(f"cat {log_path}")
-
-    return result
-
-
-def run_py_cases(args, files, log_dir = None, timeout_per_case = None, excludes: List[str] = None) -> List[Result]:
-    if log_dir is None:
-        log_dir = DEFAULT_LOG_DIR
-
-    if excludes is None:
-        excludes = []
-
-    def is_valid_test_case(file: str):
-
-        for exc in excludes:
-            if file.startswith(exc):
-                return False
-
-        return True
-    files = files[0].split(' ')
-    if isinstance(files, str):
-        files = [files]
-
-    if not isinstance(files, List):
-        files = list(files)
-
-    test_files = []
-    for i, path in enumerate(files):
-        if os.path.isfile(path) and not is_python_file(path):
-            raise RuntimeError(f"Got invalid python file {path}.")
-
-        if not os.path.isdir(path):
-            test_files.append(path)
-            continue
-
-        # 处理 目录
-        py_files = find_python_test_files(path)
-        print(py_files)
-        py_files.sort()
-        test_files.extend(py_files)
-
-    test_results = []
-    for i, file in enumerate(test_files):
-        print(f"Progress: {i+1} / {len(test_files)}, Case: {file}")
-        sys.stdout.flush()
-        if not is_valid_test_case(file):
-            print(f"Skip {file}")
-            continue
-
-        result = run_py_case(args=args, py_file=file, log_dir=log_dir, timeout=timeout_per_case)
-        result.test_file = file
-        test_results.append(result)
-
-    return test_results
-
-
-def format_execption(exception: Optional[Exception]):
-    if exception is None:
-        return "-"
-
-    if isinstance(exception, subprocess.TimeoutExpired):
-        return f"timed out after {round(exception.timeout, 2)} seconds"
-
-    return str(exception)
-
-
-def summary(results: List[Result]):
-    from tabulate import tabulate
-
-    header = ["Index", "file", "log path", "exception"]
-    success_cases = []
-    failed_cases = []
-    for i, result in enumerate(results):
-        if result.success:
-            success_cases.append([i, result.test_file, result.log, "-"])
-        else:
-            failed_cases.append(
-                [i, result.test_file, result.log, format_execption(result.exception)]
-            )
-
-    if len(success_cases) > 0:
-        print("=" * 80)
-        print("= Success Cases ")
-        print("=" * 80)
-        print(tabulate(success_cases, headers=header, tablefmt="simple"))
-
-    if len(failed_cases) > 0:
-        print("=" * 80)
-        print("= Failed Cases ")
-        print("=" * 80)
-        print(tabulate(failed_cases, headers=header, tablefmt="simple"))
-
-
-def check_status(results: List[Result], ignore_timeout: bool):
-    for result in results:
-        if ignore_timeout and result.is_timeout:
-            continue
-        # print(result)
-        if not result.success:
-            print("-" * 80)
-            print(f"Not all cases passed!")
-            exit(-1)
-
-    print("-" * 80)
-    print("Pass")
-
-
-if __name__ == '__main__':
-    setup()
-
-    args = parse_args()
-    results = run_py_cases(args,
-        args.files,
-        log_dir=args.log_dir,
-        excludes=args.excludes,
-        timeout_per_case=args.timeout_per_case
-    )
-    summary(results)
-    check_status(results, args.ignore_timeout)
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/transformer/__init__.py b/toolbox/Megatron-DeepSpeed/tests/transformer/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/transformer/test_core_attention.py b/toolbox/Megatron-DeepSpeed/tests/transformer/test_core_attention.py
deleted file mode 100644
index 2456168030fdfdcd9b88ebfce15f69d8e3f61fdf..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/transformer/test_core_attention.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-
-import pytest
-
-import torch
-
-from megatron_ds.core.transformer.core_attention import CoreAttention
-
-
-@pytest.fixture
-def core_attention(transformer_config):
-    return CoreAttention(transformer_config)
-
-
-class TestCoreAttention:
-    def test_constructor(self, core_attention):
-        assert isinstance(core_attention, CoreAttention)
-        assert core_attention.layer_number == 1
-
-        num_weights = sum([p.numel() for p in core_attention.parameters()])
-        assert num_weights == 0
-
-    def test_cpu_forward(self, core_attention):
-        # we can't currently do this because the global memory buffer is on GPU
-        pass
-
-    def test_gpu_forward(self, core_attention):
-
-        # destroy_global_memory_buffer()
-        # _set_global_memory_buffer()
-        # model_parallel_cuda_manual_seed(123)
-
-        core_attention.cuda()
-        config = core_attention.config
-        sequence_length = 32
-        micro_batch_size = 2
-        # query_layer (float): [sequence_length, micro_batch_size, num_attention_heads, hidden_size / num_attention_heads]
-        query_layer = torch.ones(
-            (
-                sequence_length,
-                micro_batch_size,
-                config.num_attention_heads,
-                config.hidden_size // config.num_attention_heads,
-            )
-        ).cuda()
-
-        key_layer = torch.ones_like(query_layer).cuda()
-
-        value_layer = torch.ones_like(query_layer).cuda()
-
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-
-        context_layer = core_attention(
-            query_layer=query_layer, key_layer=key_layer, value_layer=value_layer, attention_mask=attention_mask
-        )
-
-        assert context_layer.shape[0] == sequence_length
-        assert context_layer.shape[1] == micro_batch_size
-        assert context_layer.shape[2] == config.hidden_size
-        assert context_layer.device.type == 'cuda'
-        assert context_layer.dtype == torch.float32
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/transformer/test_module.py b/toolbox/Megatron-DeepSpeed/tests/transformer/test_module.py
deleted file mode 100644
index fea44d2bb18b4502e2425726284be3b99380c00d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/transformer/test_module.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import pytest
-
-import torch
-
-from megatron_ds.core.transformer.module import Float16Module, MegatronModule
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-
-DEVICE_CAPABILITY = None
-if torch.cuda.is_available():
-    DEVICE_CAPABILITY = torch.cuda.get_device_capability()
-
-
-class DummyModule(MegatronModule):
-    # def __init__(self, config: TransformerConfig, share_embeddings_and_output_weights=True):
-    def __init__(self, config: TransformerConfig):
-        super().__init__(config)
-
-        self.linear = torch.nn.modules.Linear(in_features=2, out_features=1)
-
-    def forward(self, x):
-        return self.linear(x)
-
-
-@pytest.fixture
-def megatron_module(transformer_config):
-    return DummyModule(config=transformer_config).cuda()
-
-
-class TestMegatronModule:
-    def test_megatron_module(self, megatron_module):
-        assert megatron_module
-        assert megatron_module.config.hidden_size == 12
-        assert megatron_module.config.ffn_hidden_size == 48
-        assert megatron_module.linear.weight.dtype == torch.float32
-
-        x = torch.ones((2, 2)).cuda()
-        assert megatron_module(x).dtype == torch.float32
-
-        # TODO: test bad configs actually fail
-        # failed_module = megatron_module
-        # failed_module.fp16 = True
-        # failed_module.bf16 = True
-
-
-class TestFloat16Module:
-    def test_fp16_module(self, transformer_config, megatron_module):
-        transformer_config.fp16 = True
-        fp16_module = Float16Module(config=transformer_config, module=megatron_module)
-
-        assert fp16_module
-        assert fp16_module.config.hidden_size == 12
-        assert fp16_module.config.ffn_hidden_size == 48
-        assert fp16_module.module.linear.weight.dtype == torch.float16
-
-        x = torch.ones((2, 2)).cuda()
-        # inputs are converted to fp16 then outputs are converted to fp32
-        assert fp16_module(x).dtype == torch.float32
-
-    pytest.mark.skipif(
-        not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8, reason='bfloat16 is not supported on this device'
-    )
-
-    def test_bf16_module(self, transformer_config, megatron_module):
-        transformer_config.bf16 = True
-        bf16_module = Float16Module(config=transformer_config, module=megatron_module)
-
-        assert bf16_module
-        assert bf16_module.config.hidden_size == 12
-        assert bf16_module.config.ffn_hidden_size == 48
-        assert bf16_module.module.linear.weight.dtype == torch.bfloat16
-
-        x = torch.ones((2, 2)).cuda()
-        # inputs are converted to bf16 then outputs are converted to fp32
-        assert bf16_module(x).dtype == torch.float32
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_attention.py b/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_attention.py
deleted file mode 100644
index 85bd71a765822109cd3bd2947c57f54efda7cfd0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_attention.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import pytest
-
-import torch
-
-from megatron_ds.core.transformer.parallel_attention import ParallelAttention
-
-
-@pytest.fixture
-def parallel_attention(transformer_config):
-    return ParallelAttention(transformer_config)
-
-
-@pytest.fixture
-def checkpointed_parallel_attention(transformer_config):
-    transformer_config.recompute_granularity = 'selective'
-    return ParallelAttention(transformer_config)
-
-
-class TestParallelAttention:
-    def test_constructor(self, parallel_attention):
-        assert isinstance(parallel_attention, ParallelAttention)
-        assert parallel_attention.layer_number == 1
-
-        num_weights = sum([p.numel() for p in parallel_attention.parameters()])
-        assert num_weights == 624
-
-    def test_cpu_forward(self, parallel_attention):
-        # we can't currently do this because the global memory buffer is on GPU
-        pass
-
-    def test_gpu_forward(self, parallel_attention):
-
-        config = parallel_attention.config
-        sequence_length = 32
-        micro_batch_size = 2
-
-        parallel_attention.cuda()
-
-        # [sequence length, batch size, hidden size]
-        hidden_states = torch.ones((sequence_length, micro_batch_size, parallel_attention.config.hidden_size))
-        hidden_states = hidden_states.cuda()
-
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-
-        output, bias = parallel_attention(hidden_states, attention_mask)
-
-        assert config.recompute_granularity is None
-        assert output.shape[0] == sequence_length
-        assert output.shape[1] == micro_batch_size
-        assert output.shape[2] == config.hidden_size
-        assert bias.shape[0] == config.hidden_size
-
-    def test_checkpointed_gpu_forward(self, checkpointed_parallel_attention):
-
-        config = checkpointed_parallel_attention.config
-
-        sequence_length = 32
-        micro_batch_size = 2
-
-        checkpointed_parallel_attention.cuda()
-
-        # [sequence length, batch size, hidden size]
-        hidden_states = torch.ones(
-            (sequence_length, micro_batch_size, checkpointed_parallel_attention.config.hidden_size)
-        )
-        hidden_states = hidden_states.cuda()
-
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-
-        output, bias = checkpointed_parallel_attention(hidden_states, attention_mask)
-
-        assert config.recompute_granularity == 'selective'
-        assert output.shape[0] == sequence_length
-        assert output.shape[1] == micro_batch_size
-        assert output.shape[2] == config.hidden_size
-        assert bias.shape[0] == config.hidden_size
diff --git a/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_mlp.py b/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_mlp.py
deleted file mode 100644
index 4acf683f6a5a6f4c2dc1b7879940889a4cc6164c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_mlp.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import pytest
-
-import torch
-
-from megatron_ds.core.transformer.parallel_mlp import ParallelMLP
-
-
-@pytest.fixture
-def mlp(transformer_config):
-    return ParallelMLP(transformer_config)
-
-
-class TestParallelMLP:
-    def test_constructor(self, mlp):
-        assert isinstance(mlp, ParallelMLP)
-
-        num_weights = sum([p.numel() for p in mlp.parameters()])
-        assert num_weights == 1212
-
-    def test_cpu_forward(self, mlp):
-        # [sequence length, micro batch size, hidden size]
-        hidden_states = torch.ones((32, 2, mlp.config.hidden_size))
-        output, output_bias = mlp(hidden_states)
-        assert output.shape[0] == 32
-        assert output.shape[1] == 2
-        assert output.shape[2] == mlp.config.hidden_size
-        assert output_bias.shape[0] == mlp.config.hidden_size
-        assert output.dtype == torch.float32
-
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    def test_gpu_forward(self, mlp):
-        mlp.cuda()
-        # [sequence length, batch size, hidden size]
-        hidden_states = torch.ones((32, 2, mlp.config.hidden_size))
-        hidden_states = hidden_states.cuda()
-        output, output_bias = mlp(hidden_states)
-        assert output.shape[0] == 32
-        assert output.shape[1] == 2
-        assert output.shape[2] == mlp.config.hidden_size
-        assert output_bias.shape[0] == mlp.config.hidden_size
-        assert output.dtype == torch.float32
-        assert output.device.type == 'cuda'
-        assert output_bias.device.type == 'cuda'
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_transformer_block.py b/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_transformer_block.py
deleted file mode 100644
index 77f239c938981a68569365eb4df849581d2c84fb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_transformer_block.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-import pytest
-
-import torch
-
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.parallel_transformer_layer import ParallelTransformerLayer
-from megatron_ds.core.transformer.parallel_transformer_block import ParallelTransformerBlock
-
-
-@pytest.fixture
-def parallel_transformer_block(transformer_config):
-    return ParallelTransformerBlock(transformer_config)
-
-
-class TestParallelTransformerBlock:
-    def test_constructor(self, parallel_transformer_block: ParallelTransformerBlock):
-        assert isinstance(parallel_transformer_block, ParallelTransformerBlock)
-        num_weights = sum([p.numel() for p in parallel_transformer_block.parameters()])
-        assert num_weights == 3792
-        assert parallel_transformer_block.num_layers_per_pipeline_rank == 2
-        assert len(parallel_transformer_block.layers) == 2
-        layer_0: ParallelTransformerLayer = parallel_transformer_block._get_layer(0)
-        assert layer_0.layer_number == 1
-        layer_1: ParallelTransformerLayer = parallel_transformer_block._get_layer(1)
-        assert layer_1.layer_number == 2
-
-    def test_gpu_forward(self, parallel_transformer_block: ParallelTransformerBlock):
-        config: TransformerConfig = parallel_transformer_block.config
-
-        sequence_length = 32
-        micro_batch_size = 2
-        parallel_transformer_block.cuda()
-
-        # [sequence length, batch size, hidden size]
-        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
-        hidden_states = hidden_states.cuda()
-
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-
-        hidden_states = parallel_transformer_block(hidden_states=hidden_states, attention_mask=attention_mask)
-        assert hidden_states.shape[0] == sequence_length
-        assert hidden_states.shape[1] == micro_batch_size
-        assert hidden_states.shape[2] == config.hidden_size
-
-    def test_gpu_forward_full_checkpoint(self, transformer_config: TransformerConfig):
-        config = transformer_config
-        config.recompute_granularity = 'full'
-        config.recompute_method = 'block'
-        config.recompute_num_layers = config.num_layers
-        full_transformer_block = ParallelTransformerBlock(config)
-        assert full_transformer_block.config.recompute_granularity == 'full'
-        assert full_transformer_block.config.recompute_method == 'block'
-
-        sequence_length = 32
-        micro_batch_size = 2
-        full_transformer_block.cuda()
-
-        # [sequence length, batch size, hidden size]
-        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
-        hidden_states = hidden_states.cuda()
-
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-
-        hidden_states = full_transformer_block(hidden_states=hidden_states, attention_mask=attention_mask)
-        assert hidden_states.shape[0] == sequence_length
-        assert hidden_states.shape[1] == micro_batch_size
-        assert hidden_states.shape[2] == config.hidden_size
-
-    def test_gpu_forward_selective_checkpoint(self, transformer_config: TransformerConfig):
-        config = transformer_config
-        config.recompute_granularity = 'selective'
-        selective_transformer_block = ParallelTransformerBlock(config)
-        assert selective_transformer_block.config.recompute_granularity == 'selective'
-        assert selective_transformer_block.checkpoint_core_attention
-
-        sequence_length = 32
-        micro_batch_size = 2
-        selective_transformer_block.cuda()
-
-        # [sequence length, batch size, hidden size]
-        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
-        hidden_states = hidden_states.cuda()
-
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-
-        hidden_states = selective_transformer_block(hidden_states=hidden_states, attention_mask=attention_mask)
-        assert hidden_states.shape[0] == sequence_length
-        assert hidden_states.shape[1] == micro_batch_size
-        assert hidden_states.shape[2] == config.hidden_size
diff --git a/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_transformer_layer.py b/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_transformer_layer.py
deleted file mode 100644
index 0b5f3889d0f3d5daf5c33382f16177d208b25048..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/transformer/test_parallel_transformer_layer.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-
-import pytest
-
-import torch
-
-from megatron_ds.core.transformer.transformer_config import TransformerConfig
-from megatron_ds.core.transformer.parallel_transformer_layer import ParallelTransformerLayer
-
-
-@pytest.fixture
-def parallel_transformer_layer(transformer_config):
-    return ParallelTransformerLayer(transformer_config)
-
-
-class TestParallelTransformerLayer:
-    def test_constructor(self, parallel_transformer_layer):
-        assert isinstance(parallel_transformer_layer, ParallelTransformerLayer)
-        assert parallel_transformer_layer.layer_number == 1
-
-        num_weights = sum([p.numel() for p in parallel_transformer_layer.parameters()])
-        assert num_weights == 1884
-
-    def test_gpu_forward(self, parallel_transformer_layer):
-        config: TransformerConfig = parallel_transformer_layer.config
-        sequence_length = 32
-        micro_batch_size = 2
-        parallel_transformer_layer.cuda()
-
-        # [sequence length, batch size, hidden size]
-        hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
-        hidden_states = hidden_states.cuda()
-
-        attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
-
-        hidden_states = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask)
-        assert hidden_states.shape[0] == sequence_length
-        assert hidden_states.shape[1] == micro_batch_size
-        assert hidden_states.shape[2] == config.hidden_size
diff --git a/toolbox/Megatron-DeepSpeed/tests/transformer/test_transformer_config.py b/toolbox/Megatron-DeepSpeed/tests/transformer/test_transformer_config.py
deleted file mode 100644
index 7c38c0e84a4ee63ed5d4fdf1d6c6eda25b227baa..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/transformer/test_transformer_config.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-
-
-class TestTransformerConfig:
-    def test_transformer_config(self, transformer_config):
-
-        assert transformer_config.hidden_size == 12
-        assert transformer_config.ffn_hidden_size == 48
-        assert transformer_config.num_attention_heads == 4
-        assert transformer_config.kv_channels == 3
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/__init__.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/__init__.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_cross_entropy.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_cross_entropy.py
deleted file mode 100644
index 9a8458a4ec0ea05ae35824931a0b98b379c29348..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_cross_entropy.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import megatronspeed.megatron_adaptor
-from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
-import torch
-from unit_tests.test_utilities import Utils
-import numpy as np
-
-def test_vocab_parallel_cross_entropy():
-    Utils.initialize_model_parallel(4,2)
-    vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda()
-    target = torch.arange(0,32,2).cuda()
-    output = vocab_parallel_cross_entropy(vocab_parallel_logits, target)
-    expected_output = torch.tensor([10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309,
-        10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309]).cuda()
-    assert(torch.equal(torch.round(expected_output), torch.round(output)))
-    Utils.destroy_model_parallel()
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_data.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_data.py
deleted file mode 100644
index 8cbb998995a94d9c995eb3cd800ce4ee3d232cfd..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_data.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import megatronspeed.megatron_adaptor
-from megatron.core.tensor_parallel.data import broadcast_data
-import torch
-from unit_tests.test_utilities import Utils
-
-def test_broadcast_data():
-    Utils.initialize_model_parallel(2,4)
-    input_data = {
-        0 : torch.ones((8,8)).cuda() * 0.0,
-        1 : torch.ones((8,8)).cuda() * 1.0,
-        2 : torch.ones((8,8)).cuda() * 2.0,
-        3 : torch.ones((8,8)).cuda() * 3.0,
-        4 : torch.ones((8,8)).cuda() * 4.0,
-        5 : torch.ones((8,8)).cuda() * 5.0,
-        6 : torch.ones((8,8)).cuda() * 6.0,
-        7 : torch.ones((8,8)).cuda() * 7.0
-        }
-    dtype = torch.float32
-    actual_output = broadcast_data([0,1],input_data, dtype)
-    assert(torch.equal(actual_output[0], input_data[0]))
-    assert(torch.equal(actual_output[1], input_data[1]))
-    Utils.destroy_model_parallel()
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_mappings.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_mappings.py
deleted file mode 100644
index 4063e5340ab73fb5cee0e43ce4df2ed941c4d8a0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_mappings.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import megatronspeed.megatron_adaptor
-from megatron.core.tensor_parallel import mappings
-from unit_tests.test_utilities import Utils
-import torch
-
-def test_CopyToModelParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.ones((1)).cuda()*Utils.rank
-    output_data = mappings._CopyToModelParallelRegion.backward(None, input_data)
-    result = torch.ones(1).cuda()
-    result = result * 22 if Utils.rank >= 4 else result * 6
-    assert(torch.equal(output_data, result))
-    assert(torch.equal(input_data, mappings.copy_to_tensor_model_parallel_region(input_data)))
-    assert(torch.equal(input_data, mappings._CopyToModelParallelRegion.symbolic(None, input_data)))
-    Utils.destroy_model_parallel()
-
-def test_ReduceFromModelParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.ones((1)).cuda()*Utils.rank
-    output_data = mappings._ReduceFromModelParallelRegion.symbolic(None, input_data)
-    result = torch.ones(1).cuda()
-    result = result * 22 if Utils.rank >= 4 else result * 6
-    assert(torch.equal(output_data, result))
-    input_data = torch.ones((1)).cuda()*Utils.rank
-    assert(torch.equal(mappings.reduce_from_tensor_model_parallel_region(input_data), result))
-    assert(torch.equal(input_data, mappings._ReduceFromModelParallelRegion.backward(None, input_data)))
-    Utils.destroy_model_parallel()
-
-def test_ScatterToModelParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.rand((8,4)).cuda()
-    output_data = mappings.scatter_to_tensor_model_parallel_region(input_data)
-    req_dim = int(Utils.rank%(Utils.world_size/2))
-    assert(torch.equal(output_data, input_data[:,req_dim].reshape((8,1))))
-    output_data = mappings._ScatterToModelParallelRegion.symbolic(None, input_data)
-    assert(torch.equal(output_data, input_data[:, req_dim].reshape((8,1))))
-
-    input_data = torch.ones(8).cuda() * Utils.rank
-    actual_output_data = mappings._ScatterToModelParallelRegion.backward(None, input_data)
-    expected_output = torch.cat((
-        torch.ones(8)*0,
-        torch.ones(8)*1,
-        torch.ones(8)*2,
-        torch.ones(8)*3)).cuda()
-    if (Utils.rank >= 4):
-        expected_output = expected_output + 4
-    assert(torch.equal(actual_output_data, expected_output))
-    Utils.destroy_model_parallel()
-
-def test_GatherFromModelParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.rand((8,4)).cuda()
-    req_dim = int(Utils.rank%(Utils.world_size/2))
-    output_data = mappings._GatherFromModelParallelRegion.backward(None, input_data)
-    assert(torch.equal(output_data, input_data[:, req_dim].reshape((8,1))))
-    input_data = torch.ones(8).cuda() * Utils.rank
-    actual_output_data = mappings.gather_from_tensor_model_parallel_region(input_data)
-    expected_output = torch.cat((
-        torch.ones(8)*0,
-        torch.ones(8)*1,
-        torch.ones(8)*2,
-        torch.ones(8)*3)).cuda()
-    if (Utils.rank >= 4):
-        expected_output = expected_output + 4
-    assert(torch.equal(actual_output_data, expected_output))
-    assert(torch.equal(mappings._GatherFromModelParallelRegion.symbolic(None, input_data), expected_output))
-    Utils.destroy_model_parallel()
- 
-def test_ScatterToSequenceParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.rand((8,4)).cuda()
-    req_dim = int(Utils.rank%(Utils.world_size/2))*2
-    output_data = mappings._ScatterToSequenceParallelRegion.symbolic(None, input_data)
-    assert(torch.equal(output_data, input_data[req_dim:req_dim+2, :]))
-    output_data = mappings.scatter_to_sequence_parallel_region(input_data)
-    assert(torch.equal(output_data, input_data[req_dim:req_dim+2, :]))
-    input_data = torch.ones(4).cuda() * Utils.rank
-    output_data = mappings._ScatterToModelParallelRegion.backward(None, input_data)
-    expected_output = torch.concat((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
-    if (Utils.rank >= 4):
-        expected_output = expected_output + 4
-    assert(torch.equal(output_data, expected_output))
-    Utils.destroy_model_parallel()
-
-def test_GatherFromSequenceParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.ones(4).cuda() * Utils.rank
-    output_data = mappings.gather_from_sequence_parallel_region(input_data)
-    expected_output = torch.concat((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
-    if (Utils.rank >= 4):
-        expected_output = expected_output + 4
-    assert(torch.equal(output_data, expected_output))
-    assert(torch.equal(mappings._GatherFromSequenceParallelRegion.symbolic(None, input_data), expected_output))
-    input_data = torch.vstack((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
-    class Ctx:
-        tensor_parallel_output_grad = True
-    output_data = mappings._GatherFromSequenceParallelRegion.backward(Ctx(), input_data)
-    expected_output = torch.ones((1,4)).cuda() * 4 * int(Utils.rank % 4)
-    assert(torch.equal(output_data[0], expected_output))
-    Utils.destroy_model_parallel()
-
-def test_ReduceScatterToSequenceParallelRegion():
-    Utils.initialize_model_parallel(4,2)
-    input_data = torch.vstack((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
-    output_data = mappings.reduce_scatter_to_sequence_parallel_region(input_data)
-    expected_output = torch.ones(4).cuda() * 4 * int(Utils.rank % 4)
-    assert(torch.equal(output_data[0], expected_output))
-    assert(torch.equal(mappings._ReduceScatterToSequenceParallelRegion.symbolic(None, input_data) , expected_output.reshape((1,4))))
-    input_data = torch.ones(4).cuda() * Utils.rank
-    output_data = mappings._ReduceScatterToSequenceParallelRegion.backward(None,input_data)
-    expected_output = torch.concat((
-        torch.ones(4)*0,
-        torch.ones(4)*1,
-        torch.ones(4)*2,
-        torch.ones(4)*3)).cuda()
-    if (Utils.rank >= 4):
-        expected_output = expected_output + 4
-    assert(torch.equal(output_data, expected_output))
-    Utils.destroy_model_parallel()
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_random.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_random.py
deleted file mode 100644
index c17b93fdbe6460bd415ba3e00b4504d0c3020d12..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_random.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import megatronspeed.megatron_adaptor
-from megatron.core.tensor_parallel.random import CudaRNGStatesTracker
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER
-from megatron.core.tensor_parallel.random import checkpoint
-from unit_tests.test_utilities import Utils
-import pytest
-import torch
-
-def test_cuda_rng_states_tracker():
-    rng_tracker = CudaRNGStatesTracker()
-    rng_tracker.set_states({"state1":1234})
-    assert(rng_tracker.get_states()["state1"] == 1234)
-    rng_tracker.reset()
-    assert(rng_tracker.get_states() == {})
-    seed = 1111
-    rng_tracker.add("state2",seed)
-    with pytest.raises(Exception):
-        assert(rng_tracker.add("state3",seed))
-    with pytest.raises(Exception):
-        assert(rng_tracker.add("state2",111))
-    assert(rng_tracker.get_states()['state2'] is not None)
-    with pytest.raises(Exception):
-        assert()
-    
-    rng_tracker.fork("state2")
-    torch.cuda.manual_seed(seed)
-    rng_state = torch.cuda.get_rng_state()
-    assert torch.equal(rng_tracker.get_states()['state2'], rng_state)
-
-def test_model_parallel_cuda_manual_seed():
-    Utils.initialize_model_parallel(4,2)
-    model_parallel_cuda_manual_seed(0)
-    assert(_CUDA_RNG_STATE_TRACKER.get_states()['model-parallel-rng'] is not None)
-    Utils.destroy_model_parallel()
-
-def test_checkpoint():
-    def test_forward(*input):
-        return input[0]+input[1]
-    assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2)))
-    Utils.initialize_model_parallel()
-    input1 = torch.ones((4,4))
-    checkpoint(test_forward, True, input1, torch.ones((4,4))*2)
-    assert(torch.equal(torch.ones(input1.numel()).cuda(), input1))
-    Utils.destroy_model_parallel()
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py
deleted file mode 100644
index be304a932c51394ee624ca71ef58f0b8830ed921..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import megatronspeed.megatron_adaptor
-import megatron.core.tensor_parallel.utils as util
-import megatron.core.parallel_state as ps
-from unit_tests.test_utilities import Utils
-
-rank = Utils.rank
-
-def test_split_tensor_along_last_dim():
-    input_tensor = torch.rand((3,4))
-    torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0])
-    torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1])
-
-def test_split_tensor_into_1d_equal_chunks():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-    input_tensor = torch.rand((3,4))
-    output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor)
-    if rank % 2 == 0 :
-        start = 0
-        end = int(input_tensor.numel()/2)
-    else :
-        start = int(input_tensor.numel()/2)
-        end = input_tensor.numel()
-        
-    assert torch.equal(output_tensor, input_tensor.flatten()[start:end])
-    Utils.destroy_model_parallel()
-
-def test_gather_split_1d_tensor():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-    input_tensor = torch.ones((2,4)).cuda() * rank
-    actual_output_tensor = util.gather_split_1d_tensor(input_tensor)
-    if rank %2 == 0:
-        expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1))
-    else : 
-        expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten()))
-    assert(torch.equal(actual_output_tensor, expected_output_tensor))
-    Utils.destroy_model_parallel()
-
-def test_vocab():
-    global_vocab_size = 1600
-    per_partition_vocab_size = 1600 / Utils.world_size
-    assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size)))
-    assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size)))
-    
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_basic.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_basic.py
deleted file mode 100644
index 4f8c32ebbcaf1430f9a6eb2a7718f7283b261622..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_basic.py
+++ /dev/null
@@ -1,4 +0,0 @@
-def test_import():
-    import megatronspeed
-    import megatronspeed.megatron_adaptor
-
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_parallel_state.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_parallel_state.py
deleted file mode 100644
index d554dc4832f20a939ff4c5ca07e461efd1c4c516..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_parallel_state.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import torch
-import megatronspeed.megatron_adaptor
-import megatron.core.parallel_state as ps
-import pytest
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
-from unit_tests.test_utilities import Utils
-import os 
-
-rank = Utils.rank
-world_size = Utils.world_size
-
-def test_initialize__and_destroy_model_parallel():
-    with pytest.raises(AssertionError):
-        assert(ps.initialize_model_parallel())
-    Utils.initialize_distributed()
-    with pytest.raises(RuntimeError):
-        assert(ps.initialize_model_parallel(tensor_model_parallel_size=2*world_size))
-    with pytest.raises(RuntimeError):
-        assert(ps.initialize_model_parallel(pipeline_model_parallel_size=2*world_size))
-    with pytest.raises(RuntimeError):
-        assert(ps.initialize_model_parallel(pipeline_model_parallel_size=world_size, tensor_model_parallel_size=world_size))
-    with pytest.raises(RuntimeError):
-        assert(ps.initialize_model_parallel(virtual_pipeline_model_parallel_size=2))
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-
-    assert(ps.model_parallel_is_initialized())
-    assert(ps.get_model_parallel_group() is not None)
-    assert(ps.get_tensor_model_parallel_group() is not None)
-    assert(ps.get_pipeline_model_parallel_group() is not None)
-    assert(ps.get_data_parallel_group() is not None)  
-    Utils.destroy_model_parallel()
-    assert(ps._MODEL_PARALLEL_GROUP is None)
-
-def test_pipeline_parallel_initializations():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
-    num_pipeline_parallel_groups = world_size / ps.get_pipeline_model_parallel_world_size()
-    assert(ps.get_pipeline_model_parallel_first_rank() == rank % num_pipeline_parallel_groups )
-    ## In a data parallel group, subtracting the first gpu rank from any gpu rank must be a multiple of tensor parallel size or sequence parallel size
-    assert((rank - ps.get_data_parallel_src_rank()) % ps.get_tensor_model_parallel_world_size() == 0)
-    assert(ps.get_pipeline_model_parallel_next_rank() == ((rank + num_pipeline_parallel_groups) % world_size))
-    assert(ps.get_pipeline_model_parallel_prev_rank() == ((rank - num_pipeline_parallel_groups) % world_size))
-    Utils.destroy_model_parallel()
-
-def test_data_parallel_initializations():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.get_data_parallel_src_rank() == rank)
-    assert(ps.get_data_parallel_world_size() == 1)
-    assert(ps.get_data_parallel_rank() == 0)
-    Utils.destroy_model_parallel()
-    
-
-def test_tensor_model_parellel_world_size():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size)
-    assert(ps.get_tensor_model_parallel_world_size() == world_size)
-    ps.set_tensor_model_parallel_world_size(None)
-    assert(ps.get_tensor_model_parallel_world_size() == world_size)
-    Utils.destroy_model_parallel()
-    
-
-def test_pipeline_model_parallel_world_size():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.get_pipeline_model_parallel_world_size() == world_size)
-    ps.set_pipeline_model_parallel_world_size(None)
-    assert(ps.get_pipeline_model_parallel_world_size() == world_size)
-    Utils.destroy_model_parallel()    
-    
-
-def test_tensor_model_parallel_rank():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size)
-    assert(ps.get_tensor_model_parallel_rank() == rank)
-    ps.set_tensor_model_parallel_rank(None)
-    assert(ps.get_tensor_model_parallel_rank() == rank)    
-    Utils.destroy_model_parallel()    
-    
-
-def test_pipeline_model_parallel_rank():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.get_pipeline_model_parallel_rank() == rank)
-    ps.set_pipeline_model_parallel_rank(None)
-    assert(ps.get_pipeline_model_parallel_rank() == rank)
-    Utils.destroy_model_parallel()
-    
-
-def test_is_pipeline_first_stage():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.is_pipeline_first_stage(ignore_virtual=True) == (rank == 0))
-    assert(ps.is_pipeline_first_stage() == (rank == 0))
-    Utils.destroy_model_parallel()
-    
-
-def test_is_pipeline_last_stage():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    assert(ps.is_pipeline_last_stage(ignore_virtual=True) == (rank == world_size-1))
-    assert(ps.is_pipeline_last_stage() == (rank == world_size-1))
-    Utils.destroy_model_parallel()
-    
-
-def test_virtual_pipeline_model_parallel_rank():
-    Utils.initialize_model_parallel(pipeline_model_parallel_size=world_size)
-    ps.set_virtual_pipeline_model_parallel_rank(rank)
-    assert(ps.get_virtual_pipeline_model_parallel_rank() == rank)
-    Utils.destroy_model_parallel()
-    
-
-def test_get_tensor_model_parallel_src_rank():
-    Utils.initialize_model_parallel(tensor_model_parallel_size=world_size)
-    assert(ps.get_tensor_model_parallel_src_rank() == ((rank // world_size) * world_size))
-    Utils.destroy_model_parallel() 
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_utilities.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_utilities.py
deleted file mode 100644
index e3c2e17dfbc69d302f67e81bcdb718bf1b1c5159..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_utilities.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import os
-import torch
-import megatron.core.parallel_state as ps
-
-class Utils:
-    rank = int(os.environ['RANK'])
-    world_size = int(os.environ['WORLD_SIZE'])
-
-    def __init__():
-        pass
-
-    @staticmethod
-    def initialize_distributed():
-        rank = int(os.environ['RANK'])
-        world_size = int(os.environ['WORLD_SIZE'])
-        print(f'Initializing torch.distributed with rank: {rank}, world_size: {world_size}')
-        torch.cuda.set_device(rank % torch.cuda.device_count())
-        # init_method = 'tcp://'
-        # master_ip = os.getenv('MASTER_ADDR', 'localhost')
-        # master_port = os.getenv('MASTER_PORT', '6000')
-        # init_method += master_ip + ':' + master_port
-        # torch.distributed.init_process_group(backend='nccl', world_size=world_size, rank=rank, init_method=init_method)
-        torch.distributed.init_process_group(backend='nccl')
-        # local_rank = torch.distributed.get_rank()
-        # torch.cuda.set_device(local_rank)
-        
-    @staticmethod
-    def destroy_model_parallel():
-        ps.destroy_model_parallel()
-        # torch.distributed.barrier()
-
-    @staticmethod
-    def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None):
-        ps.destroy_model_parallel()
-        if not torch.distributed.is_initialized():
-            Utils.initialize_distributed()
-        ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank = pipeline_model_parallel_split_rank)
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_utils.py b/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_utils.py
deleted file mode 100644
index 97480013eaa731e916c53fc4bdff51fd77c5e5bb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tests/unit_tests/test_utils.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import pytest
-import torch
-import megatronspeed.megatron_adaptor
-import megatron.core.utils as util
-import numpy as np
-
-def test_divide_properly():
-    assert util.divide(4,2) == 2
-
-def test_divide_improperly():
-    with pytest.raises(AssertionError):
-        util.divide(4,5)
-
-def test_global_memory_buffer():
-    global_memory_buffer = util.GlobalMemoryBuffer()
-    obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor")
-    expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device())
-    assert torch.equal(torch.ones_like(obtained_tensor), torch.ones_like(expected_tensor))
-
-def test_make_viewless_tensor():
-    inp = torch.rand((3,4))
-    assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True)))
-    assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False)))
-
-def test_safely_set_viewless_tensor_data():
-    tensor = torch.zeros((3,4))
-    new_data_tensor = torch.tensor(np.random.rand(3,4))
-    util.safely_set_viewless_tensor_data(tensor, new_data_tensor)
-    assert(torch.equal(tensor, new_data_tensor))
-
-def test_assert_viewless_tensor():
-    tensor = torch.rand((3,4))
-    assert(torch.equal(util.assert_viewless_tensor(tensor), tensor))
-    input_tensor_list=[tensor,tensor,tensor]
-    output_tensor_list = util.assert_viewless_tensor(input_tensor_list)
-    for inp,out in zip(input_tensor_list, output_tensor_list):
-        assert(torch.equal(inp,out))
diff --git a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/__init__.py b/toolbox/Megatron-DeepSpeed/tools/bert_embedding/__init__.py
deleted file mode 100644
index 766a66ba2151c9f910a1b0fdc465ca70bc7e5f70..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .embed import BertEmbedder, DiskDataParallelBertEmbedder
diff --git a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/dataset.py b/toolbox/Megatron-DeepSpeed/tools/bert_embedding/dataset.py
deleted file mode 100644
index 02c4fc9392f38d4853089242dc749a59a5fa1c76..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/dataset.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import numpy as np
-import torch
-
-from megatron_ds import get_args, get_tokenizer
-from megatron_ds.data.bert_dataset import build_training_sample
-
-
-class BertEmbeddingDataset(torch.utils.data.Dataset):
-    '''Dataset to convert a text dataset to Bert tokens.'''
-
-    def __init__(self, text_dataset, max_seq_length):
-
-        super().__init__()
-
-        args = get_args()
-
-        # Dataset, tokenizer.
-        self.text_dataset = text_dataset
-        self.bert_tokenizer = get_tokenizer()
-
-        # Params to store.
-        self.max_seq_length = max_seq_length
-        self.seed = args.seed
-        self.masked_lm_prob = args.mask_prob
-
-        # Vocab stuff.
-        self.vocab_id_list = list(self.bert_tokenizer.inv_vocab.keys())
-        self.vocab_id_to_token_dict = self.bert_tokenizer.inv_vocab
-        self.cls_id = self.bert_tokenizer.cls
-        self.sep_id = self.bert_tokenizer.sep
-        self.mask_id = self.bert_tokenizer.mask
-        self.pad_id = self.bert_tokenizer.pad
-
-    def __len__(self):
-        return len(self.text_dataset)
-
-    def __getitem__(self, idx):
-
-        # Text.
-        text_sample = self.text_dataset[idx]
-        text = text_sample["text"]
-        text = text.replace("<|endoftext|>", "")
-
-        # Bert/Wordpiece tokens (+truncate).
-        bert_token_ids = self.bert_tokenizer.tokenize(text)
-        bert_token_ids = bert_token_ids[:self.max_seq_length - 2] # cls+sep.
-        if not bert_token_ids:
-            bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq
-
-        # Note that this rng state should be numpy and not python since
-        # python randint is inclusive whereas the numpy one is exclusive.
-        # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
-        np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
-
-        # Build sample.
-        sample = build_training_sample([bert_token_ids],
-                                       len(bert_token_ids),
-                                       len(bert_token_ids) + 2, # for cls+sep
-                                       self.vocab_id_list,
-                                       self.vocab_id_to_token_dict,
-                                       self.cls_id, self.sep_id,
-                                       self.mask_id, self.pad_id,
-                                       self.masked_lm_prob, np_rng,
-                                       binary_head=False)
-        sample["seq_length"] = len(sample["text"])
-        return sample
diff --git a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/embed.py b/toolbox/Megatron-DeepSpeed/tools/bert_embedding/embed.py
deleted file mode 100644
index ba2769769d6b6d0e10be1468f170ef0a1e733601..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/embed.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from functools import partial
-import numpy as np
-import os
-import time
-import torch
-from torch.utils.data import BatchSampler, DataLoader, SequentialSampler, Subset
-from torch.utils.data._utils.collate import default_collate
-from tqdm import tqdm
-
-from megatron_ds import get_args, get_tokenizer, print_rank_0
-from megatron_ds import core
-from megatron_ds.core.enums import ModelType
-from megatron_ds.core.pipeline_parallel import get_forward_backward_func
-from megatron_ds.model import BertModel
-from megatron_ds.training import setup_model_and_optimizer
-
-from .dataset import BertEmbeddingDataset
-from .external_libs import h5py
-from .huggingface import HuggingfaceEmbedder
-from .utils import get_missing_blocks_by_rank
-
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    print_rank_0(" > build Bert model.")
-
-    args = get_args()
-    num_tokentypes = 2 if args.bert_binary_head else 0
-    model = BertModel(
-        num_tokentypes=num_tokentypes,
-        add_binary_head=args.bert_binary_head,
-        parallel_output=True,
-        pre_process=pre_process,
-        post_process=post_process)
-
-    return model
-
-
-def get_batch(data_iterator):
-    """Build the batch."""
-
-    # Items and their type.
-    keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask',
-            'seq_length']
-    datatype = torch.int64
-
-    # Broadcast data.
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    data_b = core.tensor_parallel.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens = data_b['text'].long()
-    types = data_b['types'].long()
-    sentence_order = data_b['is_random'].long()
-    loss_mask = data_b['loss_mask'].float()
-    lm_labels = data_b['labels'].long()
-    padding_mask = data_b['padding_mask'].long()
-    seq_lengths = data_b['seq_length'].long()
-
-    return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask, \
-        seq_lengths
-
-
-def loss_func(loss_mask, sentence_order, seq_lengths,
-              output_tensor, non_loss_data):
-    """Loss function. Sequence lengths returned here for progress print-outs."""
-    assert non_loss_data
-    return seq_lengths, output_tensor
-
-
-def forward_step(data_iterator, model):
-    """Forward step."""
-
-    args = get_args()
-
-    # Get the batch.
-    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask, \
-        seq_lengths = get_batch(data_iterator)
-
-    if not args.bert_binary_head:
-        types = None
-
-    # Forward pass through the model.
-    output_tensor = model(tokens, padding_mask, tokentype_ids=types,
-                          lm_labels=lm_labels)
-
-    return output_tensor, partial(loss_func, loss_mask, sentence_order,
-                                  seq_lengths)
-
-
-def collate_batch(samples):
-    """Collate samples of various lengths.
-
-    This collate function handles samples with various sequence lengths, by
-    padding 'text' arrays with pad_id, and other arrays with 0.
-    """
-
-    n_samples = len(samples)
-    keys = list(samples[0].keys())
-    tokenizer = get_tokenizer()
-
-    # Max sample length across all samples.
-    max_length_map = { key:0 for key in keys }
-    for sample in samples:
-        for key in keys:
-            value_length = \
-                len(sample[key]) if isinstance(sample[key], np.ndarray) else None
-            max_length_map[key] = None \
-                if value_length is None else \
-                   max(max_length_map[key], value_length)
-
-    # Pad samples.
-    padded_samples = []
-    for sample in samples:
-        padded_sample = {}
-        for key in keys:
-            padded_sample[key] = \
-                np.pad(
-                    sample[key],
-                    (0, max_length_map[key] - len(sample[key])),
-                    mode="constant",
-                    constant_values=tokenizer.pad_id if key == "text" else 0,
-                ) \
-                if isinstance(sample[key], np.ndarray) else \
-                   sample[key]
-        padded_samples.append(padded_sample)
-
-    # Build batch with padded samples.
-    batch = default_collate(padded_samples)
-
-    return batch
-
-
-def get_data_loader(dataset, batch_size):
-    """Build data loader over data subset.
-
-    Get a subset of the dataset (from start_idx -> end_idx), and wrap it in
-    a sequential sampler and data loader.
-    """
-
-    args = get_args()
-
-    # Sequential & batch samplers.
-    batch_sampler = BatchSampler(
-        sampler=SequentialSampler(dataset),
-        batch_size=batch_size,
-        drop_last=False,
-    )
-
-    # Data loader.
-    data_loader = DataLoader(dataset,
-                             batch_sampler=batch_sampler,
-                             num_workers=args.num_workers,
-                             pin_memory=True,
-                             collate_fn=collate_batch)
-
-    return data_loader
-
-
-def embed_data_loader(models, data_loader):
-    '''Iterate data loader and compute embeddings.'''
-
-    # Verify no model parallelism.
-    args = get_args()
-    assert args.tensor_model_parallel_size == 1 and \
-        args.pipeline_model_parallel_size == 1, \
-        "since we call forward_step directly, only tp == pp == 1 allowed."
-
-    # Data iterator.
-    data_iterator = iter(data_loader)
-
-    # Eval mode.
-    for m in models:
-        m.eval()
-
-    # Embed.
-    embeddings = []
-    for _ in tqdm(range(len(data_loader)), "mt embed"):
-        with torch.no_grad():
-            result = forward_step(data_iterator, models[0])
-            embeddings.append(result[0].detach().cpu().numpy())
-
-    # Concatenate embeddings.
-    embeddings = np.concatenate(embeddings, axis=0)
-
-    return embeddings
-
-
-class BertEmbedder:
-    '''Compute Bert embeddings, from a text dataset.'''
-
-    def __init__(self, batch_size, max_bert_seq_length, embedder_type):
-
-        args = get_args()
-
-        assert args.output_bert_embeddings
-
-        self.models, optimizer, opt_param_scheduler = \
-            setup_model_and_optimizer(model_provider,
-                                      ModelType.encoder_or_decoder)
-        self.batch_size = batch_size
-        self.max_bert_seq_length = max_bert_seq_length
-
-        # Init Huggingface, if in use.
-        if embedder_type == "megatron":
-            self.huggingface_embedder = None
-        elif embedder_type == "huggingface":
-            self.huggingface_embedder = HuggingfaceEmbedder(batch_size,
-                                                            max_bert_seq_length)
-        else:
-            raise Exception("specialize for embedder type '%s'." % embedder_type)
-
-    def embed_text_dataset(self, text_dataset):
-        '''Embed a text dataset.'''
-
-        # Huggingface.
-        if self.huggingface_embedder:
-            return self.huggingface_embedder.embed_text_dataset(text_dataset)
-
-        # Wrap in a BertEmbeddingDataset to tokenize samples.
-        bert_dataset = BertEmbeddingDataset(text_dataset,
-                                            self.max_bert_seq_length)
-
-        # Embed.
-        data_loader = get_data_loader(bert_dataset, self.batch_size)
-        embeddings = embed_data_loader(self.models, data_loader)
-
-        return embeddings
-
-    def embed_text(self, text):
-        '''Embed a single text string.
-
-        Primarily used for on-the-fly embeddings, particularly during
-        analysis or debugging. For large scale, use 'embed_text_dataset()'.
-        '''
-
-        class SingleTextDataset(torch.utils.data.Dataset):
-            '''Dataset that holds single string.'''
-            def __init__(self, text):
-                assert isinstance(text, str)
-                self.text = text
-            def __len__(self):
-                return 1
-            def __getitem__(self, i):
-                return {"text": self.text}
-
-        # Embed text.
-        text_ds = SingleTextDataset(text)
-        embed = self.embed_text_dataset(text_ds)[0]
-
-        return embed
-
-
-class DiskDataParallelBertEmbedder:
-    '''Process embeddings in blocks & save to disk.'''
-
-    def __init__(self, batch_size, max_bert_seq_length, block_size,
-                 embedder_type):
-        self.embedder = BertEmbedder(batch_size, max_bert_seq_length,
-                                     embedder_type)
-        self.block_size = block_size
-
-    def embed_text_blocks(self, name, workdir, text_dataset,
-                          missing_embedding_blocks):
-        '''Process a text dataset in blocks.'''
-
-        # Iterate blocks.
-        for block_index, block_info in enumerate(missing_embedding_blocks):
-
-            # Missing block lists are extended with None to have equal-length
-            # lists. Skip the Nones.
-            if block_info is not None:
-
-                # Progress. (*note*: move world progress to here.)
-                print_rank_0("embed '%s' block %d / %d ... %s." % (
-                    name,
-                    block_index,
-                    len(missing_embedding_blocks),
-                    block_info["path"],
-                ))
-
-                # Embed block.
-                sub_dataset = Subset(text_dataset, range(*block_info["range"]))
-                embeddings = self.embedder.embed_text_dataset(sub_dataset)
-
-                # Save embeddings.
-                f = h5py.File(block_info["path"], "w")
-                f.create_dataset("data", data=embeddings)
-                f.close()
-
-            # Synchronize progress across all ranks. (for easier observation)
-            print_rank_0(" > waiting for other ranks to finish block.")
-            torch.distributed.barrier()
-
-    def embed_text_dataset(self, name, workdir, text_dataset):
-        '''Embed a text dataset.'''
-
-        # Dataset workdir.
-        os.makedirs(workdir, exist_ok=True)
-
-        # Missing embedding blocks (stored on disk).
-        def validate(f):
-            assert f["data"].shape[1] == 1024
-        n_missing_world, missing_embedding_blocks = get_missing_blocks_by_rank(
-            workdir,
-            len(text_dataset),
-            self.block_size,
-            validate=validate)
-
-        # Prevent missing file race condition.
-        torch.distributed.barrier()
-
-        # Embed batches.
-        self.embed_text_blocks(name, workdir, text_dataset,
-                               missing_embedding_blocks)
diff --git a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/external_libs.py b/toolbox/Megatron-DeepSpeed/tools/bert_embedding/external_libs.py
deleted file mode 100644
index fb8e69f5cb0a9fb49d98d135f9ef2a7a99b73013..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/external_libs.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import importlib
-
-required_libs = [
-    "h5py",
-    "transformers", # for huggingface bert
-]
-
-for lib in required_libs:
-    try:
-        globals()[lib] = importlib.import_module(lib)
-    except ImportError as e:
-        raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.")
diff --git a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/huggingface.py b/toolbox/Megatron-DeepSpeed/tools/bert_embedding/huggingface.py
deleted file mode 100644
index 1a08a803bba44575a305967ce9cd7e0d2307b0bb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/huggingface.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from .external_libs import transformers
-
-
-class IterableTextDataset(torch.utils.data.IterableDataset):
-    '''Iterable over a text dataset.'''
-
-    def __init__(self, text_dataset):
-        self.text_dataset = text_dataset
-
-    def __iter__(self):
-        '''Remove 'endoftext' string.'''
-        for sample_idx in range(len(self.text_dataset)):
-            sample = self.text_dataset[sample_idx]
-            text = sample["text"].replace("<|endoftext|>", "")
-            yield text
-
-
-class MyFeatureExtractionPipeline(transformers.FeatureExtractionPipeline):
-    def _forward(self, model_inputs):
-
-        # Embed inputs.
-        model_outputs = self.model(**model_inputs)
-
-        # Attention mask.
-        embeddings = model_outputs[0]
-        masks = torch.sum(model_inputs['attention_mask'], dim=1)
-
-        # Collect embeddings & check for nan.
-        outputs = []
-        for embedding, mask in zip(embeddings, masks):
-            output = torch.mean(embedding[1: mask - 1], dim=0)
-
-            # Nans due to empty input sequences; so only check first element.
-            if torch.isnan(output.view(-1)[0]).any():
-                output.zero_()
-
-            outputs.append(output)
-
-        # Sample.
-        data = {
-            "input" : model_inputs["input_ids"],
-            "output" : outputs,
-        }
-
-        return data
-
-    def postprocess(self, model_outputs):
-        # Return input for analysis.
-        return {
-            "input" : model_outputs["input"].numpy(),
-            "output" : model_outputs["output"].numpy(),
-        }
-
-
-class HuggingfaceEmbedder:
-
-    def __init__(self, batch_size, max_seq_length):
-
-        # Model, tokenizer.
-        self.model = transformers.BertModel.from_pretrained("bert-large-cased")
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            "bert-large-cased", model_max_length=max_seq_length)
-
-        # Feature extraction pipeline.
-        self.pipe = MyFeatureExtractionPipeline(
-            model=self.model,
-            tokenizer=self.tokenizer,
-            device=torch.cuda.current_device(),
-            truncation=True,
-            max_length=max_seq_length,
-        )
-
-        self.batch_size = batch_size
-
-    def embed_text_dataset(self, text_dataset, verbose=True):
-
-        # Wrap dataset in iterable.
-        dataset = IterableTextDataset(text_dataset)
-
-        # Allocate output array.
-        n_samples = len(text_dataset)
-        embeddings = np.zeros((n_samples, 1024), dtype="f4")
-        start_idx = 0
-
-        # Wrap iterator in tqdm for verbose output.
-        _iter = self.pipe(dataset, batch_size=self.batch_size)
-        if verbose:
-            _iter = tqdm(_iter, "hf embed", total=n_samples)
-
-        # Embed dataset.
-        for idx, out_dict in enumerate(_iter):
-            inp = out_dict["input"]
-            out = out_dict["output"]
-            embeddings[start_idx] = out
-            start_idx += 1
-
-        return embeddings
-
-    def embed_text(self, text):
-        '''Embed a single text string.
-
-        Primarily used for on-the-fly embeddings, particularly during
-        analysis or debugging. For large scale, use 'embed_text_dataset()'.
-        '''
-
-        class SingleTextDataset(torch.utils.data.Dataset):
-            '''Dataset that holds single string.'''
-            def __init__(self, text):
-                assert isinstance(text, str)
-                self.text = text
-            def __len__(self):
-                return 1
-            def __getitem__(self, i):
-                return {"text": self.text}
-
-        # Embed text.
-        text_ds = SingleTextDataset(text)
-        embed = self.embed_text_dataset(text_ds, verbose=False)[0]
-
-        return embed
diff --git a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/utils.py b/toolbox/Megatron-DeepSpeed/tools/bert_embedding/utils.py
deleted file mode 100644
index a080cd75d270c54e387e11890e9a79b07c95306b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/bert_embedding/utils.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from collections import defaultdict
-import glob
-import numpy as np
-import os
-import torch
-from tqdm import tqdm
-
-from megatron_ds import print_rank_0
-from megatron_ds.core import parallel_state
-
-from .external_libs import h5py
-
-
-def save_data(data_map, *args):
-    '''Save map of numpy arrays to hdf5 file.'''
-
-    # Parse args.
-    if len(args) == 1:
-        path = args[0]
-    elif len(args) == 2:
-        dir_path, file_name = args
-        path = os.path.join(dir_path, file_name)
-    else:
-        raise Exception("specialize for len(args) == %d." % len(args))
-
-    # Save data.
-    if not os.path.isfile(path):
-        f = h5py.File(path, "w")
-        for k, v in data_map.items():
-            f.create_dataset(k, data=v)
-        f.close()
-
-    return path
-
-
-def load_data(paths):
-    '''Load multiple hdf5 files to single numpy array.'''
-
-    # Read data shapes.
-    shape_map = defaultdict(lambda : (0, None))
-    for p in paths:
-        f = h5py.File(p, "r")
-        for k in f.keys():
-            shape = tuple(f[k].shape)
-            shape_map[k] = (shape_map[k][0] + shape[0], shape[1])
-        f.close()
-
-    # Allocate output array.
-    data_map = { k : np.empty(s, dtype="f4") for k, s in shape_map.items() }
-    start_map = { k : 0 for k in shape_map }
-
-    # Load files.
-    for pi, p in enumerate(tqdm(paths, "load data")):
-        f = h5py.File(p, "r")
-        for k in f.keys():
-            i0 = start_map[k]
-            i1 = i0 + len(f[k])
-            data_map[k][i0:i1] = f[k]
-            start_map[k] += len(f[k])
-        f.close()
-
-    return data_map
-
-
-def get_missing_blocks(workdir, n_samples, block_size,
-                       validate=lambda f : None):
-    '''Divide range [0, num_samples) to sequence of block ranges.
-
-    This is a core method within the concept of block processing. The idea
-    is to divide a range (size n_samples) into a sequence of blocks. Each
-    block corresponds to a file within 'workdir' with name
-    '{start_idx}-{end_idx}.hdf5'. This method checks for the existence of
-    these files, and returns a list of the ones that are missing.
-    '''
-
-    # Block ranges.
-    block_start_idxs = list(range(0, n_samples, block_size))
-    block_end_idxs = [ min(n_samples, i + block_size) for i in block_start_idxs ]
-    block_ranges = list(zip(block_start_idxs, block_end_idxs))
-
-    # All block files (existing + missing).
-    n_digits = int(np.ceil(np.log(n_samples) / np.log(10)) + 1)
-    all_blocks = [{
-        "range" : r,
-        "path" : os.path.join(
-            workdir,
-            "%s-%s.hdf5" % tuple([ str(i).zfill(n_digits) for i in r ]),
-        )
-    } for r in block_ranges]
-    all_block_path_set = set(block["path"] for block in all_blocks)
-
-    # Delete corrupt files.
-    if torch.distributed.get_rank() == 0:
-        existing_block_paths = [block["path"]
-                                for block in all_blocks
-                                if os.path.exists(block["path"])]
-        for index, path in enumerate(
-                tqdm(existing_block_paths, "validating block.")):
-
-            assert path in all_block_path_set, "unexpected filename, '%s'." % path
-
-            try:
-                f = h5py.File(path, "r")
-            except:
-                # raise Exception("unable to open/validate '%s'." % path)
-                os.remove(path)
-                continue
-
-            try:
-                validate(f)
-            except:
-                # raise Exception("delete block file '%s'." % path)
-                os.remove(path)
-            finally:
-                f.close()
-
-    # Wait for files to be deleted.
-    torch.distributed.barrier()
-
-    # Filter missing files.
-    missing_blocks = [block
-                      for block in all_blocks
-                      if not os.path.exists(block["path"])]
-
-    return missing_blocks
-
-
-def get_missing_blocks_by_rank(workdir, n_samples, block_size,
-                               validate=lambda f : None):
-    '''Divide missing blocks evenly across all ranks.
-
-    See 'get_missing_blocks()' above for description. The returned list of
-    missing blocks is split evenly across ranks via interleaving. This way,
-    each rank has a roughly equal number of blocks to process for a
-    downstream operation.
-    '''
-
-    missing_blocks = get_missing_blocks(workdir, n_samples, block_size,
-                                        validate)
-
-    # This rank's missing files.
-    data_parallel_rank = parallel_state.get_data_parallel_rank()
-    data_parallel_world_size = parallel_state.get_data_parallel_world_size()
-    rank_missing_blocks = missing_blocks[data_parallel_rank:len(missing_blocks):data_parallel_world_size]
-
-    # Extend rank's missing blocks (with None) such that all ranks have equal
-    # length lists. This allows for easier tracking of global progress.
-    n_missing_tensor = torch.cuda.LongTensor([len(rank_missing_blocks)])
-    torch.distributed.all_reduce(n_missing_tensor,
-                                 op=torch.distributed.ReduceOp.MAX)
-    max_n_missing = n_missing_tensor.item()
-    rank_missing_blocks += [None] * (max_n_missing - len(rank_missing_blocks))
-
-    return len(missing_blocks), rank_missing_blocks
-
-
-class BlockPathMap:
-    '''Map an index to its containing block path.
-
-    The common use for this class is to have a directory of files containing
-    blocks of processed data, of uniform block size (e.g., 100k samples per
-    file). Each file must follow a naming convention of 'startIdx-endIdx.[ext]',
-    where 'endIdx' minus 'startIdx' must equal the block size, with the possible
-    exception of the final block. Given an input index, this class maps the
-    index to the containing block file.
-    '''
-
-    @classmethod
-    def from_dir(cls, _dir, block_size, ext="hdf5"):
-        '''Get list of block files, and create map.'''
-        assert os.path.isdir(_dir), f"directory not found, '{_dir}'."
-        return cls(sorted(glob.glob(_dir + f"/*.{ext}")), block_size)
-
-    def __init__(self, block_paths, block_size):
-        self.max_idx = 0
-        self.block_path_map = {}
-        for block_path in block_paths:
-            name = os.path.splitext(os.path.basename(block_path))[0]
-            start_idx, end_idx = [ int(i) for i in name.split("-") ]
-            self.block_path_map[start_idx] = block_path
-            self.max_idx = max(self.max_idx, end_idx)
-        self.block_size = block_size
-
-    def __str__(self):
-        return "%d paths" % len(self.block_path_map)
-
-    def __getitem__(self, idx):
-        '''Get block path from index.'''
-        block_start_idx = self.block_size * (idx // self.block_size)
-        block_path = self.block_path_map[block_start_idx]
-        return block_path
diff --git a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/README.md b/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/README.md
deleted file mode 100644
index 06b92279ee75fe8e38ce4c2d6fe4dfd3a0bd8344..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Introduction
-
-This folder is a collection of scripts for converting checkpoints of one training framework (e.g., DeepSpeed) into that of a different framework (e.g., Megatron-LM, HF Transformers).
-
-The folder also contains scripts for inspecting checkpoint files and folders, which could be useful when developing checkpoint conversion logic. At the time of creation, this folder contains scripts to convert DeepSpeed checkpoints to Megatron-LM and HF Transformers checkpoints (this motivated this effort as part of the BigScience project).
-
-Here are the list and details of checkpoint conversions provided by the available scripts:
-
-1. [Megatron-DeepSpeed to Megatron-LM](#Megatron-DeepSpeed-to-Megatron)
-1. [Megatron-DeepSpeed to HF Transformers](#Megatron-DeepSpeed-to-HF-Transformers)
-
-
-## Megatron-DeepSpeed to Megatron
-
-The (current implementation of the) converter extracts args and model parameters from a DeepSpeed checkpoint (i.e., excludes other training states such as optimizer, scheduler, etc) and convert into a Megatron-LM checkpoint similarly containing only model parameters. The converter also provides a best-effort attempt to reshape the tensor-parallelism and pipeline parallelism degrees for the checkpoint. The resulting Megatron-LM checkpoint could be loaded into Megatron-LM framework for finetuning or inference. Tensor parallelism (TP) and pipeline parallelism (PP) are supported in the sense that the generated Megatron-LM checkpoint (folders and files) will be of the same TP and PP of the training that created the input DeepSpeed checkpoint. The entry point of the converter is `deepspeed_to_megatron_ds.py`, which as the following usage:
-```bash
-python tools/convert_checkpoint/deepspeed_to_megatron_ds.py -h
-Convert DeepSpeed Checkpoint to Megatron Checkpoint
-usage: deepspeed_to_megatron_ds.py [-h] [--input_folder INPUT_FOLDER]
-                                [--output_folder OUTPUT_FOLDER]
-                                [--target_tp TARGET_TP]
-                                [--target_pp TARGET_PP] [--for_release]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --input_folder INPUT_FOLDER
-                        Input DeepSpeed Checkpoint folder
-  --output_folder OUTPUT_FOLDER
-                        Output Megatron checkpoint folder
-  --target_tp TARGET_TP
-                        Target TP degree
-  --target_pp TARGET_PP
-                        Target PP degree
-  --for_release         Convert for release purpose, reset some (progress)
-                        counters.
-```
-
-The following scripts which proved useful for debugging are also included:
-1. `inspect_deepspeed_checkpoint.py`: view the contents of a DeepSpeed checkpoint folder.
-2. `inspect_checkpoint.py`: view the contents of a PyTorch checkpoint file.
-
-## Megatron-DeepSpeed to HF Transformers
-
-In order to convert from Megatron-DeepSpeed to HF Transformers, you can do this directly using:
-
-```bash
-python tools/convert_checkpoint/deepspeed_to_transformers.py  \
---input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
---output_folder /path/to/transformers/checkpoint
-```
-since `transformers` currently only works with PP=1/TP=1 we use the defaults `--target_tp 1 --target_pp 1`.
-
-The script taps into `transformers` and as of this writing requires `transformers@master` (or `transformers==4.11` if you read this later and a new version is released).
-
-Note that you may run into problems with not having `megatron_ds.enums` defined since `Megatron-Deepspeed` in the `bigscience-workshop` tree diverged from the `microsoft` tree. In such cases you can fix this on the fly by ensuring the former appears first in the `sys.path`. For example:
-
-
-```bash
-PYTHONPATH=/hf/Megatron-DeepSpeed-bigscience:/hf/Megatron-DeepSpeed-microsoft \
-python tools/convert_checkpoint/deepspeed_to_transformers.py  \
---input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
---output_folder /path/to/transformers/checkpoint
-```
-
-Alternatively, you can convert first from Megatron-DeepSpeed to Megatron and then to HF Transformers:
-
-```bash
-# 1. Megatron-DeepSpeed to Megatron
-cd /hf/Megatron-DeepSpeed-bigscience
-python tools/convert_checkpoint/deepspeed_to_megatron_ds.py --target_tp 1 --target_pp 1 \
---input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
---output_folder /path/to/Megatron/checkpoint
-
-# 2. Megatron to HF Transformers
-cd /hf/transformers
-python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py \
-/path/to/Megatron/checkpoint/iter_0097500/mp_rank_00/model_optim_rng.pt
-```
diff --git a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_checkpoint.py b/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_checkpoint.py
deleted file mode 100644
index decd98c359097ea8d84fa4c56fc1c54282469858..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_checkpoint.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import os
-from typing import Dict
-import torch 
-
-ZERO_FILE_PREFIX = 'zero_pp_rank_'
-LAYER_FILE_PREFIX = 'layer_'
-MP_RANK_FILE_PREFIX = 'mp_rank_'
-EMBEDDING_LAYER_INDEX = 0
-FINAL_LAYER_NORM_INDEX = -1
-ARGS_KEY = 'args'
-ITERATION_KEY = 'iteration'
-SEQUENTIAL_LAYERS = [
-    'input_layernorm.weight', 'input_layernorm.bias',
-    'self_attention.dense.bias',
-    'post_attention_layernorm.weight', 'post_attention_layernorm.bias',
-    'mlp.dense_4h_to_h.bias',
-    'position_embeddings.weight'
-]
-
-LAYER_CONCAT_DIM = {
-    'self_attention.dense.weight': 1,
-    'mlp.dense_4h_to_h.weight': 1
-}
-
-class DeepSpeedCheckpoint(object):
-    def __init__(self, dir, tp_degree=None, pp_degree=None, no_pp=False):
-        self.dir = dir
-        self.no_pp = no_pp
-        self.file_list = self._get_files(dir)
-        self.zero_files = self._get_files_with_prefix(self.file_list, ZERO_FILE_PREFIX)
-        self.layer_files = self._get_files_with_prefix(self.file_list, LAYER_FILE_PREFIX)
-        self.mp_rank_files = self._get_files_with_prefix(self.file_list, MP_RANK_FILE_PREFIX)
-        self.layer_keys = self._get_layer_keys()
-        self.layer_count = len(self.layer_keys)
-        if not self.no_pp:
-            self.original_tp_degree = len(self._get_files_with_prefix(self.layer_files, f'{LAYER_FILE_PREFIX}01'))
-            self.original_pp_degree = len(self.mp_rank_files) // self.original_tp_degree
-        else:
-            self.original_tp_degree = len(self.mp_rank_files)
-            self.original_pp_degree = 1
-        self.dp_degree = len(self.zero_files) // (self.original_pp_degree * self.original_tp_degree)
-        self.tp_degree = self.original_tp_degree if tp_degree is None else tp_degree
-        self.pp_degree = self.original_pp_degree if pp_degree is None else pp_degree
-        self.global_state = {}
-    
-        self._sanity_check()
-        self.pp_to_transformer_map = self._build_pp_transformer_map()
-        self.transformer_file_map = self._build_transformer_file_map()
-        if not self.no_pp:
-            self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX)
-            self.tp_to_final_norm_map = self._build_tp_other_layer_map(FINAL_LAYER_NORM_INDEX)
-        self._build_global_state()
-
-
-
-    def show_tp_embedding_map(self):
-        self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers')
-
-    def show_tp_final_norm_map(self):
-        self._dump_mapping(self.tp_to_final_norm_map, 'tp_to_final_norm_layers')
-
-    def show_pp_tranformer_map(self):
-        self._dump_mapping(self.pp_to_transformer_map, 'pp_to_tranformer_layers')
-
-    def show_transformer_file_map(self):
-        self._dump_mapping(self.transformer_file_map, 'rank_to_tranformer_files')
-
-    def _build_global_state(self):
-        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
-        self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
-        self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
-
-    def get_iteration(self):
-        if not ITERATION_KEY in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
-            self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
-
-        return self.global_state[ITERATION_KEY]
-
-    def get_embedding_state(self, tp_index: int) -> Dict:
-        assert tp_index in self.tp_to_embedding_map.keys()
-        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]]
-        sd = self._merge_state_dicts(sd_list)
-        return sd
-
-    def get_args(self):
-        if not ARGS_KEY in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
-            self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
-
-        return self.global_state[ARGS_KEY]
-    
-
-    def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
-        assert tp_index < self.tp_degree
-        assert pp_index < self.pp_degree
-        t_list = []
-        for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
-            sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
-            sd = self._merge_state_dicts(sd_list)
-            t_list.append(sd)
-        return t_list   
-
-    def get_final_norm_state(self, tp_index:int) -> Dict:
-        assert tp_index in self.tp_to_final_norm_map.keys()
-        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'))
-        return sd
-
-    def _build_tp_other_layer_map(self, layer_index:int):
-        assert layer_index < len(self.layer_files)
-        layer_files = self._get_files_with_prefix(self.layer_files, self.layer_keys[layer_index])
-        layer_file_partitions = self._partition_data(layer_files, self.tp_degree)
-        data_map = {i:flist for i, flist in enumerate(layer_file_partitions)}
-        return data_map
-
-    def _build_pp_transformer_map(self):
-        data_map = {}
-        transformer_layers = self.layer_keys[1:-1]
-        layers_per_pp = len(transformer_layers) // self.pp_degree
-        data_map = {i:transformer_layers[i*layers_per_pp:(i+1)*layers_per_pp] for i in range(0, self.pp_degree)}
-        return data_map
-
-    def _dump_mapping(self, data_map, map_tag = None):
-        if map_tag is not None:
-            print(f'Dump mapping: {map_tag}')
-        for k, v in data_map.items():
-            print(f'{k} = {v}')
-
-    def _build_transformer_file_map(self):
-        transformer_layer_keys = self.layer_keys[1:-1]
-        file_map = {}
-        layers_per_pp = len(transformer_layer_keys) // self.pp_degree
-        for key_index, layer_key in enumerate(transformer_layer_keys):
-            pp_index = key_index // layers_per_pp
-            layer_files = self._get_files_with_prefix(self.layer_files, layer_key)
-            layer_file_partitions = self._partition_data(layer_files, self.tp_degree)
-            for tp_index in range(self.tp_degree):
-                map_key = (tp_index, pp_index)
-                if not map_key in file_map.keys():
-                    file_map[map_key] = []
-                file_map[map_key].append(layer_file_partitions[tp_index])
-        
-        return file_map
-        
-    def _sanity_check(self):
-        assert len(self.mp_rank_files) % self.tp_degree == 0
-        assert len(self.zero_files) % (self.pp_degree * self.tp_degree) == 0
-        if not self.no_pp:
-            assert len(self.layer_keys) > 2
-            assert (len(self.layer_keys) - 2) % self.pp_degree == 0
-     
-    def _get_files_with_prefix(self, all_files, prefix):
-        file_list = []
-        for file_path in all_files:
-            _, fname = os.path.split(file_path)
-            if fname.startswith(prefix):
-                file_list.append(file_path)
-        
-        return sorted(file_list)
-
-    def validate_files(self):
-        for file in self.file_list:
-            if not os.path.isfile(file):
-                print(f'Error: {file} is not existent')
-        
-    def _get_files(self, dir):
-        file_list = []
-        for root, dirs, files in os.walk(dir):
-            for file in files:
-                file_list.append(os.path.join(root, file))
-        return file_list
-
-    def _get_layer_keys(self):
-        key_set = set()
-        key_len = len(LAYER_FILE_PREFIX) + 2 
-        for file_path in self.layer_files:
-            _, fname = os.path.split(file_path)
-            key_set.add(fname[:key_len])
-        return sorted(list(key_set))
-
-    def _partition_data(self, data_list, num_partitions):
-        num_elems = len(data_list)
-        assert num_elems % num_partitions == 0
-        partition_size = num_elems // num_partitions
-        partitions_list = [data_list[i:i+partition_size] for i in range(0, num_elems, partition_size)]
-        return partitions_list
-
-    def _merge_state_dicts(self, sd_list):
-        merged_sd = {}
-        for key in sd_list[0].keys():
-            if not key in SEQUENTIAL_LAYERS:
-                cat_dim = LAYER_CONCAT_DIM.get(key, 0)
-                merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim)
-            else:
-                merged_sd[key] = sd_list[0][key]
-        return merged_sd
diff --git a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_to_megatron.py b/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_to_megatron.py
deleted file mode 100644
index ef1c77e546e2f2e269acfa41a069ae2f9fd98c0f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_to_megatron.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import os
-import torch
-from collections import OrderedDict
-from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
-
-MODEL_KEY = 'model'
-ARGS_KEY = 'args'
-LANGUGAGE_MODEL_KEY = 'language_model'
-EMBEDDING_KEY = 'embedding'
-ENCODER_KEY = 'encoder'
-WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
-WORD_EMBEDDINGS_KEY = 'word_embeddings'
-FINAL_LAYER_NORM_KEY ='final_layernorm'
-CHECKPOINT_VERSION_KEY = 'checkpoint_version'
-CHECKPOINT_VERSION_VALUE = 3.0
-ITERATION_KEY = 'iteration'
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input_folder', default=None, type=str, help='Input DeepSpeed Checkpoint folder')
-    parser.add_argument('--output_folder', default=None, type=str, help='Output Megatron checkpoint folder')
-    parser.add_argument('--target_tp', default=1, type=int, help='Target TP degree')
-    parser.add_argument('--target_pp', default=1, type=int, help='Target PP degree')
-    parser.add_argument('--for_release', action='store_true', help='Convert for release purpose, reset some (progress) counters.')
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def _convert_ds_transformer_state(sd_list):
-    new_sd = OrderedDict()
-    for i, sd in enumerate(sd_list):
-        for key, value in sd.items():
-            new_key = f'layers.{i}.{key}'
-            new_sd[new_key] = value
-
-    return new_sd
-
-def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
-    path_list = []
-    iter_folder = f'iter_{iteration:07d}'
-    for i in range(0, tp_degree):
-        path_list.append([])
-        for j in range(0, pp_degree):
-            rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
-            ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
-            path_list[i].append(os.path.join(base_folder, iter_folder, ckpt_path))
-
-    return path_list
-
-
-def _create_megatron_dict():
-    language_model_dict = {
-        EMBEDDING_KEY: {},
-        ENCODER_KEY: {}
-    }
-    megatron_dict = {
-        MODEL_KEY: {LANGUGAGE_MODEL_KEY: language_model_dict},
-        CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
-    }
-    return megatron_dict
-
-
-def _save_checkpoint(file_path, chkpt_sd):
-    dir, _ = os.path.split(file_path)
-    os.makedirs(dir, exist_ok=True)
-    torch.save(chkpt_sd, file_path)
-
-
-def _renest_sd(sd):
-    new_sd = OrderedDict()
-    for key, value in sd.items():
-        a, b = key.split('.')
-        new_sd[a] = {b: value}
-    return new_sd
-
-
-def _create_rank_checkpoint(ds_checkpoint, tp_index, pp_index, for_release=False):
-    meg_encoder_sd = OrderedDict()
-    meg_embedding_sd = OrderedDict()
-    meg_embedding_for_head_sd = OrderedDict()
-
-    transformer_sd = ds_checkpoint.get_transformer_state(tp_index, pp_index)
-    meg_encoder_sd.update(_convert_ds_transformer_state(transformer_sd))
-
-    if pp_index in [0, ds_checkpoint.pp_degree - 1]:
-        embedding_sd = ds_checkpoint.get_embedding_state(tp_index)
-        nested_embedding_sd = _renest_sd(embedding_sd)
-        if pp_index == 0:
-            meg_embedding_sd.update(nested_embedding_sd)
-
-        if pp_index == ds_checkpoint.pp_degree -1:
-            for key, value in embedding_sd.items():
-                if key.startswith(WORD_EMBEDDINGS_KEY):
-                    fields = key.split('.')
-                    new_fields = fields[1:]
-                    new_key = '.'.join(new_fields)
-                    meg_embedding_for_head_sd[new_key] = value
-
-            final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index)
-            new_final_norm_sd = {f'{FINAL_LAYER_NORM_KEY}.{key}': value for key, value in final_norm_sd.items()}
-            meg_encoder_sd.update(new_final_norm_sd)
-
-    checkpoint_sd = _create_megatron_dict()
-
-    iteration = ds_checkpoint.get_iteration()
-    checkpoint_sd[ITERATION_KEY] = iteration
-    if pp_index == 0:
-        checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][EMBEDDING_KEY] = meg_embedding_sd
-    checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd
-    if pp_index == ds_checkpoint.pp_degree -1:
-        checkpoint_sd[MODEL_KEY][WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
-
-    checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args()
-    # Adjust specific fields
-    checkpoint_sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
-    checkpoint_sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
-    if for_release:
-        checkpoint_sd[ARGS_KEY].consumed_train_samples = 0
-        checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0
-
-    return checkpoint_sd
-
-
-def _create_latest_file(base_folder, iteration):
-    file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt')
-    os.makedirs(base_folder, exist_ok=True)
-    with open(file_path, 'w') as f:
-        f.write(str(iteration))
-
-def main():
-    print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint')
-
-    args = parse_arguments()
-    print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}')
-
-    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp)
-    iteration = ds_checkpoint.get_iteration()
-    _create_latest_file(args.output_folder, iteration)
-    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree, ds_checkpoint.pp_degree)
-    for i in range(0, ds_checkpoint.tp_degree):
-        for j in range(0, ds_checkpoint.pp_degree):
-            sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release)
-            _save_checkpoint(checkpoint_paths[i][j], sd)
-
-if __name__ == "__main__":
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_to_transformers.py b/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_to_transformers.py
deleted file mode 100644
index 18c664ea60a77efc6f1aec2ebf90047ea3017735..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/deepspeed_to_transformers.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import torch
-import json
-
-from deepspeed_checkpoint import DeepSpeedCheckpoint
-from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
-
-# the import was tested to work with this version
-# https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
-# copying that version here instead
-from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
-from transformers import GPT2Config
-
-def main():
-
-    # this first part comes mainly from deepspeed_to_megatron_ds.main
-    args = parse_arguments()
-    print(f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}')
-
-    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp)
-    iteration = ds_checkpoint.get_iteration()
-    input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, args.for_release)
-
-    # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
-    # Spell out all parameters in case the defaults change.
-    config = GPT2Config(
-        vocab_size=50257,
-        n_positions=1024,
-        n_ctx=1024,
-        n_embd=1024,
-        n_layer=24,
-        n_head=16,
-        n_inner=4096,
-        activation_function="gelu",  # used to be "gelu_new" in earlier versions
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        scale_attn_weights=True,
-        gradient_checkpointing=False,
-        use_cache=True,
-        bos_token_id=50256,
-        eos_token_id=50256,
-    )
-
-    # Convert.
-    print("Converting to HF Checkpoint")
-    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
-
-    basename = args.output_folder
-    os.makedirs(basename, exist_ok=True)
-
-    # Print the structure of converted state dict.
-    #if args.print_checkpoint_structure:
-    #    recursive_print(None, output_state_dict)
-
-    # Store the config to file.
-    output_config_file = os.path.join(basename, "config.json")
-    output_config = config.to_dict()
-    output_config["architectures"] = ["GPT2LMHeadModel"]
-    output_config["model_type"] = "gpt2"
-    print(f'Saving config to "{output_config_file}"')
-    with open(output_config_file, "w") as f:
-        json.dump(output_config, f)
-
-    # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
-    print(f'Saving checkpoint to "{output_checkpoint_file}"')
-    torch.save(output_state_dict, output_checkpoint_file)
-
-    print("Now add tokenizer files and upload to the hub")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/inspect_checkpoint.py b/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/inspect_checkpoint.py
deleted file mode 100644
index 5ee955bb480012932fb6d7446e561d72852a7372..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/inspect_checkpoint.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import torch
-import sys
-import os
-from collections import OrderedDict
-
-
-def dump_data(datum, name_list=[]):
-    if type(datum) in (dict, OrderedDict):
-        for k, v in datum.items():
-            dump_data(v, name_list+[str(k)])
-    elif type(datum) in (list, tuple):
-        for v in datum:
-            dump_data(v, name_list)
-    elif torch.is_tensor(datum):
-        prefix = '.'.join(name_list)
-        print(f'[tensor] {prefix} = {datum.shape}')
-    else:
-        #pass 
-        prefix = '.'.join(name_list)
-        print(f'[other] {prefix} = {datum}')
-
-def main():
-    if len(sys.argv) < 2:
-        print(f'Usage: {sys.argv[0]} <checkpoint file>')
-        exit(1)
-
-    ckpt_file = sys.argv[1]
-    if not os.path.isfile(ckpt_file):
-        print(f'{ckpt_file} is not a valid file')
-        exit(1)
-
-    print(f'loading checkpoint file: {ckpt_file}')
-    sd = torch.load(ckpt_file)
-    dump_data(sd)
-
-    quit()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py b/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py
deleted file mode 100644
index 3125f7d9a78eb3e3ff54d8d324e358d2d556eb57..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import argparse
-from deepspeed_checkpoint import DeepSpeedCheckpoint
-
-def list_files(file_list, tag):
-    print(f'Listing files: {tag}')
-    for i, file in enumerate(file_list):
-        print(f'{i+1}: {file}')
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--folder', default=None, type=str, help='DeepSpeed Checkpoint folder')
-    parser.add_argument('--target_tp', default=None, type=int, help='Target TP degree')
-    parser.add_argument('--target_pp', default=None, type=int, help='Target PP degree')
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args 
-
-
-def show_input_files(ds_checkpoint):
-    list_files(ds_checkpoint.file_list, 'all')
-    list_files(ds_checkpoint.zero_files, 'zero')
-    list_files(ds_checkpoint.layer_files, 'layer')
-    list_files(ds_checkpoint.mp_rank_files, 'mp rank')
-
-def show_simple_state(ds_checkpoint):
-    print(f'layer keys = {ds_checkpoint.layer_keys}')
-    print(f'layer count = {ds_checkpoint.layer_count}')
-
-    print(f'tp_degree_count = {ds_checkpoint.tp_degree}')
-    print(f'pp_degree_count = {ds_checkpoint.pp_degree}')
-    print(f'dp_degree_count = {ds_checkpoint.dp_degree}')
-
-def show_mappings(ds_checkpoint):
-    ds_checkpoint.show_pp_tranformer_map()
-    ds_checkpoint.show_transformer_file_map()
-    ds_checkpoint.show_tp_embedding_map()
-    ds_checkpoint.show_tp_final_norm_map()
-
-def show_state_summary(tag, sd):
-    summary = {k:v.shape for k,v in sd.items()}
-    print(f'{tag} = {summary}')
-
-def show_embedding_states(ds_checkpoint):
-    for i in range(0, ds_checkpoint.tp_degree):
-        sd = ds_checkpoint.get_embedding_state(i)
-        show_state_summary(f'embedding[{i}]', sd)
-
-def show_final_norm_states(ds_checkpoint):
-    for i in range(0, ds_checkpoint.tp_degree):
-        sd = ds_checkpoint.get_final_norm_state(i)
-        show_state_summary(f'final_norm[{i}]', sd)
-
-def show_transformer_states(ds_checkpoint):
-    for i in range(0, ds_checkpoint.tp_degree):
-        for j in range(0, ds_checkpoint.pp_degree):
-            state_list = ds_checkpoint.get_transformer_state(tp_index=i, pp_index=j)
-            print(f'tp_pp_rank[{i},{j}] = ')
-            for k, sd in enumerate(state_list):
-                show_state_summary(f'      block[{k}]', sd)
-                print("")
-
-
-def main():
-    print(f'Inspecting DeepSpeed Checkpoint')
-    args = parse_arguments()
-
-    ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp, args.target_pp)
-    ds_checkpoint.validate_files()
-    
-    show_input_files(ds_checkpoint)
-    show_simple_state(ds_checkpoint)
-    show_mappings(ds_checkpoint)
-    show_embedding_states(ds_checkpoint)
-    show_final_norm_states(ds_checkpoint)
-    show_transformer_states(ds_checkpoint)
-    checkpoint_args = ds_checkpoint.get_args()
-    print(f'checkpoint args = {checkpoint_args}')
-
-if __name__ == "__main__":
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tools/convert_mg2hf.sh b/toolbox/Megatron-DeepSpeed/tools/convert_mg2hf.sh
deleted file mode 100644
index e7ba87a2f80d2f5ab40e816adfa961caedeae45b..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/convert_mg2hf.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-TP=1
-PP=8
-
-PROJ_HOME=$(dirname "$PWD")
-python3 $PROJ_HOME/tools/checkpoint_util.py \
-     --model-type GPT \
-     --loader megatron \
-     --saver megatron \
-     --save-model-type save_huggingface_llama \
-     --target-tensor-parallel-size ${TP} \
-     --target-pipeline-parallel-size ${PP} \
-     --load-dir XXX \
-     --save-dir XXX \
-     --custom-partition 4 4 4 4 4 4 5 3
diff --git a/toolbox/Megatron-DeepSpeed/tools/generate_samples_gpt.py b/toolbox/Megatron-DeepSpeed/tools/generate_samples_gpt.py
deleted file mode 100644
index 345a5d4bd041a66fc95c95c43d7375e09d63e6a5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/generate_samples_gpt.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Sample Generate GPT"""
-
-import deepspeed
-
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds import get_tokenizer
-from megatron_ds.core import mpu
-from megatron_ds.checkpointing import load_checkpoint
-from megatron_ds.initialize import initialize_megatron
-from megatron_ds.model import GPTModel
-from megatron_ds.training import get_model
-from megatron_ds.text_generation_utils import generate_and_write_samples_unconditional
-from megatron_ds.text_generation_utils import generate_samples_input_from_file
-from megatron_ds.text_generation_utils import generate_samples_interactive
-import deepspeed
-import torch
-
-from megatron_ds.arguments import core_transformer_config_from_args
-from megatron_ds import get_args
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    args = get_args()
-    config = core_transformer_config_from_args(args)
-
-    print_rank_0('building GPT model ...')
-    model = GPTModel(config=config, num_tokentypes=0, parallel_output=False,
-                     pre_process=pre_process, post_process=post_process,
-                     return_moe_loss=False) # we need to set "return_moe_loss" for the inference_mode
-    return model
-
-
-def add_text_generate_args(parser):
-    """Text generation arguments."""
-    group = parser.add_argument_group(title='text generation')
-
-    group.add_argument("--temperature", type=float, default=1.0,
-                       help='Sampling temperature.')
-    group.add_argument("--greedy", action='store_true', default=False,
-                       help='Use greedy sampling.')
-    group.add_argument("--top_p", type=float, default=0.0,
-                       help='Top p sampling.')
-    group.add_argument("--top_k", type=int, default=0,
-                       help='Top k sampling.')
-    group.add_argument("--out-seq-length", type=int, default=1024,
-                       help='Size of the output generated text.')
-    group.add_argument("--sample-input-file", type=str, default=None,
-                       help='Get input from file instead of interactive mode, '
-                       'each line is an input.')
-    group.add_argument("--sample-output-file", type=str, default=None,
-                       help='Output file got from --sample-input-file')
-    group.add_argument("--num-samples", type=int, default=0,
-                       help='Number of samples to generate unconditionally, '
-                       'defaults to 0 and interactive conditional sampling')
-    group.add_argument("--genfile", type=str,
-                       help='Output file when generating unconditionally')
-    group.add_argument("--recompute", action='store_true',
-                       help='During generation recompute all attention '
-                       'instead of using previously computed keys/values.')
-    group.add_argument("--local_rank", type=int, default=0,
-                       help='local_rank')
-
-    return parser
-
-def print_latency(latency_set, title=""):
-    # 10 warmup queries
-    latency_set = latency_set[10:]
-    count = len(latency_set)
-    if count > 0:
-        latency_set.sort()
-        n50 = (count - 1) * 0.5 + 1
-        n90 = (count - 1) * 0.9 + 1
-        n95 = (count - 1) * 0.95 + 1
-        n99 = (count - 1) * 0.99 + 1
-        n999 = (count - 1) * 0.999 + 1
-
-        avg = sum(latency_set) / count
-        p50 = latency_set[int(n50) - 1]
-        p90 = latency_set[int(n90) - 1]
-        p95 = latency_set[int(n95) - 1]
-        p99 = latency_set[int(n99) - 1]
-        p999 = latency_set[int(n999) - 1]
-
-        print("====== latency stats {0} ======", title)
-        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
-        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
-        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
-        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
-        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
-        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
-
-def main():
-    """Main program."""
-    latencies = []
-    model_latencies = []
-    single_token_latency = []
-
-    initialize_megatron(extra_args_provider=add_text_generate_args,
-                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
-                                       'no_load_rng': True,
-                                       'no_load_optim': True})
-
-    args = get_args()
-
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        print("Interleaved pipeline schedule is not yet supported for text generation.")
-        exit()
-
-    # Set up model and load checkpoint.
-    model = get_model(model_provider)
-
-    if args.load is not None:
-        _ = load_checkpoint(model, None, None)
-
-    assert len(model) == 1, "Above condition should have caught this"
-    model = model[0]
-
-    if args.ds_inference:
-        model = ds_inference(model, args)
-        print('> DeepSpeed Inference engine initialized')
-
-    # Generate samples.
-    if args.num_samples == 0:
-        args.micro_batch_size = 1
-        if args.sample_input_file != None:
-            generate_samples_input_from_file(model)
-        else:
-            generate_samples_interactive(model)
-    else:
-        generate_and_write_samples_unconditional(model, latencies, single_token_latency, model_latencies)
-
-
-    #if torch.cuda.current_device() == 0:
-    if torch.distributed.get_rank() == 0:
-        print_latency(latencies)
-        print_latency(model_latencies, "model_latencies")
-        print_latency(single_token_latency, "single_token_latency")
-
-
-def ds_inference(model, args):
-    import megatron_ds.model as mm
-    engine = deepspeed.init_inference(model=model,
-                                      mp_size=args.tensor_model_parallel_size,
-                                      tensor_parallel={"mpu": mpu},
-                                      dtype=torch.half,
-                                      replace_with_kernel_inject=True,
-                                      moe_experts=args.num_experts,
-                                      moe_type=args.mlp_type)
-
-    return engine.module
-
-if __name__ == "__main__":
-
-    main()
diff --git a/toolbox/Megatron-DeepSpeed/tools/hf2megads_weight_converter.py b/toolbox/Megatron-DeepSpeed/tools/hf2megads_weight_converter.py
deleted file mode 100644
index bfbde1fd05583112b25068d5c76a28a027868939..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/hf2megads_weight_converter.py
+++ /dev/null
@@ -1,334 +0,0 @@
-import torch
-import re
-import sys
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
-from megatron import print_rank_0, get_tokenizer, get_args
-from megatron.core import mpu
-from megatron.core.utils import divide
-from megatron.model import GPTModelPipe, Float16Module
-from megatron.utils import unwrap_model
-from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.arguments import core_transformer_config_from_args
-from megatron.initialize import initialize_megatron
-from megatron.optimizer import get_megatron_optimizer
-from megatron.checkpointing import save_checkpoint
-from megatron.training import get_optimizer_param_scheduler
-from deepspeed.runtime.utils import see_memory_usage
-import deepspeed
-
-
-def add_extra_args(parser):
-    """Text generation arguments."""
-    group = parser.add_argument_group(title='hf2mega')
-    group.add_argument("--hf-ckpt-num-shards", type=int, help='num of llama ckpt.')
-    group.add_argument("--origin-hf-ckpt-dir",
-                       type=str,
-                       default="",
-                       help="the original path of the llama-hf ckpt")
-    return parser
-
-
-def compute_partition_range(hidden_size, local_rank, tp_size):
-    partition_size = divide(hidden_size, tp_size)
-    start_index = local_rank * partition_size
-    end_index = start_index + partition_size
-    return partition_size, start_index, end_index
-
-
-def load_and_print_hf_weight(hf_ckpt_dir, hf_ckpt_num_of_shards):
-    # Optimization point: We can selectively load specific 'shared' data to reduce CPU memory usage.
-    loaded = {}
-    print_rank_0(
-        f"----------------------------hf weight list----------------------------")
-
-    for wid in range(1, hf_ckpt_num_of_shards + 1):
-        d = torch.load(
-            f"{hf_ckpt_dir}/pytorch_model-{wid:05d}-of-{hf_ckpt_num_of_shards:05d}.bin",
-            map_location=torch.device('cpu'))
-        for k in d:
-            print_rank_0(k)
-            assert k not in loaded
-            loaded[k] = d[k].clone()
-    del d
-    return loaded
-
-
-def print_distinct_weights(model):
-    print_rank_0(
-        f"----------------------------mega-ds weight list----------------------------")
-    for pipe_rank in range(mpu.get_pipeline_model_parallel_world_size()):
-        if mpu.get_pipeline_model_parallel_rank() == pipe_rank:
-            if mpu.get_data_parallel_rank() == 0 and mpu.get_tensor_model_parallel_rank(
-            ) == 0:
-                for pname, p in model.named_parameters():
-                    print(pname)
-            torch.distributed.barrier()
-        else:
-            torch.distributed.barrier()
-
-
-class refactor:
-    def __init__(self, model, loaded, args, config):
-        tokenizer = get_tokenizer()
-        # align layer number
-        self.model = model
-        self.loaded = loaded
-        self.config = config
-
-        self.offset_num = 2
-        self.mega_emb_wnum = 1
-        self.mega_norm_wnum = args.num_layers + 2
-        self.mega_lm_head_wnum = self.mega_norm_wnum + 1
-        self.token_vocab = tokenizer.vocab_size
-        self.padded_vocab_size = args.padded_vocab_size
-        self.more_padded = self.padded_vocab_size - self.token_vocab
-        self.tp_size = mpu.get_tensor_model_parallel_world_size()
-        self.tp_rank = mpu.get_tensor_model_parallel_rank()
-        self.decoder_pat = re.compile("(\d+)\.(.+)")
-        self.refactor_weight_list = []
-        self.is_refactored = False
-
-    def _embedding_refactor(self, pname, p):
-        if pname == f"{self.mega_lm_head_wnum}.lm_head.weight":
-            hf_name = "lm_head.weight"
-        elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight":
-            hf_name = "model.embed_tokens.weight"
-        hf_w = self.loaded[hf_name]
-        assert hf_w.shape[0] == self.token_vocab
-        per_partition_vocab_size, start_index, end_index = compute_partition_range(
-            self.padded_vocab_size, self.tp_rank, self.tp_size)
-        end_index = min(end_index, self.token_vocab)
-        real_partition_vocab_size = end_index - start_index
-
-        new_w = torch.zeros((per_partition_vocab_size, hf_w.shape[1]), dtype=hf_w.dtype)
-        new_w[:real_partition_vocab_size, :] = hf_w[start_index:end_index, :]
-        if self.tp_rank == self.tp_size - 1 and self.more_padded > 0:
-            new_w[-self.more_padded:] = hf_w[:self.token_vocab].mean(dim=0, keepdim=True)
-
-        self.record_mapping_info(
-            f"mega-ds: {pname,p.data.shape}<--hf: {hf_name,}  [{start_index}:{end_index},:]  of {hf_w.shape}"
-        )
-        return new_w
-
-    def _direct_refactor(self, pname, p, hf_layer=None, subname=None):
-        if pname == f"{self.mega_norm_wnum}.weight":
-            hf_name = "model.norm.weight"
-        elif subname in ["input_layernorm.weight", "post_attention_layernorm.weight"]:
-            hf_name = f"model.layers.{hf_layer}.{subname}"
-
-        new_w = hf_w = self.loaded[hf_name]
-        self.record_mapping_info(
-            f"mega-ds:{pname,p.data.shape}<--hf{hf_name,}  {hf_w.shape}")
-        return new_w
-
-    def _qkv_refactor(self, pname, p, hf_layer):
-        hf_wq_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight"
-        hf_wk_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight"
-        hf_wv_name = f"model.layers.{hf_layer}.self_attn.v_proj.weight"
-        wq = self.loaded[hf_wq_name]
-        wk = self.loaded[hf_wk_name]
-        wv = self.loaded[hf_wv_name]
-
-        hidden_size = wq.shape[0]
-        per_partition_size, start_index, end_index = compute_partition_range(
-            hidden_size, self.tp_rank, self.tp_size)
-        hidden_size_per_attention_head = divide(hidden_size,
-                                                self.config.num_attention_heads)
-        num_attention_heads_per_partition = divide(self.config.num_attention_heads,
-                                                   self.tp_size)
-
-        new_w = torch.zeros((per_partition_size * 3, wq.shape[1]), dtype=wq.dtype)
-
-        for i in range(num_attention_heads_per_partition):
-            current_index = start_index + i * hidden_size_per_attention_head
-            next_index = current_index + hidden_size_per_attention_head
-            new_w_index = i * (3 * hidden_size_per_attention_head)
-            new_w[new_w_index: new_w_index + (3 * hidden_size_per_attention_head), :] = \
-                torch.cat([
-                    wq[current_index: next_index, :],
-                    wk[current_index: next_index, :],
-                    wv[current_index: next_index, :]
-                ], dim=0)
-        self.record_mapping_info(
-            f"mega-ds:{pname,p.data.shape}<--hf{hf_wq_name,hf_wk_name,hf_wv_name,}  cat q,k,v [{current_index}:{next_index},:]  of q,k,v{wq.shape}"
-        )
-        return new_w
-
-    def _mlphto4h_dense_refactor(self, pname, p, hf_layer):
-        hf_w_gate_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight"
-        hf_w_up_name = f"model.layers.{hf_layer}.mlp.up_proj.weight"
-        w_gate = self.loaded[hf_w_gate_name]
-        w_up = self.loaded[hf_w_up_name]
-
-        hidden_size = w_gate.shape[0]
-        per_partition_size, start_index, end_index = compute_partition_range(
-            hidden_size, self.tp_rank, self.tp_size)
-        new_w = torch.zeros((per_partition_size * 2,
-                             w_gate.shape[1]),
-                            dtype=w_gate.dtype)
-        new_w[:per_partition_size * 2, :] = \
-                torch.cat([
-                    w_gate[start_index:end_index, :],
-                    w_up[start_index:end_index, :]
-                ], dim=0)
-        self.record_mapping_info(
-            f"mega-ds:{pname,p.data.shape}<--hf{hf_w_gate_name,hf_w_up_name}  cat gate,up [{start_index}:{end_index},:]  of gate,up{w_gate.shape}"
-        )
-        return new_w
-
-    def _attn_dense_refactor(self, pname, p, hf_layer, subname):
-        if subname == "self_attention.dense.weight":
-            hf_name = f"model.layers.{hf_layer}.self_attn.o_proj.weight"
-        else:
-            hf_name = f"model.layers.{hf_layer}.mlp.down_proj.weight"
-
-        hf_w = self.loaded[hf_name]
-        hidden_size = hf_w.shape[1]
-        per_partition_size, start_index, end_index = compute_partition_range(
-            hidden_size, self.tp_rank, self.tp_size)
-        new_w = torch.zeros((hf_w.shape[0], per_partition_size), dtype=hf_w.dtype)
-        new_w[:, :per_partition_size] = hf_w[:, start_index:end_index]
-        self.record_mapping_info(
-            f"mega-ds:{pname,p.data.shape}<--hf{hf_name,}  [:,{start_index}:{end_index}]  of {hf_w.shape}"
-        )
-        return new_w
-
-    def _mlphto4h1_refactor(self, pname, p, hf_layer, subname):
-        if subname == "mlp.dense_h_to_4h1.weight":
-            hf_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight"
-        else:
-            hf_name = f"model.layers.{hf_layer}.mlp.up_proj.weight"
-        hf_w = self.loaded[hf_name]
-        hidden_size = hf_w.shape[0]
-        per_partition_size, start_index, end_index = compute_partition_range(
-            hidden_size, self.tp_rank, self.tp_size)
-        new_w = torch.zeros((per_partition_size, hf_w.shape[1]), dtype=hf_w.dtype)
-
-        new_w[:per_partition_size, :] = hf_w[start_index:end_index, :]
-        self.record_mapping_info(
-            f"mega-ds:{pname,p.data.shape}<--hf{hf_name,}  [{start_index}:{end_index},:]  of {hf_w.shape}"
-        )
-        return new_w
-
-    def refactor(self):
-        assert self.is_refactored == False
-        new_w = None
-        for pname, p in self.model.named_parameters():
-            if pname in [
-                    f"{self.mega_emb_wnum}.word_embeddings.weight",
-                    f"{self.mega_lm_head_wnum}.lm_head.weight"
-            ]:
-                new_w = self._embedding_refactor(pname, p)
-            elif pname == f"{self.mega_norm_wnum}.weight":
-                new_w = self._direct_refactor(pname, p)
-            else:
-                mobj = self.decoder_pat.match(pname)
-                layer_num = int(mobj.group(1))
-                subname = mobj.group(2)
-                hf_layer = layer_num - self.offset_num
-                if subname in ["self_attention.query_key_value.weight"]:
-                    new_w = self._qkv_refactor(pname, p, hf_layer)
-                elif subname in ["mlp.dense_h_to_4h.weight"]:
-                    new_w = self._mlphto4h_dense_refactor(pname, p, hf_layer)
-                elif subname in [
-                        "self_attention.dense.weight",
-                        "mlp.dense_4h_to_h.weight"
-                ]:
-                    new_w = self._attn_dense_refactor(pname, p, hf_layer, subname)
-                elif subname in [
-                        "mlp.dense_h_to_4h1.weight",
-                        "mlp.dense_h_to_4h2.weight"
-                ]:
-                    new_w = self._mlphto4h1_refactor()
-                elif subname in [
-                        "input_layernorm.weight",
-                        "post_attention_layernorm.weight"
-                ]:
-                    new_w = self._direct_refactor(pname, p, hf_layer, subname)
-                else:
-                    raise ValueError("Unrecognized weight type")
-            p.data.copy_(new_w)
-            new_w = None
-        self.is_refactored = True
-
-    def record_mapping_info(self, record_msg):
-        self.refactor_weight_list.append(record_msg)
-
-    def inorder_show_record(self):
-        assert self.is_refactored
-        print_rank_0(
-            f"----------------------------mapping list----------------------------")
-        # print dp rank0 tp rank0  records.
-        for pipe_rank in range(mpu.get_pipeline_model_parallel_world_size()):
-            if mpu.get_pipeline_model_parallel_rank() == pipe_rank:
-                if mpu.get_data_parallel_rank(
-                ) == 0 and mpu.get_tensor_model_parallel_rank() == 0:
-                    for record in self.refactor_weight_list:
-                        print(record)
-                torch.distributed.barrier()
-            else:
-                torch.distributed.barrier()
-
-
-def convert_hf_to_mega_ds():
-    """Build the model."""
-    args = get_args()
-    print_rank_0(f'building model ...')
-    see_memory_usage(f"Before Building Model", force=True)
-
-    config = core_transformer_config_from_args(args)
-    with deepspeed.zero.Init(
-            data_parallel_group=mpu.get_data_parallel_group(),
-            remote_device=None if args.remote_device == 'none' else args.remote_device,
-            config_dict_or_path=args.deepspeed_config,
-            enabled=args.zero_stage == 3,
-            mpu=mpu):
-        if args.deepspeed and not args.no_pipeline_parallel:
-            model = GPTModelPipe(config, num_tokentypes=0, parallel_output=True)
-        else:
-            raise NotImplementedError("Not implemented")
-
-    see_memory_usage(f"After Building Model", force=True)
-    if torch.distributed.get_rank() < 2:
-        print(f"{torch.distributed.get_rank()} {model}")
-
-    # load and initialize HF weight dict
-    # print hf weights list & mega-ds weights list
-    hf_ckpt_dir = args.origin_hf_ckpt_dir
-    hf_ckpt_num_of_shards = args.hf_ckpt_num_shards
-    loaded = load_and_print_hf_weight(hf_ckpt_dir, hf_ckpt_num_of_shards)
-    print_distinct_weights(model)
-
-    # refactor weight from hf to mega-ds
-
-    cur_refactor = refactor(model, loaded, args, config)
-    cur_refactor.refactor()
-    cur_refactor.inorder_show_record()
-
-    del loaded
-
-    unwrapped_model = unwrap_model([model], (torchDDP, LocalDDP, Float16Module))
-    optimizer = get_megatron_optimizer(unwrapped_model)
-    opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
-
-    #init model and save
-    print_rank_0(f"before deepspeed init")
-    ds_engine, _, _, _ = deepspeed.initialize(
-        model=model,
-        optimizer=optimizer,
-        args=args,
-        lr_scheduler=opt_param_scheduler,
-        mpu=mpu if args.no_pipeline_parallel else None)
-    print_rank_0(f"after deepspeed init")
-
-    print_rank_0(f"mega-ds checkpoint will be saved in {args.save}")
-    save_checkpoint(0, [ds_engine], optimizer, opt_param_scheduler)
-    print_rank_0(f"save checkpoint completed")
-
-
-if __name__ == "__main__":
-
-    initialize_megatron(extra_args_provider=add_extra_args)
-    convert_hf_to_mega_ds()
diff --git a/toolbox/Megatron-DeepSpeed/tools/linter.py b/toolbox/Megatron-DeepSpeed/tools/linter.py
deleted file mode 100644
index 5b14007666600746341e2b962b7b09e69f3019f0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/linter.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import os
-import os.path as osp
-import pathlib
-import subprocess
-
-
-def recursively_lint_files():
-    """Recursively lint all python files in chosen subdirectories of megatron-lm"""
-
-    try:
-        import autopep8
-    except ModuleNotFoundError:
-        print("Please first install autopep8 via `pip install autopep8`")
-        return
-
-    # get all python file paths from top level directory
-    file_dir = str(pathlib.Path(__file__).parent.absolute())
-    working_dir = osp.join(file_dir, os.pardir)
-    all_py_paths = set(os.path.join(working_dir, fname)
-                       for fname in os.listdir(working_dir) if ".py" in fname)
-
-    # get all python file paths from chosen subdirectories
-    check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
-    for sub_dir in check_dirs:
-        for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
-            all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
-
-    print("Linting the following: ")
-    for py_path in all_py_paths:
-        print(py_path)
-        command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
-        subprocess.check_call(command)
-
-
-if __name__ == "__main__":
-    recursively_lint_files()
diff --git a/toolbox/Megatron-DeepSpeed/tools/merge_datasets.py b/toolbox/Megatron-DeepSpeed/tools/merge_datasets.py
deleted file mode 100644
index 63419350d2d36c3295ae265a64485ec58df41962..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/merge_datasets.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import os
-import sys
-import json
-import argparse
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-
-from megatron_ds.data import indexed_dataset
-
-
-def main(args):
-
-    prefixes = set()
-    for basename in os.listdir(args.input):
-        prefix, ext = os.path.splitext(basename)
-
-        if prefix in prefixes:
-            continue
-
-        if not os.path.isfile(os.path.join(args.input, basename)):
-            continue
-
-        ext_pair = '.bin' if ext == '.idx' else '.idx'
-        assert os.path.isfile(os.path.join(args.input, prefix) + ext_pair), \
-               f'ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}'
-
-        prefixes.add(prefix)
-
-    builder = None
-    for prefix in sorted(prefixes):
-        if builder is None:
-            dataset = indexed_dataset.make_dataset(os.path.join(args.input, prefix), 'infer')
-
-            if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
-                builder = indexed_dataset.MMapIndexedDatasetBuilder(args.output_prefix + '.bin', dtype=dataset._index.dtype)
-            else:
-                builder = indexed_dataset.IndexedDatasetBuilder(args.output_prefix + '.bin')
-
-            del dataset
-
-        builder.merge_file_(os.path.join(args.input, prefix))
-
-    builder.finalize(args.output_prefix + '.idx')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to directory containing all document files to merge')
-
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
-
-    args = parser.parse_args()
-
-    assert os.path.isdir(args.input), \
-           f'ERROR: {args.input} is not a directory or does not exist'
-
-    assert os.path.isdir(os.path.dirname(args.output_prefix)), \
-           f'ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist'
-
-    main(args)
-
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/README.md b/toolbox/Megatron-DeepSpeed/tools/openwebtext/README.md
deleted file mode 100644
index 7e6f10a0a7346a09d9586919dc239e381158f8ce..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-The following steps show how to prepare training dataset to train the mode.
-
-# Libraries to install
-
-```
-    pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 
-    git clone https://github.com/mattilyra/LSH
-    cd LSH
-    python setup.py install
-``` 
-
-# Download the dataset
-
-1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ)
-2. Remove blacklisted URLs.
-```
-python blacklist_urls.py <path to the dowloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
-```
-3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 
-
-4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique.
-
-# Prepare the data for GPT training:
-
-1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards.
-```
-python cleanup_dataset.py <input data file> <output cleaned data filename>
-```
-Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`.
-2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`.
-```
-python find_duplicates.py --inputs <pairlist list of input cleaned data files and keys, e.g. cc.json cc_id news.json news_id> --output <output possible duplicate urls filename>
-```
-3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest.
-```
-python group_duplicate_urls.py <possible duplicate urls file> <output file containing similar urls>
-```
-4. Remove similar documents that were detected in the last step.
-```
-python remove_group_duplicates.py <file containing simialr documents> <cleaned data file> <outputfile containing deduplicate data>
-```
-
-5. Shuffle the dataset.
-```
-shuf <cleaned deduped data file> -o train_data.json
-```
-
-# Deduplicating ngrams
-
-To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command.
-
-```
-python filter_ngrams.py --tasks <name of the task, e.g. lambada, squad> --dedup-dataset <training dataset to deduplicate> <json key> --output <output training dataset>
-```
-We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments.
-
-Only for the lambada task, we need to provide the path, `--lambada-path <path of the lambada test data>`.
-
-Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details.
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/add_id.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/add_id.py
deleted file mode 100644
index 7bea7ee8a2cdddb8e2e2490b114a6d9083ca6000..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/add_id.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import argparse
-import json
-import os
-import time
-
-"""
-This code adds id to each json object in a json file. User can add prefix
-to the ids.
-"""
-
-if __name__ == '__main__':
-
-    print('parsing the arguments ...')
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input-file', type=str, default=None, help='Input'\
-        ' json file where id needs to be added')
-    parser.add_argument('--output-file', type=str, default=None, help=\
-        'Output file name with id')
-    parser.add_argument('--id-prefix', type=str, default=None, help=\
-        'Id prefix')
-    parser.add_argument('--log-interval', type=int, default=100,
-                       help='Log interval')
-    args = parser.parse_args()
-
-    print('Adding ids to dataset ...')
-
-    f_input = open(args.input_file, 'r', encoding='utf-8')
-    f_output = open(args.output_file, 'wb')
-
-    unique_ids = 1
-    start_time = time.time()
-    for row in f_input:
-        each_row = json.loads(row)
-        adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
-        each_row['adlr_id'] = adlr_id_string
-        myjson = json.dumps(each_row, ensure_ascii=False)
-
-        f_output.write(myjson.encode('utf-8'))
-        f_output.write('\n'.encode('utf-8'))
-
-        if unique_ids % args.log_interval == 0:
-            print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
-                    unique_ids, time.time() - start_time), flush=True)
-
-        unique_ids += 1
-
-    # Close the file.
-    f_input.close()
-    f_output.close()
-    
-    print('done :-)', flush=True)
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/blacklist_urls.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/blacklist_urls.py
deleted file mode 100644
index bf68840b6d399ca6bb45960142d323900eb0076a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/blacklist_urls.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-
-import glob
-import re
-import time
-import tldextract
-import sys
-
-
-# List of the domains to blacklist.
-domain_blacklist = set([
-    '500px',
-    'aapks',
-    'akamaihd',
-    'amazon',
-    'apple',
-    'artifactfire',
-    'artstation',
-    'awwni',
-    'bandcamp',
-    'battleforthenet',
-    'coinscalendar',
-    'dailymotion',
-    'deviantart',
-    'discord',
-    'discordapp',
-    'dlapkandroid',
-    'dropbox',
-    'e621',
-    'ebay',
-    'edealinfo',
-    'erome',
-    'eroshare',
-    'explosm',
-    'facebook',
-    'fbcdn',
-    'flickr',
-    'furaffinity',
-    'futhead',
-    'gatopardo',
-    'gfycat',
-    'gifsound',
-    'gifsoup',
-    'giphy',
-    'github',
-    'google',
-    'gunprime',
-    'gyazo',
-    'hotdealstar',
-    'imagefap',
-    'imageshack',
-    'imgflip',
-    'imgur',
-    'instagram',
-    'karmadecay',
-    'kryptocal',
-    'kym-cdn',
-    'liveleak',
-    'livememe',
-    'lmgtfy',
-    'magaimg',
-    'memegenerator',
-    'minorplanetcenter',
-    'minus',
-    'mobafire',
-    'morejpeg',
-    'nocookie',
-    'pcpartpicker',
-    'photobucket',
-    'pinimg',
-    'pinterest',
-    'pixiv',
-    'pornhub',
-    'prntscr',
-    'puu',
-    'qkme',
-    'quickmeme',
-    'radd',
-    'redd',
-    'reddit',
-    'reddit-stream',
-    'redditlog',
-    'redditmedia',
-    'reddituploads',
-    'redtube',
-    'reupp',
-    'reverb',
-    'roanoke',
-    'rollingstone',
-    'sli',
-    'soundcloud',
-    'soundgasm',
-    'spankbang',
-    'spotify',
-    'strawpoll',
-    'streamable',
-    'timeanddate',
-    'tinypic',
-    'touhouradio',
-    'tumblr',
-    'twimg',
-    'twitch',
-    'twitter',
-    'vid',
-    'vimeo',
-    'vine',
-    'vkaao',
-    'vocaroo',
-    'voyagefusion',
-    'walmart',
-    'wciu',
-    'wikimedia',
-    'wikipedia',
-    'xhamster',
-    'xkcd',
-    'xvideos',
-    'youtu',
-    'youtube',
-    'youtubedoubler',
-    'ytimg',
-    'zillexplorer',
-])
-
-def domain_is_in_blacklist(url):
-    domain = tldextract.extract(url).domain
-    return domain in domain_blacklist
-
-
-# List of extentions to blacklist.
-extentions_blacklist = (
-    '.3gp',
-    '.7z'
-    '.ai',
-    '.aif',
-    '.apk',
-    '.app',
-    '.avi',
-    '.bin',
-    '.bmp',
-    '.bz2',
-    '.css',
-    '.csv',
-    '.dat',
-    '.deb',
-    '.dmg',
-    '.doc',
-    '.docx',
-    '.exe',
-    '.gif',
-    '.gifv',
-    '.gz',
-    '.iso',
-    '.jar',
-    '.jpeg',
-    '.jpg',
-    '.js',
-    '.log',
-    '.mid',
-    '.midi',
-    '.mkv',
-    '.mov',
-    '.mp3',
-    '.mp4',
-    '.mpeg',
-    '.mpg',
-    '.ogg',
-    '.ogv',
-    '.otf',
-    '.pdf',
-    '.pkg',
-    '.png',
-    '.pps',
-    '.ppt',
-    '.pptx',
-    '.psd',
-    '.py',
-    '.qt',
-    '.ram',
-    '.rar',
-    '.sql',
-    '.svg',
-    '.swf',
-    '.tar.gz',
-    '.tar',
-    '.tgz',
-    '.tiff',
-    '.ttf',
-    '.txt',
-    '.wav',
-    '.webm',
-    '.wma',
-    '.wmv',
-    '.xls',
-    '.xlsx',
-    '.xml',
-    '.xz',
-    '.zip',
-)
-
-def extention_is_in_blacklist(url):
-    if url.split('?')[0].lower().endswith(extentions_blacklist):
-        return True
-    return False
-
-
-# Malformed urls.
-# This function is adapted from:
-#   https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not
-url_regex = re.compile(
-    r'^(?:http)s?://' # http:// or https://
-    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
-    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
-    r'(?::\d+)?' # optional port
-    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
-def url_is_malformed(url):
-    return re.match(url_regex, url) is None
-
-
-def print_progress(prefix, start_time, urls_counter,
-                   domain_blacklist_counter,
-                   extention_blacklist_counter,
-                   short_url_counter, malformed_url_counter,
-                   duplicate_url_counter):
-    string = prefix + ' | '
-    string += 'time elapsed (s): {:.2f} | '.format(time.time() - start_time)
-    string += 'number of urls: {} | '.format(urls_counter)
-    string += 'domain blacklisted: {} | '.format(domain_blacklist_counter)
-    string += 'extention blacklisted: {} | '.format(extention_blacklist_counter)
-    string += 'short urls (<=8): {} | '.format(short_url_counter)
-    string += 'malformed urls: {} | '.format(malformed_url_counter)
-    string += 'duplicate urls: {}'.format(duplicate_url_counter)
-    print(string, flush=True)
-
-
-if __name__ == '__main__':
-
-
-    print('remove blacklisted urls ..')
-
-    # Path to the url files.
-    path = sys.argv[1]
-    # Output url file.
-    output = sys.argv[2]
-
-    # Get the list of url files.
-    files = glob.glob(path + '/*.txt')
-    print('> found {} files'.format(len(files)))
-
-    urls = set()
-    urls_counter = 0
-    domain_blacklist_counter = 0
-    extention_blacklist_counter = 0
-    short_url_counter = 0
-    malformed_url_counter = 0
-    duplicate_url_counter = 0
-    start_time = time.time()
-    for filename in files:
-        with open(filename, 'r') as f:
-            for line in f:
-                url = line.strip()
-                urls_counter += 1
-                if domain_is_in_blacklist(url):
-                    print('[DOMAIN BLACKLIST]: {}'.format(url), flush=True)
-                    domain_blacklist_counter += 1
-                elif extention_is_in_blacklist(url):
-                    print('[EXTENTION BLACKLIST]: {}'.format(url), flush=True)
-                    extention_blacklist_counter += 1
-                elif len(url) <= 8:
-                    print('[SHORT URL]: {}'.format(url), flush=True)
-                    short_url_counter += 1
-                elif url_is_malformed(url):
-                    print('[MALFORMED URL]: {}'.format(url), flush=True)
-                    malformed_url_counter += 1
-                elif url in urls:
-                    print('[DUPLICATE URL]: {}'.format(url), flush=True)
-                    duplicate_url_counter += 1
-                else:
-                    urls.add(url)
-                if urls_counter % 100000 == 0:
-                    print_progress('PROGRESS', start_time, urls_counter,
-                                   domain_blacklist_counter,
-                                   extention_blacklist_counter,
-                                   short_url_counter, malformed_url_counter,
-                                   duplicate_url_counter)
-
-    print_progress('FINAL', start_time, urls_counter,
-                   domain_blacklist_counter,
-                   extention_blacklist_counter,
-                   short_url_counter, malformed_url_counter,
-                   duplicate_url_counter)
-
-    # Write the final set of urls.
-    print('> writing cleaned up url list to {}'.format(output))
-    with open(output, 'w') as f:
-        for url in urls:
-            f.write(url + '\n')
-
-    print('done :-)')
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/cleanup_dataset.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/cleanup_dataset.py
deleted file mode 100644
index 3a2eba4e8463bedbbc09ecca902c984dd2fd5314..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/cleanup_dataset.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-
-import ftfy
-import json
-from langdetect import detect
-import numpy as np
-import time
-import os
-import sys
-
-from tokenizer import Tokenizer
-
-MIN_DOCUMENT_LENGHT = 128
-
-
-def print_progress(prefix, start_time, num_docs, num_fixed_text,
-                   num_non_english_docs, chars_non_english_docs,
-                   num_small_docs, chars_small_docs):
-
-    string = prefix + ' | '
-    string += 'elapsed time: {:.2f} | '.format(time.time() - start_time)
-    string += 'documents: {} | '.format(num_docs)
-    string += 'fixed text: {} | '.format(num_fixed_text)
-    string += 'non-english: {} | '.format(num_non_english_docs)
-    string += 'non-english chars: {} | '.format(chars_non_english_docs)
-    string += 'small docs: {} | '.format(num_small_docs)
-    string += 'small docs chars: {}'.format(chars_small_docs)
-    print(string, flush=True)
-
-
-def filter_corpus(filename, out_filename, print_interval=10000):
-
-    print(' > filtering {}'.format(filename))
-
-    tokenizer = Tokenizer(cache_dir='./cache')
-
-    num_docs = 0
-    num_written_docs = 0
-    num_small_docs = 0
-    num_fixed_text = 0
-    num_non_english_docs = 0
-    chars_non_english_docs = 0
-    chars_small_docs = 0
-    start_time = time.time()
-    with open(out_filename, 'wb') as f:
-        with open(filename, 'r') as fin:
-            for line in fin:
-                try:
-                    num_docs += 1
-                    myjson = json.loads(line)
-                    # Fix text
-                    text = ftfy.fix_text(myjson['text'])
-                    if text != myjson['text']:
-                        num_fixed_text += 1
-                    myjson['text'] = text
-                    # Detect language.
-                    if detect(text) != 'en':
-                        print('[non-english text]', myjson)
-                        num_non_english_docs += 1
-                        chars_non_english_docs += len(text)
-                        continue
-                    # On average each token is 5 characters so 8 is an
-                    # upper bound.
-                    if len(text) < (8 * MIN_DOCUMENT_LENGHT):
-                        tokens = tokenizer.tokenize_document(text)
-                        if len(tokens) < MIN_DOCUMENT_LENGHT:
-                            print('[small document, skipping]:', myjson)
-                            num_small_docs += 1
-                            chars_small_docs += len(text)
-                            continue
-                    myjson = json.dumps(myjson, ensure_ascii=False)
-                    f.write(myjson.encode('utf-8'))
-                    f.write('\n'.encode('utf-8'))
-                    num_written_docs += 1
-                    if num_docs % print_interval == 0:
-                        print_progress('[PROGRESS]', start_time, num_docs,
-                                       num_fixed_text, num_non_english_docs,
-                                       chars_non_english_docs,
-                                       num_small_docs, chars_small_docs)
-                except Exception as e:
-                    print('    skipping ', line, e)
-
-    print_progress('[FINAL]', start_time, num_docs,
-                   num_fixed_text, num_non_english_docs,
-                   chars_non_english_docs,
-                   num_small_docs, chars_small_docs)
-
-
-if __name__ == '__main__':
-
-    print('building gpt2 dataset ...')
-
-    input_filename = sys.argv[1]
-    output_filename = sys.argv[2]
-
-    print('will be reading {}'.format(input_filename))
-    print('and will write the results to {}'.format(output_filename))
-
-    filter_corpus(input_filename, output_filename)
-
-
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/cleanup_fix_dataset.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/cleanup_fix_dataset.py
deleted file mode 100644
index c7f6cf2dbb0e5294d79bd35e5a2c2b99d4102f8f..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/cleanup_fix_dataset.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""
-Filter and clean documents:
-Capable to clean docs with less than 512 characters, less than
-256 characters and contains javascript, fix text and dataset specific
-cleaning like stories and realnews datasets.
-Program arguments have the details.
-"""
-
-import argparse
-from functools import partial
-import glob
-import ftfy
-import json
-from langdetect import detect
-import multiprocessing
-import os
-from pathlib import Path
-import re
-import time
-
-def process_doc(json_line, args):
-
-    # Read the line.
-    document = json.loads(json_line)
-    text = document['text']
-
-    output = {'remove_512': False, 'remove_256_javascript': False, \
-        'remove_512_non_english': False, 'ftfy_fix_text': False, \
-        'general_cleaning': False}
-
-    try:
-        # Reomove all docs with less than 512 characters
-        if "remove_512" in args.tasks:
-            if len(text) < 512:
-                output['remove_512'] = True
-                return output, text, document, True
-
-        # Remove docs if less than 256 character length and contains Javascript
-        if "remove_256_javascript" in args.tasks:
-            if len(text) < 256 and 'javascript' in text.lower():
-                output['remove_256_javascript'] = True
-                return output, text, document, True
-
-        # Remove docs < 512 and nonenglish
-        if "remove_512_non_english" in args.tasks:
-            if len(text) < 512 and detect(text) != 'en':
-                output['remove_512_non_english'] = True
-                return output, text, document, True
-
-        # Fix the text using ftfy, don't remove the text, hence return False
-        if "ftfy_fix_text" in args.tasks:
-            fixed_text = ftfy.fix_text(text)
-            output['ftfy_fix_text'] = True
-            return output, fixed_text, document, False
-
-        # Cleaning extra spaces and newlines
-        if "general_cleaning" in args.tasks:
-            cleaned_text = re.sub(r"  +|\b\n+ |\b\n+", " ", text)
-            #cleaned_text = re.sub(r"\n\n+", "\n\n", text) # used this for Gutenberg dataset
-            #cleaned_text = re.sub(r"\n", "\n\n", text) # Used this for realnews
-
-            # stories datasets
-            #cleaned_text = re.sub(r" \'", "'", text)
-            #cleaned_text = re.sub(r" \!", "!", cleaned_text)
-            #cleaned_text = re.sub(r" \.", ".", cleaned_text)
-            #cleaned_text = re.sub(r" \?", "?", cleaned_text)
-            #cleaned_text = re.sub(r" - ", "-", cleaned_text)
-            ##cleaned_text = re.sub(r"\" ", "\"", cleaned_text)
-            #cleaned_text = re.sub(r" @ ", "@", cleaned_text)
-
-            output['general_cleaning'] = True
-            return output, cleaned_text, document, False
-
-    except Exception as e:
-        print('Error: *************************\n{}\ntext: {}'.format(e, \
-            text), flush=True)
-        return output, text, document, True
-
-    # don't remove
-    return output, text, document, False
-
-
-def process_set(args, input_file, output_f_cleaned, output_f_filtered):
-
-    print(' > working on {} ...'.format(input_file), flush=True)
-    
-    num_docs = num_remove_512 = num_remove_java = num_remove_512_non_english \
-        = num_ftfy_fix_text = num_general_cleaning = 0
-
-    # Output file and counters.
-    output_cleaned = open(output_f_cleaned, 'wb')
-    output_filtered = open(output_f_filtered, 'wb')
-
-    start_time = time.time()
-
-    # Setup multi-processing.
-    num_workers = 40
-    fin = open(input_file, 'r', encoding='utf-8')
-    pool = multiprocessing.Pool(num_workers)
-    process_doc_partial = partial(process_doc, args=args)
-    processed_docs = pool.imap(process_doc_partial, fin, 500)
-
-    # Process documents.
-    for output, text, document, to_filter in processed_docs:
-        num_docs += 1
-
-        num_remove_512 += 1 if output['remove_512'] else 0
-        num_remove_java += 1 if output['remove_256_javascript'] else 0
-        num_remove_512_non_english += 1 if output['remove_512_non_english'] \
-            else 0
-        num_ftfy_fix_text += 1 if output['ftfy_fix_text'] else 0
-        num_general_cleaning += 1 if output['general_cleaning'] else 0
-
-        document['text'] = text
-        myjson = json.dumps(document, ensure_ascii=False)
-
-        if to_filter:
-            output_filtered.write(myjson.encode('utf-8'))
-            output_filtered.write('\n'.encode('utf-8'))
-        else:
-            output_cleaned.write(myjson.encode('utf-8'))
-            output_cleaned.write('\n'.encode('utf-8'))
-
-        if num_docs % args.log_interval == 0:
-            print('    processed {:9d} documents in {:.2f} seconds ...'.format(
-                num_docs, time.time() - start_time), flush=True)
-
-    # Close the file.
-    output_cleaned.close()
-    output_filtered.close()
-    fin.close()
-
-    # Print stats.
-    print('  >> total docs: {} remove_512 {} remove_256_javascript {} '\
-        'remove_512_non_english {} ftfy_fix_text {} general_cleaning {}'.\
-        format(num_docs, num_remove_512, num_remove_java,\
-        num_remove_512_non_english, num_ftfy_fix_text, \
-        num_general_cleaning), flush=True)
-
-if __name__ == '__main__':
-
-
-    print('parsing the arguments ...')
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input-files', nargs = '*', required=True, default=\
-                        None, help = 'Input json files that needs to be'\
-                        ' cleaned')
-    parser.add_argument('--tasks', nargs = '*', required=True, default=None,\
-                        help = 'Tasks to perform on the input files, ' \
-                        'such as remove_512, remove_256_javascript, ' \
-                        'remove_512_non_english, ftfy_fix_text, and ' \
-                        'general_cleaning. 256 or 512 means the number' \
-                        ' of characters.')
-
-    parser.add_argument('--output-path', type=str, default=None,
-                       help='Directory where the output should go')
-    parser.add_argument('--log-interval', type=int, default=100,
-                       help='Log interval')
-
-    args = parser.parse_args()
-
-    print('cleanup dataset ...')
-
-    for input_file in args.input_files:
-        input_filename, input_filename_ext = os.path.splitext(Path(input_file)\
-            .name)
-
-        output_f_cleaned = os.path.join(args.output_path, input_filename + \
-            "_cleaned" + input_filename_ext)
-        output_f_filtered = os.path.join(args.output_path, input_filename + \
-            "_filtered" + input_filename_ext)
-
-        process_set(args, input_file, output_f_cleaned, output_f_filtered)
-
-    print('done :-)', flush=True)
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/filter_ngrams.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/filter_ngrams.py
deleted file mode 100644
index 7327a16541e102d9344b7486f9b4fe0c0844778d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/filter_ngrams.py
+++ /dev/null
@@ -1,479 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""
-Deduplicate downstream tasks from training dataset. 13-grams have been used.
-All split documents with less than 200 characters got filtered. Any document
-with more than 10 splits got filtered as well.
-"""
-
-import argparse
-from functools import partial
-import json
-import multiprocessing
-import nltk
-import pickle
-import re
-import string
-import sys
-import time
-
-def get_words(text):
-    # get all the lowercase words from text
-    words, positions = [], []
-    for match in re.finditer(r'\w+', text.lower()):
-        words.append(match.group(0))
-        positions.append(match.start())
-    return words, positions
-
-# splits the text
-def split_text(text, start_position, remove_char_each_side, seq):
-    # first part of the text
-    punctuations = ".!?"
-    pos = start_position - remove_char_each_side
-    text_first = ""
-    while pos > 0 and not text[pos] in punctuations:
-        pos -= 1
-    if pos > 0:
-        text_first = text[0:pos+1]
-
-    # add length of seq and remove_char_each_side
-    pos = start_position + len(seq) + remove_char_each_side
-
-    # last part of the text
-    text_second = ""
-    while pos < len(text) and not text[pos] in punctuations:
-        pos += 1
-    if pos + 1 < len(text):
-        text_second = text[pos+1:len(text)]
-
-    return text_first, text_second
-
-def check_and_clean_text(args, words, ngrams, text, start_position, \
-    text_buf_ngram_free, text_buf, local_ngram):
-
-    seq = " ".join(words)
-    if seq in ngrams:
-        print(" [matched]: {}".format(seq), flush=True)
-
-        if args.get_ngram_freq_only:
-            # increase freq of this seq and then only consider the later part
-            # of the text for further processing
-            if seq in local_ngram:
-                local_ngram[seq] += 1
-            else:
-                local_ngram[seq] = 1
-            #print(" [increased]: {} {}".format(seq, ngrams[seq]), flush=True)
-            if (start_position + len(seq) + 1) < len(text):
-                text_buf.append(text[start_position + len(seq) + 1:len(text)])
-            return False            
-
-        # split the text
-        text_first, text_second = split_text(text, start_position, \
-            args.remove_char_each_side, seq)
-
-        # first part of ngrams free
-        if len(text_first) > args.filter_text_char_len:
-            text_buf_ngram_free.append(text_first)
-
-        # add second part for further processing
-        if len(text_second) > args.filter_text_char_len:
-            text_buf.append(text_second)
-
-        return False # not ngram free
-
-    # ngram free
-    return True
-
-
-def free_ngram(line, args, key, ngrams, ngrams_freq_sorted):
-    # remove all the ngrams
-
-    try:
-        myjson = json.loads(line)
-        text_buf = [myjson[key]]
-    except Exception as e:
-        print("Error: {}".format(e), flush=True)
-        text_buf = []
-
-    text_buf_ngram_free = []
-    local_ngram = {}
-    while len(text_buf) > 0:
-
-        # get the first one from the buffer
-        text = text_buf.pop(0)
-        words, positions = get_words(text)
-        
-        ngram_free = True
-        # find each max n-grams and check dictionary
-        for i in range(len(words) - args.max_ngram_size + 1):
-            check_ngram_free = check_and_clean_text(args, words[i:\
-                i+args.max_ngram_size], ngrams, text, positions[i], \
-                text_buf_ngram_free, text_buf, local_ngram)
-
-            # the seq is ngram free? if yes, break
-            if not check_ngram_free:
-                ngram_free = False
-                break
-
-            # if max ngrams doesn't match, check if any other lower n-grams
-            # within max ngram macthes
-            for ngram_len, _ in ngrams_freq_sorted:
-                check_ngram_free = check_and_clean_text(args, words[i:\
-                    i+ngram_len], ngrams, text, positions[i], \
-                    text_buf_ngram_free, text_buf, local_ngram)
-
-                # same check as above
-                if not check_ngram_free:
-                    ngram_free = False
-                    break
-
-            # check break from lower than max ngram loop above
-            if not ngram_free:
-                break
-
-        # for the last max n-gram, check all the lower ngrams in it
-        if ngram_free and len(words) - args.max_ngram_size > 0:
-            # get the last words of the lax max ngram
-            last_seq_words = words[(len(words)-args.max_ngram_size):len(words)]
-            last_seq_start_position = len(words) - args.max_ngram_size
-
-            # check all n-grams lower than the max
-            for pos, (ngram_len, _) in enumerate(ngrams_freq_sorted):
-
-                # ignore the max ngram as has been considered already
-                if ngram_len == args.max_ngram_size:
-                    continue
-
-                # find each ngram of ngram_len in max n-grams and check
-                for i in range(len(last_seq_words) - ngram_len + 1):
-                    check_ngram_free = check_and_clean_text(args, \
-                        last_seq_words[i:i+ngram_len], ngrams, text,\
-                        positions[last_seq_start_position+i], \
-                        text_buf_ngram_free, text_buf, local_ngram)
-
-                    if not check_ngram_free:
-                        ngram_free = False
-                        break
-
-                if not ngram_free:
-                    break
-
-        # texts are ngram free
-        if ngram_free and not args.get_ngram_freq_only:
-            text_buf_ngram_free.append(text)
-
-    # check if the text has only been trimmed
-    trimmed = 0
-    if not args.get_ngram_freq_only and len(text_buf_ngram_free) == 1 and \
-        len(text_buf_ngram_free[0]) < len(myjson[key]):
-        trimmed = 1
-
-    return text_buf_ngram_free, trimmed, myjson, local_ngram
-
-# insert word sequence into dictionary
-def insert_dict(words, ngrams, pos):
-    seq = " ".join(words)
-    if seq not in ngrams:
-        ngrams[seq] = 0
-        #ngrams[seq] = pos
-
-# insert each ngram from text into the ngrams dictionary
-def compute_ngrams_insert_dict(args, text, ngrams):
-    words, positions = get_words(text)
-    if len(words) < args.min_ngram_size:
-        return
-
-    if len(words) < args.max_ngram_size:
-        insert_dict(words, ngrams, positions[0])
-
-    for i in range(len(words) - args.max_ngram_size+1):
-        insert_dict(words[i:i+args.max_ngram_size], ngrams, positions[i])
-
-
-# Build ngrams for the lambada dataset
-def process_task_lambda(args, task_file, ngrams):
-    print(' reading from {} and computing ngrams'.format(task_file))
-    with open(task_file, 'r') as f:
-        for line in f:
-            try:
-                myjson = json.loads(line)
-                text = myjson['text']
-                compute_ngrams_insert_dict(args, text, ngrams)
-            except Exception as e:
-                print('Error:', e)
-    print(" Entities in ngrams {}".format(len(ngrams)), flush=True)
-
-
-# Build ngrams for the dataset of the given task
-def process_task(args, task_name, ngrams):
-
-    print(' reading from {} and computing ngrams'.format('import datasets'))
-    print(" Current entities in ngrams {}".format(len(ngrams)), flush=True)
-    # using validation/test data from datasets
-    from datasets import load_dataset
-
-    entities_in_ngrams = len(ngrams)
-
-    # load the dataset
-    if task_name == 'squad':
-        dataset = load_dataset('squad_v2', split='validation')
-    elif task_name == 'natural_questions':
-        dataset = load_dataset('natural_questions', split='validation')
-    elif task_name == 'triviaqa':
-        dataset = load_dataset('trivia_qa', 'unfiltered', split='test')
-    elif task_name == 'webqa':
-        dataset = load_dataset('web_questions', split='test')
-    elif task_name == 'race':
-        dataset = load_dataset('race', 'all', split='test')
-    elif task_name == 'drop':
-        dataset = load_dataset('drop', split='validation')
-    elif task_name == 'coqa':
-        dataset = load_dataset('coqa', split='validation')
-    elif task_name == 'piqa':
-        dataset = load_dataset('piqa', split='test')
-    else:
-        print("Invalid task name: {}".format(task_name), flush=True)
-        return
-
-    # read the dataset and add to ngrams
-    for line in dataset:
-        try:
-            if task_name in ['squad', 'triviaqa', 'webqa', 'race', 'drop']:
-                text = line['question']
-                compute_ngrams_insert_dict(args, text, ngrams)
-            elif task_name == 'natural_questions':
-                text = line['question']['text']
-                compute_ngrams_insert_dict(args, text, ngrams)
-            elif task_name == 'coqa':
-                all_questions = line['questions']
-                for question in all_questions:
-                    compute_ngrams_insert_dict(args, question, ngrams)
-            elif task_name == 'piqa':
-                text = line['goal']
-                compute_ngrams_insert_dict(args, text, ngrams)
-        except Exception as e:
-            print('Error:', e)
-
-    print(" After task {} entities in ngrams {}, added {}".format(task_name, \
-            len(ngrams), len(ngrams) - entities_in_ngrams), flush=True)
-
-def compute_tasks_ngrams(args, ngrams):
-    start_time = time.time()
-    for _, task_name in enumerate(args.tasks):
-        print('Task: {}'.format(task_name), flush=True)
-        if task_name == 'lambada':
-            assert args.lambada_path is not None
-            process_task_lambda(args, args.lambada_path, ngrams)
-        else:
-            process_task(args, task_name, ngrams)
-    print(" Taken time to compute ngrams {:.2f}".format(time.time() - \
-        start_time), flush=True)
-
-def compute_ngram_freq_sorted(args, ngrams):
-    ngrams_freq = {}
-    for ngram_key in ngrams.keys():
-        length = len(ngram_key.split())
-        ngrams_freq[length] = ngrams_freq[length] + 1 if length in \
-            ngrams_freq else 1
-
-    ngrams_freq_sorted = sorted(ngrams_freq.items(), key=lambda item: item[0])
-    print(" Ngram frequencies: {}".format(ngrams_freq_sorted), flush=True)
-    print(" Entities in ngrams {} min_ngram_size {} max_ngram_size {}".format(\
-            len(ngrams), ngrams_freq_sorted[0][0], ngrams_freq_sorted[len(\
-            ngrams_freq_sorted) -1 ][0]), flush=True)
-    return ngrams_freq_sorted
-
-def get_ngrams_below_threshold(args, ngrams, ngrams_below_threshold, \
-    dedup_file, dedup_key, ngrams_freq_sorted):
-
-    start_time = time.time()
-    # get the ngrams frequency
-    args.get_ngram_freq_only = True
- 
-    # Open the large file to process in parallel
-    num_workers = args.num_threads 
-    pool = multiprocessing.Pool(num_workers)
-    fin = open(dedup_file, 'r', encoding='utf-8')
-    free_ngram_abt_partial=partial(free_ngram, args=args, key=dedup_key, \
-        ngrams=ngrams, ngrams_freq_sorted=ngrams_freq_sorted)
-    free_ngrams_abt = pool.imap(free_ngram_abt_partial, fin, 500)
- 
-    counter = 0
-    for _, _, _, local_ngram in free_ngrams_abt:
-        counter += 1
-        if counter % 1000 == 0:
-            print(' [compute_stat]> processed {} documents in {:.2f} seconds ...'.
-                    format(counter, time.time() - start_time), flush=True)
-        for local_key in local_ngram:
-            if local_key in ngrams:
-                ngrams[local_key] += 1
-        local_ngram = {}
-
-    print(' Time taken to compute statistics {:.2f} seconds'.format(time.time() - \
-        start_time), flush=True)
-    pool.close()
-    pool.join()
-
-    start_time = time.time()
-    counter_threshold = 0
-    # Get ngram below theadhold
-    for local_key, local_val in ngrams.items():
-        if ngrams[local_key] < args.key_threshold:
-            print(" [threshold] {} {}".format(local_key, local_val), flush=True)
-            counter_threshold += 1
-            ngrams_below_threshold[local_key] = 1
-            
-    print(' Ngrams below threshold {}'.format(counter_threshold), flush=True)
-    fin.close()
-
-def clean_ngrams_below_threshold(args, ngrams_below_threshold, dedup_file, \
-    dedup_key):
-
-    start_time = time.time()
-    # Now actually filter the dataset
-    args.get_ngram_freq_only = False
-    #id_prefix = '-'.join(args.tasks[::2])
-    id_prefix = '-'.join(args.tasks[::1])
-
-    # get the range of the size of the ngrams
-    ngrams_freq_sorted = compute_ngram_freq_sorted(args, ngrams_below_threshold)
-
-    # Open the large file to process in parallel
-    counter = splitted = ignored = split_mt_thld = trimmed_count = 0
-    num_workers = args.num_threads
-    pool = multiprocessing.Pool(num_workers)
-    fin = open(dedup_file, 'r', encoding='utf-8')
-    free_ngram_clean_partial=partial(free_ngram, args=args, key=dedup_key, \
-        ngrams=ngrams_below_threshold, ngrams_freq_sorted=ngrams_freq_sorted)
-    free_ngrams_clean = pool.imap(free_ngram_clean_partial, fin, 500)
- 
-    out_f = open(args.output, 'wb')
-
-    for text_buf_ngram_free, trimmed, myjson, _ in free_ngrams_clean:
-        counter += 1
-        try:
-
-            trimmed_count += trimmed
-
-            if len(text_buf_ngram_free) > 1:
-                splitted += 1
-            if len(text_buf_ngram_free) == 0:
-                ignored += 1
-            # more than 10 splits ignored
-            if len(text_buf_ngram_free) > args.splits_count:
-                text_buf_ngram_free = []
-                split_mt_thld += 1
-
-            if args.output is not None:
-                if "split_id" in myjson:
-                    use_prefix = myjson["split_id"] + "-"
-                else:
-                    use_prefix = ""
-
-                for i in range(len(text_buf_ngram_free)):
-                    split_id_string = id_prefix + '-{:010d}'.format(int(\
-                        counter)) + '-{:04d}'.format(int(i))
-                    myjson[dedup_key] = text_buf_ngram_free[i]
-                    myjson["split_id"] = use_prefix + split_id_string
-                    outjson = json.dumps(myjson, ensure_ascii=False)
-                    #outjson = json.dumps({"text":text_buf_ngram_free[i],
-                    #    id_prefix+"_split_id":split_id_string},
-                    #    ensure_ascii=False)
-                    out_f.write(outjson.encode('utf-8'))
-                    out_f.write('\n'.encode('utf-8'))
-
-            if counter % 1000 == 0:
-                print(' [final]> processed {} documents in {:.2f} seconds ...'.
-                    format(counter, time.time() - start_time), flush=True)
-        except Exception as e:
-            print('Error:', e)
-
-    print(' [final]> processed {} documents in {:.2f} seconds ...'.
-        format(counter, time.time() - start_time), flush=True)
-    
-    print(' Total docs {} splitted {} ignored {} splits > theshold {} trimmed'\
-        ' {}'.format(counter, splitted, ignored, split_mt_thld, trimmed_count)\
-        , flush=True)
-
-    pool.close()
-    pool.join()
-
-    out_f.close()
-    fin.close()
-
-if __name__ == '__main__':
-
-    # we use 13-grams, any text less than 200 characters got removed
-    # any text splitted more than 10 got removed as well
-
-    print('parsing the arguments ...')
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--tasks', nargs = '*', required=True, default=None, \
-                        help = 'Tasks to use for deduplication: currently '
-                        ' suuport [lambada, squad, natural_questions,'
-                        ' triviaqa, webqa, race, drop, coqa, and piqa]')
-    parser.add_argument('--lambada-path', type=str, default=None,
-                       help='Only Lambada task needs the path')
-    parser.add_argument('--dedup-dataset', nargs = '*', default=None,
-                       help='Dataset to deduplicate with the key to use'
-                        ' e.g. cc.json text')
-    parser.add_argument('--output', type=str, default=None,
-                       help='Output file name to save dedup dataset')
-    parser.add_argument('--num-threads', type=int, default=40,
-                       help='Number of threads to use')
-    # Default dedup values
-    parser.add_argument('--max-ngram-size', type=int, default=13,
-                       help='Maximum size of ngram to use.')
-    parser.add_argument('--min-ngram-size', type=int, default=8,
-                       help='Minimum size of ngram to use.')
-    parser.add_argument('--filter-text-char-len', type=int, default=200,
-                       help='Remove any text below this length.')
-    parser.add_argument('--key-threshold', type=int, default=10,
-                       help='Number of keys to consider as threshold')
-    parser.add_argument('--save-dictionary', type=str, default=None,
-                       help='Save the dictionary')
-    parser.add_argument('--load-dictionary', type=str, default=None,
-                       help='Load the dictionary')
-    parser.add_argument('--splits-count', type=int, default=10,
-                       help='Remove any documents more than this many splits')
-    parser.add_argument('--remove-char-each-side', type=int, default=200,
-                       help='Maximum size of ngram to use.')
-
-    args = parser.parse_args()
-
-    assert len(args.dedup_dataset) == 2
-    dedup_file = args.dedup_dataset[0]
-    dedup_key = args.dedup_dataset[1]
-
-    # Setup multi-processing
-    num_workers = args.num_threads
-    if args.load_dictionary is None:
-
-        # Build ngrams
-        ngrams = {}
-        compute_tasks_ngrams(args, ngrams)
-
-        # get the range of the size of the ngrams
-        ngrams_freq_sorted = compute_ngram_freq_sorted(args, ngrams)
-
-        # get ngram freq from large file in parallel
-        # get ngrams below threshold
-        ngrams_below_threshold = {}
-        get_ngrams_below_threshold(args, ngrams, ngrams_below_threshold, \
-            dedup_file, dedup_key, ngrams_freq_sorted)
-
-        # save the dictionary if needed
-        if args.save_dictionary is not None:
-            with open(args.save_dictionary, 'wb') as save_dict_handle:
-                pickle.dump(ngrams_below_threshold, save_dict_handle)
-    else:
-        with open(args.load_dictionary, 'rb') as load_dict_handle:
-            ngrams_below_threshold = pickle.load(load_dict_handle)
-
-    # filter the large file
-    if args.output is not None:
-        clean_ngrams_below_threshold(args, ngrams_below_threshold, \
-            dedup_file, dedup_key)
-
-    print('done :-)')
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/find_duplicates.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/find_duplicates.py
deleted file mode 100644
index 7f6cb577441abf95796103e998991f491fc762ba..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/find_duplicates.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import argparse
-from functools import partial
-import itertools
-import json
-from lsh import cache, minhash
-import multiprocessing
-import numpy as np
-import time
-import pickle
-import sys
-import os
-
-# This function is adapted from:
-#   https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb
-def shingles(text, char_ngram=5):
-    return set(text[head:head + char_ngram]
-               for head in range(0, len(text) - char_ngram))
-
-
-# This function is adapted from:
-#  https://github.com/mattilyra/LSH/blob/master/examples/Introduction.ipynb
-def jaccard(set_a, set_b, args):
-    if len(set_a) < 1 or len(set_b) < 1:
-        return 0.0
-
-    intersection = set_a & set_b
-    union = set_a | set_b
-
-    if args.jaccard == 'min':
-        return len(intersection) / min(len(set_a), len(set_b))
-    elif args.jaccard == 'max':
-        return len(intersection) / max(len(set_a), len(set_b))
-    else:
-        return len(intersection) / len(union)
-
-def compute_fingerprint(line, key):
-    try:
-        myjson = json.loads(line)
-        url = myjson[key]
-        text = myjson['text']
-        fingerprint = hasher.fingerprint(text)
-    except Exception as e:
-        print('Error:', e)
-        return None, None, None, False
-
-    return url, text, fingerprint, True
-
-def url_pairs_to_remove(args, bucket_urls, url_doc):
-    remove_urls_list = []
-    deduped_local, counter_local = 0, 0
-    iteration = 0
-    while len(bucket_urls) > 1:
-        if args.heuristic_iter != -1 and \
-            iteration == args.heuristic_iter:
-            break
-
-        items = list(bucket_urls)
-        remove_urls = []
-        main_url = items[np.random.randint(0, len(items))]
-        main_dhingles = shingles(url_doc[main_url])
-
-        for i in range(0, len(items)):
-            counter_local += 1
-            other_url = items[i]
-            if other_url == main_url:
-                continue
-            other_shingles = shingles(url_doc[other_url])
-            try:
-                jaccard_sim = jaccard(main_dhingles, other_shingles, args)
-            except Exception as e:
-                print('Error:', e)
-                jaccard_sim = 0.0
-            if jaccard_sim > 0.5:
-                remove_urls.append({other_url: jaccard_sim})
-                deduped_local += 1
-                bucket_urls.remove(other_url)
-
-        bucket_urls.remove(main_url)
-        if len(remove_urls) > 0:
-            remove_urls_list.append({main_url: remove_urls})
-        iteration += 1
-    return remove_urls_list, deduped_local, counter_local
-
-def write_remove_urls_list(remove_urls_list, f_out):
-    if len(remove_urls_list) > 0:
-        for each_url_remove in remove_urls_list:
-            myjson = json.dumps(each_url_remove, ensure_ascii=False)
-            f_out.write(myjson.encode('utf-8'))
-            f_out.write('\n'.encode('utf-8'))
-
-def compute_jaccard(each_bin, num_bins, start_time_local):
-
-    remove_urls_list = []
-    deduped_local, counter_local, bucket_local = 0, 0, 0
-
-    for bucket_id in each_bin:
-        bucket_local += 1
-        if os.getpid() % num_bins == 0 and bucket_local % 100000 == 0:
-            print("Counter {}, progress {:.2f} time {:.2f}".\
-                format(bucket_local, float(bucket_local)/float(len(each_bin)),\
-                time.time() - start_time_local), flush=True)
-
-        if len(each_bin[bucket_id]) <= 1:
-            continue
-
-        bucket_urls = each_bin[bucket_id].copy()
-        remove_urls_list_sub, deduped_local_sub, counter_local_sub = \
-            url_pairs_to_remove(args, bucket_urls, url_doc)
-
-        deduped_local += deduped_local_sub
-        counter_local += counter_local_sub
-        if len(remove_urls_list_sub) > 0:
-            remove_urls_list.extend(remove_urls_list_sub)
-
-    return remove_urls_list, deduped_local, counter_local
-
-def find_pair_urls_parallel(args, lshcache, url_doc):
-    start_time = time.time()
-    f_out = open(args.output, 'wb')
-    deduped, counter = 0, 0
-
-    # compute jaccards of buckets in bin in parallel (parallelism
-    # limited to # of bins)
-    num_bins = len(lshcache.bins)
-    pool = multiprocessing.Pool(num_bins)
-    compute_jaccard_partial = partial(compute_jaccard, num_bins=num_bins, \
-        start_time_local=start_time)
-    # don't need to pass args and url_doc as they are already shared
-    compute_jaccard_iter = pool.imap(compute_jaccard_partial, lshcache.bins)
-
-    print("multiprocessing init took {:.2f}".format(time.time() - start_time),\
-        flush=True)
-    for remove_urls_list, deduped_local, counter_local in compute_jaccard_iter:
-        deduped += deduped_local
-        counter += counter_local
-        write_remove_urls_list(remove_urls_list, f_out)
-        print(' [write]> processed {} documents in {:.2f} '
-            'seoncds and deduped {} documents ...'.format(counter, time.time()\
-            - start_time, deduped), flush=True)
-
-    pool.close()
-    pool.join()
-    f_out.close()
-
-    print(' Taken time for jaccard similariries {:.2f} seconds'.format(\
-        time.time() - start_time), flush=True)
-
-def find_pair_urls_sequential(args, lshcache, url_doc):
-    start_time = time.time()
-    f_out = open(args.output, 'wb')
-    deduped, counter = 0, 0
-    for b in lshcache.bins:
-        for bucket_id in b:
-            if len(b[bucket_id]) <= 1:
-                continue
-
-            bucket_urls = b[bucket_id].copy()
-            remove_urls_list_sub, deduped_local_sub, counter_local_sub = \
-                url_pairs_to_remove(args, bucket_urls, url_doc)
-
-            deduped += deduped_local_sub
-            counter += counter_local_sub
-            write_remove_urls_list(remove_urls_list_sub, f_out)
-            if counter % 10000 == 0:
-                print(' [write]> processed {} documents in {:.2f} '
-                    'seoncds and deduped {} documents ...'.
-                    format(counter, time.time() - start_time,
-                    deduped), flush=True)
-    f_out.close()
-    print(' [write]> processed {} documents in {:.2f} '
-        'seoncds and deduped {} documents ...'.
-        format(counter, time.time() - start_time,
-        deduped), flush=True)
-
-if __name__ == '__main__':
-
-    print('parsing the arguments ...')
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--seed', type=int, default=1234,
-                       help='Random seed used for python, numpy')
-    parser.add_argument('--inputs', nargs = '*', default=None, help = \
-                        'Pairwise list of the input files and keys, '
-                        'e.g. --inputs cc.json cc_id news.json news_id')
-    parser.add_argument('--load-fingerprints', nargs = '*', default=None,
-                       help='Load fingerprints from a list of pickle files,'
-                        ' e.g. cc.pkl news.pkl')
-    parser.add_argument('--save-fingerprints', type=str, default=None,
-                       help='Save the fingerprints of the inputs.')
-    parser.add_argument('--output', type=str, default=None,
-                       help='Output file name that consists of all ids'
-                        ' with matching similarities')
-    parser.add_argument('--jaccard', type=str, default='union',
-                        choices=['union', 'min', 'max'], help='Jaccard'\
-                        ' similarity computation')
-    parser.add_argument('--heuristic-iter', type=int, default=1,
-                       help='Number of iterations to run the heuristics'
-                        ': use -1 for exact')
-    parser.add_argument('--num-bands', type=int, default=10,
-                       help='Number of bands to use in cache')
-    parser.add_argument('--num-seeds', type=int, default=100,
-                       help='Number of seeds to use for minhash. Note that'
-                        ' this value should be divisible by num-bands')
-    parser.add_argument('--jaccard-parallel', action='store_true',
-                       help='Use this to process large number of documents.')
-    args = parser.parse_args()
-
-    print('finding possible duplicate content ...')
-
-    # set seed and get an array of seeds of 100 integers
-    np.random.seed(args.seed)
-    seeds = np.random.randint(0, 1e6, size=args.num_seeds)
-
-    # initialize minhash and lsh cache
-    hasher = minhash.MinHasher(seeds=seeds, char_ngram=5, hashbytes=4)
-    lshcache = cache.Cache(num_bands=args.num_bands, hasher=hasher)
-
-    url_doc = {}
-
-    # load fingerprints from pickle file if needed
-    if args.load_fingerprints is not None:
-        for count_fp, fp_file_name in enumerate(args.load_fingerprints):
-            print("Loading fingerprints from pickle file {}".format(
-                fp_file_name), flush=True)
-            fp = open(fp_file_name, "rb")
-            if count_fp == 0:
-                # assign directory for the first pkl
-                lshcache = pickle.load(fp)
-                url_doc = pickle.load(fp)
-            else:
-                # append these to lshcache and url_doc
-                local_lshcache = pickle.load(fp)
-                local_url_doc = pickle.load(fp)
-                for url in local_lshcache.fingerprints.keys():
-                    url_doc[url] = local_url_doc[url]
-                    lshcache.add_fingerprint(local_lshcache.fingerprints[url], url)
-            fp.close()
-
-    counter = 0
-    start_time = time.time()
-
-    # compute finger prints of the inputs if any
-    # input file and the key to use as id
-    if args.inputs is not None:
-        print("Computing fingerprints", flush=True)
-        assert len(args.inputs) % 2 == 0
-        for input_file, key in zip(args.inputs[::2], args.inputs[1::2]):
-            print(' document processing {} with key {}'.format(input_file, key),
-                flush=True)
-
-            # compute fingerprints in parallel
-            num_workers = 40
-            pool = multiprocessing.Pool(num_workers)
-            fin = open(input_file, 'r', encoding='utf-8')
-            compute_fingerprint_partial = partial(compute_fingerprint, key=key)
-            compute_fingerprint_iter = pool.imap(compute_fingerprint_partial,
-                                                    fin, 512)
-            # traverse all the texts and add fingerprints
-            for url, text, fingerprint, flag in compute_fingerprint_iter:
-                counter += 1
-                if flag:
-                    url_doc[url] = text
-                    lshcache.add_fingerprint(fingerprint, url)
-                if counter % 10000 == 0:
-                    print(' [read]> processed {} documents in {:.2f} '
-                        'seconds ...'.format(counter, time.time() - \
-                        start_time), flush=True)
-
-            fin.close()
-            pool.close()
-            pool.join()
-
-    # Save the fingerprints if needed
-    if args.save_fingerprints is not None:
-        print("Saving fingerprints to pickle file {}".format(
-            args.save_fingerprints), flush=True)
-        with open(args.save_fingerprints, 'wb') as f_save:
-            pickle.dump(lshcache, f_save)
-            pickle.dump(url_doc, f_save)
-
-    # compute jaccard index of the input texts and write to file if needed
-    if args.output is not None:
-        print("Compute jaccard similarity", flush=True)
-        if args.jaccard_parallel:
-            find_pair_urls_parallel(args, lshcache, url_doc)
-        else:
-            find_pair_urls_sequential(args, lshcache, url_doc)
-
-    print('done :-)')
- 
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/group_duplicate_url.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/group_duplicate_url.py
deleted file mode 100644
index 16a0354fde130d67de0fbb51e90658b1f16fc1c4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/group_duplicate_url.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import json
-import time
-import sys
-
-
-if __name__ == '__main__':
-
-
-    print('grouping duplicate urls ...')
-
-    input = sys.argv[1]
-    output = sys.argv[2]
-    if len(sys.argv) > 3:
-        jaccard_similarity_threshold = float(sys.argv[3])
-    else:
-        jaccard_similarity_threshold = 0.7
-
-    url_to_index = {}
-    index_to_urls = []
-    counter = 0
-    start_time = time.time()
-    with open(input, 'r') as f:
-        for line in f:
-            counter += 1
-            myjson = json.loads(line)
-            urls = []
-            for main_url in myjson.keys():
-                urls.append(main_url)
-                for value in myjson[main_url]:
-                    for other_url, js in value.items():
-                        if js >= jaccard_similarity_threshold:
-                            urls.append(other_url)
-            current_index = -1
-            other_indices = set()
-            for url in urls:
-                if url in url_to_index:
-                    if current_index == -1:
-                        current_index = url_to_index[url]
-                    elif current_index != url_to_index[url]:
-                        other_indices.add(url_to_index[url])
-            if current_index == -1:
-                current_index = len(index_to_urls)
-                index_to_urls.append(set())
-            for url in urls:
-                url_to_index[url] = current_index
-                index_to_urls[current_index].add(url)
-            for index in other_indices:
-                for url in index_to_urls[index]:
-                    index_to_urls[current_index].add(url)
-                    url_to_index[url] = current_index
-                index_to_urls[index] = None
-
-            if counter % 100000 == 0:
-                print(' > processed {} lines in {} seconds ...'.format(
-                    counter, time.time() - start_time))
-
-
-    total_remove = 0
-    total_remain = 0
-    for urls in index_to_urls:
-        if urls is not None:
-            if len(urls) > 1:
-                total_remove += (len(urls) - 1)
-                total_remain += 1
-    print('out of {} urls, only {} are unique and {} should be removed'.format(
-        total_remove+total_remain, total_remain, total_remove))
-
-    with open(output, 'wb') as f:
-        for i, urls in enumerate(index_to_urls):
-            if urls is not None:
-                if len(urls) > 1:
-                    myjson = json.dumps({str(i): list(urls)},
-                                        ensure_ascii=False)
-                    f.write(myjson.encode('utf-8'))
-                    f.write('\n'.encode('utf-8'))
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/merge_jsons.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/merge_jsons.py
deleted file mode 100644
index fb11fe45ba5d20b0bf05d9aeaad1758db0a33b3e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/merge_jsons.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-
-import glob
-import sys
-import json
-import argparse
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--json_path", type=str, default=".",
-        help="path where all the json files are located")
-
-    parser.add_argument("--output_file", type=str, default="merged_output.json",
-        help="filename where the merged json should go")
-
-    args = parser.parse_args()
-
-    json_path = args.json_path
-    out_file = args.output_file
-
-    json_files = glob.glob(json_path + '/*.json')
-
-    counter = 0
-
-    with open(out_file, 'w') as outfile:
-        for fname in json_files:
-            counter += 1
-
-            if counter % 1024 == 0:
-                print("Merging at ", counter, flush=True)
-
-            with open(fname, 'r') as infile:
-                for row in infile:
-                    each_row = json.loads(row)
-                    outfile.write(row)
-
-
-    print("Merged file", out_file, flush=True)
-
-
diff --git a/toolbox/Megatron-DeepSpeed/tools/openwebtext/remove_group_duplicates.py b/toolbox/Megatron-DeepSpeed/tools/openwebtext/remove_group_duplicates.py
deleted file mode 100644
index 44b62d62c19f35ef555507f7a07fc2bb73c8ca51..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/openwebtext/remove_group_duplicates.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-
-import json
-import time
-import sys
-
-
-if __name__ == '__main__':
-
-    url_filename = sys.argv[1]
-    data_filename = sys.argv[2]
-    output_filename = sys.argv[3]
-
-    urls = set()
-    with open(url_filename, 'r') as f:
-        for line in f:
-            myjson = json.loads(line)
-            for key in myjson:
-                this_urls = myjson[key]
-                for i in range(1, len(this_urls)):
-                    urls.add(this_urls[i])
-    print('will be removing {} urls'.format(len(urls)), flush=True)
-
-    written_docs = 0
-    removed_docs = 0
-    removed_chars = 0
-    start_time = time.time()
-    with open(output_filename, 'wb') as fout:
-        with open(data_filename, 'r') as fin:
-            for line in fin:
-                try:
-                    myjson = json.loads(line)
-                    url = myjson['url']
-                    if url in urls:
-                        print('removing', myjson)
-                        removed_docs += 1
-                        removed_chars += len(myjson['text'])
-                        continue
-                    myjson = json.dumps(myjson, ensure_ascii=False)
-                    fout.write(myjson.encode('utf-8'))
-                    fout.write('\n'.encode('utf-8'))
-                    written_docs += 1
-                    if written_docs % 10000 == 0:
-                        print(' [PROCESSED] time (s): {:.2f} | written: {} '
-                              '| removed: {} (char: {})'.format(
-                                  time.time() - start_time,
-                                  written_docs, removed_docs, removed_chars))
-                except Exception as e:
-                    print('[SKIPPING]', line, e)
-
-    print(' [PROCESSED] time (s): {:.2f} | written: {} '
-          '| removed: {} (char: {})'.format(
-              time.time() - start_time,
-              written_docs, removed_docs, removed_chars))
-    print('done :-)')
diff --git a/toolbox/Megatron-DeepSpeed/tools/preprocess_data.py b/toolbox/Megatron-DeepSpeed/tools/preprocess_data.py
deleted file mode 100644
index bce377571a14d4454579b1070830e49b8cf676bd..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/preprocess_data.py
+++ /dev/null
@@ -1,431 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Processing large data for pretraining."""
-import argparse
-import math
-import json
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-import time
-import gzip
-import glob
-import torch
-import numpy as np
-import multiprocessing
-try:
-    import nltk
-    nltk_available = True
-except ImportError:
-    nltk_available = False
-
-import megatronspeed.megatron_adaptor
-from megatron.training.tokenizer import build_tokenizer
-from megatron.core.datasets import indexed_dataset
-
-
-# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
-class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
-
-    _period_context_fmt = r"""
-        \S*                          # some word material
-        %(SentEndChars)s             # a potential sentence ending
-        \s*                       #  <-- THIS is what I changed
-        (?=(?P<after_tok>
-            %(NonWord)s              # either other punctuation
-            |
-            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
-        ))"""
-
-class IdentitySplitter(object):
-    def tokenize(self, *text):
-        return text
-
-
-class Encoder(object):
-    def __init__(self, args):
-        self.args = args
-
-    def initializer(self):
-        # Use Encoder class as a container for global data
-        Encoder.tokenizer = build_tokenizer(self.args)
-        if self.args.split_sentences:
-            if not nltk_available:
-                print("NLTK is not available to split sentences.")
-                exit()
-            if os.environ.get("NLTK_DATA"):
-                library = os.path.join(os.environ.get("NLTK_DATA"), "tokenizers", "punkt", f"{self.args.lang}.pickle")
-                url = f"file:{library}"
-            else:
-                library = os.path.join("tokenizers", "punkt", f"{self.args.lang}.pickle")
-                url = f"nltk:{library}"
-            splitter = nltk.load(url)
-            if self.args.keep_newlines:
-                # this prevents punkt from eating newlines after sentences
-                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
-                    train_text = splitter._params,
-                    lang_vars = CustomLanguageVars())
-            else:
-                Encoder.splitter = splitter
-
-        else:
-            Encoder.splitter = IdentitySplitter()
-
-    def split(self, json_line):
-        data = json.loads(json_line)
-        output = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            max_len = 1000000
-            tokens_list = [Encoder.splitter.tokenize(text[i:i+max_len]) for i in range(0, len(text), max_len)]
-            output[key] = [tokens for partial in tokens_list for tokens in partial]
-        return json.dumps(output), len(json_line)
-
-    def encode(self, json_line):
-        data = json.loads(json_line)
-        ids = {}
-        lens = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            if isinstance(text, list):
-                sentences = text
-            else:
-                sentences = [text]
-            doc_ids = []
-            sentence_lens = []
-            for sentence in sentences:
-                sentence_ids = Encoder.tokenizer.tokenize(sentence)
-                if len(sentence_ids) > 0:
-                    doc_ids.extend(sentence_ids)
-                    sentence_lens.append(len(sentence_ids))
-            if len(doc_ids) > 0 and self.args.append_eod:
-                doc_ids.append(Encoder.tokenizer.eod)
-                sentence_lens[-1] += 1
-            ## 添加数据padding
-            if self.args.pad_2_maxlen:
-                padding_token = self.args.pad_id
-                diff = self.args.pad_2_maxlen - len(doc_ids)
-                pad = [padding_token] * diff
-                if diff >= 0:
-                    if self.args.pad_direction == 'right':
-                        doc_ids = doc_ids + pad
-                    elif self.args.pad_direction == 'left':
-                        doc_ids = pad + doc_ids
-                    else:
-                        raise ValueError("pad_direction should be choose from ['right', 'left']")
-                    sentence_lens[-1] += diff
-                else:
-                    doc_ids = doc_ids[abs(diff):]
-                    sentence_lens[-1] += diff
-            ids[key] = doc_ids
-            lens[key] = sentence_lens
-        return ids, lens, len(json_line)
-
-
-class Partition(object):
-    def __init__(self, args, workers):
-        self.args = args
-        self.workers = workers
-
-    def print_processing_stats(self, count, proc_start, total_bytes_processed):
-        if count % self.args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed/elapsed/1024/1024
-            print(f"Processed {count} documents",
-                  f"({count/elapsed} docs/s, {mbs} MB/s).",
-                  file=sys.stderr)
-
-    def split_sentences(self, file_name):
-        input_file_name, output_file_name = file_name
-        print("Opening", input_file_name)
-        fin = open(input_file_name, 'r', encoding='utf-8')
-        fout = open(output_file_name, 'w')
-
-        encoder = Encoder(self.args)
-        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
-        split_docs = pool.imap(encoder.split, fin, 32)
-
-        proc_start = time.time()
-        total_bytes_processed = 0
-        for i, (doc, bytes_processed) in enumerate(split_docs, start=1):
-            total_bytes_processed += bytes_processed
-            fout.write(doc + "\n")
-            self.print_processing_stats(i, proc_start, total_bytes_processed)
-
-        fin.close()
-        fout.close()
-
-
-    def process_json_file(self, file_name):
-        input_file_name, output_prefix = file_name
-        print("Opening", input_file_name)
-        fin = open(input_file_name, 'r', encoding='utf-8')
-
-        startup_start = time.time()
-        encoder = Encoder(self.args)
-        tokenizer = build_tokenizer(self.args)
-        pool = multiprocessing.Pool(self.workers, initializer=encoder.initializer)
-        encoded_docs = pool.imap(encoder.encode, fin, 32)
-
-        level = "document"
-        if self.args.split_sentences:
-            level = "sentence"
-
-        output_bin_files = {}
-        output_idx_files = {}
-        builders = {}
-
-        for key in self.args.json_keys:
-            output_bin_files[key] = "{}_{}_{}.bin".format(output_prefix,
-                                                          key, level)
-            output_idx_files[key] = "{}_{}_{}.idx".format(output_prefix,
-                                                          key, level)
-            builders[key] = indexed_dataset.IndexedDatasetBuilder(
-                output_bin_files[key],
-                dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
-            )
-
-        startup_end = time.time()
-        proc_start = time.time()
-        total_bytes_processed = 0
-        print("Time to startup:", startup_end - startup_start)
-        for i, (doc, sentence_lens, bytes_processed) in enumerate(encoded_docs, start=1):
-            total_bytes_processed += bytes_processed
-            for key in doc.keys():
-                builders[key].add_document(doc[key], sentence_lens[key])
-            self.print_processing_stats(i, proc_start, total_bytes_processed)
-
-        fin.close()
-        builders[key].finalize(output_idx_files[key])
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to input JSON')
-    group.add_argument('--json-keys', nargs='+', default=['text'],
-                       help='space separate listed of keys to extract from json')
-    group.add_argument('--split-sentences', action='store_true',
-                       help='Split documents into sentences.')
-    group.add_argument('--keep-newlines', action='store_true',
-                       help='Keep newlines between sentences when splitting.')
-
-    group = parser.add_argument_group(title='tokenizer')
-    group.add_argument('--tokenizer-type', type=str, required=True,
-                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
-                                'GPT2BPETokenizer', 'SentencePieceTokenizer',
-                                'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
-                                'NullTokenizer','Llama3Tokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--tokenizer-model', type=str, default=None,
-                       help='YTTM tokenizer model.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file')
-    group.add_argument('--vocab-size', default=786,
-                       help='size of vocab for use with NullTokenizer')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file (if necessary).')
-    group.add_argument('--append-eod', action='store_true',
-                       help='Append an <eod> token to the end of a document.')
-    group.add_argument('--lang', type=str, default='english',
-                       help='Language to use for NLTK-powered sentence splitting.')
-    group.add_argument('--pad-2-maxlen', type=int, default=None,
-                       help='padding sequence to max length')
-    group.add_argument('--pad-direction', type=str, default='right', choices=['right', 'left'],
-                       help='pad direction choose from [right, left]')
-    group.add_argument('--pad-id', type=int, default=None, 
-                       help='padding token id')
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
-
-    group = parser.add_argument_group(title='runtime')
-    group.add_argument('--workers', type=int, required=True,
-                       help=('Number of worker processes to launch.'
-                             'A good default for fast pre-processing '
-                             'is: (workers * partitions) = available CPU cores.'))
-    group.add_argument('--partitions', type=int, default=1,
-                        help='Number of file partitions')
-    group.add_argument('--log-interval', type=int, default=1000,
-                       help='Interval between progress updates')
-    group.add_argument('--keep-sequential-samples', action='store_true',
-                       help='Ensure ordering of samples in .jsonl files is '
-                            'preserved when using partitions>1.')
-    args = parser.parse_args()
-    args.keep_empty = False
-
-    if args.tokenizer_type.lower().startswith('bert') and not args.split_sentences:
-        print("Are you sure you don't want to split sentences?")
-
-    # some default/dummy values for the tokenizer
-    args.rank = 1
-    args.make_vocab_size_divisible_by = 128
-    args.tensor_model_parallel_size = 1
-    args.vocab_extra_ids = 0
-
-    return args
-
-
-def get_file_name(args, file_id):
-    file_name, extension = os.path.splitext(args.input)
-    input_file_name = file_name + "_" + str(file_id) + extension
-    sentence_split_file = file_name + "_ss_" + str(file_id) + extension
-    output_prefix = args.output_prefix + "_" + str(file_id)
-    file_names = {
-        'partition': input_file_name,
-        'sentence_split': sentence_split_file,
-        'output_prefix': output_prefix}
-    return file_names
-
-
-def check_files_exist(in_ss_out_names, key, num_partitions):
-    for i in range(num_partitions):
-        if not os.path.exists(in_ss_out_names[i][key]):
-            return False
-    return True
-
-
-def main():
-    args = get_args()
-
-    if args.split_sentences:
-        if nltk_available:
-            nltk.download("punkt", quiet=True, download_dir=os.environ.get("NLTK_DATA"))
-        else:
-            raise Exception(
-                "nltk library required for sentence splitting is not available.")
-
-    in_ss_out_names = []
-    if args.partitions == 1:
-        file_name, extension = os.path.splitext(args.input)
-        sentence_split_file = file_name + "_ss" + extension
-        file_names = {
-            'partition': args.input,
-            'sentence_split': sentence_split_file,
-            'output_prefix': args.output_prefix}
-        in_ss_out_names.append(file_names)
-    else:
-        in_file_names = glob.glob(args.input)
-
-        # Count total number of lines across .jsonl files
-        if args.keep_sequential_samples:
-            total_sample_count = 0
-            for filename in in_file_names:
-                with open(filename, "r") as fin:
-                    for fc, _ in enumerate(fin):
-                        pass
-                total_sample_count += (fc + 1)
-            partition_size = math.ceil(total_sample_count / args.partitions)
-
-        # create .jsonl parition files
-        for idx in range(args.partitions):
-            in_ss_out_name = get_file_name(args, idx)
-            in_ss_out_names.append(in_ss_out_name)
-
-        # check to see if paritions were already created
-        partitions_present = check_files_exist(in_ss_out_names, 'partition', args.partitions)
-
-        # check to see if paritions with split sentences already created
-        split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
-
-        if not partitions_present and not split_sentences_present:
-            # populate .jsonl partition files from parent files
-            partitioned_input_files = []
-            for idx in range(args.partitions):
-                partitioned_input_file = open(in_ss_out_names[idx]['partition'], 'w')
-                partitioned_input_files.append(partitioned_input_file)
-
-            index = 0
-            if args.keep_sequential_samples: line_count = 0
-            for in_file_name in in_file_names:
-                # support for gzip files
-                if in_file_name.endswith(".gz"):
-                    fin = gzip.open(in_file_name, 'rt')
-                else:
-                    fin = open(in_file_name, 'r', encoding='utf-8')
-
-                for line in fin:
-                    partitioned_input_files[index].write(line)
-                    if args.keep_sequential_samples:
-                        line_count += 1
-                        if line_count % partition_size == 0:
-                            index += 1
-                    else:
-                        index = (index + 1)%args.partitions
-
-                fin.close()
-
-            for idx in range(args.partitions):
-                partitioned_input_files[idx].close()
-
-    assert args.workers % args.partitions == 0
-    partition = Partition(args, args.workers//args.partitions)
-
-    # check to see if paritions with split sentences already created
-    split_sentences_present = check_files_exist(in_ss_out_names, 'sentence_split', args.partitions)
-
-    # split sentences in partition files
-    if args.split_sentences and not split_sentences_present:
-        processes = []
-        for name in in_ss_out_names:
-            p = multiprocessing.Process(target=partition.split_sentences,
-                                        args=((name['partition'], name['sentence_split']),))
-            p.start()
-            processes.append(p)
-
-        for p in processes:
-            p.join()
-
-        if args.partitions == 1:
-            return
-
-
-    # encode partition files in parallel
-    processes = []
-    input_key = 'sentence_split' if args.split_sentences else 'partition'
-    for name in in_ss_out_names:
-        p = multiprocessing.Process(target=partition.process_json_file,
-                                    args=((name[input_key], name['output_prefix']),))
-        p.start()
-        processes.append(p)
-
-    for p in processes:
-        p.join()
-
-    if args.partitions == 1:
-        return
-
-    # merge bin/idx partitions
-    level = "document"
-    if args.split_sentences:
-        level = "sentence"
-
-    output_bin_files = {}
-    output_idx_files = {}
-    builders = {}
-    tokenizer = build_tokenizer(args)
-
-    for key in args.json_keys:
-        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
-                                                      key, level)
-        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
-                                                      key, level)
-        builders[key] = indexed_dataset.MMapIndexedDatasetBuilder(
-            output_bin_files[key],
-            dtype=indexed_dataset.DType.optimal_dtype(tokenizer.vocab_size),
-        )
-
-        for name in in_ss_out_names:
-            parition_output_prefix = name['output_prefix']
-            full_partition_output_prefix = "{}_{}_{}".format(parition_output_prefix,
-                                                             key, level)
-            builders[key].add_index(full_partition_output_prefix)
-        builders[key].finalize(output_idx_files[key])
-
-
-if __name__ == '__main__':
-
-    main()
\ No newline at end of file
diff --git a/toolbox/Megatron-DeepSpeed/tools/preprocess_data_nmt.py b/toolbox/Megatron-DeepSpeed/tools/preprocess_data_nmt.py
deleted file mode 100644
index 4035cc8f0f471ff309b2e056d959d636d5b36ac4..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/preprocess_data_nmt.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Processing nmt data for finetuning."""
-
-import argparse
-import json
-import multiprocessing
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-import time
-import torch
-from megatron_ds.tokenizer import build_tokenizer
-from megatron_ds.data import indexed_dataset
-
-
-class Encoder(object):
-    def __init__(self, args):
-        self.args = args
-
-    def initializer(self):
-        # Use Encoder class as a container for global data
-        Encoder.tokenizer = build_tokenizer(self.args)
-
-    def encode(self, text):
-        ids = {}
-        ids = Encoder.tokenizer.tokenize(text)
-        assert len(ids) > 0
-        return ids, len(text)
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to input JSON')
-
-    group = parser.add_argument_group(title='tokenizer')
-    group.add_argument('--tokenizer-type', type=str, default='YTTMTokenizer',
-                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
-                                'GPT2BPETokenizer', 'SentencePieceTokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file (if necessary).')
-
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
-    group.add_argument('--dataset-impl', type=str, default='mmap',
-                       choices=['lazy', 'cached', 'mmap'])
-
-    group = parser.add_argument_group(title='runtime')
-    group.add_argument('--workers', type=int, default=1,
-                       help='Number of worker processes to launch')
-    group.add_argument('--log-interval', type=int, default=100,
-                       help='Interval between progress updates')
-    args = parser.parse_args()
-    args.keep_empty = False
-
-    # some default/dummy values for the tokenizer
-    args.rank = 0
-    args.make_vocab_size_divisible_by = 128
-    args.tensor_model_parallel_size = 1
-    args.vocab_extra_ids = 0
-
-    return args
-
-def main():
-    args = get_args()
-    startup_start = time.time()
-
-    print("Opening", args.input)
-    fin = open(args.input, 'r', encoding='utf-8')
-
-    encoder = Encoder(args)
-    tokenizer = build_tokenizer(args)
-    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
-    encoded_sentences = pool.imap(encoder.encode, fin, 25)
-
-    print(f"Vocab size: {tokenizer.vocab_size}")
-    print(f"Output prefix: {args.output_prefix}")
-    output_bin_file = "{}.bin".format(args.output_prefix)
-    output_idx_file = "{}.idx".format(args.output_prefix)
-    builder = indexed_dataset.make_builder(output_bin_file,
-                                           impl=args.dataset_impl,
-                                           vocab_size=tokenizer.vocab_size)
-
-    startup_end = time.time()
-    proc_start = time.time()
-    total_bytes_processed = 0
-    print("Time to startup:", startup_end - startup_start)
-
-    for i, (sentence, bytes_processed) in enumerate(encoded_sentences, start=1):
-        total_bytes_processed += bytes_processed
-        builder.add_item(torch.IntTensor(sentence))
-        # documents contain only one sentence.
-        builder.end_document()
-        if i % args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed/elapsed/1024/1024
-            print(f"Processed {i} sentences",
-                  f"({i/elapsed} sentences/s, {mbs} MB/s).",
-                  file=sys.stderr)
-
-    builder.finalize(output_idx_file)
-
-if __name__ == '__main__':
-    main()
-
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/README.md b/toolbox/Megatron-DeepSpeed/tools/retro/README.md
deleted file mode 100644
index 54c6854098c8241cdaa0ebac4b788cc6000c05b8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/README.md
+++ /dev/null
@@ -1,226 +0,0 @@
-This directory contains a collection of tools for building the retrieval database and pretraining neighbors for Retro. This preprocessing pipeline is broken into 3 main stages:
-
-1. **Build retrieval chunk database** : Used for retrieving neighbors and continuation chunks, which are then passed through the retrieval encoder.
-2. **Build index for similarity search** : Train and build a search index for querying chunk neighbors.
-3. **Query pretraining neighbors** : For matching pretraining samples to database chunks. Neighbors are generated separately for training, validation, and test datasets.
-
-The following overview goes into more detail on the pipeline, code structure, usage, and pretraining.
-
-<!-- ################ contents ################ -->
-# Contents
-
-  * [Quick start](#quick-start)
-  * [Stages](#stages)
-  * [Code structure](#code-structure)
-  * [Arguments](#arguments)
-  <!-- * [Pretraining](#pretraining) -->
-
-<!-- ################ quick start ################ -->
-# Quick start
-
-See `examples/get_preprocess_cmd.sh` for example arguments.
-
-Key files:
-
-- `main.py` : Entry point.
-- `examples/get_preprocess_cmd.sh` : Build preprocessing command (for `main.py`).
-- `examples/preprocess_data.sh` : Run preprocessing (calls `get_preprocess_cmd.sh`, `main.py`).
-
-Use `--retro-tasks` to move through the preprocessing pipeline.
-
-- Simplest setup (builds everything): `--retro-tasks build`
-- Alternatively, for tuning compute resources, run stages independently:
-  - Build retrieval database: `--retro-tasks db-build`
-  - Build search index: `--retro-tasks index-build`
-  - Query neighbors: `--retro-tasks pretraining-query-neighbors`
-
-Sample code flow:
-
-- `main.py` : Entry point (e.g., using `--retro-tasks X`).
-- `db/build.py` : Build retrieval database.
-- `index/build.py` : Build search index. Calls the following two files:
-  - `index/train.py` : Train index on subset of database.
-  - `index/add.py` : Add database chunks to index.
-- `pretraining/query.py` : Query pretraining samples for database neighbors (saved to disk and used during pretraining).
-
-<!-- ################ stages ################ -->
-# Stages
-
-### Build retrieval chunk database
-
-This *database* (stored as a 2-D array, NOT a relational database) consists of a list of chunks (traditionally length 64) extracted from the original GPT token dataset. This is simply a consecutive, non-overlapping chunking of the token dataset. Chunking only takes place within a document, and therefore the final chunk of each document has length: 1 <= chunk_length <= max_chunk_length.
-
-We discard chunks that would convert to an empty Bert sequence (rare case, happens ~1/100,000 chunks in our case), since we use Bert embeddings for building our index. Thus, the total number of chunks in the database will be slightly less than a naive calculation.
-
-### Build index for similarity search
-
-To match pretraining chunks to database chunks, a search index must be built to perform this querying. We use Faiss (https://github.com/facebookresearch/faiss) for training and building this index. Generally, the index is trained on a subset of all chunks in the database (specified via `--retro-nchunks-sampled`). After training, all chunks are added into the index, to be available during querying.
-
-Indexes only accept 1-D floating point vectors for training and adding, so each chunk must first be embedded before passing to the index for either training or adding. We use Bert embeddings for this purpose, and the embeddings are generated automatically within the pipeline.
-
-### Query pretraining neighbors
-
-To ensure fast Retro pretraining, the database neighbors for pretraining samples are pre-computed and saved to disk, for efficient access within the Retro dataset. In this stage, the pretraining datasets (training, validation, and test) are iterated, each sample is broken into chunks, and the chunks are used for querying the index. Similar to when building the index, each chunk is embedded (via Bert) before querying the index.
-
-The saved neighbors are labeled with unique dataset properties (i.e., seed, sequence length, number of samples, etc.) to ensure the neighbors generated during preprocessing match the neighbors requested during pretraining.
-
-<!-- ################ code structure ################ -->
-# Code structure
-
-### `tools/retro/main.py`
-
-This is the main entry point for Retro preprocessing. Call `main.py --help` to see arguments. Additionally, some Retro arguments are in Megatron's core arguments, so also see `add_retro_args()` section of `megatron/arguments.py` for additional arguments. Two of the most important arguments to customize are `--retro-workdir` and `--retro-tasks`.
-
-- **`--retro-workdir`** : Set the directory in which the preprocessing pipeline saves its datasets and configuration files. This argument should remain consistent for a full pass through the pipeline, and for pretraining.
-
-- **`--retro-tasks`** : Set the stages of preprocessing to perform. As mentioned previously, the three high-level stages are: 1) build retrieval database, 2) build search index, and 3) query pretraining neighbors. `--retro-tasks` can be used to either run the full pipeline, or run each of these stages in isolation. The latter case is useful for tuning compute resources for each stage. For example, index training utilizes GPUs and requires relatively less time, while querying neighbors uses the CPU and is a relatively slow process. Example tasks include:
-
-  - **`--retro-tasks build`** : Run entire preprocessing pipeline.
-  - **`--retro-tasks db-build`** : Build retrieval database.
-  - **`--retro-tasks index-build`** : Train and build search index.
-  - **`--retro-tasks pretraining-query-neighbors`** : Query pretraining neighbors.
-
-Multiple tasks can be specified by separating with commas (e.g., `--retro-tasks db-build,index-build`). Additionally, various 'miscellaneous' tasks are currently including, primarily for validating data for each stage; these task names can be seen in `main.py`.
-
-### `tools/retro/examples`
-
-Example scripts for setting arguments and launch Retro preprocessing. The key files here are:
-
-- **`get_preprocess_cmd.sh`** : Sets up arguments and command for preprocessing. **Important note**: this script assumes a few environment variables are already set before it is called. Please see the `Environment vars.` section at the top of this file. Generally, environment variables must be set to determine the location of Retro workdirs, input datasets, and GPT and Bert model information.
-- **`preprocess_data.sh`** : Calls `get_preprocess_cmd.sh` to get arguments, and then calls `main.py` to launch preprocessing.
-- **`pretrain_model.sh`** : Example script for pretraining on Wikipedia data, after preprocessing is complete.
-
-### `tools/retro/db`
-
-Build the retrieval chunk database. The key files here are:
-
-- **`build.py`** : Entry point for building the database. This code is responsible for iterating the input datasets (i.e., `--data-path`), parsing each dataset into consecutive chunks, checking for empty Bert (Wordpiece) conversions, and storing this information to disk. Two databases are created: 1) the retrieval database, and 2) a sampled database used for training the search index.
-- **`dataset.py`** : Defines database class, for iterating or accessing chunks in the database. Each chunk contains its tokens, Bert conversion length, and dataset index.
-
-Input data:
-
-<!-- - Token datasets, as generated by `tools/preprocess_data.py`. Each dataset should include a `.bin` and `.idx` file. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`). -->
-- Token datasets, as loaded by `gpt_dataset.py`. Multiple datasets can be specified by using a blended configuration (see `--data-path` in `megatron/arguments.py`).
-
-Output data:
-
-- **`<RETRO_WORKDIR>/db/merged/train.hdf5`** : The main retrieval database. (*Database* here is used to denote a list of indexed chunks, rather than a *relational database*.) The chunks in this database are added to the search index, and are used for retrieval during pretraining. This file contains a single dataset `'chunks'`, which contains 5 columns:
-
-  - `dataset_idx` : Dataset index, from list of blended indexed datasets.
-  - `document_idx` : Document index within dataset.
-  - `chunk_start_idx` : Chunk's starting token index within document.
-  - `chunk_end_idx` : Chunk's ending token index (exclusive) within document.
-  - `bert_chunk_length` : Length of Bert token sequence, after converting from GPT.
-
-- **`<RETRO_WORKDIR>/db/merged/sampled.hdf5`** : Subset of training database that is used for training the search index. This file has the same structure as detailed above. In general, this database is significanly smaller than the `train.hdf5` database, since the search index only needs a relatively small number of samples to understand the data's structure. After training, all chunks in the main database (`train.hdf5`) are *added* to the search index.
-
-### `tools/retro/index`
-
-Build the search index. The key files here are:
-
-- `build.py` : Entry point for building the search index. First, the index is trained on the sampled chunk database (see above) by calling `train.py`, and then all chunks for the full database are added to the index by calling `add.py`. Note that training requires first embedding (using Bert) all chunks (a parallel operation), and then loading these embeddings and training the index (a sequential operation), so it's best to change one's compute setup after all chunks have been embedded and saved to disk.
-- `indexes/faiss_base.py` : Wrapper class for building a Faiss index, following the standard `train()` and `add()` operations.
-- `indexes/faiss_par_add.py` : Similar to above, except it uses an embarrassingly parallel (multi-node, multi-process) `add()` operation. Vectors are first added to separate index copies, and then merged together.
-
-Input data:
-
-- **`<RETRO_WORKDIR>/db/merged/sampled.hdf5`** : Chunks used for training the search index.
-- **`<RETRO_WORKDIR>/db/merged/train.hdf5`** : Chunks used for adding to the *trained* search index.
-
-Output data:
-
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex`** : The final index, which has been trained and has had all database chunks added to it. This index is ready for querying neighbors. Here, `RETRO_INDEX_TYPE` and `RETRO_INDEX_STR` correspond to the same-name arguments `--retro-index-type` (e.g., `faiss-par-add`) and `--retro-index-str` (e.g., `OPQ32_256,IVF4194304_HNSW32,PQ32`).
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/empty.faissindex`** : Generally can be discarded once `added.faissindex` has been built, but this file contains the *post-training*, *pre-adding* index. Useful for debugging or building other indexes.
-
-### `tools/retro/pretraining`
-
-Query the pretraining datasets (training, validation, test) for their neighbors within the database. Neighbors are queried during preprocessing -- rather than during pretraining -- because querying is a fairly slow operation, so it would be a bottleneck if performed during pretraining. Queried neighbors are tagged with their unique identifying information (e.g., `train_indexmap_27662746ns_2048sl_1234s`), so as to avoid incorrect references during pretraining. The key files here are:
-
-- **`query.py`** : Entry point for querying. The pretraining datasets are iterated, and each chunk within each sample is queried using the search index. These neighbors are filtered by discarding any database chunks that fall within the same document as any chunk within a pretraining sample.
-- **`chunk_dataset.py`** : This creates an iterable 'chunk' dataset form of a pretraining dataset. This is just a light wrapper, but makes it easier to deterministically iterate and assign IDs to each chunk in a sample dataset.
-- **`retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample.
-
-Input data:
-
-- Token datasets, as loaded by `gpt_dataset.py`.
-- **`<RETRO_WORKDIR>/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex`** : The trained index, with all database chunks added to it (see previous section for details).
-
-Output data:
-
-- **`<RETRO_WORKDIR>/{train,valid,test}_XXns_YYsl_ZZs/WW.hdf5`** : These directories/files contain the indexes of neighbors for each chunk within each sample of the pretraining datasets. Each directory (e.g., `train_indexmap_2047435ns_2048sl_1234s`) contains a list of HDF5 files (e.g., one file might be called `0075700000-0075800000.hdf5`). Each HDF5 file contains a consecutive subset of neighbor IDs for a given chunk, for indexing into the main retrieval database. All HDF5 files taken together within a given directory, represent the entire set of neighbors for a dataset. The size of these HDF5 files is determined by the argument `--retro-block-size`. The `XX`, `YY`, `ZZ`, `WW` notation above denotes the dataset properties that are used for uniquely tagging the neighbor files, to ensure compatibility during model pretraining. These neighbor files are ultimated used by `retro_dataset.py` during pretraining, for building Retro samples.
-
-### `tools/retro/cli`
-
-Inspect preprocessed data. To use the CLI, open a Python terminal via the `python` command, and then load a Retro workdir with the following:
-
-```
-from tools.retro.cli import retro
-retro.init("/path/to/retro/workdir")
-```
-
-This initializes Megatron, and prepares the Retro data for inspection. See the printed usage for available functions. Several routines are included for viewing data in the retrieval database and viewing pretraining samples and neighbors. For example:
-
-```python
-retro.get_db_num_indexed_datasets() # 15
-retro.get_db_chunk_text(92874113) # 'research project at ...  and philosophy'
-retro.get_pt_sample('train', 62005) # '[16084, 26158, 25387 ..., 6898, 9568]'
-```
-
-Most methods within the CLI are prefixed to denote the data being inspected:
-
-- **'db'** : Retrieval database (i.e., chunk tokens, document IDs, and dataset IDs)
-- **'pt'** : Pretraining datasets (i.e., sample tokens and neighbor tokens)
-
-### `tools/retro/utils.py`
-
-A collection of utility methods. Most importantly, this contains:
-
-- **`def get_gpt_tokenizer()`** : Get the GPT tokenizer.
-- **`def get_bert_tokenizer()`** : Get the Bert tokenizer.
-- **`class GPTToTextDataset`** : Wrapper class that converts GPT (BPE) samples to raw text.
-
-### `tools/bert_embedding`
-
-Generate Bert embeddings. The main files here are:
-
-- **`embed.py`** : Entry point for generating embeddings, and contains the two main embedding classes, `BertEmbedder` and `DiskDataParallelBertEmbedder` (more below). This file contains code for generating Megatron embeddings, while the file below contains code for Huggingface embeddings.
-- **`huggingface.py`** : Used by `embed.py` when the embedder is configured (see below) to output Huggingface embeddings.
-- **`dataset.py`** : Wrapper class for converting a raw-text dataset to Bert (Wordpiece) tokens.
-
-The Bert embeddings can be configured along two axes. The first axis is the output type:
-
-- **`class BertEmbedder`** : This class takes a raw-text dataset as input, generates its embeddings, and returns a Numpy array. The main functions are `embed_text_dataset` (accepts a raw-text dataset) and `embed_text` (accepts a string).
-- **`class DiskDataParallelBertEmbedder`** : This class wraps `BertEmbedder`, and rather than returning a Numpy array, it saves the embeddings to disk. Additionally, this class automatically splits data across data parallel ranks (using interleaving), and also processes data in a specified `block_size` (e.g., 1,000,000).
-
-The second axis is the type of embedding model to use, controlled by the argument `--bert-embedder-type`:
-
-- **`--bert-embedder-type megatron`** : Use Megatron's Bert model. The specific model used is dependent on the loaded checkpoint, vocab file, and tokenizer.
-- **`--bert-embedder-type huggingface`** : Use Huggingface's `bert-large-cased`. (*Note*: Huggingface's inclusion is likely to be deprecated; and there is no ability to configure cased/uncased.)
-
-### Pretraining
-
-- **`pretrain_retro.py`** : Launch script for pretraining Retro. Similar to `pretrain_gpt.py`, except this script handles loading neighbor tokens and setting up the neighbor attention mask.
-<!-- - `megatron/data/gpt_dataset.py` : ? -->
-- **`megatron/model/retro_transformer.py`** : Implementation of Retro model, including the main transformer, the retrieval encoder, and chunked cross-attention layers. Note that currently, `retro_transformer.py` contains several classes that are nearly identical to `transformer.py`, except for 1 or 2 lines, due to code changes that are yet to be integrated.
-- **`tools/retro/pretraining/retro_dataset.py`** : The Retro dataset used for pretraining (not used in preprocessing). Each sample returns the sample tokens, along with neighbor tokens for each chunk within the sample.
-
-
-<!-- ################ arguments ################ -->
-# Arguments
-
-See `tools/retro/main.py`'s `add_retro_args()` and `megatron/arguments.py`'s `_add_retro_args()` for details and descriptions. Here we list some particularly important arguments:
-
-- `--retro-workdir` : Mentioned previously, this argument determines the directory in which a set of Retro data is stored (during preprocessing) and loaded (during pretraining). Any change in this directory during preprocessing may result in preprocessing starting over from scratch, and any change before pretraining will result in pretraining throwing an error.
-- Preprocessing
-  - `--retro-gpt-chunk-length` : Retro chunk length (e.g., 64 in original paper).
-  - `--retro-tasks` : Comma-separated list of preprocessing tasks. Generally, the `build` task is the simplest way to run the preprocessing pipeline. For finer control, individual stages can be run by using tasks (in order): `db-build`, `index-build`, and `pretraining-query-neighbors`.
-  - `--retro-index-str` : Faiss index string that defines the index configuration. This will vary based on data size, compute/disk setup, and user needs. For example, this string looks something like `IVF262144_HNSW32,Flat` or `OPQ32_256,IVF4194304_HNSW32,PQ32`.
-- Pretraining
-  - `--retro-add-retriever` : Must be used to select Retro model.
-  - `--retro-num-neighbors` : Number of neighbors to retrieve from the retrieval database (defaults to 2).
-  - `--retro-num-retrieved-chunks` : For each neighbor, the number consecutive chunks to retrieve, including the initial neighbor (defaults to 2).
-
-<!-- ################ pretraining ################ -->
-<!-- # Pretraining -->
-<!-- - New retro args in arguments.py (add_retro_args). -->
-<!-- - Most important arg is `--retro-add-retriever`. -->
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/__init__.py b/toolbox/Megatron-DeepSpeed/tools/retro/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/cli/__init__.py b/toolbox/Megatron-DeepSpeed/tools/retro/cli/__init__.py
deleted file mode 100644
index 2b607770ad066475b7a845fcdac12487d38737ae..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/cli/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .cli import retro
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/cli/__main__.py b/toolbox/Megatron-DeepSpeed/tools/retro/cli/__main__.py
deleted file mode 100644
index f5973d0a673962ee28bcd9ff3398388e360b195c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/cli/__main__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import os
-
-from . import retro
-
-
-if __name__ == "__main__":
-    retro.init(os.environ["RETRO_WORKDIR"])
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/cli/cli.py b/toolbox/Megatron-DeepSpeed/tools/retro/cli/cli.py
deleted file mode 100644
index 1d96480f3898b4f3284d97a970b7f13e5d114214..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/cli/cli.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import json
-import numpy as np
-import os
-import torch
-import types
-
-from megatron_ds.global_vars import set_global_variables, set_retro_args
-from megatron_ds.initialize import (
-    initialize_megatron,
-    _initialize_distributed,
-    _set_random_seed,
-    _compile_dependencies,
-)
-from tools.retro.db.utils import (
-    get_indexed_dataset_infos as get_db_indexed_dataset_infos,
-    get_merged_train_dataset as get_db_dataset,
-)
-from tools.retro.main import add_retro_args
-from tools.retro.query.retro_dataset import get_retro_datasets
-from tools.retro.utils import get_args_path, get_bert_tokenizer, get_gpt_tokenizer
-
-
-def shorten_str(s, n):
-    s = "\\n".join(s.splitlines())
-    return s if len(s) <= n else "%s ... %s" % (s[:n//2], s[-n//2:])
-
-
-class retro:
-
-    args = None
-
-    ##############################################
-    # initialize.
-    ##############################################
-
-    @classmethod
-    def parse_dtype_str(cls, dtype_str):
-        return {
-            "torch.float16" : torch.float16,
-            "torch.float32" : torch.float32,
-            "torch.bfloat16" : torch.bfloat16,
-        }[dtype_str]
-
-    @classmethod
-    def init_megatron(cls, workdir):
-        '''Custom initialization of Megatron.'''
-
-        # Load args.
-        args_path = get_args_path(workdir)
-        assert os.path.exists(args_path), "args.json not found in workdir."
-        with open(args_path) as f:
-            cls.args = types.SimpleNamespace(**json.load(f))
-            cls.args.retro_workdir = workdir # just in case workdir moved
-            cls.args.rank = 0 # override env
-            cls.args.world_size = 1 # override env
-            cls.args.params_dtype = cls.parse_dtype_str(cls.args.params_dtype)
-
-        set_global_variables(cls.args)
-        set_retro_args(cls.args)
-        _initialize_distributed()
-        _set_random_seed(cls.args.seed, cls.args.data_parallel_random_init)
-        _compile_dependencies()
-
-    @classmethod
-    def init(cls, workdir):
-        '''Initialize Megatron, tokenizers, and datasets.'''
-
-        # Load args.
-        cls.init_megatron(workdir)
-
-        cls.tokenizers = types.SimpleNamespace(
-            gpt=get_gpt_tokenizer(),
-            bert=get_bert_tokenizer(),
-        )
-
-        # Load data.
-        cls.db_indexed_dataset_infos = get_db_indexed_dataset_infos()
-        cls.db_dataset = get_db_dataset()
-        pt_train_ds, pt_valid_ds, _ = get_retro_datasets(verify_sizes=False)
-        cls.pt_datasets = types.SimpleNamespace(
-            train=pt_train_ds,
-            valid=pt_valid_ds,
-        )
-
-        # Retrieve max saved neighbors.
-        for key in vars(cls.pt_datasets):
-            getattr(cls.pt_datasets, key).num_neighbors = \
-                cls.args.retro_query_num_neighbors_save
-
-        # Print usage.
-        cls.print_usage()
-
-    ##############################################
-    # utils.
-    ##############################################
-
-    @classmethod
-    def gpt_to_text(cls, token_ids):
-        '''GPT tokens to text.'''
-        return cls.tokenizers.gpt.detokenize(token_ids.tolist()
-                                             if isinstance(token_ids, np.ndarray)
-                                             else token_ids)
-
-    @classmethod
-    def text_to_bert(cls, text):
-        '''Text to Bert tokens.'''
-        return cls.tokenizers.bert.tokenize(text)
-
-    ##############################################
-    # chunk db.
-    ##############################################
-
-    @classmethod
-    def get_db_num_indexed_datasets(cls):
-        '''Number of indexed datasets within blendable dataset.'''
-        return len(cls.db_indexed_dataset_infos)
-
-    @classmethod
-    def get_db_indexed_dataset_infos(cls):
-        '''Dataset infos, including number of training & sampled sets.'''
-        return [(info["ratio"], info["name"])
-                for info in cls.db_indexed_dataset_infos]
-
-    @classmethod
-    def get_db_dataset(cls):
-        return cls.db_dataset
-
-    @classmethod
-    def get_db_num_chunks(cls):
-        '''Number of DB chunks.'''
-        return len(cls.get_db_dataset())
-
-    @classmethod
-    def get_db_chunk_gpt(cls, idx):
-        '''Get DB chunk as GPT token ids.'''
-        return cls.get_db_dataset()[idx]["text"].tolist()
-
-    @classmethod
-    def get_db_chunk_bert(cls, idx):
-        '''Get DB chunk as Bert token ids.'''
-        return cls.text_to_bert(cls.get_db_chunk_text(idx))
-
-    @classmethod
-    def get_db_chunk_text(cls, idx):
-        '''Get DB chunk as text.'''
-        return cls.gpt_to_text(cls.get_db_chunk_gpt(idx))
-
-    @classmethod
-    def get_db_chunk_and_continuation_text(cls, idx):
-        '''Get DB chunk along with continuation, as text.'''
-
-        # Modulus used here to match original implementation (i.e., last
-        # chunks continuation wraps around to first chunk).
-        return [
-            cls.get_db_chunk_text(idx),
-            cls.get_db_chunk_text((idx + 1) % len(cls.get_db_dataset())),
-        ]
-
-    ##############################################
-    # pretraining corpus.
-    ##############################################
-
-    @classmethod
-    def get_pt_num_samples_and_chunks(cls, data_key):
-        '''Number of samples & chunks (e.g., 32*n_samples) in corpus.'''
-        assert hasattr(cls.pt_datasets, data_key), \
-            "pretraining set '%s' not found (choices: %s)." % (
-                data_key, ", ".join(vars(cls.pt_datasets).keys()))
-        chunk_dataset = getattr(cls.pt_datasets, data_key).chunk_dataset
-        return (
-            len(chunk_dataset.sample_dataset),
-            len(chunk_dataset),
-        )
-
-    @classmethod
-    def get_pt_num_samples(cls, data_key):
-        '''Number of pretraining samples.'''
-        return cls.get_pt_num_samples_and_chunks(data_key)[0]
-
-    @classmethod
-    def get_pt_num_chunks(cls, data_key):
-        '''Number of pretraining chunks (e.g., 32*n_samples).'''
-        return cls.get_pt_num_samples_and_chunks(data_key)[1]
-
-    @classmethod
-    def get_pt_dataset(cls, data_key):
-        return getattr(cls.pt_datasets, data_key)
-
-    @classmethod
-    def get_pt_sample(cls, data_key, idx):
-        return getattr(cls.pt_datasets, data_key)[idx]
-
-    @classmethod
-    def get_neighbor_tokens(cls, sample_id, chunk_id, data_key="train"):
-        try:
-            sample = cls.get_pt_sample(data_key, sample_id)
-            sample_token_ids = sample["text"]
-            chunk_length = cls.args.retro_gpt_chunk_length
-            chunk_start_idx = chunk_id * chunk_length
-            chunk_end_idx = min(sample_token_ids.shape[0],
-                                chunk_start_idx + chunk_length)
-            chunk_token_ids = sample_token_ids[chunk_start_idx:chunk_end_idx]
-            neighbor_token_ids = sample["neighbor_tokens"][chunk_id]
-            return {
-                "chunk_tokens" : chunk_token_ids,
-                "neighbor_tokens" : neighbor_token_ids,
-            }
-        except:
-            return None
-
-    @classmethod
-    def print_neighbor_texts(cls, sample_id, chunk_id, data_key="train"):
-        tokens = cls.get_neighbor_tokens(sample_id, chunk_id, data_key)
-        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
-        try:
-            print("PRETRAINING CHUNK:")
-            print("  - %s" % shorten_str(cls.gpt_to_text(tokens["chunk_tokens"]), 150))
-            print("NEIGHBOR_CHUNKS:")
-            for token_ids in tokens["neighbor_tokens"]:
-                print("  - %s" % shorten_str(cls.gpt_to_text(token_ids), 150))
-        except:
-            print("<no neighbors for sample %d>" % sample_id)
-
-    ##############################################
-    # usage.
-    ##############################################
-
-    @classmethod
-    def print_usage(cls):
-        '''Print usage.'''
-
-        print()
-        print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
-        print("examples ... [ *note*: 'db' = chunk db; 'pt' = pretraining corpus. ]")
-        print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
-
-        print()
-        print("~~~~ indexed datasets ~~~~")
-        print("retro.get_db_num_indexed_datasets() : %s" %
-              cls.get_db_num_indexed_datasets())
-        print("retro.get_db_indexed_dataset_infos() :")
-        for i, (ratio,prefix) in enumerate(cls.get_db_indexed_dataset_infos()):
-            print("  %s(%f, %s)%s" % (
-                "[" if i == 0 else " ",
-                ratio,
-                prefix,
-                "]" if i == len(cls.db_indexed_dataset_infos) - 1 else ",",
-            ))
-
-        print()
-        print("~~~~ counts ~~~~")
-        print("retro.get_db_num_chunks : %d." % cls.get_db_num_chunks())
-
-        print()
-        for sq_key in ("sample", "chunk"):
-            for data_key in ("train", "valid"): # test?
-                print("retro.get_pt_num_%ss('%s') : %d." % (
-                    sq_key, data_key,
-                    getattr(cls, f"get_pt_num_{sq_key}s")(data_key)))
-
-        print()
-        print("~~~~ tokens, text ~~~~")
-        print("retro.get_db_chunk_gpt(chunk_id) : %s" %
-              shorten_str(str(retro.get_db_chunk_gpt(0)), 50))
-        print("retro.get_db_chunk_bert(chunk_id) : %s" %
-              shorten_str(str(retro.get_db_chunk_bert(0)), 50))
-        print("retro.get_db_chunk_text(chunk_id) : %s" %
-              shorten_str(retro.get_db_chunk_text(0).strip(), 50))
-        print("retro.get_db_chunk_and_continuation_text(chunk_id) :")
-        for i, t in enumerate(retro.get_db_chunk_and_continuation_text(0)):
-            print("  %s'%s'%s" % (
-                "[" if i == 0 else " ",
-                shorten_str(t.strip().replace("\n", " "), 50),
-                "]" if i == 1 else ",",
-            ))
-
-        sample = cls.get_pt_sample("train", 0)
-        sample_chunk_id = sample["neighbor_tokens"].shape[0] // 2
-        sample_neighbor_id = 0
-        print()
-        print("retro.get_pt_sample('train', sample_id) :")
-        print("  {")
-        for k, v in sample.items():
-            print("    '%s' : %s" % (k, shorten_str(str(v), 50)))
-        print("  }")
-
-        print()
-        print("(e.g., sample = retro.get_pt_sample(...))")
-        print()
-        print("  sample['text'].shape : %s" % str(sample["text"].shape))
-        print("  sample['neighbor_tokens'].shape : %s" % str(sample["neighbor_tokens"].shape))
-        print("  sample['text'] : %s" % shorten_str(str(sample["text"]), 50))
-        print("  sample['neighbor_tokens'][17][1] : %s" % shorten_str(str(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50))
-        print("  retro.gpt_to_text(sample['text']) : %s" % shorten_str(cls.gpt_to_text(sample["text"]), 50))
-        print("  retro.gpt_to_text(sample['neighbor_tokens']) : %s" % shorten_str(cls.gpt_to_text(sample["neighbor_tokens"][sample_chunk_id][sample_neighbor_id]), 50))
-
-        print("+++++++++++++++++++++++++++++++++++++++++++++++++++")
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/db/__init__.py b/toolbox/Megatron-DeepSpeed/tools/retro/db/__init__.py
deleted file mode 100644
index d1bf23d9663c0d10699af9ca747e2a04725424d0..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/db/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .build import build_db
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/db/build.py b/toolbox/Megatron-DeepSpeed/tools/retro/db/build.py
deleted file mode 100644
index 22b67a03f88b9915d441990a664633afe6eeedb2..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/db/build.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from collections import defaultdict
-from concurrent.futures import as_completed, ProcessPoolExecutor
-from functools import reduce
-import glob
-import json
-import numpy as np
-import os
-from pathlib import Path
-import threading
-import torch
-from tqdm import tqdm
-import types
-
-from megatron_ds import get_retro_args, print_rank_0
-from megatron_ds.data.indexed_dataset import make_dataset as make_indexed_dataset
-from megatron_ds.tokenizer.tokenizer import (
-    _BertWordPieceTokenizer,
-    _GPT2BPETokenizer,
-)
-from tools.bert_embedding.utils import get_missing_blocks_by_rank
-from tools.retro.external_libs import h5py
-from tools.retro.utils import get_gpt_tokenizer, get_bert_tokenizer
-
-from .utils import (
-    get_indexed_dataset_infos,
-    get_indexed_dataset_infos_path,
-    get_individual_db_dir,
-    get_individual_chunk_db,
-    get_individual_doc_offsets,
-    get_merged_dataset,
-    get_merged_db_path_map,
-    save_indexed_dataset_infos,
-)
-
-
-def init_indexed_dataset_infos():
-    '''Gather meta-info about each indexed dataset.
-
-    The returned info array allows for easy access to the configuration, and
-    helps remove ambiguity.
-    '''
-
-    args = get_retro_args()
-
-    assert len(args.data_path) % 2 == 0, \
-        "currently, only blendable dataset is supported."
-
-    # Dataset infos.
-    infos = []
-    for i in range(0, len(args.data_path), 2):
-        ratio = float(args.data_path[i])
-        prefix = args.data_path[i + 1]
-        path = prefix + ".bin"
-        name = os.path.basename(prefix)
-        assert os.path.exists(path), "couldn't find '%s'." % path
-        infos.append({
-            "ratio" : ratio,
-            "prefix" : prefix,
-            "path" : path,
-            "name" : name,
-            "db_dir" : get_individual_db_dir(name),
-            "dataset" : make_indexed_dataset(prefix, "mmap", True),
-        })
-
-    return infos
-
-
-def build_partial_db(
-        dataset_idx,
-        n_datasets,
-        indexed_dataset,
-        block_id,
-        n_blocks,
-        block,
-        proc_id,
-        n_procs,
-        tokenizers,
-):
-    '''Process a document index range of the indexed dataset.
-
-    The chunk database is built in parallel blocks, since de-tokenizing &
-    re-tokenizing for Bert-length computation is expensive. This method
-    iterates each document and extracts sequential 'chunk-length' sequences
-    from each document.
-    '''
-
-    args = get_retro_args()
-
-    # Document start/end indexes.
-    doc_range = block["range"]
-    n_docs = doc_range[1] - doc_range[0]
-    n_docs_per_proc = int(np.ceil(n_docs / n_procs))
-    doc_start_id = doc_range[0] + proc_id * n_docs_per_proc
-    doc_end_id = min(doc_range[1], doc_start_id + n_docs_per_proc)
-
-    # Print progress.
-    progress_proc_ids = set(range(n_procs)) \
-        if torch.distributed.get_rank() == 0 else set()
-    if proc_id in progress_proc_ids:
-        print(" > building partial chunk db, proc %d / %d, docs %d:%d / %d."%(
-            proc_id,
-            n_procs,
-            doc_start_id,
-            doc_end_id,
-            n_docs,
-        ))
-
-    # Progress bars (snapshot of overall progress).
-    doc_id_iter = range(doc_start_id, doc_end_id)
-    pbar = tqdm(doc_id_iter) \
-        if proc_id in progress_proc_ids else \
-           doc_id_iter
-
-    # Iterate documents & parse chunks.
-    chunk_db_valid = []
-    chunk_db_invalid = []
-    doc_size_map = {}
-    for doc_id in pbar:
-
-        # Progress description.
-        try:
-            pbar.set_description("ds %d / %d, block %d / %d, proc %d / %d." % (
-                dataset_idx,
-                n_datasets,
-                block_id,
-                n_blocks,
-                proc_id,
-                n_procs))
-        except:
-            pass
-
-        # Remove EOD token.
-        doc = indexed_dataset.get(doc_id)
-        if doc[-1].item() == tokenizers.gpt.eod:
-            doc = doc[:-1]
-        doc_len = len(doc)
-
-        # Chunk start/end indexes.
-        chunk_start_idxs = list(range(0, doc_len, args.retro_gpt_chunk_length))
-        chunk_end_idxs = [min(doc_len, s + args.retro_gpt_chunk_length)
-                          for s in chunk_start_idxs]
-
-        # Re-tokenize each chunk to Bert/Wordpiece (empty bert -> 'invalid').
-        doc_size_map[doc_id] = 0
-        for i, chunk_start_idx in enumerate(chunk_start_idxs):
-
-            # Re-tokenize.
-            chunk_end_idx = chunk_end_idxs[i]
-            gpt_token_ids = indexed_dataset.get(
-                idx=doc_id,
-                offset=chunk_start_idx,
-                length=chunk_end_idx - chunk_start_idx,
-            )
-            text = tokenizers.gpt.detokenize(gpt_token_ids.tolist())
-            bert_token_ids = tokenizers.bert.tokenize(text)
-
-            # 'Valid' for non-empty Bert chunks; 'invalid' otherwise.
-            if len(bert_token_ids) == 0:
-                _chunk_db = chunk_db_invalid
-            else:
-                _chunk_db = chunk_db_valid
-                doc_size_map[doc_id] += 1
-            _chunk_db.append((
-                doc_id,
-                chunk_start_idx,
-                chunk_end_idx,
-                len(bert_token_ids),
-            ))
-
-    return proc_id, chunk_db_valid, chunk_db_invalid, doc_size_map
-
-
-def build_individual_db(dataset_idx, n_datasets, dataset_info, tokenizers):
-    '''Process a single indexed dataset & extract chunks.'''
-
-    args = get_retro_args()
-
-    # Make directory.
-    db_dir = dataset_info["db_dir"]
-    os.makedirs(db_dir, exist_ok=True)
-
-    # Indexed dataset.
-    indexed_dataset = dataset_info["dataset"]
-
-    # Missing db blocks.
-    n_missing_world, missing_db_blocks = get_missing_blocks_by_rank(
-        db_dir,
-        len(indexed_dataset),
-        args.retro_doc_block_size,
-        validate=lambda f : f["chunks_valid"].shape == (0,) \
-            or f["chunks_valid"].shape[1] == 4)
-
-    # Prevent missing-path-write race condition.
-    torch.distributed.barrier()
-
-    if not missing_db_blocks:
-        return
-
-    # Num processes.
-    if n_missing_world == 1:
-        n_procs = 128
-    elif n_missing_world <= 2:
-        n_procs = 64
-    elif n_missing_world <= 4:
-        n_procs = 32
-    elif n_missing_world <= 8:
-        n_procs = 16
-    else:
-        n_procs = 8
-
-    # Process documents in parallel.
-    with ProcessPoolExecutor(max_workers=n_procs) as executor:
-        for block_idx, block in enumerate(missing_db_blocks):
-
-            if block is not None:
-
-                db_path = block["path"]
-
-                # Build partial dbs.
-                print_rank_0(' > build partial dbs.')
-                futures = []
-                for proc_id in range(n_procs): # not true process id
-                    futures.append(executor.submit(
-                        build_partial_db,
-                        dataset_idx,
-                        n_datasets,
-                        indexed_dataset,
-                        block_idx,
-                        len(missing_db_blocks),
-                        block,
-                        proc_id,
-                        n_procs,
-                        tokenizers,
-                    ))
-                partial_chunk_dbs = []
-                for future in as_completed(futures):
-                    partial_chunk_dbs.append(future.result())
-
-                # Concatenate chunks.
-                partial_chunk_dbs.sort(key=lambda item:item[0]) # sort by proc_id
-                chunk_db_valid = [item
-                                  for partial_chunk_db in partial_chunk_dbs
-                                  for item in partial_chunk_db[1]]
-                chunk_db_invalid = [item
-                                    for partial_chunk_db in partial_chunk_dbs
-                                    for item in partial_chunk_db[2]]
-
-                # Convert to numpy.
-                print_rank_0(' > converting chunk db to numpy.')
-                chunk_db_valid = np.array(chunk_db_valid, dtype="uint32")
-                chunk_db_invalid = np.array(chunk_db_invalid, dtype="uint32")
-
-                # Document offsets.
-                doc_sizes = [(d, s)
-                             for partial_chunk_db in partial_chunk_dbs
-                             for d, s in partial_chunk_db[3].items()]
-                doc_sizes.sort(key = lambda item : item[0])
-                doc_offsets = np.cumsum([item[1] for item in doc_sizes]) \
-                                .astype("uint64")
-                doc_offsets = np.stack((
-                    np.array([item[0] for item in doc_sizes], dtype="uint64"),
-                    doc_offsets), axis=1)
-
-                # Save DB.
-                print_rank_0(" > saving individual db.")
-                with h5py.File(db_path, "w") as f:
-                    dset = f.create_dataset("chunks_valid", data=chunk_db_valid)
-                    dset = f.create_dataset("chunks_invalid",
-                                            data=chunk_db_invalid)
-                    dset = f.create_dataset("doc_offsets", data=doc_offsets)
-
-            # Wait for all ranks to finish block.
-            print_rank_0(" > waiting for all ranks to finish block.")
-            torch.distributed.barrier()
-
-    print_rank_0(" > finished saving individual db.")
-
-
-def build_individual_dbs(indexed_dataset_infos):
-    '''Iterate each indexed dataset & process its chunks.'''
-
-    args = get_retro_args()
-
-    # Tokenizers.
-    tokenizers = types.SimpleNamespace(
-        gpt=get_gpt_tokenizer(),
-        bert=get_bert_tokenizer(),
-    )
-
-    # Build individual DBs.
-    print_rank_0(" > build individual chunk dbs.")
-    for ds_idx, ds_info in enumerate(indexed_dataset_infos):
-
-        # Progress.
-        print_rank_0(" > building individual db, dataset %d / %d ... '%s'." % (
-            ds_idx,
-            len(indexed_dataset_infos),
-            ds_info["name"],
-        ))
-
-        # Process single dataset.
-        build_individual_db(ds_idx, len(indexed_dataset_infos),
-                            ds_info, tokenizers)
-
-
-def update_chunk_counts(indexed_dataset_infos):
-    '''Set n_chunks_train & n_chunks sampled for each individual DB.'''
-
-    args = get_retro_args()
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    # Data ratio sum (for setting index training chunks).
-    data_ratio_sum = sum([ d["ratio"] for d in indexed_dataset_infos ])
-
-    # Training split size (split at document level).
-    train_fraction = float(args.split.split(",")[0]) / 100
-    assert train_fraction > 0 and train_fraction <= 1
-
-    # Set n_chunks (including n_chunks_sampled for unambiguity).
-    print_rank_0(" > compute n_chunks.")
-    for ds_index, ds_info in enumerate(indexed_dataset_infos):
-
-        db_dir = ds_info["db_dir"]
-        db_paths = sorted(glob.glob(db_dir + "/*.hdf5"))
-
-        # Update counts.
-        ds_info["n_docs"] = len(ds_info["dataset"].doc_idx) - 1
-        ds_info["n_docs_train"] = int(train_fraction * ds_info["n_docs"])
-        ds_info["n_chunks"] = 0 # previously, 'n_chunks_valid'
-        ds_info["n_chunks_train"] = 0
-        ds_info["n_chunks_invalid"] = 0
-        for db_path in tqdm(db_paths, "%d/%d, %s" % (
-                ds_index, len(indexed_dataset_infos), ds_info["name"])):
-           with h5py.File(db_path, "r") as f:
-                ds_info["n_chunks"] += len(f["chunks_valid"])
-                ds_info["n_chunks_invalid"] += len(f["chunks_invalid"])
-                ds_info["n_chunks_train"] += \
-                    (np.copy(f["chunks_valid"][:, 0]) < ds_info["n_docs_train"]) \
-                    .sum().item()
-
-        ds_info["n_chunks_sampled"] = int(args.retro_index_ntrain *
-                                          ds_info["ratio"] / data_ratio_sum)
-
-        # Verify counts.
-        assert ds_info["n_chunks_train"] <= ds_info["n_chunks"], \
-            "n_train (%d) > n_total (%d)." % (
-                ds_info["n_chunks_train"], ds_info["n_chunks"])
-        assert ds_info["n_chunks_sampled"] <= ds_info["n_chunks_train"], \
-            "n_sampled (%d) > n_train (%d)." % (
-                ds_info["n_chunks_sampled"], ds_info["n_chunks_train"])
-
-
-def merge_dbs(indexed_dataset_infos, db_type):
-    '''Merge individual DBs into single DB.'''
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    print(" > build %s chunk db." % db_type)
-
-    # Count chunks.
-    if db_type == "sampled":
-        n_chunks_key = "n_chunks_sampled"
-        n_docs_key = None
-    elif db_type == "train":
-        n_chunks_key = "n_chunks_train"
-        n_docs_key = "n_docs_train"
-    elif db_type == "valid":
-        n_docs_key = None
-    else:
-        raise Exception("handle db_type '%s'." % db_type)
-
-    if db_type == "valid":
-        n_chunks = sum(m["n_chunks"] - m["n_chunks_train"]
-                       for m in indexed_dataset_infos)
-    else:
-        n_chunks = sum(m[n_chunks_key] for m in indexed_dataset_infos)
-        n_docs = None if n_docs_key is None else \
-            sum(m[n_docs_key] for m in indexed_dataset_infos)
-
-    # DB path.
-    db_path = get_merged_db_path_map()[db_type]
-
-    # Delete existing chunk db if incorrect size.
-    if os.path.exists(db_path):
-
-        try:
-
-            f = h5py.File(db_path)
-            n_alloc = len(f["chunks"])           # total allocated
-            n_written = f["n_written"][0].item() # total written
-            f.close()
-
-            if n_chunks != n_alloc or n_chunks != n_written:
-                os.remove(db_path)
-
-        except Exception as e:
-            if isinstance(e, OSError):
-                os.remove(db_path)
-            elif isinstance(e, KeyError):
-                f.close()
-                os.remove(db_path)
-            else:
-                raise e
-
-    # Build merged chunk db.
-    if not os.path.exists(db_path):
-
-        os.makedirs(os.path.dirname(db_path), exist_ok=True)
-        f = h5py.File(db_path, "w")
-
-        # Initialize output arrays.
-        merged_chunk_db = \
-            f.create_dataset("chunks", (n_chunks, 5), dtype="uint32")
-        merged_doc_offsets = None if n_docs_key is None else \
-            f.create_dataset("doc_offsets", (n_docs, 3), dtype="uint64")
-        n_written = f.create_dataset("n_written", (1,), dtype="uint64")
-        n_written[0] = 0
-
-        # Iterate indexed datasets & collect chunks.
-        chunk_start_index = 0
-        doc_start_index = 0
-        doc_start_offset = 0
-        for ds_idx, ds_info in enumerate(indexed_dataset_infos):
-            print(" > merging dbs; '%s', dataset %d / %d ... '%s'." %
-                  (db_type, ds_idx, len(indexed_dataset_infos), ds_info["name"]))
-            individual_chunk_db = get_individual_chunk_db(ds_idx, ds_info)
-            individual_doc_offsets = None if n_docs_key is None else \
-                get_individual_doc_offsets(ds_idx, ds_info)
-
-            if db_type == "valid":
-                individual_chunk_db = \
-                    individual_chunk_db[ds_info["n_chunks_train"]:]
-                if n_docs_key is None:
-                    individual_doc_offsets = None
-                else:
-                    train_doc_offset = \
-                        individual_doc_offsets[ds_info["n_docs_train"] - 1, 2]
-                    individual_doc_offsets = \
-                        np.copy(individual_doc_offsets[ds_info["n_docs_train"]:])
-                    individual_doc_offsets[:, 2] -= train_doc_offset
-
-                    print("~~~")
-                    print(individual_doc_offsets)
-                    print(train_doc_offset)
-                    raise Exception("test me.")
-            else:
-                individual_chunk_db = \
-                    individual_chunk_db[:ds_info[n_chunks_key]]
-                individual_doc_offsets = None if n_docs_key is None else \
-                    np.copy(individual_doc_offsets[:ds_info[n_docs_key]])
-
-            merged_chunk_db[chunk_start_index:chunk_start_index+len(individual_chunk_db)] = individual_chunk_db
-            chunk_start_index += len(individual_chunk_db)
-            n_written[0] = chunk_start_index
-            if n_docs_key is not None:
-                individual_doc_offsets[:, 2] += doc_start_offset
-                doc_end_index = doc_start_index + individual_doc_offsets.shape[0]
-                merged_doc_offsets[doc_start_index:doc_end_index] = \
-                    individual_doc_offsets
-                doc_start_index = doc_end_index
-                doc_start_offset = individual_doc_offsets[-1, 2].item()
-
-        f.close()
-
-
-def build_db():
-    '''Extract token chunks from each indexed dataset.
-
-    Iterate each document of each indexed dataset, extract that document's
-    chunks, and save to a 'DB' (hdf5 file).
-    '''
-
-    # Indexed dataset info.
-    indexed_dataset_infos = init_indexed_dataset_infos()
-
-    # Build dbs.
-    build_individual_dbs(indexed_dataset_infos)
-
-    # Single-process going forward.
-    if torch.distributed.get_rank() != 0:
-        return
-
-    # Update n_chunks & save indexed dataset infos.
-    if not os.path.exists(get_indexed_dataset_infos_path()):
-        update_chunk_counts(indexed_dataset_infos)
-        save_indexed_dataset_infos(indexed_dataset_infos)
-    indexed_dataset_infos = get_indexed_dataset_infos()
-
-    # Merge dbs.
-    merge_dbs(indexed_dataset_infos, "sampled")
-    merge_dbs(indexed_dataset_infos, "train")
-    merge_dbs(indexed_dataset_infos, "valid")
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/db/dataset.py b/toolbox/Megatron-DeepSpeed/tools/retro/db/dataset.py
deleted file mode 100644
index 08f4af21debe081f54d433471beae27ca3cd2270..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/db/dataset.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import json
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from megatron_ds import get_args, print_rank_0
-from tools.retro.external_libs import h5py
-from tools.retro.utils import get_gpt_tokenizer
-
-
-class DBDataset(torch.utils.data.Dataset):
-    '''Dataset for iterating chunks.
-
-    Requires:
-    - List of indexed datasets
-    - Chunk index array, with format:
-        [dataset_idx, doc_id, start_idx, end_idx, bert_length])
-    '''
-
-    def __init__(self, db_path, indexed_datasets, chunks, max_chunk_length):
-
-        assert chunks.shape[1] == 5, "expected 5 columns (dataset_idx, " \
-        "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); " \
-        "found %d columns." % chunks.shape[1]
-
-        self.db_path = db_path
-        self.indexed_datasets = indexed_datasets
-        self.chunks = chunks
-        self.doc_chunk_map = None
-
-        self.max_chunk_length = max_chunk_length
-        self.eod_token_id = get_gpt_tokenizer().eod
-
-    def __len__(self):
-        return self.chunks.shape[0]
-
-    def __getitem__(self, chunk_id):
-
-        # Chunk start/end indexes.
-        indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = \
-            [ value.item() for value in self.chunks[chunk_id] ]
-        chunk_length = token_end_idx - token_start_idx
-        indexed_dataset = self.indexed_datasets[indexed_dataset_id]
-
-        # Chunk token ids.
-        token_ids = indexed_dataset.get(doc_id,
-                                        offset=token_start_idx,
-                                        length=chunk_length)
-
-        # Extend chunks to max_chunk_length by padding with EOD tokens.
-        if chunk_length != self.max_chunk_length:
-            assert chunk_length < self.max_chunk_length, "invalid chunk len."
-            token_ids = token_ids.tolist()
-            token_ids += [self.eod_token_id] * \
-                (self.max_chunk_length - chunk_length)
-
-        return {
-            "doc_id" : doc_id,
-            "text" : np.array(token_ids, dtype=np.int64),
-        }
-
-    def load_doc_tuples(self):
-        '''Load the dataset & document ids.
-
-        Load the dataset id & document id of each chunk in the database, to
-        be used for causality filtering during querying.
-        '''
-        self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
-        block_size = int(1e6)
-        for start_idx in tqdm(range(0, len(self), block_size)):
-            end_idx = min(len(self), start_idx + block_size)
-            self.doc_tuples[start_idx:end_idx]=self.chunks[start_idx:end_idx,:2]
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/db/utils.py b/toolbox/Megatron-DeepSpeed/tools/retro/db/utils.py
deleted file mode 100644
index 062368e8c80f0579ac720fdf0f68515289cadd11..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/db/utils.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from collections import defaultdict
-import glob
-import json
-import numpy as np
-import os
-from tqdm import tqdm
-
-from megatron_ds import get_retro_args, print_rank_0
-from megatron_ds.data.indexed_dataset import make_dataset as make_indexed_dataset
-from tools.retro.external_libs import h5py
-
-from .dataset import DBDataset
-
-
-def get_base_db_workdir():
-    '''Sub-directory for DB data.'''
-    args = get_retro_args()
-    return os.path.join(args.retro_workdir, "db")
-
-
-def get_indexed_dataset_infos_path():
-    '''Path to indexed dataset meta-infos.'''
-    return os.path.join(get_base_db_workdir(), "indexed_dataset_infos.json")
-
-
-def save_indexed_dataset_infos(indexed_dataset_infos):
-    '''Save dataset order & meta-info.'''
-
-    # Remove 'dataset' field.
-    clean_infos = []
-    for info in indexed_dataset_infos:
-        info = dict(info)
-        del info["dataset"]
-        clean_infos.append(info)
-
-    # Save.
-    with open(get_indexed_dataset_infos_path(), "w") as f:
-        json.dump(clean_infos, f, indent=4)
-
-
-def get_indexed_dataset_infos():
-    '''Load indexed dataset meta-infos.'''
-
-    # Load json.
-    path = get_indexed_dataset_infos_path()
-    with open(path) as f:
-        infos = json.load(f)
-
-    # Add indexed datasets.
-    for info in infos:
-        info["dataset"] = make_indexed_dataset(info["prefix"], "mmap", True)
-
-    return infos
-
-
-def get_individual_db_dir(name):
-    '''Individual DB's directory.'''
-    return os.path.join(get_base_db_workdir(), "individual", name)
-
-
-def get_individual_chunk_db(ds_id, ds_info):
-    '''Load individual dataset's chunk DB.'''
-    db_paths = sorted(glob.glob(ds_info["db_dir"] + "/*hdf5"))
-    # *Note*: convert to dataset, rather than copying to memory.
-    db = np.zeros((ds_info["n_chunks"], 5), dtype="uint32")
-    db[:, 0] = ds_id
-    start_idx = 0
-    for db_path in db_paths:
-        f = h5py.File(db_path, "r")
-        n_chunks_current = f["chunks_valid"].shape[0]
-        db[start_idx:(start_idx+n_chunks_current), 1:] = f["chunks_valid"]
-        start_idx += n_chunks_current
-        f.close()
-
-    assert start_idx == ds_info["n_chunks"]
-
-    return db
-
-
-def get_individual_doc_offsets(ds_id, ds_info):
-    '''Load individual dataset's chunk DB.'''
-    paths = sorted(glob.glob(ds_info["db_dir"] + "/*hdf5"))
-    # *Note*: convert to dataset, rather than copying to memory.
-    doc_offsets = np.zeros((ds_info["n_docs"], 3), dtype="uint64")
-    doc_offsets[:, 0] = ds_id
-    start_idx = 0
-    start_offset = 0
-    for path in paths:
-        with h5py.File(path) as f:
-            current_doc_offsets = np.copy(f["doc_offsets"])
-            current_doc_offsets[:, 1] += start_offset
-            current_ndocs = current_doc_offsets.shape[0]
-            doc_offsets[start_idx:(start_idx+current_ndocs), 1:] = \
-                current_doc_offsets
-            start_idx += current_ndocs
-            start_offset = current_doc_offsets[-1, 1].item()
-
-    return doc_offsets
-
-
-def get_merged_db_path_map():
-    '''Paths to merged datasets.'''
-    base_dir = get_base_db_workdir()
-    return {
-        "sampled" : os.path.join(base_dir, "merged", "sampled.hdf5"),
-        "train" : os.path.join(base_dir, "merged", "train.hdf5"),
-        "valid" : os.path.join(base_dir, "merged", "valid.hdf5"),
-    }
-
-
-def get_merged_dataset(db_type, indexed_dataset_infos=None):
-    '''Get merged dataset.'''
-
-    args = get_retro_args()
-
-    if not indexed_dataset_infos:
-        indexed_dataset_infos = get_indexed_dataset_infos()
-
-    # Load chunks.
-    db_path = get_merged_db_path_map()[db_type]
-    f = h5py.File(db_path, "r")
-    chunks = f["chunks"]
-
-    # DB dataset.
-    indexed_datasets = [ info["dataset"] for info in indexed_dataset_infos ]
-    dataset = DBDataset(db_path, indexed_datasets, chunks,
-                        args.retro_gpt_chunk_length)
-
-    return dataset
-
-
-def get_merged_sampled_dataset(indexed_dataset_infos=None):
-    return get_merged_dataset("sampled", indexed_dataset_infos)
-
-
-def get_merged_train_dataset(indexed_dataset_infos=None):
-    return get_merged_dataset("train", indexed_dataset_infos)
-
-
-def get_merged_valid_dataset(indexed_dataset_infos=None):
-    return get_merged_dataset("valid", indexed_dataset_infos)
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/examples/get_dataset_configs.sh b/toolbox/Megatron-DeepSpeed/tools/retro/examples/get_dataset_configs.sh
deleted file mode 100644
index 3a61a059f365be9cc633ab178eca46c3149ad3ed..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/examples/get_dataset_configs.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# Small English Wikipedia dataset (~2M chunks).
-get_wiki_tiny_config() {
-    RETRO_INDEX_STR="IVF4096_HNSW4,Flat"
-    RETRO_NCHUNKS_SAMPLED=2281307
-    RETRO_GPT_TRAIN_SAMPLES=31250
-    LR_DECAY_SAMPLES=2
-    LR_WARMUP_SAMPLES=1
-    RETRO_GPT_EVAL_INTERVAL=2000
-    RETRO_GPT_EVAL_ITERS=100
-    RETRO_EF_SEARCH=4
-    RETRO_NPROBE=64
-    DATALOADER_TYPE=cyclic
-}
-
-# English Wikipedia dataset (~67M chunks).
-get_wiki_config() {
-    RETRO_INDEX_STR="IVF262144_HNSW32,Flat"
-    RETRO_NCHUNKS_SAMPLED=66625331
-    RETRO_GPT_TRAIN_SAMPLES=2037248
-    LR_DECAY_SAMPLES=2
-    LR_WARMUP_SAMPLES=1
-    RETRO_GPT_EVAL_INTERVAL=2000
-    RETRO_GPT_EVAL_ITERS=100
-    RETRO_EF_SEARCH=16
-    RETRO_NPROBE=4096
-    DATALOADER_TYPE=cyclic
-}
-
-# Full corpus (~5B chunks).
-get_corpus_config() {
-    RETRO_INDEX_STR="OPQ64_128,IVF4194304_HNSW32,PQ64"
-    RETRO_NCHUNKS_SAMPLED=300000000
-    RETRO_GPT_TRAIN_SAMPLES=192000000
-    LR_DECAY_SAMPLES=166400000
-    LR_WARMUP_SAMPLES=162761
-    RETRO_GPT_EVAL_INTERVAL=2000
-    RETRO_GPT_EVAL_ITERS=50
-    RETRO_EF_SEARCH=32
-    RETRO_NPROBE=4096
-    DATALOADER_TYPE=single
-}
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/examples/get_preprocess_cmd.sh b/toolbox/Megatron-DeepSpeed/tools/retro/examples/get_preprocess_cmd.sh
deleted file mode 100644
index 1ba29d0b96e590ecce9498a5be1d3ecc79003b00..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/examples/get_preprocess_cmd.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/bin/bash
-
-# Build preprocessing command for Retro.
-
-set -u
-DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-
-################ Required environment variables. ################
-# Required environment variables:
-# - REPO_DIR : Root directory of Megatron codebase.
-# - RETRO_WORKDIR : Root directory of this Retro project's processed data. (For
-#     example, this project directory might be for a blended dataset, while
-#     another project directory might be for just a Wikipedia dataset, and
-#     another for just Book Corpus data, etc.) This project directory will
-#     contain a complete set of processed data, including the retrieval
-#     database, search index, and pretraining neighbors.
-# - RETRO_TASKS : One of 'build', 'db-build', 'index-build', or
-#     'pretraining-query-neighbors'. See 'Retro tasks' below for task
-#     descriptions.
-# - DATA_BLEND_SCRIPT : Path to blended dataset definition file.
-# - GPT_VOCAB_FILE : GPT vocab file.
-# - GPT_MERGE_FILE : GPT merge file.
-# - GPT_TOKENIZER : GPT tokenizer type (e.g., GPT2BPETokenizer)
-# - BERT_LOAD_PATH : Bert checkpoint directory.
-# - BERT_VOCAB_FILE : Bert vocab file.
-# - BERT_TOKENIZER : Bert tokenizer type (e.g., BertWordPieceLowerCase,
-#     BertWordPieceCase).
-# - BERT_EMBEDDER_TYPE : One of 'megatron' or 'huggingface'.
-# - EXTRA_ARGS : Extra arguments (else, leave empty).
-
-################ Data blend. ################
-. ${DATA_BLEND_SCRIPT}
-DATA_PATH=${DATA_BLEND}
-
-################ Retro setup. ################
-RETRO_GPT_SEQ_LENGTH=2048
-RETRO_GPT_CHUNK_LENGTH=64
-RETRO_GPT_MICRO_BATCH_SIZE=1 # *8
-RETRO_GPT_GLOBAL_BATCH_SIZE=256
-
-################ Retro tasks. ################
-# The '--retro-tasks' argument is a comma-separated list of tasks to run, in
-# sequential order. For a quick start, simply set this to 'build' to run the
-# entire preprocessing pipeline. For finer control, you may specify the list of
-# tasks to run. This is desirable for tuning computational resources. For
-# example, training the search index is relatively fast and utilizes GPUs,
-# while querying the search index is relatively slow, CPU-only, and memory
-# intensive (i.e., multiple populated search indexes are loaded simultaneously).
-
-# *Note* : Once the task(s) below have been completed -- by running either
-#    1) 'build', or 2) the sequential combination of 'db-build', 'index-build',
-#    and 'pretraining-query-neighbors' -- we are ready to pretrain Retro by
-#    calling pretrain_retro.py.
-
-# ---- Option #1 : Run entire pipeline. ----
-
-# RETRO_TASKS="build" # (*note*: default tasks)
-
-# ---- Option #2 : Run specific stages. ----
-# *Note*: Run the following stages in the given order. Optionally, tune your
-#   cluster setup for each stage, as described above.
-
-# RETRO_TASKS="db-build" # ....................... run 1st
-# RETRO_TASKS="index-build" # .................... run 2nd
-# RETRO_TASKS="pretraining-query-neighbors" # .... run 3rd
-
-################ Megatron args. ################
-MEGATRON_ARGS=" \
-    --seed 1234 \
-    --distributed-timeout-minutes 600 \
-    --tokenizer-type ${BERT_TOKENIZER} \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --num-layers 24 \
-    --hidden-size 1024 \
-    --num-attention-heads 16 \
-    --micro-batch-size ${RETRO_GPT_MICRO_BATCH_SIZE} \
-    --global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
-    --load ${BERT_LOAD_PATH} \
-    --exit-on-missing-checkpoint \
-    --no-load-optim \
-    --data-path ${DATA_PATH} \
-    --vocab-file ${BERT_VOCAB_FILE} \
-    --data-impl mmap \
-    --split 98,2,0 \
-    --distributed-backend nccl \
-    --lr 0.0001 \
-    --lr-decay-style linear \
-    --min-lr 1.0e-5 \
-    --lr-decay-samples ${LR_DECAY_SAMPLES} \
-    --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
-    --weight-decay 1e-2 \
-    --clip-grad 1.0 \
-    --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
-    --eval-iters ${RETRO_GPT_EVAL_ITERS} \
-    --fp16 \
-    --DDP-impl local \
-    --dataloader-type ${DATALOADER_TYPE} \
-    --no-data-sharding \
-    --no-gradient-accumulation-fusion \
-    --no-async-tensor-model-parallel-allreduce \
-"
-
-################ Retro args. ################
-RETRO_ARGS=" \
-    --bert-embedder-type ${BERT_EMBEDDER_TYPE} \
-    --output-bert-embeddings \
-    \
-    --retro-gpt-vocab-file ${GPT_VOCAB_FILE} \
-    --retro-gpt-merge-file ${GPT_MERGE_FILE} \
-    --retro-gpt-tokenizer-type ${GPT_TOKENIZER} \
-    --retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \
-    --retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \
-    --retro-bert-vocab-file ${BERT_VOCAB_FILE} \
-    --retro-bert-tokenizer-type ${BERT_TOKENIZER} \
-    \
-    --retro-tasks ${RETRO_TASKS} \
-    --retro-index-str ${RETRO_INDEX_STR} \
-    --retro-ef-search ${RETRO_EF_SEARCH} \
-    --retro-nprobe ${RETRO_NPROBE} \
-    \
-    --retro-workdir ${RETRO_WORKDIR} \
-    --retro-nchunks-sampled ${RETRO_NCHUNKS_SAMPLED} \
-    \
-    --retro-return-doc-ids \
-"
-
-################ Command. ################
-RETRO_PREPROCESS_CMD=" \
-    ./tools/retro/main.py \
-    ${MEGATRON_ARGS} \
-    ${RETRO_ARGS} \
-    ${EXTRA_ARGS} \
-"
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/examples/preprocess_data.sh b/toolbox/Megatron-DeepSpeed/tools/retro/examples/preprocess_data.sh
deleted file mode 100644
index 74cdf1823d3fe79542811df5c79996eb4cfa82ca..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/examples/preprocess_data.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-set -u
-unset NCCL_DEBUG
-
-NPROCS=8 # NPROCS must be <= number of GPUs.
-
-set_current_dir() {
-    DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-}
-
-################ Dataset configs. ################
-# This script contains methods to customize arguments to specific dataset
-# types. Customize this script as needed for your datasets.
-set_current_dir
-. $DIR/get_dataset_configs.sh
-
-################ Environment variables. ################
-# *Note*: See 'Required environment variables' in 'get_preprocess_cmd.sh' for
-# a description of the required environment variables. These variables can be
-# set however a user would like. In our setup, we use another bash script
-# (location defined by $RETRO_ENV_VARS) that sets all the environment variables
-# at once.
-. $RETRO_ENV_VARS
-
-######## Environment vars. ########
-set_current_dir
-. ${DIR}/get_preprocess_cmd.sh
-
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "DIR = '$DIR'."
-echo "RETRO_PREPROCESS_CMD = '$RETRO_PREPROCESS_CMD'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-
-######## Command. ########
-FULL_CMD="\
-    pwd && cd ${REPO_DIR} && pwd && \
-    export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
-    python -m torch.distributed.run \
-    --nproc_per_node ${NPROCS} \
-    --nnodes 1 \
-    --node_rank ${NODE_RANK} \
-    --master_addr ${MASTER_ADDR} \
-    --master_port 6000 \
-    $RETRO_PREPROCESS_CMD \
-"
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "FULL_CMD = '$FULL_CMD'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-eval $FULL_CMD
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/examples/pretrain_model.sh b/toolbox/Megatron-DeepSpeed/tools/retro/examples/pretrain_model.sh
deleted file mode 100644
index 367d87ce63a459c7baa27ed470e565982f8cb630..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/examples/pretrain_model.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-
-##################################################
-# Example script for pretraining Retro.
-##################################################
-
-set -u
-unset NCCL_DEBUG
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-NPROCS=8 # NPROCS must be <= number of GPUs.
-
-################ Dataset configs. ################
-# This script contains methods to customize arguments to specific dataset
-# types. Customize this script as needed for your datasets.
-DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-. $DIR/get_dataset_configs.sh
-
-################ Environment variables. ################
-# *Note*: See 'Required environment variables' in 'get_preprocess_cmd.sh' for
-# a description of the required environment variables. These variables can be
-# set however a user would like. In our setup, we use another bash script
-# (location defined by $RETRO_ENV_VARS) that sets all the environment variables
-# at once.
-. $RETRO_ENV_VARS
-
-################ Data blend. ################
-. ${DATA_BLEND_SCRIPT}
-DATA_PATH=${DATA_BLEND}
-
-######## Retro setup. ########
-RETRO_ADD_RETRIEVER=0
-RETRO_CYCLIC_TRAIN_ITERS=750000
-RETRO_NUM_NEIGHBORS=2
-
-######## Arguments. ########
-CHECKPOINT_DIR=${RETRO_WORKDIR}/checkpoints/${RETRO_ADD_RETRIEVER}
-TENSORBOARD_DIR="${CHECKPOINT_DIR}/tensorboard"
-mkdir -p ${TENSORBOARD_DIR}
-ARGS=" \
-    --save-interval 1000 \
-    --save ${CHECKPOINT_DIR} \
-    --load ${CHECKPOINT_DIR} \
-    --tensorboard-dir ${TENSORBOARD_DIR} \
-    --log-interval 5 \
-    --tensor-model-parallel-size 1 \
-    --pipeline-model-parallel-size 1 \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --seq-length 2048 \
-    --max-position-embeddings 2048 \
-    --micro-batch-size 4 \
-    --global-batch-size 256 \
-    --train-samples ${RETRO_GPT_TRAIN_SAMPLES}  \
-    --lr-decay-samples ${LR_DECAY_SAMPLES} \
-    --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
-    --lr 6.0e-4 \
-    --min-lr 6.0e-5 \
-    --lr-decay-style cosine \
-    --eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
-    --eval-iters ${RETRO_GPT_EVAL_ITERS} \
-    --data-path ${DATA_PATH} \
-    --vocab-file ${GPT_VOCAB_FILE} \
-    --merge-file ${GPT_MERGE_FILE} \
-    --split 98,2,0 \
-    --clip-grad 1.0 \
-    --weight-decay 0.1 \
-    --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
-    --init-method-std 0.023 \
-    --log-params-norm \
-    --log-num-zeros-in-grad \
-    --fp16 \
-    --DDP-impl local \
-    --dataloader-type ${DATALOADER_TYPE} \
-    --no-data-sharding \
-    --no-gradient-accumulation-fusion \
-"
-
-if [ "$RETRO_ADD_RETRIEVER" = "0" ]; then
-    SCRIPT=pretrain_gpt.py
-else
-    ARGS="${ARGS} \
-    --retro-add-retriever \
-    --retro-workdir ${RETRO_WORKDIR} \
-    --retro-cyclic-train-iters ${RETRO_CYCLIC_TRAIN_ITERS} \
-    --retro-num-neighbors ${RETRO_NUM_NEIGHBORS} \
-    "
-    SCRIPT=pretrain_retro.py
-fi
-
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "SCRIPT = '$SCRIPT'."
-echo "ARGS = '$ARGS'."
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
-
-python -m torch.distributed.run \
-    --nproc_per_node ${NPROCS} \
-    --nnodes 1 \
-    --node_rank 0 \
-    --master_addr localhost \
-    --master_port 6000 \
-    ${SCRIPT} \
-    ${ARGS} \
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/external_libs.py b/toolbox/Megatron-DeepSpeed/tools/retro/external_libs.py
deleted file mode 100644
index 1a160b842ce55abbbb3cb1bcce2124c44dc73a2d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/external_libs.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import importlib
-
-required_libs = [
-    "faiss",
-    "h5py",
-    "transformers", # for huggingface bert
-]
-
-for lib in required_libs:
-    try:
-        globals()[lib] = importlib.import_module(lib)
-    except ImportError as e:
-        raise Exception(f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'.")
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/index/__init__.py b/toolbox/Megatron-DeepSpeed/tools/retro/index/__init__.py
deleted file mode 100644
index 5b17f7f0fe7d1a9ba2a969493423275d08c6cd44..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/index/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .build import add_to_index, build_index, train_index
-# from .index import Index
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/index/build.py b/toolbox/Megatron-DeepSpeed/tools/retro/index/build.py
deleted file mode 100644
index a7290da73666152df74782301dcbaae1f4817a07..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/index/build.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import numpy as np
-import os
-import shutil
-import torch
-from tqdm import tqdm
-
-from megatron_ds import get_retro_args, print_rank_0
-from tools.bert_embedding import DiskDataParallelBertEmbedder
-from tools.retro.db.utils import (
-    get_indexed_dataset_infos,
-    get_merged_sampled_dataset,
-    get_merged_train_dataset,
-)
-from tools.retro.external_libs import h5py
-from tools.retro.index.factory import IndexFactory
-from tools.retro.utils import GPTToTextDataset
-
-from .utils import (
-    get_training_data_block_dir,
-    get_training_data_block_paths,
-    get_training_data_merged_path,
-    get_training_data_root_dir,
-)
-
-
-##################################################
-# Train index.
-##################################################
-
-
-def get_empty_index_path():
-    '''Path of empty index.'''
-    args = get_retro_args()
-    index = IndexFactory.get_index(args.retro_index_type)
-    empty_index_path = index.get_empty_index_path()
-    return empty_index_path
-
-
-def get_block_nload(block_path, load_fraction):
-    with h5py.File(block_path) as fi:
-        return int(load_fraction * fi["data"].shape[0])
-
-
-def merge_embedding_blocks():
-
-    if torch.distributed.get_rank() != 0:
-        return
-
-    args = get_retro_args()
-
-    # Get block, merged paths.
-    load_fraction = args.retro_index_train_load_fraction
-    block_paths = get_training_data_block_paths()
-    bin_path = get_training_data_merged_path()
-
-    # Skip, if already built.
-    if os.path.exists(bin_path):
-        return
-
-    # Merge blocks.
-    with open(bin_path, "wb") as fo:
-        byte_offset = 0
-        for block_idx, block_path in \
-            enumerate(tqdm(block_paths, "merge train embeddings")):
-            with h5py.File(block_path) as fi:
-
-                nload = get_block_nload(block_path, load_fraction)
-                block = np.array(fi["data"][:nload], copy = False)
-
-                fo.write(block.tobytes())
-
-                byte_offset += block.size * block.itemsize
-                fo.seek(byte_offset)
-
-
-def embed_db():
-    '''Embed DB chunks.
-
-    Store chunks in blocks on disk. These blocks will later be merged into
-    a single dataset for training the index.
-    '''
-
-    args = get_retro_args()
-
-    merged_train_data_path = get_training_data_merged_path()
-    if os.path.exists(merged_train_data_path):
-        return
-
-    # Get db dataset.
-    gpt_dataset = get_merged_sampled_dataset()
-    text_dataset = GPTToTextDataset(gpt_dataset)
-
-    # Embed dataset.
-    embedder = DiskDataParallelBertEmbedder(args.retro_bert_batch_size,
-                                            args.retro_bert_max_chunk_length,
-                                            args.retro_block_size,
-                                            args.bert_embedder_type)
-    embedder.embed_text_dataset("index",
-                                get_training_data_block_dir(),
-                                text_dataset)
-
-    # Merge embeddings.
-    merge_embedding_blocks()
-
-
-def train_on_embeddings():
-    '''Train index on embedded DB chunks.'''
-    args = get_retro_args()
-    index = IndexFactory.get_index(args.retro_index_type)
-    index.train()
-
-
-def remove_embeddings():
-    '''Remove embeddings after training.'''
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() != 0:
-        return
-    empty_index_path = get_empty_index_path()
-    assert os.path.isfile(empty_index_path)
-    shutil.rmtree(get_training_data_root_dir(), ignore_errors=True)
-
-
-def train_index():
-    '''Train index on DB chunks.'''
-
-    args = get_retro_args()
-
-    # Check if trained index already exists.
-    if not os.path.isfile(get_empty_index_path()):
-
-        # Embed training chunks.
-        embed_db()
-
-        # Train index on embeddings.
-        train_on_embeddings()
-
-    # Wait for (single-process) training to complete.
-    torch.distributed.barrier()
-
-    # Remove embeddings.
-    if args.retro_index_delete_training_embeddings:
-        remove_embeddings()
-
-
-##################################################
-# Add to index.
-##################################################
-
-
-def add_to_index():
-    '''Add DB chunks to index.'''
-
-    args = get_retro_args()
-
-    # Get index.
-    index = IndexFactory.get_index(args.retro_index_type)
-
-    # Get text dataset.
-    gpt_dataset = get_merged_train_dataset()
-    text_dataset = GPTToTextDataset(gpt_dataset)
-
-    # Add to index.
-    output_index_path = index.add(text_dataset)
-
-    return output_index_path
-
-
-##################################################
-# Build index (train + add).
-##################################################
-
-
-def build_index():
-    '''Build index.
-
-    Building index involves sequentially running stages above:
-    - Train index (on sampled training chunks).
-    - Add to index (on all training chunks).
-    '''
-
-    # Train index.
-    train_index()
-
-    # Add to index.
-    add_to_index()
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/index/factory.py b/toolbox/Megatron-DeepSpeed/tools/retro/index/factory.py
deleted file mode 100644
index 3e247efeaefe622d389aa460bf3a95f900cc5f1e..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/index/factory.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .indexes import FaissBaseIndex, FaissParallelAddIndex
-
-
-class IndexFactory:
-    '''Get index.
-
-    Index type generally read from argument '--retro-index-ty'.
-    '''
-
-    @classmethod
-    def get_index_class(cls, index_type):
-        return {
-            "faiss-base" : FaissBaseIndex,
-            "faiss-par-add" : FaissParallelAddIndex,
-        }[index_type]
-
-    @classmethod
-    def get_index(cls, index_type):
-        index_class = cls.get_index_class(index_type)
-        index = index_class()
-        return index
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/index/index.py b/toolbox/Megatron-DeepSpeed/tools/retro/index/index.py
deleted file mode 100644
index b4e27f0b7177e92f45f1ea3073f42077a4f4ebe5..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/index/index.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import abc
-import numpy as np
-import os
-import torch
-
-from megatron_ds import get_retro_args
-from tools.retro.external_libs import faiss
-
-from .utils import get_index_dir
-
-
-class Index(abc.ABC):
-
-    '''Abstract base class for indexes.
-
-    *Note* : While currently only Faiss-based classes are implemented, in the
-    future, this class will be extended with other types of indexes that have
-    different performance-accuracy trade-offs.
-
-    The primary methods to override are:
-    - train() : Train index on the sampled training chunks.
-    - add() : Add all training chunks to index.
-    '''
-
-    @classmethod
-    def c_verbose(cls, index, v):
-        '''Make index object verbose.'''
-        assert isinstance(v, bool)
-        faiss.ParameterSpace().set_index_parameter(index, "verbose", v)
-
-    def get_empty_index_path(self):
-        args = get_retro_args()
-        return os.path.join(
-            get_index_dir(),
-            "empty_%.3f.faissindex" % args.retro_index_train_load_fraction,
-        )
-
-    def get_empty_index(self):
-        return faiss.read_index(self.get_empty_index_path())
-
-    def get_added_index_path(self):
-        args = get_retro_args()
-        return os.path.join(
-            get_index_dir(),
-            "added_%.3f_%.3f.faissindex" % (
-                args.retro_index_train_load_fraction,
-                args.retro_index_add_load_fraction,
-            ),
-        )
-
-    def get_added_index(self):
-        return faiss.read_index(self.get_added_index_path())
-
-    @abc.abstractmethod
-    def train(self, *args):
-        pass
-
-    @abc.abstractmethod
-    def add(self, *args):
-        pass
-
-    def embed_text_dataset_block(self, embedder, text_dataset, _range):
-        '''Embed a range of a text dataset.'''
-        sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range))
-        return embedder.embed_text_dataset(sub_dataset)
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/__init__.py b/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/__init__.py
deleted file mode 100644
index 30e8a3c1111ec7610d8d9e4a0955f2ff49c13e4d..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .faiss_base import FaissBaseIndex
-from .faiss_par_add import FaissParallelAddIndex
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/faiss_base.py b/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/faiss_base.py
deleted file mode 100644
index 53ada6b638c788a49c51b26084fefab8554051db..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/faiss_base.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""
-This class implements a simple, un-optimized wrapper around a Faiss index, that
-implements the Index interface (see ..index.py). While this class is
-instantiable, it is meant to be extended with optimizations in classes that
-inherit from this class (see FaissParAddIndex, for an example).
-"""
-
-from datetime import timedelta
-import numpy as np
-import os
-import torch
-from tqdm import tqdm
-
-from megatron_ds import get_retro_args, print_rank_0
-from tools.bert_embedding import BertEmbedder
-from tools.retro.external_libs import faiss
-from tools.retro.index.index import Index
-from tools.retro.index.utils import (
-    get_training_data_merged_path,
-    num_samples_to_block_ranges,
-)
-
-
-class FaissBaseIndex(Index):
-
-    def _train(self):
-        '''Train index (rank 0's method).'''
-
-        args = get_retro_args()
-
-        assert torch.distributed.get_rank() == 0
-
-        # Set num threads (torch.distributed reset it to 1).
-        # faiss.omp_set_num_threads(32)
-        faiss.omp_set_num_threads(64)
-        # faiss.omp_set_num_threads(128)
-
-        empty_index_path = self.get_empty_index_path()
-
-        # Index already exists? -> return.
-        if os.path.isfile(empty_index_path):
-            return
-
-        # Load data.
-        merged_path = get_training_data_merged_path()
-        inp = np.memmap(
-	    merged_path,
-            dtype = "f4",
-	    mode = "r",
-        ).reshape((-1, args.hidden_size))
-
-        # Init index.
-        index = faiss.index_factory(args.retro_index_nfeats,
-                                    args.retro_index_str)
-
-        # Move to GPU.
-        print("> move faiss index to gpu.")
-        index_ivf = faiss.extract_index_ivf(index)
-        clustering_index = \
-            faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(index_ivf.d))
-        index_ivf.clustering_index = clustering_index
-        print("> finished moving to gpu.")
-        self.c_verbose(index, True)
-        self.c_verbose(index_ivf, True)
-        self.c_verbose(index_ivf.quantizer, True)
-        self.c_verbose(index_ivf.clustering_index, True)
-
-        # Train index.
-        index.train(inp)
-
-        # Save index.
-        faiss.write_index(index, empty_index_path)
-
-    def train(self):
-        '''Train index.'''
-
-        # Single process only.
-        if torch.distributed.get_rank() == 0:
-            self._train()
-
-        torch.distributed.barrier()
-
-    def _add(self, text_dataset):
-        '''Add to index (rank 0's method).'''
-
-        assert torch.distributed.get_rank() == 0
-
-        args = get_retro_args()
-
-        dataset_sample_ranges = num_samples_to_block_ranges(len(text_dataset))
-
-        # Set num threads (torch.distributed reset it to 1).
-        faiss.omp_set_num_threads(64)
-
-        # Bert embedder.
-        embedder = BertEmbedder(args.retro_bert_batch_size,
-                                args.retro_bert_max_chunk_length,
-                                args.bert_embedder_type)
-
-        # Empty/added index paths.
-        empty_index_path = self.get_empty_index_path()
-        added_index_path = self.get_added_index_path()
-
-        # Skip adding, if index exists.
-        if os.path.isfile(added_index_path):
-            return
-
-        # Read trained index.
-        index = faiss.read_index(empty_index_path)
-
-        # Iterate data blocks & add.
-        for sample_range in tqdm(dataset_sample_ranges, "faiss_base.add"):
-
-            # Embed text.
-            embeds = self.embed_text_dataset_block(
-                embedder, text_dataset, sample_range)
-
-            # Add to index.
-            index.add(embeds)
-
-        # Write index.
-        faiss.write_index(index, added_index_path)
-
-    def add(self, text_dataset):
-        '''Add to index.'''
-
-        # Single process only.
-        if torch.distributed.get_rank() == 0:
-            self._add(text_dataset)
-
-        # Wait for rank 0.
-        torch.distributed.barrier()
-
-        # Get output index path, for return.
-        return self.get_added_index_path()
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/faiss_par_add.py b/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/faiss_par_add.py
deleted file mode 100644
index 9483b70df72a5ca47f5ea2bd59fd6b21d7d00537..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/index/indexes/faiss_par_add.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Multi-process & multi-node version of Faiss's index.add().
-
-This class inherits from FaissBaseIndex, and optimizes the 'add()' method by
-making it multi-node and multi-process, with bit-wise equivalence to
-FaissBaseIndex. This allows 'add()' to scale out to very large datasets, since
-the vast majority of the computational effort is embarrassingly parallel.
-"""
-
-import numpy as np
-import os
-import psutil
-import shutil
-import torch
-from tqdm import tqdm
-
-from megatron_ds import get_retro_args, print_rank_0
-from tools.bert_embedding import BertEmbedder
-from tools.bert_embedding.utils import get_missing_blocks_by_rank
-from tools.retro.external_libs import faiss, h5py
-from tools.retro.index.utils import get_added_codes_dir, get_added_code_paths
-
-from .faiss_base import FaissBaseIndex
-
-
-class FaissParallelAddIndex(FaissBaseIndex):
-
-    def encode_block(self, index, embedder, text_dataset, block):
-        '''Encode sub-dataset block, to be later added to index.
-
-        Encode the data subset, generally in blocks of 1M vectors each. For
-        each block, the empty/trained index is loaded, codes are computed
-        via index.sa_encode(), and the resulting codes are saved to disk.
-        '''
-
-        args = get_retro_args()
-
-        # Embed block.
-        embeddings = self.embed_text_dataset_block(
-            embedder,
-            text_dataset,
-            block["range"],
-        )
-
-        # Encode block.
-        print_rank_0("encode.")
-        codes = index.sa_encode(embeddings)
-
-        # Save neighbors.
-        print_rank_0("save codes.")
-        os.makedirs(os.path.dirname(block["path"]), exist_ok=True)
-        with h5py.File(block["path"], "w") as f:
-            f.create_dataset("data", data=codes)
-
-    def encode(self, text_dataset):
-        '''Encode text dataset, to be later added to index.'''
-
-        args = get_retro_args()
-        codes_dir = get_added_codes_dir()
-
-        # Index.
-        index = self.get_empty_index()
-
-        # Bert embedder.
-        embedder = BertEmbedder(args.retro_bert_batch_size,
-                                args.retro_bert_max_chunk_length,
-                                args.bert_embedder_type)
-
-        # Missing code blocks.
-        def validate(f):
-            assert len(f["data"].shape) == 2
-        n_missing_blocks, missing_code_blocks = get_missing_blocks_by_rank(
-            codes_dir,
-            len(text_dataset),
-            args.retro_block_size,
-            validate=validate,
-        )
-
-        # Encode each block.
-        for block_index, block in enumerate(missing_code_blocks):
-
-            if block is not None:
-
-                # Progress.
-                print_rank_0("encode block %d / %d ... %s." % (
-                    block_index,
-                    len(missing_code_blocks),
-                    block["path"],
-                ))
-
-                # Query block neighbors.
-                self.encode_block(index, embedder, text_dataset, block)
-
-            # Synchronize progress across all ranks. (for easier observation)
-            print_rank_0(" > waiting for other ranks to finish block.")
-            torch.distributed.barrier()
-
-    def add_codes(self):
-
-        if torch.distributed.get_rank() != 0:
-            return
-
-        added_index_path = self.get_added_index_path()
-        if os.path.exists(added_index_path):
-            return
-
-        args = get_retro_args()
-
-        # Index.
-        print_rank_0("read empty index.")
-        index = self.get_empty_index()
-        index_ivf = faiss.extract_index_ivf(index)
-
-        # Add codes.
-        print_rank_0("add codes.")
-        code_paths = get_added_code_paths()
-        pbar = tqdm(code_paths)
-        for code_path in pbar:
-            pbar.set_description("add codes, mem %.3f gb, %.1f%%" % (
-                psutil.virtual_memory()[3] / 1024**3,
-                psutil.virtual_memory()[2],
-            ))
-            with h5py.File(code_path) as f:
-
-                nload = int(args.retro_index_add_load_fraction*f["data"].shape[0])
-                offset = int(os.path.basename(code_path).split("-")[0])
-                xids = np.arange(offset, offset + nload)
-                codes = np.copy(f["data"][:nload])
-                index_ivf.add_sa_codes(codes, xids)
-
-        # Update index's ntotal.
-        index.ntotal = index_ivf.ntotal
-
-        # Write index.
-        print_rank_0("write added index.")
-        faiss.write_index(index, added_index_path)
-
-    def remove_codes(self):
-        '''Remove added codes after adding to index.'''
-        if torch.distributed.get_rank() != 0:
-            return
-        assert os.path.isfile(self.get_added_index_path())
-
-        args = get_retro_args()
-        if args.retro_index_delete_added_codes:
-            raise Exception("remove?")
-            shutil.rmtree(get_added_codes_dir(), ignore_errors=True)
-
-    def add(self, text_dataset):
-
-        # Encode chunks.
-        self.encode(text_dataset)
-
-        # Add codes to index.
-        self.add_codes()
-
-        # Wait for (single-process) adding to complete.
-        torch.distributed.barrier()
-
-        # Remove codes.
-        self.remove_codes()
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/index/utils.py b/toolbox/Megatron-DeepSpeed/tools/retro/index/utils.py
deleted file mode 100644
index 7b0775b502988e4db0d63784f683c5b982e94dfb..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/index/utils.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import concurrent
-import gc
-import glob
-import numpy as np
-import os
-import psutil
-import time
-import torch
-from tqdm import tqdm
-
-from megatron_ds import get_retro_args, print_rank_0
-from tools.retro.db.utils import get_indexed_dataset_infos
-from tools.retro.external_libs import h5py
-
-
-def get_index_dir():
-    """Create sub-directory for this index."""
-
-    args = get_retro_args()
-
-    # Directory path.
-    index_dir_path = os.path.join(
-        args.retro_workdir,
-        "index",
-        args.retro_index_type,
-        args.retro_index_str,
-    )
-
-    # Make directory.
-    os.makedirs(index_dir_path, exist_ok=True)
-
-    return index_dir_path
-
-
-def num_samples_to_block_ranges(num_samples):
-    '''Split a range (length num_samples) into sequence of block ranges
-    of size block_size.'''
-    args = get_retro_args()
-    block_size = args.retro_block_size
-    start_idxs = list(range(0, num_samples, block_size))
-    end_idxs = [min(num_samples, s + block_size) for s in start_idxs]
-    ranges = list(zip(start_idxs, end_idxs))
-    return ranges
-
-
-def get_training_data_root_dir():
-    args = get_retro_args()
-    return os.path.join(args.retro_workdir, "index", "train_emb")
-
-
-def get_training_data_block_dir():
-    return os.path.join(get_training_data_root_dir(), "blocks")
-
-
-def get_training_data_block_paths():
-    return sorted(glob.glob(get_training_data_block_dir() + "/*.hdf5"))
-
-
-def get_training_data_merged_path():
-    args = get_retro_args()
-    return os.path.join(get_training_data_root_dir(),
-                        "train_%.3f.bin" % args.retro_index_train_load_fraction)
-
-
-def get_added_codes_dir():
-    return os.path.join(get_index_dir(), "add_codes")
-
-
-def get_added_code_paths():
-    return sorted(glob.glob(get_added_codes_dir() + "/*.hdf5"))
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/main.py b/toolbox/Megatron-DeepSpeed/tools/retro/main.py
deleted file mode 100644
index 72e35fdc89fbebc5a599f21aa175979f389be5f1..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/main.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-"""Preprocess data for Retro.
-
-Stages (see argument '--retro-tasks'):
-- Build chunk database (DB).
-- Build index (train, add).
-- Query pretraining neighbors.
-"""
-
-import json
-import os
-import torch
-
-from megatron_ds import get_args, initialize_megatron, print_rank_0
-from megatron_ds.global_vars import set_retro_args
-from tools.retro.db import build_db
-from tools.retro.index import add_to_index, build_index, train_index
-from tools.retro.query import query_pretraining_neighbors
-from tools.retro.utils import get_args_path
-
-
-def add_retro_args(parser):
-    """Retro preprocesing arguments.
-
-    *Note* : Arguments prefixed with '--retro-gpt-*' or '--retro-bert-*' are
-    included and named as such to more easily handle managing both models
-    running at the same time. Megatron is not optimized to run two models at
-    once, so this naming convention makes it clearer.
-    """
-
-    group = parser.add_argument_group(title="Retro preprocessing.")
-
-    # Basic args.
-    group.add_argument("--retro-tasks", default="build",
-                       help="Comma-separated list of tasks to run. Run entire "
-                       "preprocesing pipeline by using '--retro-tasks build'. "
-                       "Alternatively, run individual stages with tasks (in "
-                       "this order) 'db-build', 'index-build', or "
-                       "'query-pretraining-neighbors'. For example, "
-                       "'--retro-tasks db-build,index-build,"
-                       "query-pretraining-neighbors' is equivalent to "
-                       "'--retro-tasks build'; or the argument can contain "
-                       "a subset of these tasks. Stages must always be run "
-                       "in the correct order (listed above).")
-    group.add_argument("--retro-block-size", type=int, default=100000,
-                       help="Number of chunks to process at a time when "
-                       "generating Bert embeddings and querying the search "
-                       "index. Partial results for each block are generally "
-                       "saved to disk in separate files.")
-    group.add_argument("--retro-doc-block-size", type=int, default=100000,
-                       help="Number of documents to processe at time when "
-                       "processing token datasets into chunk databases. The "
-                       "partial chunk database for each block is saved into "
-                       "a separate file.")
-
-    # GPT args.
-    group.add_argument('--retro-gpt-seed', type=int, default=1234,
-                       help='Random seed used for python, numpy, '
-                       'pytorch, and cuda.')
-    group.add_argument('--retro-gpt-data-impl', type=str, default='infer',
-                       choices=['lazy', 'cached', 'mmap', 'infer'],
-                       help='Implementation of indexed datasets.')
-    group.add_argument('--retro-gpt-data-path', nargs='*', required=True,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ... It is used with --split when a '
-                       'single dataset used for all three: train, valid '
-                       'and test. It is exclusive to the other '
-                       '--*-data-path args')
-    group.add_argument('--retro-gpt-split', type=str, default='969,30,1',
-                       help='Comma-separated list of proportions for training,'
-                       ' validation, and test split. For example the split '
-                       '`90,5,5` will use 90%% of data for training, 5%% for '
-                       'validation and 5%% for test.')
-    group.add_argument('--retro-gpt-mmap-warmup', action='store_true',
-                       help='Warm up mmap files.')
-    group.add_argument("--retro-gpt-eval-interval", type=int, required=True,
-                       help="GPT evaluation interval.")
-    group.add_argument("--retro-gpt-eval-iters", type=int, required=True,
-                       help="GPT evaluation iterations.")
-    group.add_argument("--retro-gpt-tokenizer-type", required=True,
-                       help="GPT tokenizer type.")
-    group.add_argument("--retro-gpt-vocab-file", help="GPT vocab file.")
-    group.add_argument("--retro-gpt-merge-file", help="GPT merge file.")
-    group.add_argument("--retro-gpt-tokenizer-model",
-                       help="GPT tokenizer model file.")
-    group.add_argument("--retro-gpt-seq-length", type=int, required=True,
-                       help="GPT sequence length.")
-    group.add_argument("--retro-gpt-global-batch-size", type=int, required=True,
-                       help="GPT global batch size.")
-    group.add_argument("--retro-gpt-chunk-length", type=int, default=64,
-                       help="GPT chunk length.")
-
-    # Bert args.
-    group.add_argument("--retro-bert-vocab-file", required=True,
-                       help="Bert vocab file.")
-    group.add_argument("--retro-bert-tokenizer-type", required=True,
-                       help="Bert tokenizer type (for when using "
-                       "'--bert-embedder-type megatron').")
-    group.add_argument("--retro-bert-batch-size", type=int, default=128,
-                       help="Micro-batch size for processing Bert embeddings.")
-    group.add_argument("--retro-bert-max-chunk-length", type=int, default=256,
-                       help="Maximum sequence length for Bert embeddings. "
-                       "(Named 'chunk' here in reference to these Bert "
-                       "sequences being converted from GPT chunks.)")
-
-    # Index args.
-    group.add_argument("--retro-index-nfeats", "-f", type=int, default=1024,
-                       help="Dimension of Bert embeddings. Bert-large is "
-                       "commonly used, so this value defaults to 1024.")
-    group.add_argument("--retro-index-type", default="faiss-par-add",
-                       choices=["faiss-base", "faiss-par-add"],
-                       help="A 'faiss-base' index is a simple, un-optimized "
-                       "wrapper around a Faiss index. A 'faiss-par-add' index "
-                       "optimizes the 'add()' method by making it multi-node "
-                       "and multi-process, but with bit-wise equivalent "
-                       "results.")
-    group.add_argument("--retro-index-str", required=True,
-                       help="Index string used for calling "
-                       "faiss.index_factory(). For example, "
-                       "'IVF262144_HNSW32,Flat' or "
-                       "'OPQ32_256,IVF4194304_HNSW32,PQ32'.")
-    group.add_argument("--retro-index-ntrain", type=int, required=True,
-                       help="Number of database chunks to use for training "
-                       "the index. This value must be less or equal to the "
-                       "total number of chunks in the database.")
-    group.add_argument("--retro-index-train-load-fraction",
-                       type=float, default=1.,
-                       help="Fraction of sampled chunks to use for training "
-                       "the index. Useful when our total sampled embeddings "
-                       "use too much memory; lowering the load fraction is "
-                       "less costly than re-embedding a new sampled dataset "
-                       "from scratch.")
-    group.add_argument("--retro-index-add-load-fraction",
-                       type=float, default=1.,
-                       help="Fraction of database chunks to use for adding to "
-                       "the index. Useful when our total index size would "
-                       "use too much memory; lowering the load fraction is "
-                       "less costly than re-designing our token datasets.")
-    group.add_argument("--retro-index-no-delete-training-embeddings",
-                       action='store_false',
-                       dest="retro_index_delete_training_embeddings",
-                       help="Skip deleting training embeddings for the search "
-                       "index. Useful for debugging.")
-    group.add_argument("--retro-index-no-delete-added-codes",
-                       action='store_false',
-                       dest="retro_index_delete_added_codes",
-                       help="Skip deleting added codes for the search "
-                       "index. Useful for debugging.")
-
-    # Query args.
-    group.add_argument("--retro-query-ef-search", type=int, default=256,
-                       help="Index ef-search parameter for HNSW during querying.")
-    group.add_argument("--retro-query-nprobe", type=int, default=65536,
-                       help="Index nprobe parameter for IVF during querying.")
-    group.add_argument("--retro-query-num-neighbors-query", type=int, default=200,
-                       help="Number of neighbors to retrieve when calling "
-                       "index.search().")
-    group.add_argument("--retro-query-num-neighbors-save", type=int, default=20,
-                       help="Number of neighbors to save to disk after "
-                       "the index's returned neighbors. If longer than target "
-                       "value, neighbors truncated; and if shorter than target "
-                       "value, neighbors are padded with -1's.")
-
-    # Enforce argument naming convention.
-    for action in group._group_actions:
-        prefix = action.dest.split("_")[0]
-        assert prefix == "retro", \
-            "Retro args must be prefixed with '--retro-*', for consistent " \
-            "styling. Please fix '%s'." % ", ".join(action.option_strings)
-
-    return parser
-
-
-def save_args(args):
-    '''Save copy of args within retro workdir.'''
-
-    def default_dump(obj):
-        if isinstance(obj, torch.dtype):
-            return str(obj)
-        else:
-            raise Exception("specialize for <%s>." % type(obj).__name__)
-
-    if torch.distributed.get_rank() == 0:
-        args_path = get_args_path(args.retro_workdir)
-        with open(args_path, "w") as f:
-            json.dump(vars(args), f, indent=4, default=default_dump)
-
-    torch.distributed.barrier()
-
-
-if __name__ == "__main__":
-
-    # Initalize Megatron.
-    initialize_megatron(extra_args_provider=add_retro_args)
-
-    # Split retro tasks.
-    args = get_args()
-    args.retro_tasks = args.retro_tasks.split(",")
-
-    # Save/set retro args.
-    os.makedirs(args.retro_workdir, exist_ok=True)
-    save_args(args)
-    set_retro_args(args)
-
-    # Select task to run.
-    for task in args.retro_tasks:
-
-        print_rank_0("start '%s'." % task)
-
-        # Run all stages.
-        if task == "build":
-            build_db()
-            torch.distributed.barrier()
-            build_index()
-            torch.distributed.barrier()
-            query_pretraining_neighbors()
-
-        # DB (i.e., chunk db).
-        elif task == "db-build":
-            build_db()
-
-        # Index.
-        elif task == "index-build":
-            build_index() # calls both train + add.
-        elif task == "index-train":
-            train_index() # train only
-        elif task == "index-add":
-            add_to_index() # add only
-
-        # Pretraining.
-        elif task == "query-pretraining-neighbors":
-            query_pretraining_neighbors()
-
-        else:
-            raise Exception("specialize for task '%s'." % task)
-
-        torch.distributed.barrier()
-
-        print_rank_0("end '%s'." % task)
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/query/__init__.py b/toolbox/Megatron-DeepSpeed/tools/retro/query/__init__.py
deleted file mode 100644
index 8ea709941b18979cf43da0103255b7ce5c98e5ee..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/query/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-from .query import query_pretraining_neighbors
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/query/chunk_dataset.py b/toolbox/Megatron-DeepSpeed/tools/retro/query/chunk_dataset.py
deleted file mode 100644
index b795985ed52c3f1e8ebccaec82419f013833ff2a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/query/chunk_dataset.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import os
-import torch
-
-from megatron_ds import get_retro_args, print_rank_0
-from megatron_ds.data.gpt_dataset import build_train_valid_test_datasets \
-    as build_gpt_train_valid_test_datasets
-from megatron_ds.training import (
-    build_train_valid_test_datasets as build_pretraining_train_valid_test_datasets,
-    update_train_iters,
-)
-from tools.retro.db.utils import get_indexed_dataset_infos
-from tools.retro.utils import get_num_chunks_per_sample
-
-from .utils import get_neighbor_dirname, get_query_workdir
-
-
-class ChunkDataset(torch.utils.data.Dataset):
-    '''Pretraining chunk dataset wraps a standard GPT dataset.
-
-    This dataset conceptually divides each sample (e.g., length 2048)
-    into chunks (e.g., length 64) and restructures them into a list of
-    chunks (e.g., length num_samples * num_chunks_per_sample).
-    '''
-
-    def __init__(self, sample_dataset, chunk_length):
-
-        super().__init__()
-
-        self.sample_dataset = sample_dataset
-
-        self.chunk_length = chunk_length
-        self.n_chunks_per_sample = get_num_chunks_per_sample()
-        self.n_samples = len(sample_dataset)
-        self.n_chunks = self.n_samples * self.n_chunks_per_sample
-
-    def __len__(self):
-        return self.n_chunks
-
-    def __getitem__(self, idx):
-
-        # Convert global chunk index to global sample index & local chunk index.
-        sample_idx = idx // self.n_chunks_per_sample
-        chunk_idx = idx % self.n_chunks_per_sample
-
-        # Extract sample data.
-        sample = self.sample_dataset[sample_idx]
-        sample_token_ids = sample["text"]
-        sample_doc_ids = sample["doc_ids"]
-
-        # Chunk start/end token idxs.
-        token_start_idx = chunk_idx * self.chunk_length
-        token_end_idx = token_start_idx + self.chunk_length
-        chunk_token_ids = sample_token_ids[token_start_idx:token_end_idx]
-
-        # Sample.
-        return {
-            "doc_ids" : sample_doc_ids,
-            "text" : chunk_token_ids,
-        }
-
-
-def verify_indexed_dataset_order():
-    '''Verify pretraining order same as DB order.'''
-
-    args = get_retro_args()
-
-    # DB dataset prefixes.
-    db_indexed_dataset_infos = get_indexed_dataset_infos()
-    db_prefixes = [ info["prefix"] for info in db_indexed_dataset_infos ]
-
-    # Verify order & prefixes.
-    assert len(args.data_path) >= 2, "blendable dataset supported only."
-    pretraining_prefixes = args.data_path[1:None:2]
-
-    if len(db_prefixes) != len(pretraining_prefixes):
-        raise Exception("inconsistent dataset count between db & pretraining.")
-    if db_prefixes != pretraining_prefixes:
-        raise Exception("inconsistent dataset order between db & pretraining.")
-
-
-def train_valid_test_datasets_provider(train_val_test_num_samples):
-    """Build train, valid, and test datasets."""
-
-    args = get_retro_args()
-
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
-    train_ds, valid_ds, test_ds = build_gpt_train_valid_test_datasets(
-        data_prefix=args.retro_gpt_data_path,
-        data_impl=args.retro_gpt_data_impl,
-        splits_string=args.retro_gpt_split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.retro_gpt_seq_length,
-        seed=args.retro_gpt_seed,
-        skip_warmup=(not args.retro_gpt_mmap_warmup),
-        return_doc_ids=args.retro_return_doc_ids)
-    print_rank_0("> finished creating pretrained GPT datasets ...")
-
-    return train_ds, valid_ds, test_ds
-
-
-def get_chunk_dataset_map():
-    '''Get train, valid, test chunk datasets.'''
-
-    args = get_retro_args()
-
-    # Update train iters.
-    update_train_iters(args)
-
-    args.iteration = 0
-    args.consumed_train_samples = 0
-
-    # Verify indexed dataset order.
-    verify_indexed_dataset_order()
-
-    # Datasets.
-    print_rank_0(" > datasets.")
-    train_ds, valid_ds, test_ds = build_pretraining_train_valid_test_datasets(
-        train_valid_test_datasets_provider)
-
-    sample_dataset_map = {
-        "train" : train_ds,
-        "valid" : valid_ds,
-        "test" : test_ds,
-    }
-
-    # Info dict.
-    chunk_dataset_map = {
-        key : {
-            "neighbor_dir" : get_neighbor_dirname(key, sample_ds),
-            "data" : ChunkDataset(sample_ds, args.retro_gpt_chunk_length),
-        }
-        for key, sample_ds in sample_dataset_map.items() if sample_ds
-    }
-
-    return chunk_dataset_map
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/query/query.py b/toolbox/Megatron-DeepSpeed/tools/retro/query/query.py
deleted file mode 100644
index dc54355ff2f21302c984e0c6283b4205f462b50a..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/query/query.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import numpy as np
-import os
-import psutil
-import time
-import torch
-from tqdm import tqdm
-
-from megatron_ds import get_retro_args, print_rank_0
-from tools.bert_embedding import BertEmbedder
-from tools.bert_embedding.utils import get_missing_blocks_by_rank
-from tools.retro.db.utils import \
-    get_merged_train_dataset as get_db_merged_train_dataset
-from tools.retro.external_libs import faiss, h5py
-from tools.retro.index.factory import IndexFactory
-from tools.retro.index.utils import get_index_dir
-from tools.retro.utils import GPTToTextDataset
-
-from .chunk_dataset import get_chunk_dataset_map as get_query_dataset_map
-
-
-def get_index(ondisk=False):
-    '''Read index from disk.'''
-
-    args = get_retro_args()
-
-    # Load index.
-    index_wrapper = IndexFactory.get_index(args.retro_index_type)
-    index_dir = get_index_dir()
-    added_index_path = index_wrapper.get_added_index_path()
-    if ondisk:
-        index = faiss.read_index(added_index_path, faiss.IO_FLAG_MMAP)
-    else:
-        index = faiss.read_index(added_index_path)
-
-    # Search parameters.
-    faiss.ParameterSpace().set_index_parameter(index, "efSearch",
-                                               args.retro_query_ef_search)
-    faiss.ParameterSpace().set_index_parameter(index, "nprobe",
-                                               args.retro_query_nprobe)
-
-    return index
-
-
-def embed_block(gpt_dataset, block, embedder):
-    '''Embed block of chunks.'''
-    text_block_dataset = torch.utils.data.Subset(
-        GPTToTextDataset(gpt_dataset),
-        range(*block["range"]),
-    )
-    return embedder.embed_text_dataset(text_block_dataset)
-
-
-def query_embeddings(db_dataset, index,
-                     embeddings, chunk_id_range,
-                     sample_map, n_chunks_per_sample,
-                     verbose=True):
-    '''Query neighbors of a block of embeddings.'''
-
-    args = get_retro_args()
-
-    # Query neighbor ids.
-    if verbose: print_rank_0("search.")
-    t = time.time()
-    assert index.ntotal > 0, "check we don't accidentally have an empty index."
-    _, query_neighbor_ids = \
-        index.search(embeddings, args.retro_query_num_neighbors_query)
-    if verbose: print_rank_0("  time : %.3f sec." % (time.time() - t))
-
-    # Filter banned neighbor ids.
-    if verbose: print_rank_0("filter banned neighbor ids.")
-    filtered_neighbor_ids = np.full(
-        shape=(len(query_neighbor_ids), args.retro_query_num_neighbors_save),
-        fill_value=-1,
-        dtype="int64",
-    )
-    min_chunk_id, max_chunk_id = chunk_id_range
-    for chunk_id in range(min_chunk_id, max_chunk_id):
-
-        sample_id = chunk_id // n_chunks_per_sample
-        sample = sample_map[sample_id]
-        sample_dataset_idx = sample["dataset_idx"].item()
-        sample_doc_ids = sample["doc_ids"].tolist()
-        sample_doc_tuples = [(sample_dataset_idx, d) for d in sample_doc_ids]
-        
-        # Get valid neighbors (!= -1).
-        query_row = [ i for i in query_neighbor_ids[chunk_id-min_chunk_id]
-                      if i >= 0 ]
-
-        # Filter row.
-        filtered_row = [ i for i in query_row
-                         if tuple(db_dataset.doc_tuples[i].tolist())
-                         not in sample_doc_tuples ]
-        filtered_row = filtered_row[:args.retro_query_num_neighbors_save]
-        filtered_row += \
-            [-1] * (args.retro_query_num_neighbors_save - len(filtered_row))
-        filtered_neighbor_ids[chunk_id-min_chunk_id] = filtered_row
-
-    return query_neighbor_ids, filtered_neighbor_ids
-
-
-def query_embedding_block(db_dataset, index,
-                          embeddings, chunk_id_range,
-                          sample_map, n_chunks_per_sample):
-
-    query_neighbor_ids = []
-    filtered_neighbor_ids = []
-
-    # Query in sub-blocks.
-    partial_block_size = 1000
-    for partial_start_idx in tqdm(
-            range(0, len(embeddings), partial_block_size),
-            "search",
-    ):
-        partial_end_idx = min(len(embeddings),
-                              partial_start_idx + partial_block_size)
-        partial_embeddings = embeddings[partial_start_idx:partial_end_idx]
-        partial_chunk_id_range = (
-            chunk_id_range[0] + partial_start_idx,
-            chunk_id_range[0] + partial_end_idx,
-        )
-        partial_query_neighbor_ids, partial_filtered_neighbor_ids = \
-            query_embeddings(db_dataset, index,
-                             partial_embeddings, partial_chunk_id_range,
-                             sample_map, n_chunks_per_sample,
-                             verbose=False)
-        query_neighbor_ids.append(partial_query_neighbor_ids)
-        filtered_neighbor_ids.append(partial_filtered_neighbor_ids)
-
-    # Concatenate.
-    query_neighbor_ids = np.concatenate(query_neighbor_ids, axis=0)
-    filtered_neighbor_ids = np.concatenate(filtered_neighbor_ids, axis=0)
-
-    return query_neighbor_ids, filtered_neighbor_ids
-
-
-def query_block_neighbors(db_dataset, query_dataset,
-                          index, embedder,
-                          block):
-    '''Query neighbors of a dataset block (i.e., range).'''
-
-    args = get_retro_args()
-    n_chunks_per_sample = query_dataset.n_chunks_per_sample
-
-    # Sample map.
-    sample_ids = sorted(list(set(chunk_id // n_chunks_per_sample
-                                 for chunk_id in range(*block["range"]))))
-    sample_map = {}
-    for i in sample_ids:
-        sample = query_dataset.sample_dataset[i]
-        sample_map[i] = {
-            "dataset_idx" : sample["dataset_idx"],
-            "doc_ids" : sample["doc_ids"],
-        }
-
-    # Embed block.
-    embeddings = embed_block(query_dataset, block, embedder)
-
-    # Query embeddings.
-    _, filtered_neighbor_ids = query_embedding_block(
-        db_dataset, index,
-        embeddings, block["range"],
-        sample_map, n_chunks_per_sample)
-
-    # Save neighbors.
-    print_rank_0("save neighbors.")
-    os.makedirs(os.path.dirname(block["path"]), exist_ok=True)
-    f = h5py.File(block["path"], "w")
-    f.create_dataset("neighbors", data=filtered_neighbor_ids)
-    f.close()
-
-
-def query_dataset_neighbors(db_dataset, query_dataset,
-                            prefix, neighbor_dir,
-                            index, embedder):
-    '''Query neighbors of each chunk within a dataset.'''
-
-    args = get_retro_args()
-
-    def validate(f):
-        assert f["neighbors"].shape[1] == args.retro_query_num_neighbors_save, \
-            "neighbors.shape == %s; num_neighbors_target == %d." % (
-                str(f["neighbors"].shape),
-                args.retro_num_neighbors_target,
-            )
-    n_missing_blocks, missing_neighbor_blocks = get_missing_blocks_by_rank(
-        neighbor_dir,
-        len(query_dataset),
-        args.retro_block_size,
-        validate=validate,
-    )
-
-    # Query each block.
-    for block_index, block in enumerate(missing_neighbor_blocks):
-
-        if block is not None:
-
-            # Progress.
-            print_rank_0("query '%s' block %d / %d ... %s ... mem %.3f gb, %.1f%%." % (
-                prefix,
-                block_index,
-                len(missing_neighbor_blocks),
-                os.path.basename(block["path"]),
-                psutil.virtual_memory()[3] / 1024**3,
-                psutil.virtual_memory()[2],
-            ))
-
-            # Query block neighbors.
-            query_block_neighbors(db_dataset, query_dataset,
-                                  index, embedder,
-                                  block)
-
-        # Synchronize progress across all ranks. (for easier observation)
-        print_rank_0(" > waiting for other ranks to finish block.")
-        torch.distributed.barrier()
-
-
-def query_pretraining_neighbors():
-    '''Query pretraining datasets (train & valid).'''
-
-    args = get_retro_args()
-
-    # Num threads.
-    faiss.omp_set_num_threads(64)
-
-    # Load chunk db dataset.
-    print_rank_0("load chunk db dataset.")
-    db_dataset = get_db_merged_train_dataset()
-    db_dataset.load_doc_tuples()
-
-    # Load index.
-    print_rank_0(" > get index.")
-    index = get_index()
-
-    # Load datasets.
-    print_rank_0(" > get dataset map.")
-    query_dataset_map = get_query_dataset_map()
-
-    # Bert embedder.
-    embedder = BertEmbedder(args.retro_bert_batch_size,
-                            args.retro_bert_max_chunk_length,
-                            args.bert_embedder_type)
-
-    # Query each (i.e., train, valid, test) dataset.
-    print_rank_0(" > query.")
-    for prefix, info in query_dataset_map.items():
-        print_rank_0(" > query '%s' dataset ... %d samples." %
-                     (prefix, len(info["data"])))
-        query_dataset_neighbors(db_dataset, info["data"],
-                                prefix, info["neighbor_dir"],
-                                index, embedder)
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/query/retro_dataset.py b/toolbox/Megatron-DeepSpeed/tools/retro/query/retro_dataset.py
deleted file mode 100644
index 38bba2532a6cb3c53755bd018320e5d78b2680fd..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/query/retro_dataset.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import numpy as np
-import os
-import torch
-
-from megatron_ds import get_args, get_retro_args
-from tools.bert_embedding.utils import BlockPathMap
-from tools.retro.db.utils import get_merged_train_dataset as get_db_dataset
-from tools.retro.external_libs import h5py
-
-from .chunk_dataset import get_chunk_dataset_map
-from .utils import get_neighbor_dirname
-
-
-class RetroDataset(torch.utils.data.Dataset):
-    '''Dataset of retro samples.
-
-    Each sample contains the original GPT sample, along with the token IDs
-    of each neighbor of each chunk within the sequence. Neighbor array has
-    shape (num_chunks_per_sample, num_neighbors, num_retrieved_tokens).
-    '''
-
-    def __init__(self,
-                 num_neighbors,
-                 num_retrieved_chunks,
-                 block_size,
-                 db_dataset,
-                 chunk_dataset,
-                 neighbor_path_map):
-        '''Note: chunk dataset wraps original GPT dataset (see
-        chunk_dataset.py).'''
-
-        super().__init__()
-
-        self.num_neighbors = num_neighbors
-        self.num_retrieved_chunks = num_retrieved_chunks
-        self.block_size = block_size
-        self.db_dataset = db_dataset
-        self.chunk_dataset = chunk_dataset
-        self.neighbor_path_map = neighbor_path_map
-
-    def __len__(self):
-        return len(self.chunk_dataset.sample_dataset)
-
-    def __getitem__(self, sample_idx):
-
-        n_chunks_per_sample = self.chunk_dataset.n_chunks_per_sample
-
-        # Get standard sample.
-        sample = self.chunk_dataset.sample_dataset[sample_idx]
-
-        # Sample idx to chunk idxs.
-        chunk_idxs = list(range(
-            sample_idx * n_chunks_per_sample,
-            (sample_idx + 1) * n_chunks_per_sample,
-        ))
-
-        # Collect retrieved tokens.
-        all_retrieved_chunk_ids = []
-        all_retrieved_token_ids = []
-        for chunk_idx in chunk_idxs:
-
-            # Neighbor chunk ids.
-            neighbor_path = self.neighbor_path_map[chunk_idx]
-            with h5py.File(neighbor_path, "r") as f:
-                neighbor_chunk_ids = f["neighbors"] \
-                    [chunk_idx % self.block_size, :self.num_neighbors].tolist()
-
-            # Retrieved (neighbor + continuation) token ids.
-            retrieved_chunk_ids = []
-            retrieved_token_ids = []
-            for neighbor_chunk_id in neighbor_chunk_ids:
-                current_chunk_ids = [
-                    i % len(self.db_dataset)
-                    for i in range(
-                            neighbor_chunk_id,
-                            neighbor_chunk_id + self.num_retrieved_chunks)]
-                current_token_ids = [self.db_dataset[ci]["text"]
-                                     for ci in current_chunk_ids]
-                retrieved_chunk_ids.append(current_chunk_ids)
-                retrieved_token_ids.append(current_token_ids)
-
-            # Collect retrieved tokens.
-            all_retrieved_chunk_ids.append(retrieved_chunk_ids)
-            all_retrieved_token_ids.append(retrieved_token_ids)
-
-        # Reshape retrieved tokens.
-        all_retrieved_chunk_ids = np.array(all_retrieved_chunk_ids) \
-            .reshape((n_chunks_per_sample, self.num_neighbors, -1))
-        all_retrieved_token_ids = np.array(all_retrieved_token_ids) \
-            .reshape((n_chunks_per_sample, self.num_neighbors, -1))
-
-        # Sample.
-        sample = {
-            **sample,
-            "neighbor_chunks" : all_retrieved_chunk_ids,
-            "neighbor_tokens" : all_retrieved_token_ids,
-        }
-
-        return sample
-
-
-def get_retro_datasets(verify_sizes=True):
-    '''Get train, valid, test retro datasets.'''
-
-    args = get_args()
-    retro_args = get_retro_args()
-
-    # DB dataset.
-    db_dataset = get_db_dataset()
-
-    # Retro datasets.
-    chunk_ds_info_map = get_chunk_dataset_map()
-    retro_dataset_map = {}
-    for data_key, chunk_ds_info in chunk_ds_info_map.items():
-
-        chunk_dataset = chunk_ds_info["data"]
-        neighbor_dir = chunk_ds_info["neighbor_dir"]
-        neighbor_path_map = BlockPathMap.from_dir(neighbor_dir,
-                                                  retro_args.retro_block_size)
-
-        # Verify dataset prefixes.
-        expected_dir = get_neighbor_dirname(data_key, chunk_dataset.sample_dataset)
-        assert expected_dir == neighbor_dir, \
-            "inconsistent dataset source; '%s' vs. '%s'." % \
-            (expected_dir, neighbor_dir)
-
-        # Verify num chunks.
-        n_sample_chunks = len(chunk_dataset)
-        n_neighbor_chunks = neighbor_path_map.max_idx
-
-        if not os.path.isdir(neighbor_dir):
-            if torch.distributed.get_rank() == 0:
-                raise Exception("neighbor directory '%s' not found; please "
-                                "compare --train-samples, --seq-length, --seed, "
-                                "--eval-iters, and --eval-interval, with "
-                                "retro preprocessing args." %
-                                neighbor_dir)
-            torch.distributed.barrier()
-            exit()
-
-        if verify_sizes and n_sample_chunks != n_neighbor_chunks:
-            if torch.distributed.get_rank() == 0:
-                print("neighbor_dir : %s" % neighbor_dir)
-                print("neighbor_path_map : %s" % neighbor_path_map)
-                raise Exception("num sampled chunks (%d) != num neighbor chunks "
-                                "(%d); did you complete querying the entire "
-                                "pretraining dataset?"
-                                % (n_sample_chunks, n_neighbor_chunks))
-            torch.distributed.barrier()
-            exit()
-
-        # Retro dataset.
-        retro_dataset_map[data_key] = RetroDataset(
-            num_neighbors=args.retro_num_neighbors,
-            num_retrieved_chunks=args.retro_num_retrieved_chunks,
-            block_size=retro_args.retro_block_size,
-            db_dataset=db_dataset,
-            chunk_dataset=chunk_dataset,
-            neighbor_path_map=neighbor_path_map,
-        )
-
-    # Extract datasets.
-    train_ds = retro_dataset_map.get("train", None)
-    valid_ds = retro_dataset_map.get("valid", None)
-    test_ds = retro_dataset_map.get("test", None)
-
-    return train_ds, valid_ds, test_ds
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/query/utils.py b/toolbox/Megatron-DeepSpeed/tools/retro/query/utils.py
deleted file mode 100644
index 41acc797f415245ad3a6cd922da11a39555b3772..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/query/utils.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import hashlib
-import os
-
-from megatron_ds import get_retro_args
-
-
-def get_query_workdir():
-    args = get_retro_args()
-    return os.path.join(args.retro_workdir, "query")
-
-
-def get_neighbor_dirname(key, dataset):
-    hashes = ",".join([ d.desc_hash for d in dataset.datasets ])
-    hash = hashlib.md5(hashes.encode()).hexdigest()
-    return os.path.join(get_query_workdir(), os.path.basename(f"{key}_{hash}"))
diff --git a/toolbox/Megatron-DeepSpeed/tools/retro/utils.py b/toolbox/Megatron-DeepSpeed/tools/retro/utils.py
deleted file mode 100644
index cba68c7967826b95569022f73cee98b7013b4b3c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/retro/utils.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-
-import os
-import torch
-import types
-
-from megatron_ds import get_retro_args
-from megatron_ds.tokenizer.tokenizer import (
-    _BertWordPieceTokenizer,
-    _GPT2BPETokenizer,
-    _GPTSentencePieceTokenizer,
-)
-
-
-def get_args_path(workdir):
-    '''Argument copy stored within retro workdir.'''
-    return os.path.join(workdir, "args.json")
-
-
-def get_num_chunks_per_sample():
-    '''Compute seq_length // chunk_length.'''
-    args = get_retro_args()
-    sample_length = args.retro_gpt_seq_length
-    chunk_length = args.retro_gpt_chunk_length
-    assert sample_length % chunk_length == 0
-    return sample_length // chunk_length
-
-
-def get_gpt_tokenizer():
-    '''GPT (BPE) tokenizer.'''
-    args = get_retro_args()
-    tokenizer_type = args.retro_gpt_tokenizer_type
-    if tokenizer_type == "GPT2BPETokenizer":
-        assert args.retro_gpt_vocab_file and args.retro_gpt_merge_file
-        return _GPT2BPETokenizer(
-            vocab_file=args.retro_gpt_vocab_file,
-            merge_file=args.retro_gpt_merge_file,
-        )
-    elif tokenizer_type == 'GPTSentencePieceTokenizer':
-        assert args.retro_gpt_tokenizer_model is not None
-        return _GPTSentencePieceTokenizer(args.retro_gpt_tokenizer_model)
-    else:
-        raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type)
-
-
-def get_bert_tokenizer():
-    '''Bert (Wordpiece) tokenizer.'''
-    args = get_retro_args()
-    lower_case = {
-        "BertWordPieceLowerCase" : True,
-        "BertWordPieceCase" : False,
-    }[args.retro_bert_tokenizer_type]
-    return _BertWordPieceTokenizer(
-        vocab_file=args.retro_bert_vocab_file,
-        lower_case=lower_case,
-    )
-
-
-class GPTToTextDataset(torch.utils.data.Dataset):
-    '''Dataset to convert GPT tokens to text.'''
-
-    def __init__(self, gpt_dataset):
-
-        super().__init__()
-
-        self.gpt_dataset = gpt_dataset
-        self.gpt_tokenizer = get_gpt_tokenizer()
-
-    def __len__(self):
-        return len(self.gpt_dataset)
-
-    def __getitem__(self, idx):
-        gpt_token_ids = self.gpt_dataset[idx]["text"].tolist()
-        text = self.gpt_tokenizer.detokenize(gpt_token_ids)
-        return {"text": text}
diff --git a/toolbox/Megatron-DeepSpeed/tools/run_text_generation_server.py b/toolbox/Megatron-DeepSpeed/tools/run_text_generation_server.py
deleted file mode 100644
index e08b1d55cdcdb39c51043383c5a3e9e036c7e1e8..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/run_text_generation_server.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-"""Sample Generate GPT"""
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-import socket
-from megatron_ds import get_args
-from megatron_ds import print_rank_0
-from megatron_ds.core import mpu
-from megatron_ds.checkpointing import load_checkpoint
-from megatron_ds.initialize import initialize_megatron
-from megatron_ds.model import GPTModel
-from megatron_ds.training import get_model
-from megatron_ds.arguments import core_transformer_config_from_args
-from megatron_ds.text_generation_server import MegatronServer
-from megatron_ds.text_generation import generate_and_post_process
-from megatron_ds.text_generation import beam_search_and_post_process
-import torch
-
-def model_provider(pre_process=True, post_process=True):
-    """Build the model."""
-
-    config = core_transformer_config_from_args(get_args())
-
-    print_rank_0('building GPT model ...')
-    model = GPTModel(config=config, num_tokentypes=0, parallel_output=False, pre_process=pre_process, post_process=post_process)
-
-    return model
-
-def add_text_generate_args(parser):
-    group = parser.add_argument_group(title='text generation')
-
-    group.add_argument("--temperature", type=float, default=1.0,
-                       help='Sampling temperature.')
-    group.add_argument("--top_p", type=float, default=0.0,
-                       help='Top p sampling.')
-    group.add_argument("--top_k", type=int, default=0,
-                       help='Top k sampling.')
-    group.add_argument("--out-seq-length", type=int, default=1024,
-                       help='Size of the output generated text.')
-    return parser
-
-
-if __name__ == "__main__":
-    initialize_megatron(extra_args_provider=add_text_generate_args,
-                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
-                                       'no_load_rng': True,
-                                       'no_load_optim': True})
-
-    args = get_args()
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        print("Interleaved pipeline schedule is not yet supported for text generation.")
-        exit()
-    # Set up model and load checkpoint
-    model = get_model(model_provider, wrap_with_ddp=False)
-
-    if args.load is not None:
-        _ = load_checkpoint(model, None, None)
-
-    assert len(model) == 1, "Above condition should have caught this"
-    model = model[0]
-    if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
-        server = MegatronServer(model)
-        server.run("0.0.0.0")
-
-    while True:
-        choice = torch.cuda.LongTensor(1)
-        torch.distributed.broadcast(choice, 0)
-        if choice[0].item() == 0:
-            try:
-                generate_and_post_process(model)
-            except ValueError as ve:
-                pass
-        elif choice[0].item() == 1:
-            try:
-                beam_search_and_post_process(model)
-            except ValueError as ve:
-                pass
diff --git a/toolbox/Megatron-DeepSpeed/tools/text_generation_cli.py b/toolbox/Megatron-DeepSpeed/tools/text_generation_cli.py
deleted file mode 100644
index 223928cf686f0cb3f5b39f5681ac16074aac044c..0000000000000000000000000000000000000000
--- a/toolbox/Megatron-DeepSpeed/tools/text_generation_cli.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-import sys
-import json
-import requests
-
-
-if __name__ == "__main__":
-    url = sys.argv[1]
-    url = 'http://' + url + '/api'
-    headers = {'Content-Type': 'application/json'}
-
-    while True:
-        sentence = input("Enter prompt: ")
-        tokens_to_generate = int(eval(input("Enter number of tokens to generate: ")))
-
-        data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate}
-        response = requests.put(url, data=json.dumps(data), headers=headers)
-
-        if response.status_code != 200:
-            print(f"Error {response.status_code}: {response.json()['message']}")
-        else:
-            print("Megatron Response: ")
-            print(response.json()['text'][0])