diff --git a/.gitignore b/.gitignore
index 5829c86c7903c3c592fefc0945bb15e3cdbc711d..edf76c3ae88cbf5f4da7553948943b1533fa4844 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 *.pyc
-data/
\ No newline at end of file
+data/
+
+.vscode
+
+cs285.egg-info/
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..34062964d5806660fe112d1807b21e0f8dc6a600
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2021, 张聪明
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 0cef3f054f42663bd299f86956e3a118ef680093..d89eafdd31d0132eeed16f7868ad4998a070ad3f 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,10 @@
 Assignments for [Berkeley CS 285: Deep Reinforcement Learning, Decision Making, and Control](http://rail.eecs.berkeley.edu/deeprlcourse/).
+
+# 记录
+## hw1
+2021/5/18 已完成代码和solution.md，但是还在看部分的细节问题，比如训练的绝对效果等
+
+[详情请点击此处见solution.md](hw1/solution.md)
+
+[对应CSDN博客地址 点击此处](https://blog.csdn.net/qq_39537898/article/details/116905668)
+
diff --git a/hw1/README.md b/hw1/README.md
index 15612f446f773ff45b63531727a6aaf56f65ccc1..a551c01af9075ad28c2528379c77ed88535c41a6 100644
--- a/hw1/README.md
+++ b/hw1/README.md
@@ -1,3 +1,7 @@
+# Solution File
+直接跳过看结果请点击[solution.md](solution.md)
+
+
 ## Setup
 
 You can run this code on your own machine or on Google Colab. 
diff --git a/hw1/cs285.egg-info/PKG-INFO b/hw1/cs285.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..cbad8d265a1a8daa81aca1f6fbc2264e9e5d6655
--- /dev/null
+++ b/hw1/cs285.egg-info/PKG-INFO
@@ -0,0 +1,10 @@
+Metadata-Version: 1.0
+Name: cs285
+Version: 0.1.0
+Summary: UNKNOWN
+Home-page: UNKNOWN
+Author: UNKNOWN
+Author-email: UNKNOWN
+License: UNKNOWN
+Description: UNKNOWN
+Platform: UNKNOWN
diff --git a/hw1/cs285.egg-info/SOURCES.txt b/hw1/cs285.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9168387e643bfa573e2c9be74b2af4b1ce3e33fd
--- /dev/null
+++ b/hw1/cs285.egg-info/SOURCES.txt
@@ -0,0 +1,5 @@
+README.md
+cs285.egg-info/PKG-INFO
+cs285.egg-info/SOURCES.txt
+cs285.egg-info/dependency_links.txt
+cs285.egg-info/top_level.txt
\ No newline at end of file
diff --git a/hw1/cs285.egg-info/dependency_links.txt b/hw1/cs285.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/hw1/cs285.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/hw1/cs285.egg-info/top_level.txt b/hw1/cs285.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1f6195061fc04ac5f6601db5a73dbe9dae999e14
--- /dev/null
+++ b/hw1/cs285.egg-info/top_level.txt
@@ -0,0 +1 @@
+cs285
diff --git a/hw1/cs285/infrastructure/logger.py b/hw1/cs285/infrastructure/logger.py
index a64931c00287565fba63862add4d39eed188db1f..e60dff524eb4f44df23b5c8dc3ff4f8b2dcc13b7 100644
--- a/hw1/cs285/infrastructure/logger.py
+++ b/hw1/cs285/infrastructure/logger.py
@@ -30,7 +30,6 @@ class Logger:
 
         # reshape the rollouts
         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
-
         # max rollout length
         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
         max_length = videos[0].shape[0]
diff --git a/hw1/cs285/infrastructure/pytorch_util.py b/hw1/cs285/infrastructure/pytorch_util.py
index bc7a408170607f7a5ee58adc135ebc02cd5ef563..a10f410678d6feb251d7e5525e7b819f976db08f 100644
--- a/hw1/cs285/infrastructure/pytorch_util.py
+++ b/hw1/cs285/infrastructure/pytorch_util.py
@@ -45,10 +45,24 @@ def build_mlp(
     if isinstance(output_activation, str):
         output_activation = _str_to_activation[output_activation]
 
-    # TODO: return a MLP. This should be an instance of nn.Module
+    # DONE TODO: return a MLP. This should be an instance of nn.Module
     # Note: nn.Sequential is an instance of nn.Module.
-    raise NotImplementedError
-
+    # layers = [('linear1',nn.Linear(input_size,size)),('activation1',activation)]
+    # for i in range(2, n_layers):
+    #     layers.append((f'linear{i}',nn.Linear(size,size)))
+    #     layers.append((f'activation{i}',activation))
+    # layers.extend([(f'linear{n_layers+1}',nn.Linear(size,output_size)),(f'activation{n_layers+1}',output_activation)])
+    # model = nn.Sequential(OrderedDict(layers)) # need to from collections import OrderedDict
+    # return model
+    layers = []
+    layers.append(nn.Linear(input_size,size))
+    layers.append(activation)
+    for i in range(n_layers-1):
+        layers.append(nn.Linear(size,size))
+        layers.append(activation)
+    layers.append(nn.Linear(size,output_size))
+    layers.append(output_activation)
+    return nn.Sequential(*layers)
 
 device = None
 
diff --git a/hw1/cs285/infrastructure/replay_buffer.py b/hw1/cs285/infrastructure/replay_buffer.py
index 60148e79a96caa8c664411fa833f93695d85c3b9..8003b3077eaca4578a599a2671dbdac25319bcac 100644
--- a/hw1/cs285/infrastructure/replay_buffer.py
+++ b/hw1/cs285/infrastructure/replay_buffer.py
@@ -76,8 +76,8 @@ class ReplayBuffer(object):
         ## HINT 1: use np.random.permutation to sample random indices
         ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
         ## HINT 3: look at the sample_recent_data function below
-
-        return TODO, TODO, TODO, TODO, TODO
+        indices = np.random.permutation(self.obs.shape[0])[:batch_size]
+        return self.obs[indices], self.acs[indices], self.rews[indices], self.next_obs[indices], self.terminals[indices]
 
     def sample_recent_data(self, batch_size=1):
         return (
diff --git a/hw1/cs285/infrastructure/rl_trainer.py b/hw1/cs285/infrastructure/rl_trainer.py
index bb27972e367f3b515ea8611cb4e73d0f1d985ad3..6d68e5fa93493bf6479c978989681992060274b0 100644
--- a/hw1/cs285/infrastructure/rl_trainer.py
+++ b/hw1/cs285/infrastructure/rl_trainer.py
@@ -1,7 +1,7 @@
 from collections import OrderedDict
 import numpy as np
 import time
-
+import pickle
 import gym
 import torch
 
@@ -45,6 +45,7 @@ class RL_Trainer(object):
 
         # Maximum length for episodes
         self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps
+        global MAX_VIDEO_LEN
         MAX_VIDEO_LEN = self.params['ep_len']
 
         # Is this env continuous, or self.discrete?
@@ -155,18 +156,22 @@ class RL_Trainer(object):
             train_video_paths: paths which also contain videos for visualization purposes
         """
 
-        # TODO decide whether to load training data or use the current policy to collect more data
+        # DONE TODO decide whether to load training data or use the current policy to collect more data
         # HINT: depending on if it's the first iteration or not, decide whether to either
                 # (1) load the data. In this case you can directly return as follows
                 # ``` return loaded_paths, 0, None ```
 
                 # (2) collect `self.params['batch_size']` transitions
+        if itr==0:
+            with open(load_initial_expertdata, 'rb') as f:
+                loaded_paths = pickle.loads(f.read())
+            return loaded_paths, 0, None
 
-        # TODO collect `batch_size` samples to be used for training
+        # DONE TODO collect `batch_size` samples to be used for training
         # HINT1: use sample_trajectories from utils
         # HINT2: you want each of these collected rollouts to be of length self.params['ep_len']
         print("\nCollecting data to be used for training...")
-        paths, envsteps_this_batch = TODO
+        paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len'])
 
         # collect more rollouts with the same policy, to be saved as videos in tensorboard
         # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN
@@ -174,6 +179,7 @@ class RL_Trainer(object):
         if self.log_video:
             print('\nCollecting train rollouts to be used for saving videos...')
             ## TODO look in utils and implement sample_n_trajectories
+            # print('In Collect trajectory MAX_VIDEO_LEN:',MAX_VIDEO_LEN,'id:',id(MAX_VIDEO_LEN))
             train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)
 
         return paths, envsteps_this_batch, train_video_paths
@@ -184,25 +190,27 @@ class RL_Trainer(object):
         all_logs = []
         for train_step in range(self.params['num_agent_train_steps_per_iter']):
 
-            # TODO sample some data from the data buffer
+            # DONE TODO sample some data from the data buffer
             # HINT1: use the agent's sample function
             # HINT2: how much data = self.params['train_batch_size']
-            ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = TODO
+            ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample(self.params['train_batch_size'])
 
-            # TODO use the sampled data to train an agent
+            # DONE TODO use the sampled data to train an agent
             # HINT: use the agent's train function
             # HINT: keep the agent's training log for debugging
-            train_log = TODO
+            train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch)
             all_logs.append(train_log)
         return all_logs
 
     def do_relabel_with_expert(self, expert_policy, paths):
         print("\nRelabelling collected observations with labels from an expert policy...")
 
-        # TODO relabel collected obsevations (from our policy) with labels from an expert policy
+        # DONE TODO relabel collected obsevations (from our policy) with labels from an expert policy
         # HINT: query the policy (using the get_action function) with paths[i]["observation"]
         # and replace paths[i]["action"] with these expert labels
-
+        for i in range(len(paths)):
+            action = expert_policy.get_action(paths[i]["observation"])
+            paths[i]["action"] = action
         return paths
 
     ####################################
@@ -212,11 +220,13 @@ class RL_Trainer(object):
 
         # collect eval trajectories, for logging
         print("\nCollecting data for eval...")
+        # print('ep_len',self.params['ep_len'])
         eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len'])
 
         # save eval rollouts as videos in tensorboard event file
         if self.log_video and train_video_paths != None:
             print('\nCollecting video rollouts eval')
+            # print('In Perform logging MAX_VIDEO_LEN:',MAX_VIDEO_LEN,'id:',id(MAX_VIDEO_LEN))
             eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True)
 
             #save train/eval videos
@@ -229,6 +239,10 @@ class RL_Trainer(object):
         # save eval metrics
         if self.log_metrics:
             # returns, for logging
+            # print('log_metrics eval paths length',len(eval_paths))
+            # for eval_path in eval_paths:
+            #     print('log_metrics eval path length',len(eval_path))
+            #     print('eval return',len(eval_path["reward"]))
             train_returns = [path["reward"].sum() for path in paths]
             eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths]
 
diff --git a/hw1/cs285/infrastructure/utils.py b/hw1/cs285/infrastructure/utils.py
index d894480b2c121bf8e5da8b3050be7ef2eec3023c..125e297b8a8ece70d2b184c1bd550b6f1b79551c 100644
--- a/hw1/cs285/infrastructure/utils.py
+++ b/hw1/cs285/infrastructure/utils.py
@@ -7,7 +7,7 @@ import time
 def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
 
     # initialize env for the beginning of a new rollout
-    ob = TODO # HINT: should be the output of resetting the env
+    ob = env.reset() # HINT: should be the output of resetting the env
 
     # init vars
     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
@@ -27,7 +27,7 @@ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('
 
         # use the most recent ob to decide what to do
         obs.append(ob)
-        ac = TODO # HINT: query the policy's get_action function
+        ac = policy.get_action(ob) # HINT: query the policy's get_action function
         ac = ac[0]
         acs.append(ac)
 
@@ -39,9 +39,9 @@ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('
         next_obs.append(ob)
         rewards.append(rew)
 
-        # TODO end the rollout if the rollout ended
+        # DONE TODO end the rollout if the rollout ended
         # HINT: rollout can end due to done, or due to max_path_length
-        rollout_done = TODO # HINT: this is either 0 or 1
+        rollout_done = 1 if steps>=max_path_length else done # HINT: this is either 0 or 1
         terminals.append(rollout_done)
 
         if rollout_done:
@@ -53,15 +53,16 @@ def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, r
     """
         Collect rollouts until we have collected min_timesteps_per_batch steps.
 
-        TODO implement this function
+        DONE TODO implement this function
         Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
         Hint2: use get_pathlength to count the timesteps collected in each path
     """
     timesteps_this_batch = 0
     paths = []
     while timesteps_this_batch < min_timesteps_per_batch:
-
-        TODO
+        path = sample_trajectory(env, policy, max_path_length, render)
+        paths.append(path)
+        timesteps_this_batch += get_pathlength(path)
 
     return paths, timesteps_this_batch
 
@@ -69,12 +70,13 @@ def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, ren
     """
         Collect ntraj rollouts.
 
-        TODO implement this function
+        DONE TODO implement this function
         Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
     """
     paths = []
-
-    TODO
+    for i in range(ntraj):
+        path = sample_trajectory(env, policy, max_path_length, render)
+        paths.append(path)
 
     return paths
 
diff --git a/hw1/cs285/policies/MLP_policy.py b/hw1/cs285/policies/MLP_policy.py
index c8e1fd7d4fa7f8f8b97865200f20daea353f32d9..6c2ae10a9a5a053b1cbda3a502e06fe233101025 100644
--- a/hw1/cs285/policies/MLP_policy.py
+++ b/hw1/cs285/policies/MLP_policy.py
@@ -79,9 +79,8 @@ class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
             observation = obs
         else:
             observation = obs[None]
-
-        # TODO return the action that the policy prescribes
-        raise NotImplementedError
+        # DONE TODO return the action that the policy prescribes 
+        return ptu.to_numpy(self.forward(ptu.from_numpy(observation)))
 
     # update/train this policy
     def update(self, observations, actions, **kwargs):
@@ -93,7 +92,11 @@ class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
     # return more flexible objects, such as a
     # `torch.distributions.Distribution` object. It's up to you!
     def forward(self, observation: torch.FloatTensor) -> Any:
-        raise NotImplementedError
+        # DONE TODO raise NotImplementedError
+        if self.discrete:
+            return self.logits_na(observation)
+        else:
+            return self.mean_net(observation)
 
 
 #####################################################
@@ -108,8 +111,12 @@ class MLPPolicySL(MLPPolicy):
             self, observations, actions,
             adv_n=None, acs_labels_na=None, qvals=None
     ):
-        # TODO: update the policy and return the loss
-        loss = TODO
+        # DONE TODO: update the policy and return the loss
+        self.optimizer.zero_grad() # zeroes the gradient buffers of all parameters
+        loss = self.loss(self.forward(ptu.from_numpy(observations)),ptu.from_numpy(actions))
+        # the reason why we cannot use get_action since to_numpy will remove grad_func
+        loss.backward() # backprop
+        self.optimizer.step() # # Does the update
         return {
             # You can add extra logging information here, but keep this line
             'Training Loss': ptu.to_numpy(loss),
diff --git a/hw1/cs285/scripts/run_hw1.ipynb b/hw1/cs285/scripts/run_hw1.ipynb
index 476b63265b7ee3a7e7c5b14c7ca74f7b8b54f581..877ebf1fad7d91cf55672401e495db945a356c79 100644
--- a/hw1/cs285/scripts/run_hw1.ipynb
+++ b/hw1/cs285/scripts/run_hw1.ipynb
@@ -551,4 +551,4 @@
       ]
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/hw1/cs285/scripts/run_hw1.py b/hw1/cs285/scripts/run_hw1.py
index 2a4a73de424f327c45ac1047e13aeece8af4728a..377f01e3c7493e3311bf14e54760f511311da83e 100644
--- a/hw1/cs285/scripts/run_hw1.py
+++ b/hw1/cs285/scripts/run_hw1.py
@@ -112,7 +112,12 @@ def main():
     ###################
     ### RUN TRAINING
     ###################
-
+    # 重复运行，请消除此处注释，注释下面两行
+    # add_item = 1000
+    # for step in range(10):
+    #     params['eval_batch_size'] = add_item*(step+1)
+    #     trainer = BC_Trainer(params)
+    #     trainer.run_training_loop()
     trainer = BC_Trainer(params)
     trainer.run_training_loop()
 
diff --git a/hw1/solution.md b/hw1/solution.md
new file mode 100644
index 0000000000000000000000000000000000000000..49532a1073228b62b3a337c8fd691522591ca466
--- /dev/null
+++ b/hw1/solution.md
@@ -0,0 +1,264 @@
+# 前言
+首先能运行程序的前提是安装好依赖，确保自己阅读完毕了readme.md 和 installation.md，如果前言没啥直接跳过 进入运行**程序与解答部分**即可，以下代码中都没有使用gpu，谁让主机GPU版本太老 支持不了
+
+## Install mujoco:
+```
+mkdir ~/.mujoco
+cd ~/.mujoco
+wget https://www.roboti.us/download/mujoco200_linux.zip
+unzip mujoco200_linux.zip
+mv mujoco200_linux mujoco200
+rm mujoco200_linux.zip
+cp <location_of_mjkey.txt> .
+```
+关于这里的.txt是需要申请的：[https://www.roboti.us/license.html](https://www.roboti.us/license.html)
+
+
+## 先建立conda环境
+A. (Recommended) Install with conda:
+
+1. Install conda
+
+2. Create a conda environment that will contain python 3:
+```
+conda create -n cs285 python=3.6
+```
+
+3. activate the environment (do this every time you open a new terminal and want to run code):
+```
+source activate cs285
+```
+
+4. Install the requirements into this conda environment
+```
+pip install --user -r requirements.txt
+```
+
+5. Allow your code to be able to see 'cs285'
+```
+cd <path_to_hw1>
+$ pip install -e .
+```
+## 关于运行后的一些错误记录
+这里没有完，对于python自身还需要装一些依赖的：不然会报错
+### error: command 'gcc' failed with exit status 1 while installing eventlet
+```bash
+sudo apt-get install libosmesa6-dev
+```
+参考地址：[https://github.com/ethz-asl/reinmav-gym/issues/35](https://github.com/ethz-asl/reinmav-gym/issues/35)
+### error: [Errno 2] No such file or directory: 'patchelf': 'patchelf'
+```
+conda install anaconda patchelf
+```
+参考地址：[https://github.com/openai/mujoco-py/issues/147](https://github.com/openai/mujoco-py/issues/147)
+
+# 运行程序及解答
+## BC Task
+### 运行
+这个是pdf给出的运行 how you can run the Ant task 最后三行加了pdf中要求的，report the mean and standard 意味着 `eval_batch_size` 应该要比 `ep_len`大
+
+例如，`ep_len` 是1000， `eval_batch_size` 是 10000，你收集的轨迹就大概10个，程序输出的`Eval_AverageReturn ` 和 `Eval_StdReturn `分别代表了mean/std
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Ant.pkl \
+--env_name Ant-v2 --exp_name bc_ant --n_iter 1 \
+--expert_data cs285/expert_data/expert_data_Ant-v2.pkl \
+--video_log_freq -1 \
+--ep_len 1000 \
+--eval_batch_size 10000 \
+-ngpu
+```
+
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Hopper.pkl \
+--env_name Hopper-v2 --exp_name bc_hopper --n_iter 1 \
+--expert_data cs285/expert_data/expert_data_Hopper-v2.pkl \
+--video_log_freq -1 \
+--ep_len 1000 \
+--eval_batch_size 10000 \
+-ngpu
+```
+
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/HalfCheetah.pkl \
+--env_name HalfCheetah-v2 --exp_name bc_halfcheetah --n_iter 1 \
+--expert_data cs285/expert_data/expert_data_HalfCheetah-v2.pkl \
+--video_log_freq -1 \
+--ep_len 1000 \
+--eval_batch_size 10000 \
+-ngpu
+```
+
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Humanoid.pkl \
+--env_name Humanoid-v2 --exp_name bc_humanoid --n_iter 1 \
+--expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl \
+--video_log_freq -1 \
+--ep_len 1000 \
+--eval_batch_size 10000 \
+-ngpu
+```
+
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Walker2d.pkl \
+--env_name Walker2d-v2 --exp_name bc_walker2d --n_iter 1 \
+--expert_data cs285/expert_data/expert_data_Walker2d-v2.pkl \
+--video_log_freq -1 \
+--ep_len 1000 \
+--eval_batch_size 10000 \
+-ngpu
+```
+
+### Q 1.2
+需要运行两个环境，然后进行对比 平均值和标准差，对于Ant-v2环境下BC策略应该要能达到专家策略的30%以上，这里的答案都是建立在`eval_batch_size = 10000`不是默认的1000，按上面的指令运行即可得到下面表中数据：
+
+
+运行环境 | Expert Mean 专家策略的平均值 | BC Mean BC策略的平均值 | BC Std 标准差 | % of Expert
+---- | :---------------: | :----------------: | :-----: | :-----:
+Ant-v2 | 4713.65 | 4444.24 | 822.1373 | 94.24%
+Hopper-v2 | 3772.67 | 2013.19 | 503.24 | 53.36%
+HaflCheetah-v2 | 4205.777 | 3886.59 | 71.47 | 92.41%
+Humanoid-v2 | 10344.51 | 4881.17 | 8.62 | 47.18%
+Walker2d-v2 | 5566.84 | 4098.05 | 1181.29 | 73.16%
+
+
+### Q 1.3
+需要我们调整参数，以对比agent的训练会受到怎样的影响，可以从参数中就是run_hw1.py 那部分的导入参数里看到可调的
+```bash
+for ((i=100; i<=1000; i=i+100))
+do
+    python cs285/scripts/run_hw1.py \
+    --expert_policy_file cs285/policies/experts/Ant.pkl \
+    --env_name Hopper-v2 --exp_name bc_ant --n_iter 1 \
+    --expert_data cs285/expert_data/expert_data_Hopper-v2.pkl \
+    --video_log_freq -1 \
+    --train_batch_size "$i" \
+    -ngpu 
+done
+```
+但是这样好像画不了图，而且导出不了 emmm 只能直接在里面改了，但是也不是按step来的；不过能导出csv然后再单独画一个好了 lol 自己太菜了，引用参考代码里有个大哥是直接输出的tensorboard的，不过他和我的答案差距有点大，对于Q1.2
+
+由上表可知，humanoid的训练比较差，所以对比的话 就直接拿这个然后改从专家里训练的数据了
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Humanoid.pkl \
+--env_name Humanoid-v2 --exp_name bc_humanoid --n_iter 1 \
+--expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl \
+--video_log_freq -1 \
+-ngpu
+```
+最后试了一下`ep_len`，发现这是决定每次训练的长度，然后就就有对比出来了，甚至比专家策略学的还要好...
+```python
+# HINT2: you want each of these collected rollouts to be of length self.params['ep_len']
+print("\nCollecting data to be used for training...")
+paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len'], render=False, render_mode=('rgb_array'))
+```
+然后需要注释run_hw1.py那一段 去循环不同的ep_len...【==**但是这里有问题哈**==】
+详情见博客Q1.2分析[https://blog.csdn.net/qq_39537898/article/details/116905668](https://blog.csdn.net/qq_39537898/article/details/116905668)
+
+结果图：
+
+![运行结果示意图](../image/hw1_3.png)
+
+但是隐约觉得哪里不太对劲：
+1. 首先是在不给ep_len的时候，默认是1000，但是为啥我一次次train后同样的参数到后面训练的return更大了呢？ -> 已解决在Notion里提到了，并没有变大，默认的1000还是在4000范围左右 -> 但是这样的分析还是有错误的，详情见博客：[https://blog.csdn.net/qq_39537898/article/details/116905668](https://blog.csdn.net/qq_39537898/article/details/116905668)
+2. 在后面的dagger里随着迭代重新开始 可以发现专家数据的train return也会下降emmm， 是没有理解BC 和 Dagger嘛？ -> 是拿专家数据来训练，对于不同的ep是不同的reward所以总体上也没啥问题
+
+## DAgger Task
+### 运行
+ep_len运行200步是PDF第三部分提到的，提交的时候交200步的
+Ant-v2环境下：
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Ant.pkl \
+--env_name Ant-v2 --exp_name dagger_ant --n_iter 10 \
+--do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \
+--ep_len 200 \
+-ngpu
+```
+用来训练的agent表现，比如右边那副就是失败了 翻车了 但是因为rollout是两轮所以有一个还在继续
+<center>
+<img src="../image/ant_train-n10-ep200.gif" width="50%">
+</center>
+
+eval下的agent表现:
+<center>
+<img src="../image/ant_val-n10-ep200.gif" width="50%">
+</center>
+
+Humanoid-v2：
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Humanoid.pkl \
+--env_name Humanoid-v2 --exp_name dagger_humanoid --n_iter 10 \
+--do_dagger --expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl \
+--ep_len 200 \
+-ngpu
+```
+
+用来训练的agent表现
+<center>
+<img src="../image/human_train-n10-ep200.gif" width="50%">
+</center>
+
+eval下的agent表现:
+<center>
+<img src="../image/human_val-n10-ep200.gif" width="50%">
+</center>
+
+这个就是ep_len太短了，还有就是`rollout_done = 1 if steps>=max_path_length else done`，已经直接被设为`done`了
+比如长一点的ep_len默认1000情况下时：
+
+迭代次数也就是`n_iter=5`的话，可以看到迭代次数太少 根本还没来得及学会，这里之所以卡住不动是因为step返回了done 就是这个步骤没法再继续下去了，（其实一开始我没写成done写成了0，所以会出现鬼畜现象 hhhh）
+
+<center>
+<img src="../image/human-n5-ep100.gif" width="50%">
+</center>
+
+如果迭代了95次之后就能学完整个过程了：
+<center>
+<img src="../image/human-n95-ep100.gif" width="50%">
+</center>
+
+### Q 2.2
+首先这个问题是对比一下Ant和另一个环境下与BC的结果，然后画出在Dagger下的平均值与标准差，这个就直接在tensorboard里有显示了
+
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Ant.pkl \
+--env_name Ant-v2 --exp_name dagger_ant --n_iter 100 \
+--do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \
+--video_log_freq -1 \
+--ep_len 200 \
+-ngpu
+```
+运行上面【Ant-v2】这段得到的结果：
+![运行结果示意图](../image/hw1_q21.png)
+![运行结果示意图](../image/hw1_q22.png)
+
+```bash
+python cs285/scripts/run_hw1.py \
+--expert_policy_file cs285/policies/experts/Humanoid.pkl \
+--env_name Humanoid-v2 --exp_name dagger_humanoid --n_iter 100 \
+--do_dagger --expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl \
+--video_log_freq -1 \
+--ep_len 200 \
+-ngpu
+```
+运行上面【Humanoid-v2】这段得到的结果：
+![运行结果示意图](../image/hw1_q23.png)
+![运行结果示意图](../image/hw1_q24.png)
+
+
+
+
+
+# 参考引用
+写代码过程中的几个Github参考：
+1. [https://github.com/cww97/cs285_fall2020_cww/tree/main/hw1](https://github.com/cww97/cs285_fall2020_cww/tree/main/hw1)
+2. [https://github.com/vincentkslim/cs285_homework_fall2020/tree/master](https://github.com/vincentkslim/cs285_homework_fall2020/tree/master)
+3. [https://github.com/mdeib/berkeley-deep-RL-pytorch-solutions](https://github.com/mdeib/berkeley-deep-RL-pytorch-solutions)
\ No newline at end of file
diff --git a/image/ant_train-n10-ep200.gif b/image/ant_train-n10-ep200.gif
new file mode 100755
index 0000000000000000000000000000000000000000..04eb400a1b713aa9931f524741f3401b3f394100
Binary files /dev/null and b/image/ant_train-n10-ep200.gif differ
diff --git a/image/ant_val-n10-ep200.gif b/image/ant_val-n10-ep200.gif
new file mode 100644
index 0000000000000000000000000000000000000000..973dd7daa458d8c86d8b534047f684de60dde12d
Binary files /dev/null and b/image/ant_val-n10-ep200.gif differ
diff --git a/image/human-n5-ep100.gif b/image/human-n5-ep100.gif
new file mode 100755
index 0000000000000000000000000000000000000000..5da652ae6c6871bc5dff7cd2962a03a2328690ee
Binary files /dev/null and b/image/human-n5-ep100.gif differ
diff --git a/image/human-n95-ep100.gif b/image/human-n95-ep100.gif
new file mode 100755
index 0000000000000000000000000000000000000000..be6c7a1d1a46602fdbec024846bc3055c04d828f
Binary files /dev/null and b/image/human-n95-ep100.gif differ
diff --git a/image/human_train-n10-ep200.gif b/image/human_train-n10-ep200.gif
new file mode 100644
index 0000000000000000000000000000000000000000..fd34bf0006dd250ceee16223f4f56dd31856746d
Binary files /dev/null and b/image/human_train-n10-ep200.gif differ
diff --git a/image/human_val-n10-ep200.gif b/image/human_val-n10-ep200.gif
new file mode 100644
index 0000000000000000000000000000000000000000..fdde622c4b3892559e22a267dbd6ff01d3beae42
Binary files /dev/null and b/image/human_val-n10-ep200.gif differ
diff --git a/image/hw1_3.png b/image/hw1_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbd3f831735b8a7324c41aa1cbf304081ec3c061
Binary files /dev/null and b/image/hw1_3.png differ
diff --git a/image/hw1_q2.png b/image/hw1_q2.png
new file mode 100644
index 0000000000000000000000000000000000000000..94198377387e1ec7516140f2b39a87bd5559ac15
Binary files /dev/null and b/image/hw1_q2.png differ
diff --git a/image/hw1_q21.png b/image/hw1_q21.png
new file mode 100644
index 0000000000000000000000000000000000000000..dbaa7199d72f941026147e58722d9d73a0835da8
Binary files /dev/null and b/image/hw1_q21.png differ
diff --git a/image/hw1_q22.png b/image/hw1_q22.png
new file mode 100644
index 0000000000000000000000000000000000000000..dcf90fdfd6f382f85ffcdb16f93922f84945a21d
Binary files /dev/null and b/image/hw1_q22.png differ
diff --git a/image/hw1_q23.png b/image/hw1_q23.png
new file mode 100644
index 0000000000000000000000000000000000000000..00a539f594266c5027c724d9a57b8578df81ebf6
Binary files /dev/null and b/image/hw1_q23.png differ
diff --git a/image/hw1_q24.png b/image/hw1_q24.png
new file mode 100644
index 0000000000000000000000000000000000000000..a455c2d2790db819ce874357d2c0c0a5cd0c49ad
Binary files /dev/null and b/image/hw1_q24.png differ