diff --git a/.gitignore b/.gitignore index 5829c86c7903c3c592fefc0945bb15e3cdbc711d..edf76c3ae88cbf5f4da7553948943b1533fa4844 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ *.pyc -data/ \ No newline at end of file +data/ + +.vscode + +cs285.egg-info/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..34062964d5806660fe112d1807b21e0f8dc6a600 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2021, 张聪明 +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 0cef3f054f42663bd299f86956e3a118ef680093..d89eafdd31d0132eeed16f7868ad4998a070ad3f 100644 --- a/README.md +++ b/README.md @@ -1 +1,10 @@ Assignments for [Berkeley CS 285: Deep Reinforcement Learning, Decision Making, and Control](http://rail.eecs.berkeley.edu/deeprlcourse/). + +# 记录 +## hw1 +2021/5/18 已完成代码和solution.md,但是还在看部分的细节问题,比如训练的绝对效果等 + +[详情请点击此处见solution.md](hw1/solution.md) + +[对应CSDN博客地址 点击此处](https://blog.csdn.net/qq_39537898/article/details/116905668) + diff --git a/hw1/README.md b/hw1/README.md index 15612f446f773ff45b63531727a6aaf56f65ccc1..a551c01af9075ad28c2528379c77ed88535c41a6 100644 --- a/hw1/README.md +++ b/hw1/README.md @@ -1,3 +1,7 @@ +# Solution File +直接跳过看结果请点击[solution.md](solution.md) + + ## Setup You can run this code on your own machine or on Google Colab. diff --git a/hw1/cs285.egg-info/PKG-INFO b/hw1/cs285.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..cbad8d265a1a8daa81aca1f6fbc2264e9e5d6655 --- /dev/null +++ b/hw1/cs285.egg-info/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: cs285 +Version: 0.1.0 +Summary: UNKNOWN +Home-page: UNKNOWN +Author: UNKNOWN +Author-email: UNKNOWN +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN diff --git a/hw1/cs285.egg-info/SOURCES.txt b/hw1/cs285.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..9168387e643bfa573e2c9be74b2af4b1ce3e33fd --- /dev/null +++ b/hw1/cs285.egg-info/SOURCES.txt @@ -0,0 +1,5 @@ +README.md +cs285.egg-info/PKG-INFO +cs285.egg-info/SOURCES.txt +cs285.egg-info/dependency_links.txt +cs285.egg-info/top_level.txt \ No newline at end of file diff --git a/hw1/cs285.egg-info/dependency_links.txt b/hw1/cs285.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/hw1/cs285.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/hw1/cs285.egg-info/top_level.txt b/hw1/cs285.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f6195061fc04ac5f6601db5a73dbe9dae999e14 --- /dev/null +++ b/hw1/cs285.egg-info/top_level.txt @@ -0,0 +1 @@ +cs285 diff --git a/hw1/cs285/infrastructure/logger.py b/hw1/cs285/infrastructure/logger.py index a64931c00287565fba63862add4d39eed188db1f..e60dff524eb4f44df23b5c8dc3ff4f8b2dcc13b7 100644 --- a/hw1/cs285/infrastructure/logger.py +++ b/hw1/cs285/infrastructure/logger.py @@ -30,7 +30,6 @@ class Logger: # reshape the rollouts videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] - # max rollout length max_videos_to_save = np.min([max_videos_to_save, len(videos)]) max_length = videos[0].shape[0] diff --git a/hw1/cs285/infrastructure/pytorch_util.py b/hw1/cs285/infrastructure/pytorch_util.py index bc7a408170607f7a5ee58adc135ebc02cd5ef563..a10f410678d6feb251d7e5525e7b819f976db08f 100644 --- a/hw1/cs285/infrastructure/pytorch_util.py +++ b/hw1/cs285/infrastructure/pytorch_util.py @@ -45,10 +45,24 @@ def build_mlp( if isinstance(output_activation, str): output_activation = _str_to_activation[output_activation] - # TODO: return a MLP. This should be an instance of nn.Module + # DONE TODO: return a MLP. This should be an instance of nn.Module # Note: nn.Sequential is an instance of nn.Module. - raise NotImplementedError - + # layers = [('linear1',nn.Linear(input_size,size)),('activation1',activation)] + # for i in range(2, n_layers): + # layers.append((f'linear{i}',nn.Linear(size,size))) + # layers.append((f'activation{i}',activation)) + # layers.extend([(f'linear{n_layers+1}',nn.Linear(size,output_size)),(f'activation{n_layers+1}',output_activation)]) + # model = nn.Sequential(OrderedDict(layers)) # need to from collections import OrderedDict + # return model + layers = [] + layers.append(nn.Linear(input_size,size)) + layers.append(activation) + for i in range(n_layers-1): + layers.append(nn.Linear(size,size)) + layers.append(activation) + layers.append(nn.Linear(size,output_size)) + layers.append(output_activation) + return nn.Sequential(*layers) device = None diff --git a/hw1/cs285/infrastructure/replay_buffer.py b/hw1/cs285/infrastructure/replay_buffer.py index 60148e79a96caa8c664411fa833f93695d85c3b9..8003b3077eaca4578a599a2671dbdac25319bcac 100644 --- a/hw1/cs285/infrastructure/replay_buffer.py +++ b/hw1/cs285/infrastructure/replay_buffer.py @@ -76,8 +76,8 @@ class ReplayBuffer(object): ## HINT 1: use np.random.permutation to sample random indices ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array) ## HINT 3: look at the sample_recent_data function below - - return TODO, TODO, TODO, TODO, TODO + indices = np.random.permutation(self.obs.shape[0])[:batch_size] + return self.obs[indices], self.acs[indices], self.rews[indices], self.next_obs[indices], self.terminals[indices] def sample_recent_data(self, batch_size=1): return ( diff --git a/hw1/cs285/infrastructure/rl_trainer.py b/hw1/cs285/infrastructure/rl_trainer.py index bb27972e367f3b515ea8611cb4e73d0f1d985ad3..6d68e5fa93493bf6479c978989681992060274b0 100644 --- a/hw1/cs285/infrastructure/rl_trainer.py +++ b/hw1/cs285/infrastructure/rl_trainer.py @@ -1,7 +1,7 @@ from collections import OrderedDict import numpy as np import time - +import pickle import gym import torch @@ -45,6 +45,7 @@ class RL_Trainer(object): # Maximum length for episodes self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps + global MAX_VIDEO_LEN MAX_VIDEO_LEN = self.params['ep_len'] # Is this env continuous, or self.discrete? @@ -155,18 +156,22 @@ class RL_Trainer(object): train_video_paths: paths which also contain videos for visualization purposes """ - # TODO decide whether to load training data or use the current policy to collect more data + # DONE TODO decide whether to load training data or use the current policy to collect more data # HINT: depending on if it's the first iteration or not, decide whether to either # (1) load the data. In this case you can directly return as follows # ``` return loaded_paths, 0, None ``` # (2) collect `self.params['batch_size']` transitions + if itr==0: + with open(load_initial_expertdata, 'rb') as f: + loaded_paths = pickle.loads(f.read()) + return loaded_paths, 0, None - # TODO collect `batch_size` samples to be used for training + # DONE TODO collect `batch_size` samples to be used for training # HINT1: use sample_trajectories from utils # HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] print("\nCollecting data to be used for training...") - paths, envsteps_this_batch = TODO + paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len']) # collect more rollouts with the same policy, to be saved as videos in tensorboard # note: here, we collect MAX_NVIDEO rollouts, each of length MAX_VIDEO_LEN @@ -174,6 +179,7 @@ class RL_Trainer(object): if self.log_video: print('\nCollecting train rollouts to be used for saving videos...') ## TODO look in utils and implement sample_n_trajectories + # print('In Collect trajectory MAX_VIDEO_LEN:',MAX_VIDEO_LEN,'id:',id(MAX_VIDEO_LEN)) train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) return paths, envsteps_this_batch, train_video_paths @@ -184,25 +190,27 @@ class RL_Trainer(object): all_logs = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): - # TODO sample some data from the data buffer + # DONE TODO sample some data from the data buffer # HINT1: use the agent's sample function # HINT2: how much data = self.params['train_batch_size'] - ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = TODO + ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample(self.params['train_batch_size']) - # TODO use the sampled data to train an agent + # DONE TODO use the sampled data to train an agent # HINT: use the agent's train function # HINT: keep the agent's training log for debugging - train_log = TODO + train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) all_logs.append(train_log) return all_logs def do_relabel_with_expert(self, expert_policy, paths): print("\nRelabelling collected observations with labels from an expert policy...") - # TODO relabel collected obsevations (from our policy) with labels from an expert policy + # DONE TODO relabel collected obsevations (from our policy) with labels from an expert policy # HINT: query the policy (using the get_action function) with paths[i]["observation"] # and replace paths[i]["action"] with these expert labels - + for i in range(len(paths)): + action = expert_policy.get_action(paths[i]["observation"]) + paths[i]["action"] = action return paths #################################### @@ -212,11 +220,13 @@ class RL_Trainer(object): # collect eval trajectories, for logging print("\nCollecting data for eval...") + # print('ep_len',self.params['ep_len']) eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) # save eval rollouts as videos in tensorboard event file if self.log_video and train_video_paths != None: print('\nCollecting video rollouts eval') + # print('In Perform logging MAX_VIDEO_LEN:',MAX_VIDEO_LEN,'id:',id(MAX_VIDEO_LEN)) eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) #save train/eval videos @@ -229,6 +239,10 @@ class RL_Trainer(object): # save eval metrics if self.log_metrics: # returns, for logging + # print('log_metrics eval paths length',len(eval_paths)) + # for eval_path in eval_paths: + # print('log_metrics eval path length',len(eval_path)) + # print('eval return',len(eval_path["reward"])) train_returns = [path["reward"].sum() for path in paths] eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] diff --git a/hw1/cs285/infrastructure/utils.py b/hw1/cs285/infrastructure/utils.py index d894480b2c121bf8e5da8b3050be7ef2eec3023c..125e297b8a8ece70d2b184c1bd550b6f1b79551c 100644 --- a/hw1/cs285/infrastructure/utils.py +++ b/hw1/cs285/infrastructure/utils.py @@ -7,7 +7,7 @@ import time def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): # initialize env for the beginning of a new rollout - ob = TODO # HINT: should be the output of resetting the env + ob = env.reset() # HINT: should be the output of resetting the env # init vars obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] @@ -27,7 +27,7 @@ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=(' # use the most recent ob to decide what to do obs.append(ob) - ac = TODO # HINT: query the policy's get_action function + ac = policy.get_action(ob) # HINT: query the policy's get_action function ac = ac[0] acs.append(ac) @@ -39,9 +39,9 @@ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=(' next_obs.append(ob) rewards.append(rew) - # TODO end the rollout if the rollout ended + # DONE TODO end the rollout if the rollout ended # HINT: rollout can end due to done, or due to max_path_length - rollout_done = TODO # HINT: this is either 0 or 1 + rollout_done = 1 if steps>=max_path_length else done # HINT: this is either 0 or 1 terminals.append(rollout_done) if rollout_done: @@ -53,15 +53,16 @@ def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, r """ Collect rollouts until we have collected min_timesteps_per_batch steps. - TODO implement this function + DONE TODO implement this function Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths Hint2: use get_pathlength to count the timesteps collected in each path """ timesteps_this_batch = 0 paths = [] while timesteps_this_batch < min_timesteps_per_batch: - - TODO + path = sample_trajectory(env, policy, max_path_length, render) + paths.append(path) + timesteps_this_batch += get_pathlength(path) return paths, timesteps_this_batch @@ -69,12 +70,13 @@ def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, ren """ Collect ntraj rollouts. - TODO implement this function + DONE TODO implement this function Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths """ paths = [] - - TODO + for i in range(ntraj): + path = sample_trajectory(env, policy, max_path_length, render) + paths.append(path) return paths diff --git a/hw1/cs285/policies/MLP_policy.py b/hw1/cs285/policies/MLP_policy.py index c8e1fd7d4fa7f8f8b97865200f20daea353f32d9..6c2ae10a9a5a053b1cbda3a502e06fe233101025 100644 --- a/hw1/cs285/policies/MLP_policy.py +++ b/hw1/cs285/policies/MLP_policy.py @@ -79,9 +79,8 @@ class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): observation = obs else: observation = obs[None] - - # TODO return the action that the policy prescribes - raise NotImplementedError + # DONE TODO return the action that the policy prescribes + return ptu.to_numpy(self.forward(ptu.from_numpy(observation))) # update/train this policy def update(self, observations, actions, **kwargs): @@ -93,7 +92,11 @@ class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): # return more flexible objects, such as a # `torch.distributions.Distribution` object. It's up to you! def forward(self, observation: torch.FloatTensor) -> Any: - raise NotImplementedError + # DONE TODO raise NotImplementedError + if self.discrete: + return self.logits_na(observation) + else: + return self.mean_net(observation) ##################################################### @@ -108,8 +111,12 @@ class MLPPolicySL(MLPPolicy): self, observations, actions, adv_n=None, acs_labels_na=None, qvals=None ): - # TODO: update the policy and return the loss - loss = TODO + # DONE TODO: update the policy and return the loss + self.optimizer.zero_grad() # zeroes the gradient buffers of all parameters + loss = self.loss(self.forward(ptu.from_numpy(observations)),ptu.from_numpy(actions)) + # the reason why we cannot use get_action since to_numpy will remove grad_func + loss.backward() # backprop + self.optimizer.step() # # Does the update return { # You can add extra logging information here, but keep this line 'Training Loss': ptu.to_numpy(loss), diff --git a/hw1/cs285/scripts/run_hw1.ipynb b/hw1/cs285/scripts/run_hw1.ipynb index 476b63265b7ee3a7e7c5b14c7ca74f7b8b54f581..877ebf1fad7d91cf55672401e495db945a356c79 100644 --- a/hw1/cs285/scripts/run_hw1.ipynb +++ b/hw1/cs285/scripts/run_hw1.ipynb @@ -551,4 +551,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/hw1/cs285/scripts/run_hw1.py b/hw1/cs285/scripts/run_hw1.py index 2a4a73de424f327c45ac1047e13aeece8af4728a..377f01e3c7493e3311bf14e54760f511311da83e 100644 --- a/hw1/cs285/scripts/run_hw1.py +++ b/hw1/cs285/scripts/run_hw1.py @@ -112,7 +112,12 @@ def main(): ################### ### RUN TRAINING ################### - + # 重复运行,请消除此处注释,注释下面两行 + # add_item = 1000 + # for step in range(10): + # params['eval_batch_size'] = add_item*(step+1) + # trainer = BC_Trainer(params) + # trainer.run_training_loop() trainer = BC_Trainer(params) trainer.run_training_loop() diff --git a/hw1/solution.md b/hw1/solution.md new file mode 100644 index 0000000000000000000000000000000000000000..49532a1073228b62b3a337c8fd691522591ca466 --- /dev/null +++ b/hw1/solution.md @@ -0,0 +1,264 @@ +# 前言 +首先能运行程序的前提是安装好依赖,确保自己阅读完毕了readme.md 和 installation.md,如果前言没啥直接跳过 进入运行**程序与解答部分**即可,以下代码中都没有使用gpu,谁让主机GPU版本太老 支持不了 + +## Install mujoco: +``` +mkdir ~/.mujoco +cd ~/.mujoco +wget https://www.roboti.us/download/mujoco200_linux.zip +unzip mujoco200_linux.zip +mv mujoco200_linux mujoco200 +rm mujoco200_linux.zip +cp . +``` +关于这里的.txt是需要申请的:[https://www.roboti.us/license.html](https://www.roboti.us/license.html) + + +## 先建立conda环境 +A. (Recommended) Install with conda: + +1. Install conda + +2. Create a conda environment that will contain python 3: +``` +conda create -n cs285 python=3.6 +``` + +3. activate the environment (do this every time you open a new terminal and want to run code): +``` +source activate cs285 +``` + +4. Install the requirements into this conda environment +``` +pip install --user -r requirements.txt +``` + +5. Allow your code to be able to see 'cs285' +``` +cd +$ pip install -e . +``` +## 关于运行后的一些错误记录 +这里没有完,对于python自身还需要装一些依赖的:不然会报错 +### error: command 'gcc' failed with exit status 1 while installing eventlet +```bash +sudo apt-get install libosmesa6-dev +``` +参考地址:[https://github.com/ethz-asl/reinmav-gym/issues/35](https://github.com/ethz-asl/reinmav-gym/issues/35) +### error: [Errno 2] No such file or directory: 'patchelf': 'patchelf' +``` +conda install anaconda patchelf +``` +参考地址:[https://github.com/openai/mujoco-py/issues/147](https://github.com/openai/mujoco-py/issues/147) + +# 运行程序及解答 +## BC Task +### 运行 +这个是pdf给出的运行 how you can run the Ant task 最后三行加了pdf中要求的,report the mean and standard 意味着 `eval_batch_size` 应该要比 `ep_len`大 + +例如,`ep_len` 是1000, `eval_batch_size` 是 10000,你收集的轨迹就大概10个,程序输出的`Eval_AverageReturn ` 和 `Eval_StdReturn `分别代表了mean/std +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Ant.pkl \ +--env_name Ant-v2 --exp_name bc_ant --n_iter 1 \ +--expert_data cs285/expert_data/expert_data_Ant-v2.pkl \ +--video_log_freq -1 \ +--ep_len 1000 \ +--eval_batch_size 10000 \ +-ngpu +``` + +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Hopper.pkl \ +--env_name Hopper-v2 --exp_name bc_hopper --n_iter 1 \ +--expert_data cs285/expert_data/expert_data_Hopper-v2.pkl \ +--video_log_freq -1 \ +--ep_len 1000 \ +--eval_batch_size 10000 \ +-ngpu +``` + +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/HalfCheetah.pkl \ +--env_name HalfCheetah-v2 --exp_name bc_halfcheetah --n_iter 1 \ +--expert_data cs285/expert_data/expert_data_HalfCheetah-v2.pkl \ +--video_log_freq -1 \ +--ep_len 1000 \ +--eval_batch_size 10000 \ +-ngpu +``` + +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Humanoid.pkl \ +--env_name Humanoid-v2 --exp_name bc_humanoid --n_iter 1 \ +--expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl \ +--video_log_freq -1 \ +--ep_len 1000 \ +--eval_batch_size 10000 \ +-ngpu +``` + +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Walker2d.pkl \ +--env_name Walker2d-v2 --exp_name bc_walker2d --n_iter 1 \ +--expert_data cs285/expert_data/expert_data_Walker2d-v2.pkl \ +--video_log_freq -1 \ +--ep_len 1000 \ +--eval_batch_size 10000 \ +-ngpu +``` + +### Q 1.2 +需要运行两个环境,然后进行对比 平均值和标准差,对于Ant-v2环境下BC策略应该要能达到专家策略的30%以上,这里的答案都是建立在`eval_batch_size = 10000`不是默认的1000,按上面的指令运行即可得到下面表中数据: + + +运行环境 | Expert Mean 专家策略的平均值 | BC Mean BC策略的平均值 | BC Std 标准差 | % of Expert +---- | :---------------: | :----------------: | :-----: | :-----: +Ant-v2 | 4713.65 | 4444.24 | 822.1373 | 94.24% +Hopper-v2 | 3772.67 | 2013.19 | 503.24 | 53.36% +HaflCheetah-v2 | 4205.777 | 3886.59 | 71.47 | 92.41% +Humanoid-v2 | 10344.51 | 4881.17 | 8.62 | 47.18% +Walker2d-v2 | 5566.84 | 4098.05 | 1181.29 | 73.16% + + +### Q 1.3 +需要我们调整参数,以对比agent的训练会受到怎样的影响,可以从参数中就是run_hw1.py 那部分的导入参数里看到可调的 +```bash +for ((i=100; i<=1000; i=i+100)) +do + python cs285/scripts/run_hw1.py \ + --expert_policy_file cs285/policies/experts/Ant.pkl \ + --env_name Hopper-v2 --exp_name bc_ant --n_iter 1 \ + --expert_data cs285/expert_data/expert_data_Hopper-v2.pkl \ + --video_log_freq -1 \ + --train_batch_size "$i" \ + -ngpu +done +``` +但是这样好像画不了图,而且导出不了 emmm 只能直接在里面改了,但是也不是按step来的;不过能导出csv然后再单独画一个好了 lol 自己太菜了,引用参考代码里有个大哥是直接输出的tensorboard的,不过他和我的答案差距有点大,对于Q1.2 + +由上表可知,humanoid的训练比较差,所以对比的话 就直接拿这个然后改从专家里训练的数据了 +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Humanoid.pkl \ +--env_name Humanoid-v2 --exp_name bc_humanoid --n_iter 1 \ +--expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl \ +--video_log_freq -1 \ +-ngpu +``` +最后试了一下`ep_len`,发现这是决定每次训练的长度,然后就就有对比出来了,甚至比专家策略学的还要好... +```python +# HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] +print("\nCollecting data to be used for training...") +paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, batch_size, self.params['ep_len'], render=False, render_mode=('rgb_array')) +``` +然后需要注释run_hw1.py那一段 去循环不同的ep_len...【==**但是这里有问题哈**==】 +详情见博客Q1.2分析[https://blog.csdn.net/qq_39537898/article/details/116905668](https://blog.csdn.net/qq_39537898/article/details/116905668) + +结果图: + +![运行结果示意图](../image/hw1_3.png) + +但是隐约觉得哪里不太对劲: +1. 首先是在不给ep_len的时候,默认是1000,但是为啥我一次次train后同样的参数到后面训练的return更大了呢? -> 已解决在Notion里提到了,并没有变大,默认的1000还是在4000范围左右 -> 但是这样的分析还是有错误的,详情见博客:[https://blog.csdn.net/qq_39537898/article/details/116905668](https://blog.csdn.net/qq_39537898/article/details/116905668) +2. 在后面的dagger里随着迭代重新开始 可以发现专家数据的train return也会下降emmm, 是没有理解BC 和 Dagger嘛? -> 是拿专家数据来训练,对于不同的ep是不同的reward所以总体上也没啥问题 + +## DAgger Task +### 运行 +ep_len运行200步是PDF第三部分提到的,提交的时候交200步的 +Ant-v2环境下: +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Ant.pkl \ +--env_name Ant-v2 --exp_name dagger_ant --n_iter 10 \ +--do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \ +--ep_len 200 \ +-ngpu +``` +用来训练的agent表现,比如右边那副就是失败了 翻车了 但是因为rollout是两轮所以有一个还在继续 +
+ +
+ +eval下的agent表现: +
+ +
+ +Humanoid-v2: +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Humanoid.pkl \ +--env_name Humanoid-v2 --exp_name dagger_humanoid --n_iter 10 \ +--do_dagger --expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl \ +--ep_len 200 \ +-ngpu +``` + +用来训练的agent表现 +
+ +
+ +eval下的agent表现: +
+ +
+ +这个就是ep_len太短了,还有就是`rollout_done = 1 if steps>=max_path_length else done`,已经直接被设为`done`了 +比如长一点的ep_len默认1000情况下时: + +迭代次数也就是`n_iter=5`的话,可以看到迭代次数太少 根本还没来得及学会,这里之所以卡住不动是因为step返回了done 就是这个步骤没法再继续下去了,(其实一开始我没写成done写成了0,所以会出现鬼畜现象 hhhh) + +
+ +
+ +如果迭代了95次之后就能学完整个过程了: +
+ +
+ +### Q 2.2 +首先这个问题是对比一下Ant和另一个环境下与BC的结果,然后画出在Dagger下的平均值与标准差,这个就直接在tensorboard里有显示了 + +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Ant.pkl \ +--env_name Ant-v2 --exp_name dagger_ant --n_iter 100 \ +--do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \ +--video_log_freq -1 \ +--ep_len 200 \ +-ngpu +``` +运行上面【Ant-v2】这段得到的结果: +![运行结果示意图](../image/hw1_q21.png) +![运行结果示意图](../image/hw1_q22.png) + +```bash +python cs285/scripts/run_hw1.py \ +--expert_policy_file cs285/policies/experts/Humanoid.pkl \ +--env_name Humanoid-v2 --exp_name dagger_humanoid --n_iter 100 \ +--do_dagger --expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl \ +--video_log_freq -1 \ +--ep_len 200 \ +-ngpu +``` +运行上面【Humanoid-v2】这段得到的结果: +![运行结果示意图](../image/hw1_q23.png) +![运行结果示意图](../image/hw1_q24.png) + + + + + +# 参考引用 +写代码过程中的几个Github参考: +1. [https://github.com/cww97/cs285_fall2020_cww/tree/main/hw1](https://github.com/cww97/cs285_fall2020_cww/tree/main/hw1) +2. [https://github.com/vincentkslim/cs285_homework_fall2020/tree/master](https://github.com/vincentkslim/cs285_homework_fall2020/tree/master) +3. [https://github.com/mdeib/berkeley-deep-RL-pytorch-solutions](https://github.com/mdeib/berkeley-deep-RL-pytorch-solutions) \ No newline at end of file diff --git a/image/ant_train-n10-ep200.gif b/image/ant_train-n10-ep200.gif new file mode 100755 index 0000000000000000000000000000000000000000..04eb400a1b713aa9931f524741f3401b3f394100 Binary files /dev/null and b/image/ant_train-n10-ep200.gif differ diff --git a/image/ant_val-n10-ep200.gif b/image/ant_val-n10-ep200.gif new file mode 100644 index 0000000000000000000000000000000000000000..973dd7daa458d8c86d8b534047f684de60dde12d Binary files /dev/null and b/image/ant_val-n10-ep200.gif differ diff --git a/image/human-n5-ep100.gif b/image/human-n5-ep100.gif new file mode 100755 index 0000000000000000000000000000000000000000..5da652ae6c6871bc5dff7cd2962a03a2328690ee Binary files /dev/null and b/image/human-n5-ep100.gif differ diff --git a/image/human-n95-ep100.gif b/image/human-n95-ep100.gif new file mode 100755 index 0000000000000000000000000000000000000000..be6c7a1d1a46602fdbec024846bc3055c04d828f Binary files /dev/null and b/image/human-n95-ep100.gif differ diff --git a/image/human_train-n10-ep200.gif b/image/human_train-n10-ep200.gif new file mode 100644 index 0000000000000000000000000000000000000000..fd34bf0006dd250ceee16223f4f56dd31856746d Binary files /dev/null and b/image/human_train-n10-ep200.gif differ diff --git a/image/human_val-n10-ep200.gif b/image/human_val-n10-ep200.gif new file mode 100644 index 0000000000000000000000000000000000000000..fdde622c4b3892559e22a267dbd6ff01d3beae42 Binary files /dev/null and b/image/human_val-n10-ep200.gif differ diff --git a/image/hw1_3.png b/image/hw1_3.png new file mode 100644 index 0000000000000000000000000000000000000000..fbd3f831735b8a7324c41aa1cbf304081ec3c061 Binary files /dev/null and b/image/hw1_3.png differ diff --git a/image/hw1_q2.png b/image/hw1_q2.png new file mode 100644 index 0000000000000000000000000000000000000000..94198377387e1ec7516140f2b39a87bd5559ac15 Binary files /dev/null and b/image/hw1_q2.png differ diff --git a/image/hw1_q21.png b/image/hw1_q21.png new file mode 100644 index 0000000000000000000000000000000000000000..dbaa7199d72f941026147e58722d9d73a0835da8 Binary files /dev/null and b/image/hw1_q21.png differ diff --git a/image/hw1_q22.png b/image/hw1_q22.png new file mode 100644 index 0000000000000000000000000000000000000000..dcf90fdfd6f382f85ffcdb16f93922f84945a21d Binary files /dev/null and b/image/hw1_q22.png differ diff --git a/image/hw1_q23.png b/image/hw1_q23.png new file mode 100644 index 0000000000000000000000000000000000000000..00a539f594266c5027c724d9a57b8578df81ebf6 Binary files /dev/null and b/image/hw1_q23.png differ diff --git a/image/hw1_q24.png b/image/hw1_q24.png new file mode 100644 index 0000000000000000000000000000000000000000..a455c2d2790db819ce874357d2c0c0a5cd0c49ad Binary files /dev/null and b/image/hw1_q24.png differ