diff --git a/README.md b/README.md index 2c23686f5d0e3af5fbe0f8da734358b6a4b4107a..c727b10519cf8cfb5910a7ccb92d12e82988fbcc 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,11 @@ Assignments for [Berkeley CS 285: Deep Reinforcement Learning, Decision Making, [对应CSDN博客地址 点击此处](https://blog.csdn.net/qq_39537898/article/details/117064479) +## hw3 +2021/6/9 完成了第三次次作业的solution.md +[详情请点击此处见solution.md](hw3/solution.md) + +[对应CSDN博客地址 点击此处](https://blog.csdn.net/qq_39537898/article/details/117673108) diff --git a/hw1/solution.md b/hw1/solution.md index 49532a1073228b62b3a337c8fd691522591ca466..30f327517be0df51bc2111bd1564442f1795e051 100644 --- a/hw1/solution.md +++ b/hw1/solution.md @@ -23,11 +23,19 @@ A. (Recommended) Install with conda: ``` conda create -n cs285 python=3.6 ``` +温馨提示 建议装3.7,详情见hw3的[solution](../hw3/solution.md) +``` +conda create -n py37 python=3.7 +``` + 3. activate the environment (do this every time you open a new terminal and want to run code): ``` source activate cs285 ``` +``` +conda activate py37 +``` 4. Install the requirements into this conda environment ``` diff --git a/hw2/README.md b/hw2/README.md index 0e481c044bcc27fc807bf44a259817c60428721f..383e4456dee6f8a235ab87f31b7e390c18a5860e 100644 --- a/hw2/README.md +++ b/hw2/README.md @@ -1,3 +1,6 @@ +# Solution File +直接跳过看结果请点击[solution.md](solution.md) + ## Setup You can run this code on your own machine or on Google Colab. diff --git a/hw2/solution.md b/hw2/solution.md index eded00f868269ee2f8cd735a4a419c8fd8b99fad..ea091ea9d9e56d3211f41b17159174e2e2de627f 100644 --- a/hw2/solution.md +++ b/hw2/solution.md @@ -7,7 +7,7 @@ ## 编译运行前 使得cs285用在 ``` -cd +cd $ pip install -e . ``` diff --git a/hw3/README.md b/hw3/README.md index b6a7c9ce11e2de6f7d6e437745a4897f46d6ef4f..64f1be8440b7a73111c43b4f41efeee2ac7f7dd8 100644 --- a/hw3/README.md +++ b/hw3/README.md @@ -1,3 +1,6 @@ +# Solution File +直接跳过看结果请点击[solution.md](solution.md) + ## Setup You can run this code on your own machine or on Google Colab. diff --git a/hw3/cs285/DataViz.ipynb b/hw3/cs285/DataViz.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7158ecbc0205f55ab7602c4ac416d2d4619224f2 --- /dev/null +++ b/hw3/cs285/DataViz.ipynb @@ -0,0 +1,1489 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/udi/anaconda3/envs/cs285/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:458: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", + "/home/udi/anaconda3/envs/cs285/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:459: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", + "/home/udi/anaconda3/envs/cs285/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:460: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", + "/home/udi/anaconda3/envs/cs285/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:461: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", + "/home/udi/anaconda3/envs/cs285/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:462: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", + "/home/udi/anaconda3/envs/cs285/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:465: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", + " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" + ] + } + ], + "source": [ + "import os\n", + "import tensorflow as tf\n", + "import glob\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline\n", + "\n", + "figsize=(5.7, 3)\n", + "export_dir = os.path.join('../../image')\n", + "\n", + "sns.set_theme()\n", + "sns.set_context(\"paper\")\n", + "\n", + "\n", + "# From read_results.py\n", + "def get_section_results(file):\n", + " \"\"\"\n", + " requires tensorflow\n", + " \"\"\"\n", + " X = []\n", + " Y = []\n", + " for e in tf.train.summary_iterator(file):\n", + " for v in e.summary.value:\n", + " if v.tag == 'Train_EnvstepsSoFar':\n", + " X.append(v.simple_value)\n", + " elif v.tag == 'Train_AverageReturn':\n", + " Y.append(v.simple_value)\n", + " return X, Y\n", + "\n", + "data_dir = os.path.join('data')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Question 1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Train_EnvstepsSoFarConfigvalue
01.0Train_AverageReturn480.655731
110001.0Train_AverageReturn402.399994
220001.0Train_AverageReturn379.799988
330001.0Train_AverageReturn430.100006
440001.0Train_AverageReturn407.200012
............
93930001.0Train_BestReturn1833.900024
94940001.0Train_BestReturn1833.900024
95950001.0Train_BestReturn1833.900024
96960001.0Train_BestReturn1833.900024
97970001.0Train_BestReturn1833.900024
\n", + "

197 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Train_EnvstepsSoFar Config value\n", + "0 1.0 Train_AverageReturn 480.655731\n", + "1 10001.0 Train_AverageReturn 402.399994\n", + "2 20001.0 Train_AverageReturn 379.799988\n", + "3 30001.0 Train_AverageReturn 430.100006\n", + "4 40001.0 Train_AverageReturn 407.200012\n", + ".. ... ... ...\n", + "93 930001.0 Train_BestReturn 1833.900024\n", + "94 940001.0 Train_BestReturn 1833.900024\n", + "95 950001.0 Train_BestReturn 1833.900024\n", + "96 960001.0 Train_BestReturn 1833.900024\n", + "97 970001.0 Train_BestReturn 1833.900024\n", + "\n", + "[197 rows x 3 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_section_results_(file):\n", + " \"\"\"\n", + " requires tensorflow\n", + " \"\"\"\n", + " X = []\n", + " Y = []\n", + " Z = []\n", + " for e in tf.train.summary_iterator(file):\n", + " for v in e.summary.value:\n", + " if v.tag == 'Train_EnvstepsSoFar':\n", + " X.append(v.simple_value)\n", + " elif v.tag == 'Train_AverageReturn':\n", + " Y.append(v.simple_value)\n", + " elif v.tag == 'Train_BestReturn':\n", + " Z.append(v.simple_value)\n", + " return X, Y, Z\n", + "\n", + "def read_q1_data(question):\n", + " full_data = pd.DataFrame()\n", + " for folder in os.listdir(data_dir):\n", + " split = [s.strip() for s in folder.split('_')]\n", + " if 'MsPacman-v0' in split and question in split:\n", + " \n", + " logdir = os.path.join(data_dir, folder, 'events*')\n", + " eventfile = glob.glob(logdir)[0]\n", + " \n", + " X, Y, Z = get_section_results_(eventfile) # Y is Train_AverageReturn\n", + " X.pop()\n", + " full_data = pd.DataFrame({'Train_EnvstepsSoFar': X, \n", + " 'Config': 'Train_AverageReturn',\n", + " 'value': Y})\n", + " X.pop()\n", + " data = pd.DataFrame({'Train_EnvstepsSoFar': X, \n", + " 'Config': 'Train_BestReturn',\n", + " 'value': Z})\n", + " full_data = full_data.append(data)\n", + " \n", + " return full_data\n", + "\n", + "full_q1_data = read_q1_data('q1')\n", + "full_q1_data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=figsize)\n", + "ax = sns.lineplot(data=full_q1_data, x='Train_EnvstepsSoFar', y='value', hue='Config')\n", + "ax.set(xlabel='Training Steps', ylabel='Reward')\n", + "plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", + "plt.savefig(os.path.join(export_dir, 'hw3_q1.png'), dpi=200, bbox_inches='tight')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Question 2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ConfigTrain_EnvstepsSoFarTrain_AverageReturn
0dqn1.0-225.694712
1dqn10001.0-195.819229
2dqn20001.0-175.776810
3dqn30001.0-158.668772
4dqn40001.0-145.344304
............
44doubledqn440001.080.762199
45doubledqn450001.069.649890
46doubledqn460001.079.524618
47doubledqn470001.073.411114
48doubledqn480001.068.117202
\n", + "

98 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Config Train_EnvstepsSoFar Train_AverageReturn\n", + "0 dqn 1.0 -225.694712\n", + "1 dqn 10001.0 -195.819229\n", + "2 dqn 20001.0 -175.776810\n", + "3 dqn 30001.0 -158.668772\n", + "4 dqn 40001.0 -145.344304\n", + ".. ... ... ...\n", + "44 doubledqn 440001.0 80.762199\n", + "45 doubledqn 450001.0 69.649890\n", + "46 doubledqn 460001.0 79.524618\n", + "47 doubledqn 470001.0 73.411114\n", + "48 doubledqn 480001.0 68.117202\n", + "\n", + "[98 rows x 3 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def read_q2_data(question):\n", + " full_data = pd.DataFrame()\n", + " for folder in os.listdir(data_dir):\n", + " split = [s.strip() for s in folder.split('_')]\n", + " if 'LunarLander-v3' in split and question in split:\n", + " config_list = split[split.index(question)+1] # get dqn/ddqn after q2\n", + " logdir = os.path.join(data_dir, folder, 'events*')\n", + " eventfile = glob.glob(logdir)[0]\n", + "\n", + " X, Y = get_section_results(eventfile) # Y is Train_AverageReturn\n", + " X.pop()\n", + " data = pd.DataFrame({'Config':config_list,\n", + " 'Train_EnvstepsSoFar': X, \n", + " 'Train_AverageReturn': Y})\n", + " \n", + " if full_data.empty:\n", + " full_data = data\n", + " elif not (full_data['Config'].str.contains(config_list)).any():\n", + " full_data = full_data.append(data)\n", + " else:\n", + " full_data['Temp_Return'] = data.loc[:,'Train_AverageReturn']\n", + " \n", + " # for another config set temp to zero\n", + " full_data.loc[full_data['Config']!=config_list,'Temp_Return'] = 0\n", + " \n", + " # add temp adn return together\n", + " full_data['Train_AverageReturn'] = full_data.drop('Train_EnvstepsSoFar', axis=1).sum(axis=1)\n", + "\n", + " del full_data['Temp_Return']\n", + " full_data.loc[:, 'Train_AverageReturn'] /= 3\n", + "\n", + " return full_data\n", + "\n", + "full_q2_data = read_q2_data('q2')\n", + "full_q2_data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=figsize)\n", + "sns.lineplot(data=full_q2_data, x='Train_EnvstepsSoFar', y='Train_AverageReturn', hue='Config')\n", + "plt.savefig(os.path.join(export_dir, 'hw3_q2.png'), dpi=200, bbox_inches='tight')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Question 3" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ConfigTrain_EnvstepsSoFarTrain_AverageReturn
0hparam11.0-208.769302
1hparam110001.0-190.801102
2hparam120001.0-175.555710
3hparam130001.0-156.841217
4hparam140001.0-145.930069
............
44hparam2440001.0142.106491
45hparam2450001.0141.662888
46hparam2460001.0159.932724
47hparam2470001.0146.924301
48hparam2480001.0144.613419
\n", + "

196 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Config Train_EnvstepsSoFar Train_AverageReturn\n", + "0 hparam1 1.0 -208.769302\n", + "1 hparam1 10001.0 -190.801102\n", + "2 hparam1 20001.0 -175.555710\n", + "3 hparam1 30001.0 -156.841217\n", + "4 hparam1 40001.0 -145.930069\n", + ".. ... ... ...\n", + "44 hparam2 440001.0 142.106491\n", + "45 hparam2 450001.0 141.662888\n", + "46 hparam2 460001.0 159.932724\n", + "47 hparam2 470001.0 146.924301\n", + "48 hparam2 480001.0 144.613419\n", + "\n", + "[196 rows x 3 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def read_q3_data(question):\n", + " full_data = pd.DataFrame()\n", + " for folder in os.listdir(data_dir):\n", + " split = [s.strip() for s in folder.split('_')]\n", + " if 'LunarLander-v3' in split and question in split:\n", + " config_list = split[split.index(question)+1] # get dqn/ddqn after q2\n", + " logdir = os.path.join(data_dir, folder, 'events*')\n", + " eventfile = glob.glob(logdir)[0]\n", + "\n", + " X, Y = get_section_results(eventfile) # Y is Train_AverageReturn\n", + " X.pop()\n", + " data = pd.DataFrame({'Config':config_list,\n", + " 'Train_EnvstepsSoFar': X, \n", + " 'Train_AverageReturn': Y})\n", + "\n", + " full_data = full_data.append(data)\n", + " \n", + " return full_data\n", + "\n", + "full_q3_data = read_q3_data('q3')\n", + "full_q3_data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=figsize)\n", + "sns.lineplot(data=full_q3_data, x='Train_EnvstepsSoFar', y='Train_AverageReturn', hue='Config')\n", + "plt.savefig(os.path.join(export_dir, 'hw3_q31.png'), dpi=200, bbox_inches='tight')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Question 4" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ConfigTrain_EnvstepsSoFarEval_AverageReturn
010_101013.029.928572
110_1011507.0197.000000
210_1021984.0200.000000
310_1031984.0200.000000
410_1042144.0199.666672
510_1053288.0200.000000
610_1064436.0200.000000
710_1074633.0200.000000
810_1084633.0200.000000
910_1094633.0200.000000
01_1001013.025.687500
11_10011167.038.000000
21_10021617.0132.500000
31_10032345.0124.750000
41_10043015.0147.000000
51_10053503.0200.000000
61_10063648.0152.333328
71_10074251.088.800003
81_10084765.0200.000000
91_10094765.0200.000000
0ac_1_11013.025.000000
1ac_1_111125.09.595238
2ac_1_121184.09.348837
3ac_1_131230.09.302325
4ac_1_141268.09.325582
5ac_1_151317.09.441860
6ac_1_161345.09.465117
7ac_1_171381.09.488372
8ac_1_181410.09.227273
9ac_1_191453.09.372093
0100_11013.037.363636
1100_111443.088.199997
2100_122112.0158.333328
3100_132631.0133.500000
4100_143496.0102.000000
5100_153967.0125.750000
6100_164596.092.400002
7100_175247.0123.500000
8100_185785.0159.666672
9100_196564.0200.000000
\n", + "
" + ], + "text/plain": [ + " Config Train_EnvstepsSoFar Eval_AverageReturn\n", + "0 10_10 1013.0 29.928572\n", + "1 10_10 11507.0 197.000000\n", + "2 10_10 21984.0 200.000000\n", + "3 10_10 31984.0 200.000000\n", + "4 10_10 42144.0 199.666672\n", + "5 10_10 53288.0 200.000000\n", + "6 10_10 64436.0 200.000000\n", + "7 10_10 74633.0 200.000000\n", + "8 10_10 84633.0 200.000000\n", + "9 10_10 94633.0 200.000000\n", + "0 1_100 1013.0 25.687500\n", + "1 1_100 11167.0 38.000000\n", + "2 1_100 21617.0 132.500000\n", + "3 1_100 32345.0 124.750000\n", + "4 1_100 43015.0 147.000000\n", + "5 1_100 53503.0 200.000000\n", + "6 1_100 63648.0 152.333328\n", + "7 1_100 74251.0 88.800003\n", + "8 1_100 84765.0 200.000000\n", + "9 1_100 94765.0 200.000000\n", + "0 ac_1_1 1013.0 25.000000\n", + "1 ac_1_1 11125.0 9.595238\n", + "2 ac_1_1 21184.0 9.348837\n", + "3 ac_1_1 31230.0 9.302325\n", + "4 ac_1_1 41268.0 9.325582\n", + "5 ac_1_1 51317.0 9.441860\n", + "6 ac_1_1 61345.0 9.465117\n", + "7 ac_1_1 71381.0 9.488372\n", + "8 ac_1_1 81410.0 9.227273\n", + "9 ac_1_1 91453.0 9.372093\n", + "0 100_1 1013.0 37.363636\n", + "1 100_1 11443.0 88.199997\n", + "2 100_1 22112.0 158.333328\n", + "3 100_1 32631.0 133.500000\n", + "4 100_1 43496.0 102.000000\n", + "5 100_1 53967.0 125.750000\n", + "6 100_1 64596.0 92.400002\n", + "7 100_1 75247.0 123.500000\n", + "8 100_1 85785.0 159.666672\n", + "9 100_1 96564.0 200.000000" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_section_results(file,tag):\n", + " \"\"\"\n", + " requires tensorflow\n", + " \"\"\"\n", + " X = []\n", + " Y = []\n", + " for e in tf.train.summary_iterator(file):\n", + " for v in e.summary.value:\n", + " if v.tag == 'Train_EnvstepsSoFar':\n", + " X.append(v.simple_value)\n", + " elif v.tag == 'Eval_AverageReturn':\n", + " Y.append(v.simple_value)\n", + " return X, Y\n", + "\n", + "def read_q4_data(question):\n", + " full_data = pd.DataFrame()\n", + " Y_tag = 'Eval_AverageReturn'\n", + " for folder in os.listdir(data_dir):\n", + " split = [s.strip() for s in folder.split('_')]\n", + " if 'CartPole-v0' in split and question in split:\n", + " config_list = split[split.index(question)+1:split.index('CartPole-v0')]\n", + " config_list = '_'.join(config_list)\n", + " logdir = os.path.join(data_dir, folder, 'events*')\n", + " eventfile = glob.glob(logdir)[0]\n", + " \n", + " X, Y = get_section_results(eventfile, Y_tag) # Y is Train_AverageReturn\n", + " data = pd.DataFrame({'Config':config_list,\n", + " 'Train_EnvstepsSoFar': X, \n", + " Y_tag: Y})\n", + "\n", + " full_data = full_data.append(data)\n", + " \n", + " return full_data\n", + "\n", + "full_q4_data = read_q4_data('q4')\n", + "full_q4_data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=figsize)\n", + "sns.lineplot(data=full_q4_data, x='Train_EnvstepsSoFar', y='Eval_AverageReturn', hue='Config')\n", + "plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", + "plt.savefig(os.path.join(export_dir, 'hw3_q4.png'), dpi=200, bbox_inches='tight')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Question 5" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ConfigTrain_EnvstepsSoFarEval_AverageReturn
010_105011.08.888889
110_1055076.07.228070
210_10105143.07.846154
310_10155195.08.826087
410_10205250.08.760870
510_10255313.08.183674
610_10305352.08.204082
710_10355417.08.200000
810_10405465.08.431373
910_10455502.08.695652
01_1005011.012.264706
11_10055322.070.000000
21_100105611.062.285713
31_100156636.0144.666672
41_100207338.085.000000
51_100257857.0581.500000
61_100308113.01000.000000
71_100358880.01000.000000
81_100409022.01000.000000
91_100459022.01000.000000
\n", + "
" + ], + "text/plain": [ + " Config Train_EnvstepsSoFar Eval_AverageReturn\n", + "0 10_10 5011.0 8.888889\n", + "1 10_10 55076.0 7.228070\n", + "2 10_10 105143.0 7.846154\n", + "3 10_10 155195.0 8.826087\n", + "4 10_10 205250.0 8.760870\n", + "5 10_10 255313.0 8.183674\n", + "6 10_10 305352.0 8.204082\n", + "7 10_10 355417.0 8.200000\n", + "8 10_10 405465.0 8.431373\n", + "9 10_10 455502.0 8.695652\n", + "0 1_100 5011.0 12.264706\n", + "1 1_100 55322.0 70.000000\n", + "2 1_100 105611.0 62.285713\n", + "3 1_100 156636.0 144.666672\n", + "4 1_100 207338.0 85.000000\n", + "5 1_100 257857.0 581.500000\n", + "6 1_100 308113.0 1000.000000\n", + "7 1_100 358880.0 1000.000000\n", + "8 1_100 409022.0 1000.000000\n", + "9 1_100 459022.0 1000.000000" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def read_q5_data(question):\n", + " full_data = pd.DataFrame()\n", + " Y_tag = 'Eval_AverageReturn'\n", + " for folder in os.listdir(data_dir):\n", + " split = [s.strip() for s in folder.split('_')]\n", + " if 'InvertedPendulum-v2' in split and question in split:\n", + " config_list = split[split.index(question)+1:split.index('InvertedPendulum-v2')]\n", + " config_list = '_'.join(config_list)\n", + " logdir = os.path.join(data_dir, folder, 'events*')\n", + " eventfile = glob.glob(logdir)[0]\n", + " \n", + " X, Y = get_section_results(eventfile, Y_tag) # Y is Train_AverageReturn\n", + " data = pd.DataFrame({'Config':config_list,\n", + " 'Train_EnvstepsSoFar': X, \n", + " Y_tag: Y})\n", + "\n", + " full_data = full_data.append(data)\n", + " \n", + " return full_data\n", + "\n", + "full_q5_data = read_q5_data('q5')\n", + "full_q5_data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=figsize)\n", + "sns.lineplot(data=full_q5_data, x='Train_EnvstepsSoFar', y='Eval_AverageReturn', hue='Config')\n", + "plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", + "plt.savefig(os.path.join(export_dir, 'hw3_q5_11.png'), dpi=200, bbox_inches='tight')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ConfigTrain_EnvstepsSoFarEval_AverageReturn
010_1030000.0-75.416679
110_1060000.0-73.736649
210_1090000.0-75.464218
310_10120000.0-80.089256
410_10150000.0-41.744469
............
14510_104380000.0143.853867
14610_104410000.0164.700409
14710_104440000.0161.115509
14810_104470000.0168.482147
14910_104500000.0158.386398
\n", + "

150 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Config Train_EnvstepsSoFar Eval_AverageReturn\n", + "0 10_10 30000.0 -75.416679\n", + "1 10_10 60000.0 -73.736649\n", + "2 10_10 90000.0 -75.464218\n", + "3 10_10 120000.0 -80.089256\n", + "4 10_10 150000.0 -41.744469\n", + ".. ... ... ...\n", + "145 10_10 4380000.0 143.853867\n", + "146 10_10 4410000.0 164.700409\n", + "147 10_10 4440000.0 161.115509\n", + "148 10_10 4470000.0 168.482147\n", + "149 10_10 4500000.0 158.386398\n", + "\n", + "[150 rows x 3 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def read_q5_data(question):\n", + " full_data = pd.DataFrame()\n", + " Y_tag = 'Eval_AverageReturn'\n", + " for folder in os.listdir(data_dir):\n", + " split = [s.strip() for s in folder.split('_')]\n", + " if 'HalfCheetah-v2' in split and question in split:\n", + " config_list = split[split.index(question)+1:split.index('HalfCheetah-v2')]\n", + " config_list = '_'.join(config_list)\n", + " logdir = os.path.join(data_dir, folder, 'events*')\n", + " eventfile = glob.glob(logdir)[0]\n", + " \n", + " X, Y = get_section_results(eventfile, Y_tag) # Y is Train_AverageReturn\n", + " data = pd.DataFrame({'Config':config_list,\n", + " 'Train_EnvstepsSoFar': X, \n", + " Y_tag: Y})\n", + "\n", + " full_data = full_data.append(data)\n", + " \n", + " return full_data\n", + "\n", + "full_q5_data = read_q5_data('q5')\n", + "full_q5_data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=figsize)\n", + "sns.lineplot(data=full_q5_data, x='Train_EnvstepsSoFar', y='Eval_AverageReturn', hue='Config')\n", + "plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", + "plt.savefig(os.path.join(export_dir, 'hw3_q5_2.png'), dpi=200, bbox_inches='tight')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cs285", + "language": "python", + "name": "cs285" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hw3/cs285/agents/ac_agent.py b/hw3/cs285/agents/ac_agent.py index 8aa4ff98e59cffd16226299269b74c8d16beaa16..37669b88bfa3bb2c47382c0feeaceed33ee9762e 100644 --- a/hw3/cs285/agents/ac_agent.py +++ b/hw3/cs285/agents/ac_agent.py @@ -39,10 +39,17 @@ class ACAgent(BaseAgent): # for agent_params['num_actor_updates_per_agent_update'] steps, # update the actor + for i in range(self.agent_params['num_critic_updates_per_agent_update']): + loss_critic = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) + + advantage = self.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) + + for i in range(self.agent_params['num_critic_updates_per_agent_update']): + loss_actor = self.actor.update(ob_no, ac_na, advantage) loss = OrderedDict() - loss['Critic_Loss'] = TODO - loss['Actor_Loss'] = TODO + loss['Critic_Loss'] = loss_critic + loss['Actor_Loss'] = loss_actor return loss @@ -53,7 +60,10 @@ class ACAgent(BaseAgent): # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) - adv_n = TODO + value_s = self.critic.forward_np(ob_no) + value_s_prime = self.critic.forward_np(next_ob_no) + qsa_value = re_n + self.gamma * value_s_prime * (1-terminal_n) + adv_n = qsa_value - value_s if self.standardize_advantages: adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) diff --git a/hw3/cs285/agents/dqn_agent.py b/hw3/cs285/agents/dqn_agent.py index 8b071ff8fbf3805e9c59575ec0d6175a15ade563..5cceb95c9c063ee420dbb56f009c53e33292a4f5 100644 --- a/hw3/cs285/agents/dqn_agent.py +++ b/hw3/cs285/agents/dqn_agent.py @@ -46,37 +46,41 @@ class DQNAgent(object): # TODO store the latest observation ("frame") into the replay buffer # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer` # in dqn_utils.py - self.replay_buffer_idx = TODO + self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs) eps = self.exploration.value(self.t) # TODO use epsilon greedy exploration when selecting action - perform_random_action = TODO + # random if minus eps or at start point it will return True or False + perform_random_action = np.random.random() < eps or self.t < self.learning_starts if perform_random_action: # HINT: take random action # with probability eps (see np.random.random()) # OR if your current step number (see self.t) is less that self.learning_starts - action = TODO + action = np.random.randint(self.num_actions) else: # HINT: Your actor will take in multiple previous observations ("frames") in order # to deal with the partial observability of the environment. Get the most recent # `frame_history_len` observations using functionality from the replay buffer, # and then use those observations as input to your actor. - action = TODO + obs = self.replay_buffer.encode_recent_observation() + action = self.actor.get_action(obs) # TODO take a step in the environment using the action from the policy # HINT1: remember that self.last_obs must always point to the newest/latest observation # HINT2: remember the following useful function that you've seen before: #obs, reward, done, info = env.step(action) - TODO + self.last_obs, reward, done, info = self.env.step(action) # TODO store the result of taking this action into the replay buffer # HINT1: see your replay buffer's `store_effect` function # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above - TODO + # store_effect(self, idx, action, reward, done): + self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) # TODO if taking this step resulted in done, reset the env (and the latest observation) - TODO + if done: + self.last_obs = self.env.reset() def sample(self, batch_size): if self.replay_buffer.can_sample(self.batch_size): @@ -92,14 +96,13 @@ class DQNAgent(object): ): # TODO fill in the call to the update function using the appropriate tensors - log = self.critic.update( - TODO - ) + # update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n) + log = self.critic.update(ob_no, ac_na, next_ob_no, re_n, terminal_n) # TODO update the target network periodically # HINT: your critic already has this functionality implemented if self.num_param_updates % self.target_update_freq == 0: - TODO + self.critic.update_target_network() self.num_param_updates += 1 diff --git a/hw3/cs285/critics/bootstrapped_continuous_critic.py b/hw3/cs285/critics/bootstrapped_continuous_critic.py index b410eff61452c78589b6c1ecb74ffc70f92be30b..758f7e2dc26e9b8c5f3fbfed5b6c1030963e9128 100644 --- a/hw3/cs285/critics/bootstrapped_continuous_critic.py +++ b/hw3/cs285/critics/bootstrapped_continuous_critic.py @@ -85,5 +85,21 @@ class BootstrappedContinuousCritic(nn.Module, BaseCritic): # to 0) when a terminal state is reached # HINT: make sure to squeeze the output of the critic_network to ensure # that its dimensions match the reward + # print(self.num_grad_steps_per_target_update,'num_target_updates', self.num_target_updates) + for i in range(self.num_grad_steps_per_target_update * self.num_target_updates): + if i % self.num_grad_steps_per_target_update == 0: + value_s_prime = self.forward_np(next_ob_no) + targets = reward_n + self.gamma * value_s_prime * (1-terminal_n) + targets = ptu.from_numpy(targets) + # print("In it ",i) + + predictions = self.forward(ptu.from_numpy(ob_no)) + + assert predictions.shape == targets.shape + loss = self.loss(predictions, targets) + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() return loss.item() diff --git a/hw3/cs285/critics/dqn_critic.py b/hw3/cs285/critics/dqn_critic.py index 5ff5e9dfc8eed5947540ffe3501c32083086caf4..a40a66a0479ca21a2bdacadf09842244d3a96b4a 100644 --- a/hw3/cs285/critics/dqn_critic.py +++ b/hw3/cs285/critics/dqn_critic.py @@ -66,21 +66,23 @@ class DQNCritic(BaseCritic): q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) # TODO compute the Q-values from the target network - qa_tp1_values = TODO + qa_tp1_values = self.q_net_target(next_ob_no) if self.double_q: # You must fill this part for Q2 of the Q-learning portion of the homework. # In double Q-learning, the best action is selected using the Q-network that # is being updated, but the Q-value for this action is obtained from the # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. - TODO + next_qa_value = self.q_net(next_ob_no) + actions = next_qa_value.argmax(1) + q_tp1 = torch.gather(qa_tp1_values, 1, actions.unsqueeze(1)).squeeze(1) else: q_tp1, _ = qa_tp1_values.max(dim=1) # TODO compute targets for minimizing Bellman error # HINT: as you saw in lecture, this would be: #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal) - target = TODO + target = reward_n + self.gamma * q_tp1 * (1 - terminal_n) target = target.detach() assert q_t_values.shape == target.shape diff --git a/hw3/cs285/infrastructure/dqn_utils.py b/hw3/cs285/infrastructure/dqn_utils.py index a8ae1bb1617a88b0ce2b9e69db2d8f4426ec9f52..8091f0dba226c902f18d79359bf5f8a06b9e8ae7 100644 --- a/hw3/cs285/infrastructure/dqn_utils.py +++ b/hw3/cs285/infrastructure/dqn_utils.py @@ -41,8 +41,8 @@ def get_env_kwargs(env_name): kwargs = { 'learning_starts': 50000, 'target_update_freq': 10000, - 'replay_buffer_size': int(1e6), - 'num_timesteps': int(2e8), + 'replay_buffer_size': int(1e5), + 'num_timesteps': int(1e6), 'q_func': create_atari_q_network, 'learning_freq': 4, 'grad_norm_clipping': 10, @@ -122,7 +122,7 @@ def atari_exploration_schedule(num_timesteps): return PiecewiseSchedule( [ (0, 1.0), - (1e6, 0.1), + (1e5, 0.1), (num_timesteps / 8, 0.01), ], outside_value=0.01 ) @@ -132,7 +132,7 @@ def atari_ram_exploration_schedule(num_timesteps): return PiecewiseSchedule( [ (0, 0.2), - (1e6, 0.1), + (1e5, 0.1), (num_timesteps / 8, 0.01), ], outside_value=0.01 ) @@ -234,7 +234,10 @@ class PiecewiseSchedule(object): raised when outside value is requested. """ idxes = [e[0] for e in endpoints] + # print(idxes == sorted(idxes)) + # print(idxes,'shape of ',sorted(idxes)) assert idxes == sorted(idxes) + # print(idxes,'shape of ',sorted(idxes)) self._interpolation = interpolation self._outside_value = outside_value self._endpoints = endpoints diff --git a/hw3/cs285/infrastructure/rl_trainer.py b/hw3/cs285/infrastructure/rl_trainer.py index b0fa472eb385a2f67229aa6f6c6f72ee6aa51d3c..2ef597d978ed254098f17373deb079a111652e95 100644 --- a/hw3/cs285/infrastructure/rl_trainer.py +++ b/hw3/cs285/infrastructure/rl_trainer.py @@ -210,12 +210,43 @@ class RL_Trainer(object): envsteps_this_batch: the sum over the numbers of environment steps in paths train_video_paths: paths which also contain videos for visualization purposes """ - # TODO: get this from Piazza + # DONE : get this from Piazza + if itr == 0: + if initial_expertdata is not None: + with open(initial_expertdata,'rb') as f: + paths = pickle.loads(f.read()) + # after paths collecting and we need to save expert_data while itr==0 + # TODO 这里的expertdata是否是第一次的? 需要确认一下 + # if save_expert_data_to_disk: + # with open('expert_data_{}.pkl'.format(self.params['env_name']), 'wb') as file: + # pickle.dump(paths, file) + return paths, 0, None + + + print("\nCollecting data to be used for training... ") + paths, envsteps_this_batch = utils.sample_trajectories(self.env, collect_policy, num_transitions_to_sample, self.params['ep_len']) + + train_video_paths = None + if self.logvideo: + print("\nCollecting train rollouts to be used for saving videos... ") + train_video_paths = utils.sample_n_trajectories(self.env, collect_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) + + return paths, envsteps_this_batch, train_video_paths def train_agent(self): # TODO: get this from Piazza + # print("\nTraining agent using sampled data from replay buffer...") + train_logs = [] + for train_step in range(self.params['num_agent_train_steps_per_iter']): + # sample some data from the data buffer + ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample(self.params['train_batch_size']) + + # use the sample data to train an agent + train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) + train_logs.append(train_log) + return train_logs #################################### #################################### diff --git a/hw3/cs285/infrastructure/utils.py b/hw3/cs285/infrastructure/utils.py index eabdc393203ca15673c8872ea33b46700b0feb6a..9da5df139ba6cd811555116b7d673886700ad3c0 100644 --- a/hw3/cs285/infrastructure/utils.py +++ b/hw3/cs285/infrastructure/utils.py @@ -55,7 +55,47 @@ def mean_squared_error(a, b): ############################################ def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): - # TODO: get this from Piazza + # TODO: get this from Piazza -> get from hw2 since I don't have Piazza + # initialize env for the beginning of a new rollout + ob = env.reset() + + # init vars + obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] + steps = 0 + while True: + # render image of the simulated env + if render: + if 'rgb_array' in render_mode: + if hasattr(env, 'sim'): + image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1]) + else: + image_obs.append(env.render(mode=render_mode)) + if 'human' in render_mode: + env.render(mode=render_mode) + time.sleep(env.model.opt.timestep) + + # use the most recent ob to decide what to do + obs.append(ob) + ac = policy.get_action(ob) + ac = ac[0] + acs.append(ac) + + # take the action and record results + ob, rew, done, _ = env.step(ac) + + # record result of taking that action + steps += 1 + next_obs.append(ob) + rewards.append(rew) + + # rollout done if max path length or done through step + rollout_done = 1 if steps >= max_path_length else done + terminals.append(rollout_done) + + if rollout_done: + break + + return Path(obs, image_obs, acs, rewards, next_obs, terminals) def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): """ @@ -64,6 +104,13 @@ def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, r """ # TODO: get this from Piazza + timesteps_this_batch = 0 + paths = [] + while timesteps_this_batch < min_timesteps_per_batch: + path = sample_trajectory(env, policy, max_path_length, render) + paths.append(path) + timesteps_this_batch += get_pathlength(path) + return paths, timesteps_this_batch def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): @@ -71,7 +118,12 @@ def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, ren Collect ntraj rollouts using policy """ # TODO: get this from Piazza - + paths = [] + + for i in range(ntraj): + path = sample_trajectory(env, policy, max_path_length, render) + paths.append(path) + return paths ############################################ diff --git a/hw3/cs285/policies/MLP_policy.py b/hw3/cs285/policies/MLP_policy.py index 4c2184a7589f3bff310c6ca4d68224ae4909c07c..380a723e1bf5715a47486e9000d16e1f94867870 100644 --- a/hw3/cs285/policies/MLP_policy.py +++ b/hw3/cs285/policies/MLP_policy.py @@ -87,7 +87,15 @@ class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): # query the policy with observation(s) to get selected action(s) def get_action(self, obs: np.ndarray) -> np.ndarray: # TODO: get this from Piazza - return action + # for this one remember to ptu.to_numpy as it required + if len(obs.shape) > 1: + observation = obs + else: + observation = obs[None] + + # it's distribution so it need sample for actions + action_distribution = self.forward(ptu.from_numpy(observation)) + return ptu.to_numpy(action_distribution.sample()) # update/train this policy def update(self, observations, actions, **kwargs): @@ -100,7 +108,13 @@ class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): # `torch.distributions.Distribution` object. It's up to you! def forward(self, observation: torch.FloatTensor): # TODO: get this from Piazza - return action_distribution + if self.discrete: + prob_action = self.logits_na(observation) + return distributions.Categorical(logits = prob_action) + else: + mean_prob = self.mean_net(observation) + std_prob = torch.diag(self.logstd.exp()) + return distributions.MultivariateNormal(mean_prob, scale_tril = std_prob) ##################################################### @@ -110,5 +124,15 @@ class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): class MLPPolicyAC(MLPPolicy): def update(self, observations, actions, adv_n=None): # TODO: update the policy and return the loss - loss = TODO + observations = ptu.from_numpy(observations) + actions = ptu.from_numpy(actions) + advantages = ptu.from_numpy(adv_n) + + log_pi = self.forward(observations).log_prob(actions) + loss = torch.neg(torch.mean(torch.mul(log_pi, advantages))) + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + return loss.item() diff --git a/hw3/cs285/policies/argmax_policy.py b/hw3/cs285/policies/argmax_policy.py index a7e443c3b668a49bcf1e4bcb6c64a9f0578d0cf1..b7110fe86af1cf6e62356f22a9ee2e81e1da55fc 100644 --- a/hw3/cs285/policies/argmax_policy.py +++ b/hw3/cs285/policies/argmax_policy.py @@ -14,6 +14,6 @@ class ArgMaxPolicy(object): ## TODO return the action that maxinmizes the Q-value # at the current observation as the output - actions = TODO + actions = self.critic.qa_values(observation).argmax(-1) - return action.squeeze() \ No newline at end of file + return actions.squeeze() \ No newline at end of file diff --git a/hw3/cs285/scripts/run_hw3_actor_critic.py b/hw3/cs285/scripts/run_hw3_actor_critic.py index 0b6f1f08040f1fb5b05bcc18e903416ed25a67ee..d5f918c7b2d5c6d5935c941cf0f46e1a52cf076a 100644 --- a/hw3/cs285/scripts/run_hw3_actor_critic.py +++ b/hw3/cs285/scripts/run_hw3_actor_critic.py @@ -123,4 +123,7 @@ def main(): if __name__ == "__main__": + start = time.perf_counter() main() + end = time.perf_counter() + print(f"The whole programm run time: {(end - start)/60:0.4f} mins") diff --git a/hw3/cs285/scripts/run_hw3_dqn.py b/hw3/cs285/scripts/run_hw3_dqn.py index c8ecab0e6ea0cbf2c611636130873520ad2c1259..5795b5bea415a54ecbad58c7a08f37bf322da4dd 100644 --- a/hw3/cs285/scripts/run_hw3_dqn.py +++ b/hw3/cs285/scripts/run_hw3_dqn.py @@ -91,4 +91,7 @@ def main(): if __name__ == "__main__": + start = time.perf_counter() main() + end = time.perf_counter() + print(f"The whole programm run time: {(end - start)/60:0.4f} mins") diff --git a/hw3/solution.md b/hw3/solution.md new file mode 100644 index 0000000000000000000000000000000000000000..4ef0651013d68348a8f4e237f5c474ead2d851e2 --- /dev/null +++ b/hw3/solution.md @@ -0,0 +1,222 @@ +# 前言 +详细写的记录见博客:[CSDN 链接:https://blog.csdn.net/qq_39537898/article/details/117673108](https://blog.csdn.net/qq_39537898/article/details/117673108) +参考: +1. [https://github.com/vincentkslim/cs285_homework_fall2020/tree/master/hw3](https://github.com/vincentkslim/cs285_homework_fall2020/tree/master/hw3) +2. [https://github.com/fokx/cs285_fall2020/tree/master/hw3](https://github.com/fokx/cs285_fall2020/tree/master/hw3) + +## 编译运行前 +使得cs285用在 +``` +cd +$ pip install -e . +``` +关于环境的要求: +```bash +pip3 install atari-py +``` +还有一个是报错提示的: +`Exception: ROM is missing for ms_pacman, see https://github.com/openai/atari-py#roms for instructions` +### ROM +>In order to import ROMS, you need to download Roms.rar from the Atari 2600 VCS ROM +Collection and extract the .rar file. +
Once you've done that, run: +`python -m atari_py.import_roms ` +
This should print out the names of ROMs as it imports them. The ROMs will be copied to your atari_py installation directory. + +需要在这里下载并安装[http://www.atarimania.com/rom_collection_archive_atari_2600_roms.html](http://www.atarimania.com/rom_collection_archive_atari_2600_roms.html) + +但是这里有个问题,有人在issue上也提到了 python得3.7 而不能保持原来的3.6 不然没法`python -m atari_py.import_roms `执行成功这句话 +所以我们又得切到python3.7的环境进行 +### conda 环境配置 +```bash +conda activate py37 +``` +提前先把这两个error的依赖装好了: +1. error: command 'gcc' failed with exit status 1 while installing eventlet +```bash +sudo apt-get install libosmesa6-dev +``` +参考地址:[https://github.com/ethz-asl/reinmav-gym/issues/35](https://github.com/ethz-asl/reinmav-gym/issues/35) +2. error: [Errno 2] No such file or directory: 'patchelf': 'patchelf' +``` +conda install anaconda patchelf +``` +参考地址:[https://github.com/openai/mujoco-py/issues/147](https://github.com/openai/mujoco-py/issues/147) + +这个好像有点慢.. 这是另一种方法,同一个参考地址 +```bash +sudo apt-get update +sudo apt install patchelf +``` + + +再直接走requirements.txt就好了(就是有时候清华源不稳总断 大家就自行解决吧) +```bash +pip install --user -r requirements.txt +``` + +### Notebook +运行Notebook里的代码时要添加conda环境,进入py37环境 +```bash +conda install ipykernel +python -m ipykernel install --user --name=py37 +``` +```bash +conda install tensorflow seaborn +``` + + +# PDF问题 +看tensorboard数据的命令行: +```bash +tensorboard --logdir data +``` +## Question 1: basic Q-learning performance. (DQN) + +### 运行指令 + +emmm 这不看不知道 一看吓一跳,这Iteration是!2x10的八次方`'num_timesteps': int(2e8)` 也就是200,000,000,这真的是一个很大很大很大的数字哈!但是我发现其实不需要200,000,000这么多次的迭代,就可以达到平均reward大于1000 +```bash +python cs285/scripts/run_hw3_dqn.py --env_name MsPacman-v0 --exp_name q1 +``` + +但是这里可以修改 +```python +'replay_buffer_size': int(1e6), +'num_timesteps': int(2e8), +``` +不过需要注意的是如果`'num_timesteps': int(1e7)`以下的话,`'replay_buffer_size': int(1e6)`这个和下面关于1e6的部分部分是需要变的 +然后我用的就是1e6,然后运行了269.2109 mins!!巨长(但是可以并行运行其他的python没啥大问题) + +整个的迭代和return的示意图如下: + +![运行结果示意图](../image/hw3_q1.png) + +## Question 2: double Q-learning (DDQN). +### 运行指令 +此运行在配置为:CPU i7-9700 内存16G 无GPU参与 +```bash +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q2_dqn_1 --seed 1 +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q2_dqn_2 --seed 2 +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q2_dqn_3 --seed 3 +``` +运行时间大致为:15.9101 mins到16.1607 mins之间 + +```bash +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q2_doubledqn_1 --double_q --seed 1 +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q2_doubledqn_2 --double_q --seed 2 +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q2_doubledqn_3 --double_q --seed 3 +``` +运行时间大致为:17.0641 mins到17.9384 mins之间 + +此问题主要是对比DQN和DDQN的优劣,首先两者的Q-value都是根据网络学习而来的,DDQN是以当前policy的参数进入到下一个policy参数的Q值计算中,这样的好处是:能减小DQN对于value estimate的虚高现象,具体建议看一下第八节课[b站链接](https://www.bilibili.com/video/BV1dJ411W78A?p=8) +不同之处在代码: +```python +qa_tp1_values = self.q_net_target(next_ob_no) +if self.double_q: + # You must fill this part for Q2 of the Q-learning portion of the homework. + # In double Q-learning, the best action is selected using the Q-network that + # is being updated, but the Q-value for this action is obtained from the + # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. + next_qa_value = self.q_net(next_ob_no) + actions = next_qa_value.argmax(1) + q_tp1 = torch.gather(qa_tp1_values, 1, actions.unsqueeze(1)).squeeze(1) +else: + q_tp1, _ = qa_tp1_values.max(dim=1) + +target = reward_n + self.gamma * q_tp1 * (1 - terminal_n) +``` +### 结果分析 +首先因为PDF里说了DQN和DDQN各随机运行三次来避免considerable variance between runs,然后结果分析像hw2里一样是由DataViz.ipynb求和取平均做图的: + +![运行结果示意图](../image/hw3_q2.png) + +可以比较明显看出来double dqn的方法在后面迭代中return逐步增加,但是我感觉... 我这图隐隐约约有点不太对劲 emmm 差距不明显后面反而降下来了... +-[] 这里可能有问题需要重新回来看一下 + +这是另一台主机的随机结果:(好像是一样的趋势 emmm) + +![运行结果示意图](../image/hw3_q2_acom.png) + +## Question 3: experimenting with hyperparameters +这里的超参数调整得自己去py文件里改?好像是的..哎,也可以在命令行里加就行,PDF中给了选项: learning rates, neural network architecture, exploration schedule or exploration rule (e.g.you may implement an alternative to $\epsilon$-greedy), etc. + +```bash +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q3_hparam1 +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q3_hparam2 --batch_size 64 +python cs285/scripts/run_hw3_dqn.py --env_name LunarLander-v3 --exp_name q3_hparam3 --batch_size 64 +``` +默认大概需要16.7583 mins +那就改num_timesteps和batch_size两项,首先默认的是:`'gamma': 1` 然后`'batch_size': 32`,修改这个都需要到dqn_utils.py里面改 +对于hparam1我们就改`'gamma': 0.9` 运行时长:20.9463 mins +hparam2改`'batch_size': 64` 运行时长:17.2012 mins +hparam3改`'gamma': 0.9` & `'batch_size': 64` 运行时长:21.5064 mins +运行完了对比图: + +![运行结果示意图](../image/hw3_q3.png) + +结果上来看,增加network的batch_size会更好,但是时长会增加,gamma0.9效果很差... 是不是应该0.99来着,我记得课程说这个是一般0.99 + +然后又花了半小时去输出gamma:0.99的时候: + +![运行结果示意图](../image/hw3_q31.png) + +对于hparam1我们就改`'gamma': 0.99` 运行时长:16.2794 mins +hparam2改`'batch_size': 64` 运行时长:17.2012 mins +hparam3改`'gamma': 0.99` & `'batch_size': 64` 运行时长:17.1102 mins + +## Question 4: Sanity check with Cartpole + +```bash +python cs285/scripts/run_hw3_actor_critic.py --env_name CartPole-v0 -n 100 -b 1000 --exp_name q4_ac_1_1 -ntu 1 -ngsptu 1 +``` +大概运行时长:0.3502 mins +```bash +python cs285/scripts/run_hw3_actor_critic.py --env_name CartPole-v0 -n 100 -b 1000 --exp_name q4_100_1 -ntu 100 -ngsptu 1 +python cs285/scripts/run_hw3_actor_critic.py --env_name CartPole-v0 -n 100 -b 1000 --exp_name q4_1_100 -ntu 1 -ngsptu 100 +python cs285/scripts/run_hw3_actor_critic.py --env_name CartPole-v0 -n 100 -b 1000 --exp_name q4_10_10 -ntu 10 -ngsptu 10 +``` +大概运行时长:0.5576 mins - 0.6268 mins + +![运行结果示意图](../image/hw3_q4.png) + + +## Question 5: Run actor-critic with more difficult tasks +是自己选择参数即可,主要是需要运行两个环境,如果运行前面的都不报错,唯独这两个报错的话,可能是mujoco的环境没有安装好的原因...(因为我在另一台主机上就报错了) +```bash +python cs285/scripts/run_hw3_actor_critic.py --env_name InvertedPendulum-v2 --ep_len 1000 --discount 0.95 -n 100 -l 2 -s 64 -b 5000 -lr 0.01 --exp_name q5_10_10 -ntu 10 -ngsptu 10 +``` + +然后呢我主要是看到前一个问题中10_10这组参数不错,emmm就选择的是这组 + +这是InvertedPendulum-v2环境下的图,大概运行时间:4.2144mins(有q1同时在运行) + +![运行结果示意图](../image/hw3_q5_1.png) + +但是效果有点太太太差了... 所以我又试了上面第二好的也就是1_100的数据长这样: +```bash +python cs285/scripts/run_hw3_actor_critic.py --env_name InvertedPendulum-v2 --ep_len 1000 --discount 0.95 -n 100 -l 2 -s 64 -b 5000 -lr 0.01 --exp_name q5_1_100 -ntu 1 -ngsptu 100 +``` + +emm,从PDF对我们的期望是,大概100迭代后,这个return should be around 1000,所以完成挺好的 + +![运行结果示意图](../image/hw3_q5_11.png) + + +第二个环境呢是HalfCheetah-v2 首先还是采取10_10的看一下,大致运行时间:30.32754mins(有q1在同时运行) +```bash +python cs285/scripts/run_hw3_actor_critic.py --env_name HalfCheetah-v2 --ep_len 150 --discount 0.90 --scalar_log_freq 1 -n 150 -l 2 -s 32 -b 30000 -eb 1500 -lr 0.02 --exp_name q5_10_10 -ntu 10 -ngsptu 10 +``` +这是HalfCheetah-v2环境下的图: + +![运行结果示意图](../image/hw3_q5_2.png) + +感觉这个环境下10_10的就挺不错的 --- 然后我收回我的话1_100吧: 29.8561mins +```bash +python cs285/scripts/run_hw3_actor_critic.py --env_name HalfCheetah-v2 --ep_len 150 --discount 0.90 --scalar_log_freq 1 -n 150 -l 2 -s 32 -b 30000 -eb 1500 -lr 0.02 --exp_name q5_1_100 -ntu 1 -ngsptu 100 +``` + +应该差不多就算成功了? + +![运行结果示意图](../image/hw3_q5_22.png) + diff --git a/image/hw3_q1.png b/image/hw3_q1.png new file mode 100644 index 0000000000000000000000000000000000000000..c897abdbc5ad2a751a9fad219221bb8953796c3c Binary files /dev/null and b/image/hw3_q1.png differ diff --git a/image/hw3_q2.png b/image/hw3_q2.png new file mode 100644 index 0000000000000000000000000000000000000000..d2a17d5f6cf2d964df6f2e9df63ad705e1e59a81 Binary files /dev/null and b/image/hw3_q2.png differ diff --git a/image/hw3_q2_acom.png b/image/hw3_q2_acom.png new file mode 100644 index 0000000000000000000000000000000000000000..90b0c17bcd34be7219dea1b598afb5bd36dc7a78 Binary files /dev/null and b/image/hw3_q2_acom.png differ diff --git a/image/hw3_q3.png b/image/hw3_q3.png new file mode 100644 index 0000000000000000000000000000000000000000..fe597d27b8a12e7bc5ee3c5ee8f2fb19784f378f Binary files /dev/null and b/image/hw3_q3.png differ diff --git a/image/hw3_q31.png b/image/hw3_q31.png new file mode 100644 index 0000000000000000000000000000000000000000..11827450b4846b3bf2fef003795df9121ad3f4d6 Binary files /dev/null and b/image/hw3_q31.png differ diff --git a/image/hw3_q4.png b/image/hw3_q4.png new file mode 100644 index 0000000000000000000000000000000000000000..a8487bbf6bdd512305dd7c69c9c05d44014c6383 Binary files /dev/null and b/image/hw3_q4.png differ diff --git a/image/hw3_q5_1.png b/image/hw3_q5_1.png new file mode 100644 index 0000000000000000000000000000000000000000..dd3e9711c6bde7e0b61a5cc3a895601140104617 Binary files /dev/null and b/image/hw3_q5_1.png differ diff --git a/image/hw3_q5_11.png b/image/hw3_q5_11.png new file mode 100644 index 0000000000000000000000000000000000000000..243b4d3767e9a61e4e2b3067a23abe008275431b Binary files /dev/null and b/image/hw3_q5_11.png differ diff --git a/image/hw3_q5_2.png b/image/hw3_q5_2.png new file mode 100644 index 0000000000000000000000000000000000000000..a07f997ea79e0280eb60d65c69c604806f8b3bc6 Binary files /dev/null and b/image/hw3_q5_2.png differ diff --git a/image/hw3_q5_22.png b/image/hw3_q5_22.png new file mode 100644 index 0000000000000000000000000000000000000000..66349b05dd15d78def5ea2acf4e041fdfb7c4fdd Binary files /dev/null and b/image/hw3_q5_22.png differ