-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathenvironment.py
More file actions
262 lines (209 loc) · 7.56 KB
/
environment.py
File metadata and controls
262 lines (209 loc) · 7.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# Import modules
import gym
from copy import deepcopy
from env.PendulumEnv import PendulumEnv
from env.Acrobot import AcrobotEnv
import env.MinAtar as MinAtar
import numpy as np
class Environment:
"""
Class Environment is a wrapper around OpenAI Gym environments, to ensure
logging can be done as well as to ensure that we can restrict the episode
time steps.
"""
def __init__(self, config, seed, monitor=False, monitor_after=0):
"""
Constructor
Parameters
----------
config : dict
The environment configuration file
seed : int
The seed to use for all random number generators
monitor : bool
Whether or not to render the scenes as the agent learns, by
default False
monitor_after : int
If monitor is True, how many timesteps should pass before
the scene is rendered, by default 0.
"""
# Overwrite rewards and start state if necessary
self.overwrite_rewards = config["overwrite_rewards"]
self.rewards = config["rewards"]
self.start_state = np.array(config["start_state"])
self.steps = 0
self.episodes = 0
# Keep track of monitoring
self.monitor = monitor
self.steps_until_monitor = monitor_after
self.env_name = config["env_name"]
self.env = env_factory(config)
print("Seeding environment:", seed)
self.env.seed(seed=seed)
self.steps_per_episode = config["steps_per_episode"]
# Increase the episode steps of the wrapped OpenAI gym environment so
# that this wrapper will timeout before the OpenAI gym one does
self.env._max_episode_steps = self.steps_per_episode + 10
if "info" in dir(self.env):
self.info = self.env.info
else:
self.info = {}
@property
def action_space(self):
"""
Gets the action space of the Gym environment
Returns
-------
gym.spaces.Space
The action space
"""
return self.env.action_space
@property
def observation_space(self):
"""
Gets the observation space of the Gym environment
Returns
-------
gym.spaces.Space
The observation space
"""
return self.env.observation_space
def seed(self, seed):
"""
Seeds the environment with a random seed
Parameters
----------
seed : int
The random seed to seed the environment with
"""
self.env.seed(seed)
def reset(self):
"""
Resets the environment by resetting the step counter to 0 and resetting
the wrapped environment. This function also increments the total
episode count.
Returns
-------
2-tuple of array_like, dict
The new starting state and an info dictionary
"""
self.steps = 0
self.episodes += 1
state = self.env.reset()
# If the user has inputted a fixed start state, use that instead
if self.start_state.shape[0] != 0:
state = self.start_state
self.env.state = state
return state, {"orig_state": state}
def render(self):
"""
Renders the current frame
"""
self.env.render()
def step(self, action):
"""
Takes a single environmental step
Parameters
----------
action : array_like of float
The action array. The number of elements in this array should be
the same as the action dimension.
Returns
-------
float, array_like of float, bool, dict
The reward and next state as well as a flag specifying if the
current episode has been completed and an info dictionary
"""
if self.monitor and self.steps_until_monitor < 0:
self.render()
self.steps += 1
self.steps_until_monitor -= (1 if self.steps_until_monitor >= 0 else 0)
# Get the next state, reward, and done flag
state, reward, done, info = self.env.step(action)
info["orig_state"] = state
# If the episode completes, return the goal reward
if done:
info["steps_exceeded"] = False
if self.overwrite_rewards:
reward = self.rewards["goal"]
return state, reward, done, info
# If the user has set rewards per timestep
if self.overwrite_rewards:
reward = self.rewards["timestep"]
# If the maximum time-step was reached
if self.steps >= self.steps_per_episode > 0:
done = True
info["steps_exceeded"] = True
return state, reward, done, info
def env_factory(config):
"""
Instantiates and returns an environment given an environment name.
Parameters
----------
config : dict
The environment config
Returns
-------
gym.Env
The environment to train on
"""
name = config["env_name"]
seed = config["seed"]
env = None
if name == "Pendulum-v0":
env = PendulumEnv(seed=seed, continuous_action=config["continuous"])
elif name == "PendulumPenalty-v0":
env = pp.PendulumEnv(seed=seed, continuous_action=config["continuous"])
elif name == "PositivePendulumPenalty-v0":
env = ppp.PendulumEnv(seed=seed,
continuous_action=config["continuous"])
elif name == "PendulumNoShaped-v0":
env = pens.PendulumEnv(seed=seed,
continuous_action=config["continuous"])
elif name == "PendulumNoShapedPenalty-v0":
env = pensp.PendulumEnv(seed=seed,
continuous_action=config["continuous"])
elif name == "MountainCarShaped":
env = mcs.MountainCar()
elif name == "Bimodal" or name == "Bimodal":
reward_variance = config.get("reward_variance", True)
env = Bimodal(seed, reward_variance)
elif name == "Bandit":
n_actions = config.get("n_action", 10)
env = Bandit(seed, n_actions)
elif name == "ContinuousCartpole-v0":
env = ContinuousCartPoleEnv()
elif name == "IndexGridworld":
env = IndexGridworldEnv(config["rows"], config["cols"])
env.seed(seed)
elif name == "XYGridworld":
env = XYGridworldEnv(config["rows"], config["cols"])
env.seed(seed)
elif name == "Gridworld":
env = GridworldEnv(config["rows"], config["cols"])
env.seed(seed)
elif name == "PuddleWorld-v1":
env = PuddleWorldEnv(continuous=config["continuous"], seed=seed)
elif name == "Acrobot-v1":
env = AcrobotEnv(seed=seed, continuous_action=config["continuous"])
elif name == "CGW":
env = CGW.GridWorld()
elif name == "ContinuousGridWorld":
env = ContinuousGridWorld.GridWorld()
elif "minatar" in name.lower():
if "/" in name:
raise ValueError(f"specify environment as MinAtar{name} rather " +
"than MinAtar/{name}")
minimal_actions = config.get("use_minimal_action_set", True)
stripped_name = name[7:].lower() # Strip off "MinAtar"
env = MinAtar.GymEnv(
stripped_name,
use_minimal_action_set=minimal_actions,
)
else:
# Ensure we use the base gym environment. `gym.make` returns a TimeStep
# environment wrapper, but we want the underlying environment alone.
env = gym.make(name).env
env.seed(seed)
print(config)
return env