GreedyAC/environment.py at master · samuelfneumann/GreedyAC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# Import modules
import gym
from copy import deepcopy
from env.PendulumEnv import PendulumEnv
from env.Acrobot import AcrobotEnv
import env.MinAtar as MinAtar
import numpy as np


class Environment:
    """
    Class Environment is a wrapper around OpenAI Gym environments, to ensure
    logging can be done as well as to ensure that we can restrict the episode
    time steps.
    """
    def __init__(self, config, seed, monitor=False, monitor_after=0):
        """
        Constructor

        Parameters
        ----------
        config : dict
            The environment configuration file
        seed : int
            The seed to use for all random number generators
        monitor : bool
            Whether or not to render the scenes as the agent learns, by
            default False
        monitor_after : int
            If monitor is True, how many timesteps should pass before
            the scene is rendered, by default 0.
        """
        # Overwrite rewards and start state if necessary
        self.overwrite_rewards = config["overwrite_rewards"]
        self.rewards = config["rewards"]
        self.start_state = np.array(config["start_state"])

        self.steps = 0
        self.episodes = 0

        # Keep track of monitoring
        self.monitor = monitor
        self.steps_until_monitor = monitor_after

        self.env_name = config["env_name"]

        self.env = env_factory(config)
        print("Seeding environment:", seed)
        self.env.seed(seed=seed)
        self.steps_per_episode = config["steps_per_episode"]

        # Increase the episode steps of the wrapped OpenAI gym environment so
        # that this wrapper will timeout before the OpenAI gym one does
        self.env._max_episode_steps = self.steps_per_episode + 10

        if "info" in dir(self.env):
            self.info = self.env.info
        else:
            self.info = {}

    @property
    def action_space(self):
        """
        Gets the action space of the Gym environment

        Returns
        -------
        gym.spaces.Space
            The action space
        """
        return self.env.action_space

    @property
    def observation_space(self):
        """
        Gets the observation space of the Gym environment

        Returns
        -------
        gym.spaces.Space
            The observation space
        """
        return self.env.observation_space

    def seed(self, seed):
        """
        Seeds the environment with a random seed

        Parameters
        ----------
        seed : int
            The random seed to seed the environment with
        """
        self.env.seed(seed)

    def reset(self):
        """
        Resets the environment by resetting the step counter to 0 and resetting
        the wrapped environment. This function also increments the total
        episode count.

        Returns
        -------
        2-tuple of array_like, dict
            The new starting state and an info dictionary
        """
        self.steps = 0
        self.episodes += 1

        state = self.env.reset()

        # If the user has inputted a fixed start state, use that instead
        if self.start_state.shape[0] != 0:
            state = self.start_state
            self.env.state = state

        return state, {"orig_state": state}

    def render(self):
        """
        Renders the current frame
        """
        self.env.render()

    def step(self, action):
        """
        Takes a single environmental step

        Parameters
        ----------
        action : array_like of float
            The action array. The number of elements in this array should be
            the same as the action dimension.

        Returns
        -------
        float, array_like of float, bool, dict
            The reward and next state as well as a flag specifying if the
            current episode has been completed and an info dictionary
        """
        if self.monitor and self.steps_until_monitor < 0:
            self.render()

        self.steps += 1
        self.steps_until_monitor -= (1 if self.steps_until_monitor >= 0 else 0)

        # Get the next state, reward, and done flag
        state, reward, done, info = self.env.step(action)
        info["orig_state"] = state

        # If the episode completes, return the goal reward
        if done:
            info["steps_exceeded"] = False
            if self.overwrite_rewards:
                reward = self.rewards["goal"]
            return state, reward, done, info

        # If the user has set rewards per timestep
        if self.overwrite_rewards:
            reward = self.rewards["timestep"]

        # If the maximum time-step was reached
        if self.steps >= self.steps_per_episode > 0:
            done = True
            info["steps_exceeded"] = True

        return state, reward, done, info


def env_factory(config):
    """
    Instantiates and returns an environment given an environment name.

    Parameters
    ----------
    config : dict
        The environment config

    Returns
    -------
    gym.Env
        The environment to train on
    """
    name = config["env_name"]
    seed = config["seed"]
    env = None

    if name == "Pendulum-v0":
        env = PendulumEnv(seed=seed, continuous_action=config["continuous"])

    elif name == "PendulumPenalty-v0":
        env = pp.PendulumEnv(seed=seed, continuous_action=config["continuous"])

    elif name == "PositivePendulumPenalty-v0":
        env = ppp.PendulumEnv(seed=seed,
                              continuous_action=config["continuous"])

    elif name == "PendulumNoShaped-v0":
        env = pens.PendulumEnv(seed=seed,
                               continuous_action=config["continuous"])

    elif name == "PendulumNoShapedPenalty-v0":
        env = pensp.PendulumEnv(seed=seed,
                                continuous_action=config["continuous"])

    elif name == "MountainCarShaped":
        env = mcs.MountainCar()

    elif name == "Bimodal" or name == "Bimodal":
        reward_variance = config.get("reward_variance", True)
        env = Bimodal(seed, reward_variance)

    elif name == "Bandit":
        n_actions = config.get("n_action", 10)
        env = Bandit(seed, n_actions)

    elif name == "ContinuousCartpole-v0":
        env = ContinuousCartPoleEnv()

    elif name == "IndexGridworld":
        env = IndexGridworldEnv(config["rows"], config["cols"])
        env.seed(seed)

    elif name == "XYGridworld":
        env = XYGridworldEnv(config["rows"], config["cols"])
        env.seed(seed)

    elif name == "Gridworld":
        env = GridworldEnv(config["rows"], config["cols"])
        env.seed(seed)

    elif name == "PuddleWorld-v1":
        env = PuddleWorldEnv(continuous=config["continuous"], seed=seed)

    elif name == "Acrobot-v1":
        env = AcrobotEnv(seed=seed, continuous_action=config["continuous"])

    elif name == "CGW":
        env = CGW.GridWorld()

    elif name == "ContinuousGridWorld":
        env = ContinuousGridWorld.GridWorld()

    elif "minatar" in name.lower():
        if "/" in name:
            raise ValueError(f"specify environment as MinAtar{name} rather " +
                             "than MinAtar/{name}")
        minimal_actions = config.get("use_minimal_action_set", True)
        stripped_name = name[7:].lower()  # Strip off "MinAtar"
        env = MinAtar.GymEnv(
            stripped_name,
            use_minimal_action_set=minimal_actions,
        )

    else:
        # Ensure we use the base gym environment. `gym.make` returns a TimeStep
        # environment wrapper, but we want the underlying environment alone.
        env = gym.make(name).env
        env.seed(seed)

    print(config)
    return env