quantilizers/sarsa.py at master · mtrazzi/quantilizers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#heavily based on mountain-car-SARSA-AC
import matplotlib
import numpy as np
import gym
import matplotlib.pyplot as plt
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from mpl_toolkits.mplot3d import Axes3D
import sklearn.pipeline
import sklearn.preprocessing
from wrappers import RobustRewardEnv


# Normalize and turn into feature
def featurize_state(state, scaler, featurizer):
	# Transform data
	scaled = scaler.transform([state])
	featurized = featurizer.transform(scaled)
	return featurized

def Q(state,action,weight):
	value = state.dot(weight[action])
	return value

# Epsilon greedy policy
def policy(state, weight, nA, epsilon=0.1):
	A = np.ones(nA,dtype=float) * epsilon/nA
	best_action =  np.argmax([Q(state,a,weight) for a in range(nA)])
	A[best_action] += (1.0-epsilon)
	sample = np.random.choice(nA,p=A)
	return sample

def train(num_episodes, discount_factor=.99, alpha=.01):
    env = RobustRewardEnv('MountainCar-v0')
    nA = env.action_space.n

    #Parameter vector define number of parameters per action based on featurizer size
    weight = np.zeros((nA,400))

    # Plots
    ep_rewards = np.zeros(num_episodes)
    ep_performances = np.zeros(num_episodes)

    # Get satistics over observation space samples for normalization
    observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)

    # Create radial basis function sampler to convert states to features for nonlinear function approx
    featurizer = sklearn.pipeline.FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
            ("rbf4", RBFSampler(gamma=0.5, n_components=100))
                    ])
    # Fit featurizer to our scaled inputs
    featurizer.fit(scaler.transform(observation_examples))

    # Our main training loop
    for e in range(num_episodes):
            state = env.reset()
            state = featurize_state(state, scaler, featurizer)

            while True:
                    #env.render()
                    # Sample from our policy
                    action = policy(state, weight, nA)

                    # Step environment and get next state and make it a feature
                    next_state, reward, done, info = env.step(action)
                    next_state = featurize_state(next_state, scaler, featurizer)

                    # Figure out what our policy tells us to do for the next state
                    next_action = policy(next_state, weight, nA)

                    # Statistic for graphing
                    ep_rewards[e] += reward
                    ep_performances[e] += info['performance']

                    # Figure out target and td error
                    #target = reward + discount_factor * Q(next_state,next_action,weight)
                    target = reward + discount_factor * max([next_state.dot(weight[a]) for a in range(nA)])
                    td_error = Q(state,action,weight) - target

                    # Find gradient with code to check it commented below (check passes)
                    dw = (td_error).dot(state)

                    # Update weight
                    weight[action] -= alpha * dw

                    if done:
                            break

                    # update our state
                    state = next_state
    env.close()
    return ep_rewards, ep_performances

if __name__=='__main__':
    num_episodes = 200
    ep_rewards, ep_performances = train(num_episodes=num_episodes)
    print('average proxy rew: {}'.format(ep_rewards[-100:].mean()))
    print('average true rew: {}'.format(ep_performances[-100:].mean()))

    # Plot the reward over all episodes
    plt.figure()
    plt.plot(np.arange(num_episodes),ep_rewards)
    plt.plot(np.arange(num_episodes),ep_performances, c='r')
    plt.show()