-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathOffpolicyQlearning150816.m
61 lines (46 loc) · 1.67 KB
/
OffpolicyQlearning150816.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
function [ Q, sumQ] = OffpolicyQlearning150816( qldata3 , gamma, alpha, numtraces)
% OFF POLICY Q LEARNING
%initialisation of variables
sumQ=zeros(numtraces,1); %record sum of Q after each iteration
nact=numel(unique(qldata3(:,3)))-1; %nr of actions
ncl=numel(unique(qldata3(:,2)));
Q=zeros (ncl, nact);
maxavgQ=1;
modu=100;
listi=find(qldata3(:,1)==1); %position of 1st step of each episodes in dataset
nrepi=numel(listi); %nr of episodes in the dataset
jj=1;
for j=1:numtraces
i=listi(floor(rand()*(nrepi-2))+1); %pick one episode randomly (not the last one!)
trace = [];
while qldata3(i+1,1)~=1
S1=qldata3(i+1,2);
a1=qldata3(i+1,3);
r1=qldata3(i+1,4);
step = [ r1, S1, a1 ];
trace = [trace ; step];
i=i+1;
end
tracelength = length(trace(:,1));
return_t = trace(tracelength,1); % get last reward as return for penultimate state and action.
for t=tracelength-1:-1:1 %Step through time-steps in reverse order
s = trace(t,2); % get state index from trace at time t
a = trace(t,3); % get action index
Q(s,a) = (1-alpha)*Q(s,a) + alpha*return_t; % update Q.
return_t = return_t*gamma + trace(t,1); % return for time t-1 in terms of return and reward at t
end
sumQ(jj,1)=sum(sum(Q));
jj=jj+1;
if mod(j,500*modu)==0 %check if can stop iterating (when no more improvement is seen)
% sumQ(jj,1)=sum(sum(Q));
% jj=jj+1;
s=mean(sumQ(j-49999:j));
d=(s-maxavgQ)/maxavgQ;
if abs(d)<0.001
break %exit routine
end
maxavgQ=s;
end
end
sumQ(jj:end)=[];
end