Commit bf167e77 authored by Steven Cordwell's avatar Steven Cordwell
Browse files

merge from master

parents ea7f86b8 6f70f434
# -*- coding: utf-8 -*-
#import mdp
def str_base(num, base, numerals = '0123456789abcdefghijklmnopqrstuvwxyz'):
if base < 2 or base > len(numerals):
raise ValueError("str_base: base must be between 2 and %i" %
len(numerals))
if num == 0:
return '0'
if num < 0:
sign = '-'
num = -num
else:
sign = ''
result = ''
while num:
result = numerals[num % (base)] + result
num //= base
return sign + result
class TicTacToeMDP(object):
""""""
def __init__(self):
""""""
self.P = {}
for a in xrange(9):
self.P[a] = {}
self.R = {}
# some board states are equal, just rotations of other states
self.rotorder = []
#self.rotorder.append([0, 1, 2, 3, 4, 5, 6, 7, 8])
self.rotorder.append([6, 3, 0, 7, 4, 1, 8, 5, 2])
self.rotorder.append([8, 7, 6, 5, 4, 3, 2, 1, 0])
self.rotorder.append([2, 5, 8, 1, 4, 7, 0, 3, 6])
# The valid number of cells belonging to either the player or the
# opponent: (player, opponent)
self.nXO = ((0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(0, 1),
(1, 2),
(2, 3),
(3, 4))
# The winning positions
self.wins = ([1, 1, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 1, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 1, 1],
[1, 0, 0, 1, 0, 0, 1, 0, 0],
[0, 1, 0, 0, 1, 0, 0, 1, 0],
[0, 0, 1, 0, 0, 1, 0, 0, 1],
[1, 0, 0, 0, 1, 0, 0, 0, 1],
[0, 0, 1, 0, 1, 0, 1, 0, 0])
def rotate(self, state):
#rotations = []
identity = []
#rotations.append(state)
identity.append(int("".join(str(x) for x in state), 3))
for k in range(3):
#rotations.append(tuple(state[self.rotorder[k][kk]]
# for kk in xrange(9)))
# Convert the state from base 3 number to integer.
#identity.append(int("".join(str(x) for x in rotations[k + 1]), 3))
identity.append(int("".join(str(state[self.rotorder[k][kk]])
for kk in xrange(9)), 3))
# return the rotation with the smallest identity number
#idx = identity.index(min(identity))
#return (identity[idx], rotations[idx])
return min(identity)
def unrotate(self, move, rotation):
rotation -= 1
# return the move
return self.rotorder[rotation][move]
def isLegal(self, state, action):
""""""
if state[action] == 0:
return True
else:
return False
def isWon(self, state, who):
""""""
# Check to see if there are any wins
for w in self.wins:
S = sum(1 if (w[k] == 1 and state[k] == who) else 0
for k in xrange(9))
if S == 3:
# We have a win
return True
# There were no wins so return False
return False
def isDraw(self, state):
""""""
try:
state.index(0)
return False
except ValueError:
return True
except:
raise
def isValid(self, state):
""""""
# S1 is the sum of the player's cells
S1 = sum(1 if x == 1 else 0 for x in state)
# S2 is the sum of the opponent's cells
S2 = sum(1 if x == 2 else 0 for x in state)
if (S1, S2) in self.nXO:
return True
else:
return False
def run(self):
""""""
l = (0,1,2)
# Iterate through a generator of all the combinations
for s in ([a0,a1,a2,a3,a4,a5,a6,a7,a8] for a0 in l for a1 in l
for a2 in l for a3 in l for a4 in l for a5 in l
for a6 in l for a7 in l for a8 in l):
if self.isValid(s):
self.transition(s)
# Convert P and R to ijv lists
# Iterate through up to the theorectically maxmimum value of s
for s in xrange(int('222211110',3)):
pass
# return (P, R)
def toTuple(self, state):
""""""
state = str_base(state, 3)
state = ''.join('0' for x in range(9 - len(state))) + state
return tuple(int(x) for x in state)
def transition(self, state):
""""""
#TODO: the state needs to be rotated before anything else is done!!!
idn_s = int("".join(str(x) for x in state), 3)
legal_a = [x for x in xrange(9) if state[x] == 0]
for a in legal_a:
s = [x for x in state]
s[a] = 1
is_won = self.isWon(s, 1)
legal_m = [x for x in xrange(9) if s[x] == 0]
for m in legal_m:
s_new = [x for x in s]
s_new[m] = 2
idn_s_new = self.rotate(s_new)
if not self.P[a].has_key((idn_s, idn_s_new)):
self.P[a][(idn_s, idn_s_new)] = len(legal_m)
if not self.R.has_key((idn_s, idn_s_new)):
if is_won:
self.R[(idn_s, idn_s_new)] = 1
elif self.isWon(s_new, 2):
self.R[(idn_s, idn_s_new)] = -1
else:
self.R[(idn_s, idn_s_new)] = 0
if __name__ == "__main__":
P, R = TicTacToeMDP().run()
#ttt = mdp.ValueIteration(P, R, 1)
\ No newline at end of file
......@@ -93,13 +93,14 @@ http://www.inra.fr/mia/T/MDPtoolbox/.
# POSSIBILITY OF SUCH DAMAGE.
from math import ceil, log, sqrt
from random import randint, random
from random import random
from time import time
from numpy import absolute, array, diag, empty, mean, mod, multiply
from numpy import ndarray, ones, zeros
from numpy.random import rand
from numpy import ndarray, ones, where, zeros
from numpy.random import rand, randint
from scipy.sparse import csr_matrix as sparse
from scipy.sparse import coo_matrix, dok_matrix
# __all__ = ["check", "checkSquareStochastic"]
......@@ -371,15 +372,15 @@ def checkSquareStochastic(Z):
return(None)
def exampleForest(S=3, r1=4, r2=2, p=0.1):
def exampleForest(S=3, r1=4, r2=2, p=0.1, is_sparse=False):
"""Generate a MDP example based on a simple forest management scenario.
This function is used to generate a transition probability (A×S×S) array P
and a reward (S×A) matrix R that model the following problem.
A forest is managed by two actions: 'Wait' and 'Cut'.
An action is decided each year with first the objective to maintain an old
forest for wildlife and second to make money selling cut wood.
Each year there is a probability ``p`` that a fire burns the forest.
This function is used to generate a transition probability
(``A`` × ``S`` × ``S``) array ``P`` and a reward (``S`` × ``A``) matrix
``R`` that model the following problem. A forest is managed by two actions:
'Wait' and 'Cut'. An action is decided each year with first the objective
to maintain an old forest for wildlife and second to make money selling cut
wood. Each year there is a probability ``p`` that a fire burns the forest.
Here is how the problem is modelled.
Let {1, 2 . . . ``S`` } be the states of the forest, with ``S`` being the
......@@ -471,12 +472,23 @@ def exampleForest(S=3, r1=4, r2=2, p=0.1):
# | . . . | | . . . |
# | . . 1-p | | . . . |
# | p 0 0....0 1-p | | 1 0..........0 |
P = zeros((2, S, S))
P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
P[0, :, 0] = p
P[0, S - 1, S - 1] = (1 - p)
P[1, :, :] = zeros((S, S))
P[1, :, 0] = 1
if is_sparse:
P = []
rows = range(S) * 2
cols = [0] * S + range(1, S) + [S - 1]
vals = [0.1] * S + [0.9] * S
P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
rows = range(S)
cols = [0] * S
vals = [1] * S
P.append(coo_matrix((vals, (rows, cols)), shape=(S,S)).tocsr())
else:
P = zeros((2, S, S))
P[0, :, :] = (1 - p) * diag(ones(S - 1), 1)
P[0, :, 0] = p
P[0, S - 1, S - 1] = (1 - p)
P[1, :, :] = zeros((S, S))
P[1, :, 0] = 1
# Definition of Reward matrix R1 associated to action Wait and
# R2 associated to action Cut
# | 0 | | 0 |
......@@ -528,10 +540,15 @@ def exampleRand(S, A, is_sparse=False, mask=None):
# if the user hasn't specified a mask, then we will make a random one now
if mask is None:
mask = rand(A, S, S)
for a in range(A):
if is_sparse:
# create a mask that has roughly two thirds of the cells set to 0
#for a in range(A):
mask[mask <= 2/3.0] = 0
mask[mask > 2/3.0] = 1
else:
r = random()
mask[a][mask[a, :, :] < r] = 0
mask[a][mask[a, :, :] >= r] = 1
mask[mask < r] = 0
mask[mask >= r] = 1
else:
# the mask needs to be SxS or AxSxS
try:
......@@ -542,28 +559,40 @@ def exampleRand(S, A, is_sparse=False, mask=None):
# generate the transition and reward matrices based on S, A and mask
if is_sparse:
# definition of transition matrix : square stochastic matrix
P = []
P = [None] * A
# definition of reward matrix (values between -1 and +1)
R = []
R = [None] * A
for a in xrange(A):
if mask.ndim == 3:
PP = mask[a, :, :] * rand(S, S)
for s in range(S):
if mask[a, s, :].sum() == 0:
PP[s, randint(0, S - 1)] = 1
PP[s, :] = PP[s, :] / PP[s, :].sum()
P.append(sparse(PP))
R.append(sparse(mask[a] * (2*rand(S, S) - ones((S, S)))))
if mask.shape == (A, S, S):
m = mask[a] # mask[a, :, :]
else:
PP = mask * rand(S, S)
for s in range(S):
if mask[s, :].sum() == 0:
PP[s, randint(0, S - 1)] = 1
PP[s, :] = PP[s, :] / PP[s, :].sum()
P.append(sparse(PP))
R.append(sparse(mask * (2*rand(S, S) - ones((S, S)))))
P = tuple(P)
R = tuple(R)
m = mask
# it may be more efficient to implement this by constructing lists
# of rows, columns and values then creating a coo_matrix, but this
# works for now
PP = dok_matrix((S, S))
RR = dok_matrix((S, S))
for s in xrange(S):
n = int(m[s].sum()) # m[s, :]
if n == 0:
PP[s, randint(0, S)] = 1
else:
rows = s * ones(n, dtype=int)
cols = where(m[s])[0] # m[s, :]
vals = rand(n)
vals = vals / vals.sum()
reward = 2*rand(n) - ones(n)
# I want to do this: PP[rows, cols] = vals, but it doesn't
# seem to work, as val[-1] is stored as the value for each
# row, column pair. Therefore the loop is needed.
for x in xrange(n):
PP[rows[x], cols[x]] = vals[x]
RR[rows[x], cols[x]] = reward[x]
# PP.tocsr() takes the same amount of time as PP.tocoo().tocsr()
# so constructing PP and RR as coo_matrix in the first place is
# probably "better"
P[a] = PP.tocsr()
R[a] = RR.tocsr()
else:
# definition of transition matrix : square stochastic matrix
P = zeros((A, S, S))
......@@ -574,7 +603,7 @@ def exampleRand(S, A, is_sparse=False, mask=None):
P[a, :, :] = mask[a] * rand(S, S)
for s in range(S):
if mask[a, s, :].sum() == 0:
P[a, s, randint(0, S - 1)] = 1
P[a, s, randint(0, S)] = 1
P[a, s, :] = P[a, s, :] / P[a, s, :].sum()
R[a, :, :] = (mask[a, :, :] * (2*rand(S, S) -
ones((S, S), dtype=int)))
......@@ -582,7 +611,7 @@ def exampleRand(S, A, is_sparse=False, mask=None):
P[a, :, :] = mask * rand(S, S)
for s in range(S):
if mask[a, s, :].sum() == 0:
P[a, s, randint(0, S - 1)] = 1
P[a, s, randint(0, S)] = 1
P[a, s, :] = P[a, s, :] / P[a, s, :].sum()
R[a, :, :] = mask * (2*rand(S, S) - ones((S, S), dtype=int))
# we want to return the generated transition and reward matrices
......@@ -720,9 +749,9 @@ class MDP(object):
except AttributeError:
raise TypeError("bellman: V must be a numpy array or matrix.")
Q = empty((self.S, self.A))
Q = empty((self.A, self.S))
for aa in range(self.A):
Q[:, aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
Q[aa] = self.R[aa] + self.discount * self.P[aa].dot(V)
# Which way is better?
# 1. Return, (policy, value)
......@@ -1151,7 +1180,7 @@ class PolicyIteration(MDP):
if (type(V0) in (int, float)) and (V0 == 0):
policy_V = zeros(self.S)
else:
if (type(V0) in (ndarray, matrix)) and (V0.shape == (self.S, 1)):
if (type(V0) in (ndarray)) and (V0.shape == (self.S, 1)):
policy_V = V0
else:
raise ValueError("PyMDPtoolbox: V0 vector/array type not "
......@@ -1337,7 +1366,7 @@ class PolicyIterationModified(PolicyIteration):
self.epsilon = epsilon
if discount == 1:
self.V = matrix(zeros((self.S, 1)))
self.V = zeros((self.S, 1))
else:
# min(min()) is not right
self.V = 1 / (1 - discount) * self.R.min() * ones((self.S, 1))
......@@ -1491,13 +1520,13 @@ class QLearning(MDP):
self.time = time()
# initial state choice
s = randint(0, self.S - 1)
s = randint(0, self.S)
for n in range(1, self.max_iter + 1):
# Reinitialisation of trajectories every 100 transitions
if ((n % 100) == 0):
s = randint(0, self.S - 1)
s = randint(0, self.S)
# Action choice : greedy with increasing probability
# probability 1-(1/log(n+2)) can be changed
......@@ -1506,7 +1535,7 @@ class QLearning(MDP):
# optimal_action = self.Q[s, :].max()
a = self.Q[s, :].argmax()
else:
a = randint(0, self.A - 1)
a = randint(0, self.A)
# Simulating next state s_new and reward associated to <s,s_new,a>
p_s_new = random()
......@@ -1619,7 +1648,7 @@ class RelativeValueIteration(MDP):
self.epsilon = epsilon
self.discount = 1
self.V = matrix(zeros((self.S, 1)))
self.V = zeros((self.S, 1))
self.gain = 0 # self.U[self.S]
self.average_reward = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment