tictactoe.py 5.21 KB
Newer Older
1 2
# -*- coding: utf-8 -*-

Steven Cordwell's avatar
Steven Cordwell committed
3
import numpy as np
4
from scipy.sparse import dok_matrix
5

Steven Cordwell's avatar
Steven Cordwell committed
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
from mdptoolbox import mdp

ACTIONS = 9
STATES = 3**ACTIONS
PLAYER = 1
OPPONENT = 2
WINS = ([1, 1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 0, 0, 1, 0, 0],
        [0, 1, 0, 0, 1, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 1, 0, 0, 1],
        [1, 0, 0, 0, 1, 0, 0, 0, 1],
        [0, 0, 1, 0, 1, 0, 1, 0, 0])

# The valid number of cells belonging to either the player or the opponent:
# (player, opponent)
OWNED_CELLS = ((0, 0),
               (1, 1),
               (2, 2),
               (3, 3),
               (4, 4),
               (0, 1),
               (1, 2),
               (2, 3),
               (3, 4))

def convertIndexToTuple(state):
    """"""
    return(tuple(int(x) for x in np.base_repr(state, 3, 9)[-9::]))

def convertTupleToIndex(state):
    """"""
    return(int("".join(str(x) for x in state), 3))

def getLegalActions(state):
    """"""
    return(tuple(x for x in range(ACTIONS) if state[x] == 0))

def getTransitionAndRewardArrays():
    """"""
47 48 49
    P = [dok_matrix((STATES, STATES)) for a in range(ACTIONS)]
    #R = spdok((STATES, ACTIONS))
    R = np.zeros((STATES, ACTIONS))
Steven Cordwell's avatar
Steven Cordwell committed
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
    # Naive approach, iterate through all possible combinations
    for a in range(ACTIONS):
        for s in range(STATES):
            state = convertIndexToTuple(s)
            if not isValid(state):
                # There are no defined moves from an invalid state, so
                # transition probabilities cannot be calculated. However,
                # P must be a square stochastic matrix, so assign a
                # probability of one to the invalid state transitioning
                # back to itself.
                P[a][s, s] = 1
                # Reward is 0
            else:
                s1, p, r = getTransitionProbabilities(state, a)
                P[a][s, s1] = p
                R[s, a] = r
        P[a] = P[a].tocsr()
67
    #R = R.tolil()
Steven Cordwell's avatar
Steven Cordwell committed
68 69 70 71 72 73 74 75 76 77
    return(P, R)

def getTransitionProbabilities(state, action):
    """
    Parameters
    ----------
    state : tuple
        The state
    action : int
        The action
Steven Cordwell's avatar
Steven Cordwell committed
78
    
Steven Cordwell's avatar
Steven Cordwell committed
79 80 81 82
    Returns
    -------
    s1, p, r : tuple of two lists and an int
        s1 are the next states, p are the probabilities, and r is the reward
Steven Cordwell's avatar
Steven Cordwell committed
83
    
Steven Cordwell's avatar
Steven Cordwell committed
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
    """
    #assert isValid(state)
    assert 0 <= action < ACTIONS
    if not isLegal(state, action):
        # If the action is illegal, then transition back to the same state but
        # incur a high negative reward
        s1 = [convertTupleToIndex(state)]
        return(s1, [1], -10)
    # Update the state with the action
    state = list(state)
    state[action] = PLAYER
    if isWon(state, PLAYER):
        # If the player's action is a winning move then transition to the
        # winning state and receive a reward of 1.
        s1 = [convertTupleToIndex(state)]
        return(s1, [1], 1)
    elif isDraw(state):
        s1 = [convertTupleToIndex(state)]
        return(s1, [1], 0)
    # Now we search through the opponents moves, and calculate transition
    # probabilities based on maximising the opponents chance of winning..
    s1 = []
    p = []
    legal_a = getLegalActions(state)
    for a in legal_a:
        state[a] = OPPONENT
        # If the opponent is going to win, we assume that the winning move will
        # be chosen:
        if isWon(state, OPPONENT):
            s1 = [convertTupleToIndex(state)]
            return(s1, [1], -1)
        elif isDraw(state):
            s1 = [convertTupleToIndex(state)]
            return(s1, [1], 0)
        # Otherwise we assume the opponent will select a move with uniform
        # probability across potential moves:
        s1.append(convertTupleToIndex(state))
        p.append(1.0 / len(legal_a))
        state[a] = 0
    # During non-terminal play states the reward is 0.
    return(s1, p, 0)

def getReward(state, action):
    """"""
    if not isLegal(state, action):
        return -100
    state = list(state)
    state[action] = PLAYER
    if isWon(state, PLAYER):
        return 1
    elif isWon(state, OPPONENT):
        return -1
Steven Cordwell's avatar
Steven Cordwell committed
136
    else:
Steven Cordwell's avatar
Steven Cordwell committed
137
        return 0
Steven Cordwell's avatar
Steven Cordwell committed
138

Steven Cordwell's avatar
Steven Cordwell committed
139 140 141 142 143 144 145
def isDraw(state):
    """"""
    try:
        state.index(0)
        return False
    except ValueError:
        return True
Steven Cordwell's avatar
Steven Cordwell committed
146

Steven Cordwell's avatar
Steven Cordwell committed
147
def isLegal(state, action):
148
    """"""
Steven Cordwell's avatar
Steven Cordwell committed
149 150 151
    if state[action] == 0:
        return True
    else:
152
        return False
Steven Cordwell's avatar
Steven Cordwell committed
153 154 155

def isWon(state, who):
    """Test if a tic-tac-toe game has been won.
156
    
Steven Cordwell's avatar
Steven Cordwell committed
157 158
    Assumes that the board is in a legal state.
    Will test if the value 1 is in any winning combination.
159
    
Steven Cordwell's avatar
Steven Cordwell committed
160 161 162 163 164 165
    """
    for w in WINS:
        S = sum(1 if (w[k] == state[k] == who) else 0
                for k in range(ACTIONS))
        if S == 3:
            # We have a win
Steven Cordwell's avatar
Steven Cordwell committed
166
            return True
Steven Cordwell's avatar
Steven Cordwell committed
167 168 169 170 171 172 173 174 175 176 177 178 179
    # There were no wins so return False
    return False

def isValid(state):
    """"""
    # S1 is the sum of the player's cells
    S1 = sum(1 if x == PLAYER else 0 for x in state)
    # S2 is the sum of the opponent's cells
    S2 = sum(1 if x == OPPONENT else 0 for x in state)
    if (S1, S2) in OWNED_CELLS:
        return True
    else:
        return False
180 181

if __name__ == "__main__":
Steven Cordwell's avatar
Steven Cordwell committed
182 183 184
    P, R = getTransitionAndRewardArrays()
    ttt = mdp.ValueIteration(P, R, 1)
    print(ttt.policy)