tictactoe.py 5.17 KB
Newer Older
1 2
# -*- coding: utf-8 -*-

Steven Cordwell's avatar
Steven Cordwell committed
3 4
import numpy as np
from scipy.sparse import dok_matrix as spdok
5

Steven Cordwell's avatar
Steven Cordwell committed
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
from mdptoolbox import mdp

ACTIONS = 9
STATES = 3**ACTIONS
PLAYER = 1
OPPONENT = 2
WINS = ([1, 1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 0, 0, 1, 0, 0],
        [0, 1, 0, 0, 1, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 1, 0, 0, 1],
        [1, 0, 0, 0, 1, 0, 0, 0, 1],
        [0, 0, 1, 0, 1, 0, 1, 0, 0])

# The valid number of cells belonging to either the player or the opponent:
# (player, opponent)
OWNED_CELLS = ((0, 0),
               (1, 1),
               (2, 2),
               (3, 3),
               (4, 4),
               (0, 1),
               (1, 2),
               (2, 3),
               (3, 4))

def convertIndexToTuple(state):
    """"""
    return(tuple(int(x) for x in np.base_repr(state, 3, 9)[-9::]))

def convertTupleToIndex(state):
    """"""
    return(int("".join(str(x) for x in state), 3))

def getLegalActions(state):
    """"""
    return(tuple(x for x in range(ACTIONS) if state[x] == 0))

def getTransitionAndRewardArrays():
    """"""
    P = [spdok((STATES, STATES)) for a in range(ACTIONS)]
    R = spdok((STATES, ACTIONS))
    # Naive approach, iterate through all possible combinations
    for a in range(ACTIONS):
        for s in range(STATES):
            state = convertIndexToTuple(s)
            if not isValid(state):
                # There are no defined moves from an invalid state, so
                # transition probabilities cannot be calculated. However,
                # P must be a square stochastic matrix, so assign a
                # probability of one to the invalid state transitioning
                # back to itself.
                P[a][s, s] = 1
                # Reward is 0
            else:
                s1, p, r = getTransitionProbabilities(state, a)
                P[a][s, s1] = p
                R[s, a] = r
        P[a] = P[a].tocsr()
    R = R.tocsc()
    return(P, R)

def getTransitionProbabilities(state, action):
    """
    Parameters
    ----------
    state : tuple
        The state
    action : int
        The action
Steven Cordwell's avatar
Steven Cordwell committed
77
    
Steven Cordwell's avatar
Steven Cordwell committed
78 79 80 81
    Returns
    -------
    s1, p, r : tuple of two lists and an int
        s1 are the next states, p are the probabilities, and r is the reward
Steven Cordwell's avatar
Steven Cordwell committed
82
    
Steven Cordwell's avatar
Steven Cordwell committed
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
    """
    #assert isValid(state)
    assert 0 <= action < ACTIONS
    if not isLegal(state, action):
        # If the action is illegal, then transition back to the same state but
        # incur a high negative reward
        s1 = [convertTupleToIndex(state)]
        return(s1, [1], -10)
    # Update the state with the action
    state = list(state)
    state[action] = PLAYER
    if isWon(state, PLAYER):
        # If the player's action is a winning move then transition to the
        # winning state and receive a reward of 1.
        s1 = [convertTupleToIndex(state)]
        return(s1, [1], 1)
    elif isDraw(state):
        s1 = [convertTupleToIndex(state)]
        return(s1, [1], 0)
    # Now we search through the opponents moves, and calculate transition
    # probabilities based on maximising the opponents chance of winning..
    s1 = []
    p = []
    legal_a = getLegalActions(state)
    for a in legal_a:
        state[a] = OPPONENT
        # If the opponent is going to win, we assume that the winning move will
        # be chosen:
        if isWon(state, OPPONENT):
            s1 = [convertTupleToIndex(state)]
            return(s1, [1], -1)
        elif isDraw(state):
            s1 = [convertTupleToIndex(state)]
            return(s1, [1], 0)
        # Otherwise we assume the opponent will select a move with uniform
        # probability across potential moves:
        s1.append(convertTupleToIndex(state))
        p.append(1.0 / len(legal_a))
        state[a] = 0
    # During non-terminal play states the reward is 0.
    return(s1, p, 0)

def getReward(state, action):
    """"""
    if not isLegal(state, action):
        return -100
    state = list(state)
    state[action] = PLAYER
    if isWon(state, PLAYER):
        return 1
    elif isWon(state, OPPONENT):
        return -1
Steven Cordwell's avatar
Steven Cordwell committed
135
    else:
Steven Cordwell's avatar
Steven Cordwell committed
136
        return 0
Steven Cordwell's avatar
Steven Cordwell committed
137

Steven Cordwell's avatar
Steven Cordwell committed
138 139 140 141 142 143 144
def isDraw(state):
    """"""
    try:
        state.index(0)
        return False
    except ValueError:
        return True
Steven Cordwell's avatar
Steven Cordwell committed
145

Steven Cordwell's avatar
Steven Cordwell committed
146
def isLegal(state, action):
147
    """"""
Steven Cordwell's avatar
Steven Cordwell committed
148 149 150
    if state[action] == 0:
        return True
    else:
151
        return False
Steven Cordwell's avatar
Steven Cordwell committed
152 153 154

def isWon(state, who):
    """Test if a tic-tac-toe game has been won.
155
    
Steven Cordwell's avatar
Steven Cordwell committed
156 157
    Assumes that the board is in a legal state.
    Will test if the value 1 is in any winning combination.
158
    
Steven Cordwell's avatar
Steven Cordwell committed
159 160 161 162 163 164
    """
    for w in WINS:
        S = sum(1 if (w[k] == state[k] == who) else 0
                for k in range(ACTIONS))
        if S == 3:
            # We have a win
Steven Cordwell's avatar
Steven Cordwell committed
165
            return True
Steven Cordwell's avatar
Steven Cordwell committed
166 167 168 169 170 171 172 173 174 175 176 177 178
    # There were no wins so return False
    return False

def isValid(state):
    """"""
    # S1 is the sum of the player's cells
    S1 = sum(1 if x == PLAYER else 0 for x in state)
    # S2 is the sum of the opponent's cells
    S2 = sum(1 if x == OPPONENT else 0 for x in state)
    if (S1, S2) in OWNED_CELLS:
        return True
    else:
        return False
179 180

if __name__ == "__main__":
Steven Cordwell's avatar
Steven Cordwell committed
181 182 183
    P, R = getTransitionAndRewardArrays()
    ttt = mdp.ValueIteration(P, R, 1)
    print(ttt.policy)