Learn practical skills, build real-world projects, and advance your career
import gym
import random

random.seed(1234)

streets = gym.make("Taxi-v3").env #New versions keep getting released; if -v3 doesn't work, try -v2 or -v4
streets.render()
+---------+ |R: | : :G| | : | : : | | : : : : | | | : | : | |Y| : |B: | +---------+
initial_state = streets.encode(2, 3, 2, 0)

streets.s = initial_state

streets.render()
+---------+ |R: | : :G| | : | : : | | : : : : | | | : | : | |Y| : |B: | +---------+
import numpy as np

q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
learning_rate = 0.1
discount_factor = 0.6
exploration = 0.1
epochs = 10000

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    
    while not done:
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            action = np.argmax(q_table[state]) # Use the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
        
        state = next_state
        
        
q_table[initial_state]  
array([-2.40090669, -2.41412198, -2.41767969, -2.3639511 , -6.84836069,
       -8.62169302])
q_table[streets.encode(1,0,2,0)]  
array([-2.12208981, -2.23981204, -2.25062334, -2.22939021, -7.50948405,
       -7.91650559])