Notes
![]() ![]() Notes - notes.io |
import gym
import numpy as np
# Frozen Lake environment setup
epsilon = 0.1
env = gym.make('MountainCar-v0')
total_episodes = 10000
max_steps = 100
alpha = 0.85
gamma = 0.95
# Initialize Q-table
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))
# SARSA algorithm
for episode in range(total_episodes):
state = env.reset()
action = env.action_space.sample()
sum=0
for step in range(max_steps):
# Take an action and observe the next state and reward
next_state, reward, done, _ = env.step(action)
next_state=np.argmax(next_state)
sum+=reward
# Choose the next action using epsilon-greedy policy
next_action = env.action_space.sample() if np.random.rand() < epsilon else np.argmax(Q[next_state])
state=np.argmax(state)
# Update Q-value using SARSA update rule
Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
state = next_state
action = next_action
if done:
break
reward+=1
print('episode : '+str(episode)+' reward: '+str(sum))
# Print the optimal Q-values
print("Optimal Q-values:")
print(Q)
FROZEN LAKE
import gym
import numpy as np
# Frozen Lake environment setup
epsilon = 0.9
env = gym.make('FrozenLake-v1')
total_episodes = 10000
max_steps = 100
alpha = 0.85
gamma = 0.95
# Initialize Q-table
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))
# SARSA algorithm
for episode in range(total_episodes):
state = env.reset()
action = env.action_space.sample()
for step in range(max_steps):
# Take an action and observe the next state and reward
next_state, reward, done, _ = env.step(action)
# Choose the next action using epsilon-greedy policy
next_action = env.action_space.sample() if np.random.rand() < epsilon else np.argmax(Q[next_state])
# Update Q-value using SARSA update rule
Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
state = next_state
action = next_action
if done:
break
# Print the optimal Q-values
print("Optimal Q-values:")
print(Q)
TAXI
import gym
import numpy as np
# Taxi environment setup
env = gym.make('Taxi-v3')
total_episodes = 10000
max_steps = 100
alpha = 0.1
gamma = 0.6
epsilon = 0.1
# Initialize Q-table
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))
# Q-learning algorithm
for episode in range(total_episodes):
state = env.reset()
for step in range(max_steps):
# Choose an action using epsilon-greedy policy
if np.random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(Q[state])
# Take the chosen action and observe the next state, reward, and done flag
next_state, reward, done, _ = env.step(action)
# Update Q-value using the Q-learning update rule
Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
state = next_state
if done:
break
# Evaluate the learned policy
total_rewards = 0
num_eval_episodes = 10
for _ in range(num_eval_episodes):
state = env.reset()
for _ in range(max_steps):
action = np.argmax(Q[state])
state, reward, done, _ = env.step(action)
total_rewards += reward
if done:
break
average_reward = total_rewards / num_eval_episodes
# Print the average reward
print("Average reward:", average_reward)
CARTPOLE
import gym
import numpy as np
# CartPole environment setup
env = gym.make('CartPole-v1')
total_episodes = 1000
max_steps = 200
alpha = 0.1
gamma = 0.9
epsilon = 0.1
# Initialize Q-table
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))
# Q-learning algorithm
for episode in range(total_episodes):
state = env.reset()
for step in range(max_steps):
# Choose an action using epsilon-greedy policy
state=np.argmax(state)
if np.random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(Q[state])
# Take the chosen action and observe the next state, reward, and done flag
next_state, reward, done, _ = env.step(action)
next_state = np.argmax(next_state)
# Update Q-value using the Q-learning update rule
Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
state = next_state
if done:
break
# Evaluate the learned policy
total_rewards = 0
num_eval_episodes = 10
for _ in range(num_eval_episodes):
state = env.reset()
for _ in range(max_steps):
state = np.argmax(state)
action = np.argmax(Q[state])
state, reward, done, _ = env.step(action)
total_rewards += reward
if done:
break
average_reward = total_rewards / num_eval_episodes
# Print the average reward
print("Average reward:", average_reward)
10-ARMED BANDIT PROBLEM
import numpy as np
import matplotlib.pyplot as plt
# Define the number of arms and the number of episodes
num_arms = 10
num_episodes = 1000
# Define the epsilon values to test
epsilons = [0, 0.1, 0.01]
# Define the true reward distribution for each arm
reward_means = np.random.normal(loc=0, scale=1, size=num_arms)
# Initialize the estimated reward distribution for each arm
estimated_means = np.zeros(num_arms)
# Initialize the number of times each arm has been pulled
num_pulls = np.zeros(num_arms)
# Define the epsilon-greedy action selection function
def epsilon_greedy(epsilon):
if np.random.uniform() < epsilon:
# Choose a random arm
action = np.random.choice(num_arms)
else:
# Choose the arm with the highest estimated mean reward
action = np.argmax(estimated_means)
return action
# Initialize arrays to store the rewards and average rewards for each episode
rewards = np.zeros((len(epsilons), num_episodes))
avg_rewards = np.zeros((len(epsilons), num_episodes))
# Loop over the episodes
for i in range(num_episodes):
# Loop over the epsilon values
for j, epsilon in enumerate(epsilons):
# Choose an action using the epsilon-greedy method
action = epsilon_greedy(epsilon)
# Pull the arm and observe the reward
reward = np.random.normal(loc=reward_means[action], scale=1)
# Update the estimated mean reward for the chosen arm
num_pulls[action] += 1
estimated_means[action] += (reward - estimated_means[action]) / num_pulls[action]
# Store the reward and average reward
rewards[j, i] = reward
avg_rewards[j, i] = np.mean(rewards[j, :i+1])
# Plot the average rewards for each epsilon value
for j, epsilon in enumerate(epsilons):
plt.plot(avg_rewards[j, :], label='epsilon = ' + str(epsilon))
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.legend()
plt.show()
![]() |
Notes is a web-based application for online taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000+ notes created and continuing...
With notes.io;
- * You can take a note from anywhere and any device with internet connection.
- * You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
- * You can quickly share your contents without website, blog and e-mail.
- * You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
- * Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.
Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.
Easy: Notes.io doesn’t require installation. Just write and share note!
Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )
Free: Notes.io works for 14 years and has been free since the day it was started.
You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;
Email: [email protected]
Twitter: http://twitter.com/notesio
Instagram: http://instagram.com/notes.io
Facebook: http://facebook.com/notesio
Regards;
Notes.io Team