import numpy as np
Create an array with 10 elements, fill the elements with a random value from a normal distribution.
from numpy.random import default_rng, randint, binomial
rng = default_rng()
vals = rng.standard_normal(10)
chosen_int = randint(0,9)
choice = binomial(1,0.1)
choice
Create a Bandit class that instantiates the value array and gets a reward from the Bandit class.
class Bandit(object):
def __init__(self, reward_spread=1.0):
self.reward_spread = reward_spread
self.rng = default_rng()
self.true_q_array = rng.standard_normal(10)
def q_star(self, a):
""" This function computes the true value of an action."""
return self.true_q_array[a]
def get_reward(self, a):
q_s = self.q_star(a)
return self.rng.normal(q_s,0.2)
b = Bandit()
print("Value Array = {0} \n".format(b.true_q_array))
print("Reward = {0}".format(b.get_reward(1)))
Create an Agent that computes a value function
class Agent(object):
def __init__(self, greedyness=0.1):
self.rew_dict = dict()
self.tries_dict = dict()
self.value_dict = dict()
self.state = "choose random"
self.greedyness = greedyness #more greedy means usually pick best
def update_valuef(self, a,rew):
a = int(a)
rew = float(rew)
if a in self.rew_dict:
s1 = self.rew_dict[a]
self.rew_dict[a] = s1+rew
t = self.tries_dict[a]
self.tries_dict[a] = t+1
self.value_dict[a] = (s1+rew)/(t+1)
else:
self.rew_dict[a] = rew
self.tries_dict[a] = 1
self.value_dict[a] = rew
def update_valuef_alt(self, a, rew):
""" For each action in the dictionary update the value
using the formula that new estimate is the old
estimate + step_size * (current_reward - old_estimate)"""
a = int(a)
rew = float(rew)
if a in self.value_dict:
v1 = self.value_dict[a]
tries = self.tries_dict[a] + 1
self.value_dict[a] = v1 + (1/tries)*(rew - v1)
else:
self.value_dict[a] = rew
self.tries_dict[a] = 1
def get_valuef(self,a):
return self.value_dict[a]
def find_max_value_action(self):
action_max = 1
value_max = -100
for key in self.value_dict.keys():
val = self.value_dict[key]
if val > value_max:
action_max = key
value_max = val
else:
pass
return action_max
def choose_action(self):
be_greedy = binomial(1,(1.0-self.greedyness))
if self.state == "choose random":
chosen_action = randint(0,9)
if be_greedy == 0:
self.state = "choose random"
else:
self.state = "choose best"
return chosen_action
elif self.state == "choose best":
chosen_action = self.find_max_value_action()
if be_greedy == 0:
self.state = "choose random"
else:
self.state = "choose best"
return chosen_action
A = Agent()
B = Agent()
action = 1
reward = 0.5
A.update_valuef(action, reward)
B.update_valuef_alt(action, reward)
print("Value of action using regular way {0} is {1}".format(action,A.get_valuef(action)))
print("Action of highest value is {0}".format(A.find_max_value_action()))
print("Value of action using smart way {0} is {1}".format(action, B.get_valuef(action)))
def get_best_action(chosen_method):
if not chosen_method in ["regular", "smart"]:
raise ValueError("chosen_method must be regular or smart.")
a1 = Agent(0.5)
b1 = Bandit(0.5)
for i in range(10):
action = i
reward = b1.get_reward(action)
if chosen_method == "regular":
a1.update_valuef(action, reward)
elif chosen_method == "smart":
a1.update_valuef_alt(action, reward)
else:
pass
for reward in range(1000):
action = a1.choose_action()
reward = b1.get_reward(action)
if chosen_method == "regular":
a1.update_valuef(action, reward)
elif chosen_method == "smart":
a1.update_valuef_alt(action, reward)
else:
pass
return {"best_action": a1.find_max_value_action(), "best_value":a1.get_valuef(a1.find_max_value_action())}
get_best_action("regular")
get_best_action("smart")
#Start a Bandit and an Agent
b1 = Bandit(0.5)
a1 = Agent(0.5)
print("Value Array = {0} \n".format(b1.true_q_array))
#Start by trying every action once
for i in range(10):
action = i
reward = b1.get_reward(action)
a1.update_valuef_alt(action, reward)
print("Action = {0}, Reward = {1}".format(action, reward))
best_action = a1.find_max_value_action()
best_value = a1.get_valuef(best_action)
print("Action with maximum reward = {0}, max reward = {1}".format(best_action, best_value))
print("Agent is now in action")
for i in range(100):
action = a1.choose_action()
reward = b1.get_reward(action)
best_value = a1.get_valuef(action)
print("Action = {0}, Reward = {1}".format(action, reward))
print("Value of best action = {0}".format(best_value))
a1.update_valuef_alt(action, reward)