import numpy as np

Create an array with 10 elements, fill the elements with a random value from a normal distribution.

from numpy.random import default_rng, randint, binomial
rng = default_rng()
vals = rng.standard_normal(10)
chosen_int = randint(0,9)
choice = binomial(1,0.1)
choice

0

Create a Bandit class that instantiates the value array and gets a reward from the Bandit class.

class Bandit(object):
    def __init__(self, reward_spread=1.0):
        self.reward_spread = reward_spread
        self.rng = default_rng()
        self.true_q_array = rng.standard_normal(10)
    
    def q_star(self, a):
        """ This function computes the true value of an action."""
        return self.true_q_array[a]
    
    def get_reward(self, a):
        q_s = self.q_star(a)
        return self.rng.normal(q_s,0.2)

b = Bandit()
print("Value Array = {0} \n".format(b.true_q_array))

print("Reward = {0}".format(b.get_reward(1)))

Value Array = [-0.09662544 -0.48230226 -0.64380762 -1.61421347  0.20998613  0.88967097
  3.14720021  0.25110337 -1.41516975 -1.952631  ] 

Reward = -0.6403866789534078

Create an Agent that computes a value function

class Agent(object):
    def __init__(self, greedyness=0.1):
        self.rew_dict = dict()
        self.tries_dict = dict()
        self.value_dict = dict()
        self.state = "choose random"
        self.greedyness = greedyness #more greedy means usually pick best
    
    def update_valuef(self, a,rew):
        a = int(a)
        rew = float(rew)
        if a in self.rew_dict:
            s1 = self.rew_dict[a]
            self.rew_dict[a] = s1+rew
            t = self.tries_dict[a]
            self.tries_dict[a] = t+1
            self.value_dict[a] = (s1+rew)/(t+1)
        else:
            self.rew_dict[a] = rew
            self.tries_dict[a] = 1
            self.value_dict[a] = rew
    
    def update_valuef_alt(self, a, rew):
        """ For each action in the dictionary update the value
        using the formula that new estimate is the old
        estimate + step_size * (current_reward - old_estimate)"""
        a = int(a)
        rew = float(rew)
        if a in self.value_dict:
            v1 = self.value_dict[a]
            tries = self.tries_dict[a] + 1
            self.value_dict[a] = v1 + (1/tries)*(rew - v1)
        else:
            self.value_dict[a] = rew
            self.tries_dict[a] = 1
    
    def get_valuef(self,a):
        return self.value_dict[a]
    
    def find_max_value_action(self):
        action_max = 1
        value_max = -100
        for key in self.value_dict.keys():
            val = self.value_dict[key]
            if val > value_max:
                action_max = key
                value_max = val
            else:
                pass
        return action_max
    
    def choose_action(self):
        be_greedy = binomial(1,(1.0-self.greedyness))
        if self.state == "choose random":
            chosen_action = randint(0,9)
            if be_greedy == 0:
                self.state = "choose random"
            else:
                self.state = "choose best"
            return chosen_action
        elif self.state == "choose best":
            chosen_action = self.find_max_value_action()
            if be_greedy == 0:
                self.state = "choose random"
            else:
                self.state = "choose best"
            return chosen_action
            
A = Agent()
B = Agent()
action = 1
reward = 0.5
A.update_valuef(action, reward)
B.update_valuef_alt(action, reward)
print("Value of action using regular way {0} is {1}".format(action,A.get_valuef(action)))
print("Action of highest value is {0}".format(A.find_max_value_action()))
print("Value of action using smart way {0} is {1}".format(action, B.get_valuef(action)))

Value of action using regular way 1 is 0.5
Action of highest value is 1
Value of action using smart way 1 is 0.5

def get_best_action(chosen_method):
    if not chosen_method in ["regular", "smart"]:
        raise ValueError("chosen_method must be regular or smart.")
    a1 = Agent(0.5)
    b1 = Bandit(0.5)
    for i in range(10):
        action = i
        reward = b1.get_reward(action)
        if chosen_method == "regular":
            a1.update_valuef(action, reward)
        elif chosen_method == "smart":
            a1.update_valuef_alt(action, reward)
        else:
            pass
    
    for reward in range(1000):
        action = a1.choose_action()
        reward = b1.get_reward(action)
        if chosen_method == "regular":
            a1.update_valuef(action, reward)
        elif chosen_method == "smart":
            a1.update_valuef_alt(action, reward)
        else:
            pass
    
    return {"best_action": a1.find_max_value_action(), "best_value":a1.get_valuef(a1.find_max_value_action())}

get_best_action("regular")

{'best_action': 7, 'best_value': 2.6063271680378097}

get_best_action("smart")

{'best_action': 0, 'best_value': 2.0155285439746375}

#Start a Bandit and an Agent
b1 = Bandit(0.5)
a1 = Agent(0.5)
print("Value Array = {0} \n".format(b1.true_q_array))
#Start by trying every action once
for i in range(10):
    action = i
    reward = b1.get_reward(action)
    a1.update_valuef_alt(action, reward)
    print("Action = {0}, Reward = {1}".format(action, reward))

best_action = a1.find_max_value_action()
best_value = a1.get_valuef(best_action)
print("Action with maximum reward = {0}, max reward = {1}".format(best_action, best_value))

print("Agent is now in action")
for i in range(100):
    action = a1.choose_action()
    reward = b1.get_reward(action)
    best_value = a1.get_valuef(action)
    print("Action = {0}, Reward = {1}".format(action, reward))
    print("Value of best action = {0}".format(best_value))
    a1.update_valuef_alt(action, reward)

Value Array = [ 0.5983849   0.47837459 -0.68461583 -0.3327361  -1.18192293 -1.43728722
  1.13477297  1.16289645 -0.68624463 -1.92146909] 

Action = 0, Reward = 0.47940390729307014
Action = 1, Reward = 0.5761878865095773
Action = 2, Reward = -0.971098624156757
Action = 3, Reward = -0.33524184598182316
Action = 4, Reward = -1.5358905944259447
Action = 5, Reward = -1.341798718378165
Action = 6, Reward = 1.4573532643715148
Action = 7, Reward = 1.1341991029786118
Action = 8, Reward = -0.4650687749822804
Action = 9, Reward = -1.7955172455067647
Action with maximum reward = 6, max reward = 1.4573532643715148
Agent is now in action
Action = 0, Reward = 0.9313913928754249
Value of best action = 0.47940390729307014
Action = 4, Reward = -1.2294365202445303
Value of best action = -1.5358905944259447
Action = 6, Reward = 0.9006019470605549
Value of best action = 1.4573532643715148
Action = 6, Reward = 1.443355411002833
Value of best action = 1.1789776057160348
Action = 6, Reward = 1.0486088944055938
Value of best action = 1.311166508359434
Action = 0, Reward = 0.4632900150326469
Value of best action = 0.7053976500842476
Action = 6, Reward = 0.9606967455875807
Value of best action = 1.1798877013825138
Action = 7, Reward = 0.9630786714135968
Value of best action = 1.1341991029786118
Action = 2, Reward = -0.561730618645343
Value of best action = -0.971098624156757
Action = 6, Reward = 1.047037736529226
Value of best action = 1.0702922234850472
Action = 0, Reward = 0.8435442673046049
Value of best action = 0.5843438325584472
Action = 6, Reward = 0.9068367540165182
Value of best action = 1.0586649800071366
Action = 7, Reward = 1.485909262604257
Value of best action = 1.0486388871961043
Action = 2, Reward = -0.9667840977084796
Value of best action = -0.76641462140105
Action = 7, Reward = 0.7393515437119877
Value of best action = 1.2672740749001807
Action = 8, Reward = -0.5785166689070433
Value of best action = -0.4650687749822804
Action = 3, Reward = -0.594850126911778
Value of best action = -0.33524184598182316
Action = 7, Reward = 1.255148850708119
Value of best action = 1.0033128093060841
Action = 7, Reward = 1.0350620934153485
Value of best action = 1.1292308300071014
Action = 7, Reward = 1.0992773916353293
Value of best action = 1.082146461711225
Action = 3, Reward = -0.3242305889121465
Value of best action = -0.46504598644680056
Action = 3, Reward = -0.38767020746957914
Value of best action = -0.3946382876794735
Action = 7, Reward = 1.126504648407892
Value of best action = 1.0907119266732772
Action = 6, Reward = 0.9024109685659333
Value of best action = 0.9827508670118275
Action = 7, Reward = 1.0758261002582308
Value of best action = 1.1086082875405845
Action = 7, Reward = 1.358177771583342
Value of best action = 1.0922171938994076
Action = 7, Reward = 1.454733861306854
Value of best action = 1.225197482741375
Action = 6, Reward = 1.7100339419025443
Value of best action = 0.9425809177888804
Action = 7, Reward = 1.5897463222984332
Value of best action = 1.3399656720241144
Action = 7, Reward = 0.76795935226373
Value of best action = 1.4648559971612738
Action = 3, Reward = -0.5117880945380702
Value of best action = -0.3911542475745263
Action = 0, Reward = 0.5770464661806022
Value of best action = 0.713944049931526
Action = 6, Reward = 1.1727302616757436
Value of best action = 1.3263074298457123
Action = 5, Reward = -1.2247862650429024
Value of best action = -1.341798718378165
Action = 7, Reward = 1.2629224257825176
Value of best action = 1.116407674712502
Action = 6, Reward = 0.9142757759024337
Value of best action = 1.2495188457607278
Action = 7, Reward = 1.312282316153211
Value of best action = 1.18966505024751
Action = 6, Reward = 1.1900239914367647
Value of best action = 1.0818973108315808
Action = 2, Reward = -0.4162090029143061
Value of best action = -0.8665993595547647
Action = 7, Reward = 1.2913476758185765
Value of best action = 1.2509736832003604
Action = 8, Reward = -0.42153234056003946
Value of best action = -0.5217927219446619
Action = 7, Reward = 0.914439004488569
Value of best action = 1.2711606795094683
Action = 6, Reward = 1.5058545136180124
Value of best action = 1.1359606511341727
Action = 6, Reward = 0.8644816216621538
Value of best action = 1.3209075823760925
Action = 5, Reward = -1.3586379957165458
Value of best action = -1.2832924917105337
Action = 7, Reward = 0.9693659902838374
Value of best action = 1.0927998419990186
Action = 6, Reward = 1.055778836785291
Value of best action = 1.0926946020191233
Action = 6, Reward = 1.0907687969033462
Value of best action = 1.074236719402207
Action = 6, Reward = 0.98655137403416
Value of best action = 1.0825027581527766
Action = 6, Reward = 1.111908248316114
Value of best action = 1.0345270660934682
Action = 6, Reward = 1.085676892036337
Value of best action = 1.0732176572047911
Action = 6, Reward = 1.1243654726313692
Value of best action = 1.0794472746205641
Action = 0, Reward = 0.7335152958690585
Value of best action = 0.6454952580560641
Action = 6, Reward = 1.118592323663626
Value of best action = 1.1019063736259667
Action = 5, Reward = -1.1676848742424137
Value of best action = -1.3209652437135397
Action = 0, Reward = 0.18192507031867922
Value of best action = 0.6895052769625614
Action = 6, Reward = 0.687838792539387
Value of best action = 1.1102493486447962
Action = 6, Reward = 0.8579751352138434
Value of best action = 0.8990440705920916
Action = 0, Reward = 0.5733385972602056
Value of best action = 0.4357151736406203
Action = 7, Reward = 1.077572018380029
Value of best action = 1.031082916141428
Action = 7, Reward = 0.9762494139072372
Value of best action = 1.0543274672607286
Action = 2, Reward = -0.6700872792915874
Value of best action = -0.6414041812345355
Action = 7, Reward = 1.0014949126546389
Value of best action = 1.0152884405839828
Action = 7, Reward = 1.1915434142164818
Value of best action = 1.0083916766193108
Action = 7, Reward = 1.4133849195360428
Value of best action = 1.0999675454178963
Action = 4, Reward = -1.4412650365144615
Value of best action = -1.3826635573352375
Action = 7, Reward = 1.2423161136326784
Value of best action = 1.2566762324769696
Action = 2, Reward = -0.7030932289007025
Value of best action = -0.6557457302630614
Action = 6, Reward = 1.066137262905631
Value of best action = 0.8785096029029675
Action = 3, Reward = -0.2455072508638304
Value of best action = -0.45147117105629825
Action = 7, Reward = 0.9433720563554504
Value of best action = 1.249496173054824
Action = 7, Reward = 1.260269769719773
Value of best action = 1.0964341147051373
Action = 7, Reward = 0.9799024857451761
Value of best action = 1.178351942212455
Action = 2, Reward = -0.5672026453327405
Value of best action = -0.679419479581882
Action = 7, Reward = 1.0591207017017958
Value of best action = 1.0791272139788157
Action = 7, Reward = 1.1609877790206924
Value of best action = 1.0691239578403058
Action = 0, Reward = 0.8185331306016318
Value of best action = 0.504526885450413
Action = 3, Reward = -0.34461779398082865
Value of best action = -0.34848921096006436
Action = 7, Reward = 1.1591327951659591
Value of best action = 1.115055868430499
Action = 7, Reward = 1.4746426991183825
Value of best action = 1.1370943317982292
Action = 0, Reward = 1.0237125580716175
Value of best action = 0.6615300080260225
Action = 7, Reward = 1.4101971148765378
Value of best action = 1.3058685154583058
Action = 8, Reward = -0.6499704695944519
Value of best action = -0.4716625312523507
Action = 1, Reward = 0.3327996807403369
Value of best action = 0.5761878865095773
Action = 4, Reward = -1.021569173957158
Value of best action = -1.4119642969248494
Action = 7, Reward = 1.1761507977654482
Value of best action = 1.3580328151674217
Action = 7, Reward = 1.295538429860539
Value of best action = 1.267091806466435
Action = 7, Reward = 0.9638140931633399
Value of best action = 1.281315118163487
Action = 1, Reward = 0.16203262276902786
Value of best action = 0.45449378362495707
Action = 7, Reward = 0.9030730905095777
Value of best action = 1.1225646056634133
Action = 0, Reward = 0.5745645864516538
Value of best action = 0.84262128304882
Action = 7, Reward = 0.7481455995437478
Value of best action = 1.0128188480864955
Action = 8, Reward = -0.7232550606697654
Value of best action = -0.5608165004234014
Action = 5, Reward = -1.3871178992399502
Value of best action = -1.2443250589779766
Action = 6, Reward = 1.1964263895769987
Value of best action = 0.9723234329042992
Action = 6, Reward = 1.1558739542032068
Value of best action = 1.0843749112406489
Action = 6, Reward = 0.760332438616557
Value of best action = 1.1201244327219277
Action = 6, Reward = 1.5441328776549323
Value of best action = 0.9402284356692423
Action = 6, Reward = 1.1141245249861558
Value of best action = 1.2421806566620872
Action = 0, Reward = 0.7563741174686782
Value of best action = 0.7085929347502369