-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
108 lines (76 loc) · 3.34 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
import matplotlib.pyplot as plt
from agent import Agent
from bandit import Bandit
# Creating a bandit with 'n' arms/levers.
number_of_arms = 10
bandit = Bandit(number_of_arms)
# Filling/computing q*(a), for each 'a'. Bandit Intrinsic.
# Remember that: q*(a) = True value of an action.
bandit.fill_true_action_values(0, 1)
#bandit.fill_true_action_values_uniform(-1, 1)
q_star = bandit.get_action_values()
# Filling/computing rewards of each action.
bandit.fill_reward_values(q_star)
#bandit.fill_reward_values_uniform(q_star)
rewards = bandit.get_reward_values()
### Prototype
agent = Agent(num_actions=10, epsilon=0.1, steps = 2000)
# ndarrays to keep Q_estimates and action_count.
Q_estimates = agent.get_Q_estimates()
action_count = agent.get_action_count()
# Initializing actions.
# Here, any action is a candidate.
candidate_action = agent.compute_index_candidate_action(Q_estimates)
# 'action' keeps the indexes where Q_(.) is maximum.
action = agent.compute_argmax_actions(Q_estimates)
#print(f"Actions such that Q is max = {action}")
# Vector to keep cumulative rewards. Updated each step.
cumulative_rewards_received = agent.get_cumulative_rewards()
### Prototype
# Some parameters
epsilon = 0.1
steps = 20000
average_reward = []
epsilon_list = []
# Just to count the number of times it exploited/explored.
exploring = agent.get_exploring_count()
exploiting = agent.get_exploit_count()
for i in range(steps):
bandit.fill_reward_values(q_star)
rewards = bandit.get_reward_values()
agent.action_method("epsilon-greedy", epsilon, number_of_arms, rewards)
print(f"Q = {agent.get_Q_estimates()}")
print(f"q = {q_star}")
print(f"-> (Average) Cumulative reward = {agent.get_cumulative_rewards()/steps}")
print(f"-> action count = {agent.get_action_count()}")
print(f"-> Exploited {agent.get_exploit_count()} and explored {agent.get_exploring_count()}")
print(f"-> Average reward = {np.sum(agent.get_cumulative_rewards())/steps:.3f}\n")
average_reward.append(np.sum(agent.get_cumulative_rewards())/steps)
print(f"(Average reward, epsilon) = {np.sum(agent.get_cumulative_rewards())/steps:.3f}, {epsilon}")
# Prototype analysis.
# exploring = agent.get_exploring_count()
# exploiting = agent.get_exploit_count()
# for epsilon in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
# epsilon_list.append(epsilon)
# i = 0
# agent.set_zero_exploit_count()
# agent.set_zero_explore_count()
# agent.set_zero_cumulative_rewards()
# agent.set_zero_action_count()
# for i in range(steps):
# bandit.fill_reward_values(q_star)
# rewards = bandit.get_reward_values()
# agent.action_method("epsilon-greedy", epsilon, number_of_arms, rewards)
# print(f"Q = {agent.get_Q_estimates()}")
# print(f"-> (Average) Cumulative reward = {agent.get_cumulative_rewards()/steps}")
# print(f"-> action count = {agent.get_action_count()}")
# print(f"-> Exploited {agent.get_exploit_count()} and explored {agent.get_exploring_count()}")
# print(f"-> Average reward = {np.sum(agent.get_cumulative_rewards())/steps:.3f}\n")
# average_reward.append(np.sum(agent.get_cumulative_rewards())/steps)
# print(f"(Average reward, epsilon) = {np.sum(agent.get_cumulative_rewards())/steps:.3f}, {epsilon}")
# plt.plot(epsilon_list, average_reward)
# plt.xlabel("Epsilon")
# plt.ylabel("Average reward")
# plt.show()
#########