-
-
Notifications
You must be signed in to change notification settings - Fork 46.2k
/
Copy pathbasic_string.py
208 lines (180 loc) · 7.84 KB
/
basic_string.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Simple multithreaded algorithm to show how the 4 phases of a genetic algorithm works
(Evaluation, Selection, Crossover and Mutation)
https://en.wikipedia.org/wiki/Genetic_algorithm
Author: D4rkia
"""
from __future__ import annotations
import random
# Maximum size of the population. Bigger could be faster but is more memory expensive.
N_POPULATION = 200
# Number of elements selected in every generation of evolution. The selection takes
# place from best to worst of that generation and must be smaller than N_POPULATION.
N_SELECTED = 50
# Probability that an element of a generation can mutate, changing one of its genes.
# This will guarantee that all genes will be used during evolution.
MUTATION_PROBABILITY = 0.4
# Just a seed to improve randomness required by the algorithm.
random.seed(random.randint(0, 1000))
def evaluate(item: str, main_target: str) -> tuple[str, float]:
"""
Evaluate how similar the item is with the target by just
counting each char in the right position
>>> evaluate("Helxo Worlx", "Hello World")
('Helxo Worlx', 9.0)
"""
score = len([g for position, g in enumerate(item) if g == main_target[position]])
return (item, float(score))
def crossover(parent_1: str, parent_2: str) -> tuple[str, str]:
"""
Slice and combine two strings at a random point.
>>> random.seed(42)
>>> crossover("123456", "abcdef")
('12345f', 'abcde6')
"""
random_slice = random.randint(0, len(parent_1) - 1)
child_1 = parent_1[:random_slice] + parent_2[random_slice:]
child_2 = parent_2[:random_slice] + parent_1[random_slice:]
return (child_1, child_2)
def mutate(child: str, genes: list[str]) -> str:
"""
Mutate a random gene of a child with another one from the list.
>>> random.seed(123)
>>> mutate("123456", list("ABCDEF"))
'12345A'
"""
child_list = list(child)
if random.uniform(0, 1) < MUTATION_PROBABILITY:
child_list[random.randint(0, len(child)) - 1] = random.choice(genes)
return "".join(child_list)
# Select, crossover and mutate a new population.
def select(
parent_1: tuple[str, float],
population_score: list[tuple[str, float]],
genes: list[str],
) -> list[str]:
"""
Select the second parent and generate new population
>>> random.seed(42)
>>> parent_1 = ("123456", 8.0)
>>> population_score = [("abcdef", 4.0), ("ghijkl", 5.0), ("mnopqr", 7.0)]
>>> genes = list("ABCDEF")
>>> child_n = int(min(parent_1[1] + 1, 10))
>>> population = []
>>> for _ in range(child_n):
... parent_2 = population_score[random.randrange(len(population_score))][0]
... child_1, child_2 = crossover(parent_1[0], parent_2)
... population.extend((mutate(child_1, genes), mutate(child_2, genes)))
>>> len(population) == (int(parent_1[1]) + 1) * 2
True
"""
pop = []
# Generate more children proportionally to the fitness score.
child_n = int(parent_1[1] * 100) + 1
child_n = 10 if child_n >= 10 else child_n
for _ in range(child_n):
parent_2 = population_score[random.randint(0, N_SELECTED)][0]
child_1, child_2 = crossover(parent_1[0], parent_2)
# Append new string to the population list.
pop.append(mutate(child_1, genes))
pop.append(mutate(child_2, genes))
return pop
def basic(target: str, genes: list[str], debug: bool = True) -> tuple[int, int, str]:
"""
Verify that the target contains no genes besides the ones inside genes variable.
>>> from string import ascii_lowercase
>>> basic("doctest", ascii_lowercase, debug=False)[2]
'doctest'
>>> genes = list(ascii_lowercase)
>>> genes.remove("e")
>>> basic("test", genes)
Traceback (most recent call last):
...
ValueError: ['e'] is not in genes list, evolution cannot converge
>>> genes.remove("s")
>>> basic("test", genes)
Traceback (most recent call last):
...
ValueError: ['e', 's'] is not in genes list, evolution cannot converge
>>> genes.remove("t")
>>> basic("test", genes)
Traceback (most recent call last):
...
ValueError: ['e', 's', 't'] is not in genes list, evolution cannot converge
"""
# Verify if N_POPULATION is bigger than N_SELECTED
if N_POPULATION < N_SELECTED:
msg = f"{N_POPULATION} must be bigger than {N_SELECTED}"
raise ValueError(msg)
# Verify that the target contains no genes besides the ones inside genes variable.
not_in_genes_list = sorted({c for c in target if c not in genes})
if not_in_genes_list:
msg = f"{not_in_genes_list} is not in genes list, evolution cannot converge"
raise ValueError(msg)
# Generate random starting population.
population = []
for _ in range(N_POPULATION):
population.append("".join([random.choice(genes) for i in range(len(target))]))
# Just some logs to know what the algorithms is doing.
generation, total_population = 0, 0
# This loop will end when we find a perfect match for our target.
while True:
generation += 1
total_population += len(population)
# Random population created. Now it's time to evaluate.
# Adding a bit of concurrency can make everything faster,
#
# import concurrent.futures
# population_score: list[tuple[str, float]] = []
# with concurrent.futures.ThreadPoolExecutor(
# max_workers=NUM_WORKERS) as executor:
# futures = {executor.submit(evaluate, item) for item in population}
# concurrent.futures.wait(futures)
# population_score = [item.result() for item in futures]
#
# but with a simple algorithm like this, it will probably be slower.
# We just need to call evaluate for every item inside the population.
population_score = [evaluate(item, target) for item in population]
# Check if there is a matching evolution.
population_score = sorted(population_score, key=lambda x: x[1], reverse=True)
if population_score[0][0] == target:
return (generation, total_population, population_score[0][0])
# Print the best result every 10 generation.
# Just to know that the algorithm is working.
if debug and generation % 10 == 0:
print(
f"\nGeneration: {generation}"
f"\nTotal Population:{total_population}"
f"\nBest score: {population_score[0][1]}"
f"\nBest string: {population_score[0][0]}"
)
# Flush the old population, keeping some of the best evolutions.
# Keeping this avoid regression of evolution.
population_best = population[: int(N_POPULATION / 3)]
population.clear()
population.extend(population_best)
# Normalize population score to be between 0 and 1.
population_score = [
(item, score / len(target)) for item, score in population_score
]
# This is selection
for i in range(N_SELECTED):
population.extend(select(population_score[int(i)], population_score, genes))
# Check if the population has already reached the maximum value and if so,
# break the cycle. If this check is disabled, the algorithm will take
# forever to compute large strings, but will also calculate small strings in
# a far fewer generations.
if len(population) > N_POPULATION:
break
if __name__ == "__main__":
target_str = (
"This is a genetic algorithm to evaluate, combine, evolve, and mutate a string!"
)
genes_list = list(
" ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklm"
"nopqrstuvwxyz.,;!?+-*#@^'èéòà€ù=)(&%$£/\\"
)
generation, population, target = basic(target_str, genes_list)
print(
f"\nGeneration: {generation}\nTotal Population: {population}\nTarget: {target}"
)