-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
392 lines (361 loc) · 17.8 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
<!doctype html>
<meta charset="utf-8">
<style>
body {
overflow-x: hidden;
}
.scroll-down {
width: 80px;
height: 40px;
right: 10px;
bottom: 10px;
position: absolute;
font-family: "Roboto","Helvetica Neue",Helvetica,Arial,sans-serif;
font-size: 12px;
font-weight: 300;
color: #FFFFFF;
opacity: 0;
-webkit-transition: opacity 2s ease-in;
-moz-transition: opacity 2s ease-in;
-o-transition: opacity 2s ease-in;
-ms-transition: opacity 2s ease-in;
transition: opacity 2s ease-in;
}
.scroll-down span {
margin-top: 5px;
position: absolute;
left: 50%;
transform: translate(-100%, 0) rotate(45deg);
transform-origin: 100% 100%;
height: 2px;
width: 10px;
background: #FFFFFF;
}
.scroll-down span:nth-of-type(2) {
transform-origin: 0 100%;
transform: translate(0, 0) rotate(-45deg);
}
.spinner {
position: absolute;
height: 160px;
width: 160px;
-webkit-animation: rotation .6s infinite linear;
-moz-animation: rotation .6s infinite linear;
-o-animation: rotation .6s infinite linear;
animation: rotation .6s infinite linear;
border-left: 6px solid rgba(0, 174, 239, .15);
border-right: 6px solid rgba(0, 174, 239, .15);
border-bottom: 6px solid rgba(0, 174, 239, .15);
border-top: 6px solid rgba(0, 174, 239, .8);
border-radius: 100%;
top: calc(50% - 100px);
left: calc(50% - 80px);
right: auto;
bottom: auto;
}
@-webkit-keyframes rotation {
from {
-webkit-transform: rotate(0deg);
}
to {
-webkit-transform: rotate(359deg);
}
}
.transparent {
opacity: 0;
}
figcaption {
padding: 0.5em;
color: rgba(0, 0, 0, 0.6);
font-size: 12px;
line-height: 1.5em;
text-align: left;
}
dt-article figcaption {
padding: 0.5em;
color: rgba(0, 0, 0, 0.6);
font-size: 12px;
line-height: 1.5em;
text-align: left;
}
dt-article figcaption a {
color: rgba(0, 0, 0, 0.6);
}
dt-article figcaption b {
font-weight: 600;
color: rgba(0, 0, 0, 1.0);
}
*.unselectable {
-moz-user-select: -moz-none;
-khtml-user-select: none;
-webkit-user-select: none;
-o-user-select: none;
user-select: none;
}
*.svgunselectable {
-moz-user-select: -moz-none;
-khtml-user-select: none;
-webkit-user-select: none;
-o-user-select: none;
user-select: none;
background: none;
pointer-events: none;
}
</style>
<head>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta http-equiv="X-UA-Compatible" content="ie=edge" />
<meta name="theme-color" content="#ffffff" />
<!-- SEO -->
<meta property="og:title" content="Learning vision processing for assistive displays through self-attention agents" />
<meta property="og:type" content="article" />
<meta property="og:description" content="Compact self-attention visual representations learnt through an evolutionary process." />
<meta property="og:image" content="https://ruizserra.github.io/self-attention-assistive-displays/assets/png/TVCG-main-figure.png" />
<meta property="og:url" content="https://ruizserra.github.io/self-attention-assistive-displays" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Learning vision processing for assistive displays through self-attention agents" />
<meta name="twitter:description" content="Compact self-attention visual representations learnt through an evolutionary process." />
<meta property="og:site_name" content="Jaime Ruiz Serra" />
<meta name="twitter:image" content="https://ruizserra.github.io/self-attention-assistive-displays/assets/png/TVCG-main-figure.png" />
</head>
<link rel="stylesheet" href="css/katex.min.css">
<!--<script src="lib/jquery-1.12.4.min.js"></script>
<script src="lib/mobile-detect.min.js"></script>-->
<script src="lib/template.v1.js"></script>
<script type="text/front-matter">
title: "Learning vision processing for assistive displays through self-attention agents"
description: "Compact self-attention visual representations learnt through an evolutionary process."
</script>
<body>
<div id="no_javasript_warning">
<h3>This page requires Javascript. Please enable it for this webpage.</h3>
</div>
<script>
document.getElementById("no_javasript_warning").style.display = "none";
</script>
<dt-article id="dtbody">
<dt-byline class="l-page transparent"></dt-byline>
<h1>Learning vision processing for assistive displays through self-attention agents</h1>
<p></p>
<dt-byline class="l-page" id="authors_section" hidden>
<div class="byline">
<div class="authors">
<div class="author">
<a class="name" href="http://jaime.rs/">Jaime Ruiz-Serra</a>
<a class="affiliation" href="https://www.swinburne.edu.au/research">Swinburne University</a>
</div>
<div class="author">
<a class="name" href="#">Jack White</a>
<a class="affiliation" href="https://www.swinburne.edu.au/research">Swinburne University</a>
</div>
<div class="author">
<a class="name" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=spetrie">Stephen Petrie</a>
<a class="affiliation" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=spetrie">Swinburne University</a>
</div>
<div class="author">
<a class="name" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=tkam">Tatiana Kameneva</a>
<a class="affiliation" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=tkam">Swinburne University</a>
</div>
<div class="author">
<a class="name" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=cdmccarthy">Chris McCarthy</a>
<a class="affiliation" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=cdmccarthy">Swinburne University</a>
</div>
</div>
<div class="date">
<div class="month">February</div>
<div class="year">2024</div>
</div>
<div class="date">
<div class="month">ACM-TOMM<dt-fn>ACM Transactions on Multimedia Computing, Communications, and Applications</dt-fn></div>
<div class="year" style="color: #FF6C00;"><a href="https://doi.org/10.1145/3650111" target="_blank">paper</a></div>
</div>
</div>
</dt-byline>
<h2>Problem statement</h2>
<div style="text-align: center;">
<img src="assets/png/TVCG-pipeline-1.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
<span style="color: #00F">Assistive vision</span> consists of a camera that captures the real world, with
images processed by a video processing unit (VPU), converting them into scene
representations that can be rendered in assistive displays of different kinds.
We <span style="color: #FF9002">train a self-attention network in a RL context</span> to select important parts of
images for 3D navigation. Once trained, the SA network can be <span style="color: #C159B2">deployed</span>
to the visual prostheses’ VPU to perform the vision processing.
</figcaption>
</div>
<p>With the goal of simplifying visual representations of scenes
for navigation by selecting relevant features, we build upon
the work of Tang et al. <dt-cite key="Tang2020"></dt-cite>,
adapting the DRL agent they introduced to enable training in a 3D navigation simulation environment. We
propose several methods to enhance the selected features,
and adapt the vision processing pipeline to present the obtained representations through different display modalities,
highlighting the method’s versatility. The resultant visualisations’ task-relevant features are enhanced, and those
irrelevant removed, effectively increasing the signal-to-noise ratio.</p>
<hr>
<h2>Training in simulation</h2>
<p>The agents are trained in Deepmind Lab <dt-cite key="Beattie2016"></dt-cite>
"NavMaze" simulation environments with RGB-D observations (or variations thereof),
and an action space size of 3.</p>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/d2_10_0_overlay.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
The self-attention models are trained in a reinforcement learning context by means of neuroevolution.
During training, the LSTM controller part of the network makes all decisions based solely on the
location of the top <i>K</i> most important image patches. This figure shows agent <i>d2</i> navigating environment
<i>NavMazeStatic01</i>.
</figcaption>
</div>
<p></p>
<div style="text-align: center;">
<img src="assets/png/d2_reward_vs_iteration.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
The agents can learn to navigate the environment effectively with less than
100 million training observations (~200 iterations × 64 population/iter. ×
8 episodes/pop. × 900 observations/episode ≈ 92E6 observations), taking ~3h of wall time in our infrastructure.
This figure shows agent <i>d2</i> learning in environment <i>NavMazeStatic01</i>.
</figcaption>
</div>
<p></p>
<div style="text-align: center;">
<img src="assets/png/training-components.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
To make the training process more scalable and marginally faster, we completely decoupled the CMA-ES
population from the training task queue. Task requests, including population member
identifier and agent parameters for the given population member are placed in a
queue and undertaken by compute workers on a FIFO basis. This makes the training
more flexible and suitable for distributed computing.
</figcaption>
</div>
<hr>
<h2>Vision processing in real-world scenes</h2>
<div style="text-align: center;">
<img src="assets/png/TVCG-main-figure.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
The representations learnt in simulation translate to the real-world.
Hyperparameters can be adjusted in real time in the final application. For example,
in this figure, <i>K=10</i> patches are selected in training, whereas <i>K=80</i>
patches are selected in the real-world image.
</figcaption>
</div>
<p>Below we show different feature retrieval methods applied to real-world RGB-D video.</p>
<h3>Importance ranking</h3>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/C4star_50_ranking.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Patch brightness is based on its importance ranking. Agent C4*, showing K=50 patches.
</figcaption>
</div>
<h3>Masked luminance</h3>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/C4star_50_masked_intensity.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Luminance (greyscale) masked with selected patches. Agent C4*, showing K=50 patches.
</figcaption>
</div>
<h3>Masked depth</h3>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/C4star_50_masked_depth.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Depth channel (disparity values) masked with selected patches. Agent C4*, showing K=50 patches.
</figcaption>
</div>
<h3>Weighted depth</h3>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/C4star_weighted_depth.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Depth at the patch location is scaled by the patch importance value. Agent C4*, showing all patches.
</figcaption>
</div>
<hr>
<h2>Display modalities</h2>
<h3>Simulated Phosphene Visualisation</h3>
<div style="text-align: left;">
<img src="assets/png/TVCG-SA-output-SPV.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
SPV of different output modes (refer to Figure 5 in the paper).
</figcaption>
</div>
<!--
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/TODO.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Simulated Phosphene Visualisation (SPV)
</figcaption>
</div> -->
<!-- <p></p>
### vOICe
<dt-cite key="Meijer1993"></dt-cite> -->
</dt-article>
<dt-appendix>
<h3>Acknowledgements</h3>
<p>The template for this supporting materials site is from <a href="https://github.com/attentionagent/attentionagent.github.io">Tang et al</a>.</p>
<p>The experiments in this work were performed on Swinburne University's <a href="https://supercomputing.swin.edu.au/ozstar/">OzStar high-performance computing system</a>.</p>
<h3 id="citation">Citation</h3>
<p>For attribution in academic contexts, please cite this work as:</p>
<pre class="citation short">Jaime Ruiz-Serra and Jack White and Stephen Petrie and Tatiana Kameneva and Chris McCarthy,
Learning vision processing for assistive displays through self-attention agents, 2022.</pre>
<p>BibTeX citation</p>
<pre class="citation long">@article{Ruiz-Serra2021,
author = {Ruiz-Serra, Jaime and
White, Jack and
Petrie, Stephen and
Kameneva, Tatiana and
McCarthy, Chris},
title = {Learning vision processing for assistive displays through self-attention agents},
eprint = {},
url = {},
note = "\url{http://ruizserra.github.io/self-attention-assistive-displays}",
year = {2022}
}</pre>
<h3>Open Source Code</h3>
<p>Code to reproduce the results in this work TBD.</p>
<h3>Reuse</h3>
<p>Diagrams and text are licensed under Creative Commons Attribution <a href="https://creativecommons.org/licenses/by/4.0/">CC-BY 4.0</a> with the <a href="http://github.com/ruizserra/self-attention-assistive-displays/assets">source available on GitHub</a>, unless noted otherwise. The figures that have been reused from other sources don’t fall under this license and can be recognized by the citations in their caption.</p>
</dt-appendix>
</dt-appendix>
</body>
<script type="text/bibliography">
@inproceedings{Tang2020,
title = {Neuroevolution of Self-Interpretable Agents},
url = {http://arxiv.org/abs/2003.08165},
doi = {10.1145/3377930.3389847},
abstract = {Inattentional blindness is the psychological phenomenon that causes one to miss things in plain sight. It is a consequence of the selective attention in perception that lets us remain focused on important parts of our world without distraction from irrelevant details. Motivated by selective attention, we study the properties of artificial agents that perceive the world through the lens of a self-attention bottleneck. By constraining access to only a small fraction of the visual input, we show that their policies are directly interpretable in pixel space. We find neuroevolution ideal for training self-attention architectures for vision-based reinforcement learning ({RL}) tasks, allowing us to incorporate modules that can include discrete, non-differentiable operations which are useful for our agent. We argue that self-attention has similar properties as indirect encoding, in the sense that large implicit weight matrices are generated from a small number of key-query parameters, thus enabling our agent to solve challenging vision based tasks with at least 1000x fewer parameters than existing methods. Since our agent attends to only task critical visual hints, they are able to generalize to environments where task irrelevant elements are modified while conventional methods fail. Videos of our results and source code available at https://attentionagent.github.io/},
pages = {414--424},
booktitle = {GECCO'20: Proceedings of the 2020 Genetic and Evolutionary Computation Conference},
author = {Tang, Yujin and Nguyen, Duong and Ha, David},
urldate = {2020-05-01},
date = {2020},
keywords = {},
}
@article{Beattie2016,
title = {DeepMind Lab},
author = {Beattie, Charles and Leibo, Joel Z. and Teplyashin, Denis and Ward, Tom and Wainwright, Marcus and Küttler, Heinrich and Lefrancq, Andrew and Green, Simon and Valdés, Víctor and Sadik, Amir and Schrittwieser, Julian and Anderson, Keith and York, Sarah and Cant, Max and Cain, Adam and Bolton, Adrian and Gaffney, Stephen and King, Helen and Hassabis, Demis and Legg, Shane and Petersen, Stig},
date = {2016},
url = {http://arxiv.org/abs/1612.03801},
abstract = {DeepMind Lab is a first-person 3D game platform designed for research and development of general artificial intelligence and machine learning systems. DeepMind Lab can be used to study how autonomous artificial agents may learn complex tasks in large, partially observed, and visually diverse worlds. DeepMind Lab has a simple and flexible API enabling creative task-designs and novel AI-designs to be explored and quickly iterated upon. It is powered by a fast and widely recognised game engine, and tailored for effective use by the research community.},
}
@article{Meijer1993,
title = {An Experimental System for Auditory Image},
journaltitle = {IEEE Transactions on Biomedical Engineering},
volume = {39},
number = {2},
author = {Meijer, Peter},
date = {1993},
url = {https://www.seeingwithsound.com/voicebme.html},
}
</script>
<script src="lib/blazy.js"></script>
<script>
// blazy code
var bLazy = new Blazy({
success: function(){
updateCounter();
}
});
// not needed, only here to illustrate amount of loaded images
var imageLoaded = 0;
function updateCounter() {
imageLoaded++;
console.log("blazy image loaded: "+imageLoaded);
}
</script>