index.html

<!doctype html>
<meta charset="utf-8">
<style>
body {
  overflow-x: hidden;
}
.scroll-down {
  width: 80px;
  height: 40px;
  right: 10px;
  bottom: 10px;
  position: absolute;
  font-family: "Roboto","Helvetica Neue",Helvetica,Arial,sans-serif;
  font-size: 12px;
  font-weight: 300;
  color: #FFFFFF;
  opacity: 0;
  -webkit-transition: opacity 2s ease-in;
  -moz-transition: opacity 2s ease-in;
  -o-transition: opacity 2s ease-in;
  -ms-transition: opacity 2s ease-in;
  transition: opacity 2s ease-in;
}
.scroll-down span {
  margin-top: 5px;
  position: absolute;
  left: 50%;
  transform: translate(-100%, 0) rotate(45deg);
  transform-origin: 100% 100%;
  height: 2px;
  width: 10px;
  background: #FFFFFF;
}
.scroll-down span:nth-of-type(2) {
  transform-origin: 0 100%;
  transform: translate(0, 0) rotate(-45deg);
}
.spinner {
  position: absolute;
  height: 160px;
  width: 160px;
  -webkit-animation: rotation .6s infinite linear;
  -moz-animation: rotation .6s infinite linear;
  -o-animation: rotation .6s infinite linear;
  animation: rotation .6s infinite linear;
  border-left: 6px solid rgba(0, 174, 239, .15);
  border-right: 6px solid rgba(0, 174, 239, .15);
  border-bottom: 6px solid rgba(0, 174, 239, .15);
  border-top: 6px solid rgba(0, 174, 239, .8);
  border-radius: 100%;
  top: calc(50% - 100px);
  left: calc(50% - 80px);
  right: auto;
  bottom: auto;
}

@-webkit-keyframes rotation {
  from {
    -webkit-transform: rotate(0deg);
  }
  to {
    -webkit-transform: rotate(359deg);
  }
}
.transparent {
  opacity: 0;
}

figcaption {
  padding: 0.5em;
  color: rgba(0, 0, 0, 0.6);
  font-size: 12px;
  line-height: 1.5em;
  text-align: left;
}

dt-article figcaption {
  padding: 0.5em;
  color: rgba(0, 0, 0, 0.6);
  font-size: 12px;
  line-height: 1.5em;
  text-align: left;
}

dt-article figcaption a {
  color: rgba(0, 0, 0, 0.6);
}

dt-article figcaption b {
  font-weight: 600;
  color: rgba(0, 0, 0, 1.0);
}

*.unselectable {
    -moz-user-select: -moz-none;
    -khtml-user-select: none;
    -webkit-user-select: none;
    -o-user-select: none;
    user-select: none;
}
*.svgunselectable {
    -moz-user-select: -moz-none;
    -khtml-user-select: none;
    -webkit-user-select: none;
    -o-user-select: none;
    user-select: none;
    background: none;
    pointer-events: none;
}
</style>
<head>
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <meta http-equiv="X-UA-Compatible" content="ie=edge" />

  <meta name="theme-color" content="#ffffff" />


  <!-- SEO -->
  <meta property="og:title" content="Learning vision processing for assistive displays through self-attention agents" />
  <meta property="og:type" content="article" />
  <meta property="og:description" content="Compact self-attention visual representations learnt through an evolutionary process." />
  <meta property="og:image" content="https://ruizserra.github.io/self-attention-assistive-displays/assets/png/TVCG-main-figure.png" />
  <meta property="og:url" content="https://ruizserra.github.io/self-attention-assistive-displays" />

  <meta name="twitter:card" content="summary" />
  <meta name="twitter:title" content="Learning vision processing for assistive displays through self-attention agents" />
  <meta name="twitter:description" content="Compact self-attention visual representations learnt through an evolutionary process." />
  <meta property="og:site_name" content="Jaime Ruiz Serra" />
  <meta name="twitter:image" content="https://ruizserra.github.io/self-attention-assistive-displays/assets/png/TVCG-main-figure.png" />

</head>
<link rel="stylesheet" href="css/katex.min.css">

<!--<script src="lib/jquery-1.12.4.min.js"></script>
<script src="lib/mobile-detect.min.js"></script>-->
<script src="lib/template.v1.js"></script>

<script type="text/front-matter">
  title: "Learning vision processing for assistive displays through self-attention agents"
  description: "Compact self-attention visual representations learnt through an evolutionary process."
</script>
<body>
  <div id="no_javasript_warning">
    <h3>This page requires Javascript. Please enable it for this webpage.</h3>
  </div>
  <script>
    document.getElementById("no_javasript_warning").style.display = "none";
  </script>


<dt-article id="dtbody">


<dt-byline class="l-page transparent"></dt-byline>
<h1>Learning vision processing for assistive displays through self-attention agents</h1>
<p></p>
<dt-byline class="l-page" id="authors_section" hidden>
<div class="byline">
  <div class="authors">
    <div class="author">
        <a class="name" href="http://jaime.rs/">Jaime Ruiz-Serra</a>
        <a class="affiliation" href="https://www.swinburne.edu.au/research">Swinburne University</a>
    </div>
    <div class="author">
        <a class="name" href="#">Jack White</a>
        <a class="affiliation" href="https://www.swinburne.edu.au/research">Swinburne University</a>
    </div>
    <div class="author">
        <a class="name" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=spetrie">Stephen Petrie</a>
        <a class="affiliation" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=spetrie">Swinburne University</a>
    </div>
    <div class="author">
        <a class="name" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=tkam">Tatiana Kameneva</a>
        <a class="affiliation" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=tkam">Swinburne University</a>
    </div>
    <div class="author">
        <a class="name" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=cdmccarthy">Chris McCarthy</a>
        <a class="affiliation" href="https://www.swinburne.edu.au/research/our-research/access-our-research/find-a-researcher-or-supervisor/researcher-profile/?id=cdmccarthy">Swinburne University</a>
    </div>
  </div>
  <div class="date">
    <div class="month">February</div>
    <div class="year">2024</div>
  </div>
  <div class="date">
    <div class="month">ACM-TOMM<dt-fn>ACM Transactions on Multimedia Computing, Communications, and Applications</dt-fn></div>
    <div class="year" style="color: #FF6C00;"><a href="https://doi.org/10.1145/3650111" target="_blank">paper</a></div>
  </div>
</div>
</dt-byline>
<h2>Problem statement</h2>
<div style="text-align: center;">
<img src="assets/png/TVCG-pipeline-1.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
<span style="color: #00F">Assistive vision</span> consists of a camera that captures the real world, with
images processed by a video processing unit (VPU), converting them into scene
representations that can be rendered in assistive displays of different kinds.
We <span style="color: #FF9002">train a self-attention network in a RL context</span> to select important parts of
images for 3D navigation. Once trained, the SA network can be <span style="color: #C159B2">deployed</span>
to the visual prostheses’ VPU to perform the vision processing.
</figcaption>
</div>
<p>With the goal of simplifying visual representations of scenes
for navigation by selecting relevant features, we build upon
the work of Tang et al. <dt-cite key="Tang2020"></dt-cite>,
adapting the DRL agent they introduced to enable training in a 3D navigation simulation environment. We
propose several methods to enhance the selected features,
and adapt the vision processing pipeline to present the obtained representations through different display modalities,
highlighting the method’s versatility. The resultant visualisations’ task-relevant features are enhanced, and those
irrelevant removed, effectively increasing the signal-to-noise ratio.</p>
<hr>
<h2>Training in simulation</h2>
<p>The agents are trained in Deepmind Lab <dt-cite key="Beattie2016"></dt-cite>
&quot;NavMaze&quot; simulation environments with RGB-D observations (or variations thereof),
and an action space size of 3.</p>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/d2_10_0_overlay.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
The self-attention models are trained in a reinforcement learning context by means of neuroevolution.
During training, the LSTM controller part of the network makes all decisions based solely on the
location of the top <i>K</i> most important image patches. This figure shows agent <i>d2</i> navigating environment
<i>NavMazeStatic01</i>.
</figcaption>
</div>
<p></p>
<div style="text-align: center;">
<img src="assets/png/d2_reward_vs_iteration.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
The agents can learn to navigate the environment effectively with less than
100 million  training  observations (~200 iterations × 64 population/iter. ×
8 episodes/pop. × 900 observations/episode ≈ 92E6 observations), taking ~3h of wall time in our infrastructure.
This figure shows agent <i>d2</i> learning in environment <i>NavMazeStatic01</i>.
</figcaption>
</div>
<p></p>
<div style="text-align: center;">
<img src="assets/png/training-components.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
To make the training process more scalable and marginally faster, we completely decoupled the CMA-ES
population from the training task queue. Task requests, including population member
identifier and agent parameters for the given population member are placed in a
queue and undertaken by compute workers on a FIFO basis. This makes the training
more flexible and suitable for distributed computing.
</figcaption>
</div>
<hr>
<h2>Vision processing in real-world scenes</h2>
<div style="text-align: center;">
<img src="assets/png/TVCG-main-figure.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
The representations learnt in simulation translate to the real-world.
Hyperparameters can be adjusted in real time in the final application. For example,
in this figure, <i>K=10</i> patches are selected in training, whereas <i>K=80</i>
patches are selected in the real-world image.
</figcaption>
</div>
<p>Below we show different feature retrieval methods applied to real-world RGB-D video.</p>
<h3>Importance ranking</h3>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/C4star_50_ranking.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Patch brightness is based on its importance ranking. Agent C4*, showing K=50 patches.
</figcaption>
</div>
<h3>Masked luminance</h3>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/C4star_50_masked_intensity.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Luminance (greyscale) masked with selected patches. Agent C4*, showing K=50 patches.
</figcaption>
</div>
<h3>Masked depth</h3>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/C4star_50_masked_depth.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Depth channel (disparity values) masked with selected patches. Agent C4*, showing K=50 patches.
</figcaption>
</div>
<h3>Weighted depth</h3>
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/C4star_weighted_depth.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Depth at the patch location is scaled by the patch importance value. Agent C4*, showing all patches.
</figcaption>
</div>
<hr>
<h2>Display modalities</h2>
<h3>Simulated Phosphene Visualisation</h3>
<div style="text-align: left;">
<img src="assets/png/TVCG-SA-output-SPV.png" style="margin: 0; width: 100%;" ></img>
<figcaption style="text-align: left; padding-top: 0;">
SPV of different output modes (refer to Figure 5 in the paper).
</figcaption>
</div>
<!--
<div style="text-align: center;">
<video class="b-lazy" src="assets/mp4/TODO.mp4" type="video/mp4" autoplay muted playsinline loop style="margin: 0; width: 100%;" ></video>
<figcaption style="text-align: left; padding-top: 0;">
Simulated Phosphene Visualisation (SPV)
</figcaption>
</div> -->
<!-- <p></p>


### vOICe

<dt-cite key="Meijer1993"></dt-cite> -->
</dt-article>
<dt-appendix>
<h3>Acknowledgements</h3>
<p>The template for this supporting materials site is from <a href="https://github.com/attentionagent/attentionagent.github.io">Tang et al</a>.</p>
<p>The experiments in this work were performed on Swinburne University's <a href="https://supercomputing.swin.edu.au/ozstar/">OzStar high-performance computing system</a>.</p>
<h3 id="citation">Citation</h3>
<p>For attribution in academic contexts, please cite this work as:</p>
<pre class="citation short">Jaime Ruiz-Serra and Jack White and Stephen Petrie and Tatiana Kameneva and Chris McCarthy,
Learning vision processing for assistive displays through self-attention agents, 2022.</pre>
<p>BibTeX citation</p>
<pre class="citation long">@article{Ruiz-Serra2021,
  author = {Ruiz-Serra, Jaime and
            White, Jack and
            Petrie, Stephen and
            Kameneva, Tatiana and
            McCarthy, Chris},
  title  = {Learning vision processing for assistive displays through self-attention agents},
  eprint = {},
  url    = {},
  note   = "\url{http://ruizserra.github.io/self-attention-assistive-displays}",
  year   = {2022}
}</pre>
<h3>Open Source Code</h3>
<p>Code to reproduce the results in this work TBD.</p>
<h3>Reuse</h3>
<p>Diagrams and text are licensed under Creative Commons Attribution <a href="https://creativecommons.org/licenses/by/4.0/">CC-BY 4.0</a> with the <a href="http://github.com/ruizserra/self-attention-assistive-displays/assets">source available on GitHub</a>, unless noted otherwise. The figures that have been reused from other sources don’t fall under this license and can be recognized by the citations in their caption.</p>
</dt-appendix>
</dt-appendix>
</body>
<script type="text/bibliography">


  @inproceedings{Tang2020,
    title = {Neuroevolution of Self-Interpretable Agents},
    url = {http://arxiv.org/abs/2003.08165},
    doi = {10.1145/3377930.3389847},
    abstract = {Inattentional blindness is the psychological phenomenon that causes one to miss things in plain sight. It is a consequence of the selective attention in perception that lets us remain focused on important parts of our world without distraction from irrelevant details. Motivated by selective attention, we study the properties of artificial agents that perceive the world through the lens of a self-attention bottleneck. By constraining access to only a small fraction of the visual input, we show that their policies are directly interpretable in pixel space. We find neuroevolution ideal for training self-attention architectures for vision-based reinforcement learning ({RL}) tasks, allowing us to incorporate modules that can include discrete, non-differentiable operations which are useful for our agent. We argue that self-attention has similar properties as indirect encoding, in the sense that large implicit weight matrices are generated from a small number of key-query parameters, thus enabling our agent to solve challenging vision based tasks with at least 1000x fewer parameters than existing methods. Since our agent attends to only task critical visual hints, they are able to generalize to environments where task irrelevant elements are modified while conventional methods fail. Videos of our results and source code available at https://attentionagent.github.io/},
    pages = {414--424},
    booktitle = {GECCO'20: Proceedings of the 2020 Genetic and Evolutionary Computation Conference},
    author = {Tang, Yujin and Nguyen, Duong and Ha, David},
    urldate = {2020-05-01},
    date = {2020},
    keywords = {},
  }


  @article{Beattie2016,
    title = {DeepMind Lab},
    author = {Beattie, Charles and Leibo, Joel Z. and Teplyashin, Denis and Ward, Tom and Wainwright, Marcus and Küttler, Heinrich and Lefrancq, Andrew and Green, Simon and Valdés, Víctor and Sadik, Amir and Schrittwieser, Julian and Anderson, Keith and York, Sarah and Cant, Max and Cain, Adam and Bolton, Adrian and Gaffney, Stephen and King, Helen and Hassabis, Demis and Legg, Shane and Petersen, Stig},
    date = {2016},
    url = {http://arxiv.org/abs/1612.03801},
    abstract = {DeepMind Lab is a first-person 3D game platform designed for research and development of general artificial intelligence and machine learning systems. DeepMind Lab can be used to study how autonomous artificial agents may learn complex tasks in large, partially observed, and visually diverse worlds. DeepMind Lab has a simple and flexible API enabling creative task-designs and novel AI-designs to be explored and quickly iterated upon. It is powered by a fast and widely recognised game engine, and tailored for effective use by the research community.},
  }

  @article{Meijer1993,
  	title = {An Experimental System for Auditory Image},
    journaltitle = {IEEE Transactions on Biomedical Engineering},
    volume = {39},
  	number = {2},
  	author = {Meijer, Peter},
  	date = {1993},
    url = {https://www.seeingwithsound.com/voicebme.html},
  }


</script>
<script src="lib/blazy.js"></script>
<script>
  // blazy code
  var bLazy = new Blazy({
    success: function(){
      updateCounter();
    }
  });

  // not needed, only here to illustrate amount of loaded images
  var imageLoaded = 0;

  function updateCounter() {
    imageLoaded++;
    console.log("blazy image loaded: "+imageLoaded);
  }
</script>