TRex.tex

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% ELIFE ARTICLE TEMPLATE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% PREAMBLE 
\documentclass[9pt,lineno]{elife}
% Use the onehalfspacing option for 1.5 line spacing
% Use the doublespacing option for 2.0 line spacing
% Please note that these options may affect formatting.
% Additionally, the use of the \newcommand function should be limited.
\usepackage{algorithm2e}
\usepackage{mathtools}
\usepackage{color, colortbl}
\usepackage{array} % for defining a new column type
\usepackage{varwidth} %for the varwidth minipage environment

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% ARTICLE SETUP
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{TRex, a fast multi-animal tracking system with markerless identification, and 2D estimation of posture and visual fields}

\author[1,2,3*]{Tristan Walter}
\author[1,2,3*]{Iain D Couzin}
\affil[1]{Max Planck Institute of Animal Behavior, Germany}
\affil[2]{Centre for the Advanced Study of Collective Behaviour, University of Konstanz, Germany}
\affil[3]{Department of Biology, University of Konstanz, Germany}

\corr{twalter@ab.mpg.de}{TW}
\corr{icouzin@ab.mpg.de}{IDC}

%\presentadd[\authfn{1}]{Department of Collective Behaviour, Max Planck Institute of Animal Behavior, D-78457 Konstanz, Germany}
%\presentadd[\authfn{2}]{Centre for the Advanced Study of Collective Behaviour, University of Konstanz, Universitätsstraße 10, D-78457 Konstanz, Germany}
%\presentadd[\authfn{3}]{Department of Biology, University of Konstanz, Universitätsstraße 10, D-78457 Konstanz, Germany}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% ARTICLE START
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\figref}[1]{\textit{\textbf{\ref{#1}}}}
\newcommand{\vidref}[1]{\textit{\textbf{\ref{#1}}}}
\newcommand{\tableref}[1]{\textit{\textbf{\ref{tab:#1}}}\xspace}
\newcommand{\videoref}[1]{video~\textit{\textbf{\ref{#1}}}}
%\DeclarePairedDelimiterX\set[1]\lbrace\rbrace{\def\given{\;\delimsize\vert\;}#1}


\NewDocumentCommand{\up}{som}{%
  \IfBooleanTF{#1}
    {\upext{#3}}
    {#3\IfNoValueTF{#2}{\mathord}{#2}\uparrow}%
}
\NewDocumentCommand{\upext}{m}{%
  \mleft.\kern-\nulldelimiterspace#1\mright\uparrow
}

\DeclarePairedDelimiterX{\given}[1]{(}{)}{%
  \ifnum\currentgrouptype=16 \else\begingroup\fi
  \activatebar#1
  \ifnum\currentgrouptype=16 \else\endgroup\fi
}

\DeclarePairedDelimiterX{\givenset}[1]{\{}{\}}{%
  \ifnum\currentgrouptype=16 \else\begingroup\fi
  \activatebar#1
  \ifnum\currentgrouptype=16 \else\endgroup\fi
}

\newcommand{\idtracker}{\protect\path{ idtracker.ai}}
\newcommand{\TRex}{\protect\path{TRex}}
\newcommand{\TGrabs}{\protect\path{TGrabs}}
\definecolor{Gray}{gray}{0.9}

\newcommand{\innermid}{\nonscript\;\vert\nonscript\;} % \delimsize
\newcommand{\activatebar}{%
  \begingroup\lccode`\~=`\|
  \lowercase{\endgroup\let~}\innermid 
  \mathcode`|=\string"8000
}

\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\newcommand{\expnumber}[2]{{#1}\mathrm{e}{#2}}
\DeclarePairedDelimiter{\nint}\lfloor\rceil
\newcommand{\argmax}[1]{\underset{#1}{\operatorname{arg}\,\operatorname{max}}\;}
\newcommand*\mean[1]{\bar{#1}}

\newcommand{\Tau}{\mathcal{T}}

\newcommand{\direction}[1]{\overrightarrow{#1}\;}

\DeclareMathOperator*{\median}{median}
\DeclareMathOperator{\atantwo}{atan2}

\renewcommand{\thefigure}{Figure~\arabic{figure}}
\captionsetup*[figure]{name={\hspace{-2.5pt}},font={color=eLifeDarkBlue,small},skip=\smallskipamount,justification=justified}
\captionsetup*[table]{name={\hspace{-2.5pt}},font={color=eLifeDarkBlue,small},margin=0pt,indention=0cm,justification=justified}
\renewcommand{\thetable}{Table~\arabic{table}}

\newcommand{\changemade}[1]{#1}
%\renewcommand{\changemade}[1]{{\color{blue}#1}}

\makeatletter\newcommand\newtag[2]{#1\def\@currentlabel{#1\hspace{-2pt}}\label{#2}}\makeatother%

\makeatletter
\newcommand*{\inlineequation}[2][]{%
  \begingroup
    % Put \refstepcounter at the beginning, because
    % package `hyperref' sets the anchor here.
    \refstepcounter{equation}%
    \ifx\\#1\\%
    \else
      \label{#1}%
    \fi
    % prevent line breaks inside equation
    \relpenalty=10000 %
    \binoppenalty=10000 %
    \ensuremath{%
      % \displaystyle % larger fractions, ...
      #2%
    }%
    ~\@eqnnum
  \endgroup
}
\makeatother

\begin{document}
	
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% INTRODUCTION AND ABSTRACT
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\maketitle

\begin{abstract}
    Automated visual tracking of animals is rapidly becoming an indispensable tool for the study of behavior. It offers a quantitative methodology by which organisms' sensing and decision-making can be studied in a wide range of ecological contexts. Despite this, existing solutions tend to be challenging to deploy in practice, especially when considering long and/or high-resolution video\changemade{-}streams. Here, we present TRex, a fast and easy-to-use solution for tracking a large number of individuals simultaneously \changemade{using background-subtraction} with real-time (60Hz) tracking performance for up to approximately 256 individuals and estimates 2D \changemade{visual-fields, outlines, and head/rear of bilateral animals}, both in open and closed-loop contexts. Additionally, TRex offers highly-accurate, deep-learning-based visual identification of up to approximately 100 unmarked individuals, where it is between 2.5-46.7 times faster, and requires 2-10 times less memory, than comparable software (with relative performance increasing for more organisms\changemade{/}longer videos) and provides interactive data-exploration within an intuitive, platform-independent graphical user\changemade{-}interface.
\end{abstract}

\section{Introduction}

Tracking multiple moving animals (and multiple objects, generally) is important in various fields of research such as behavioral studies, ecophysiology, biomechanics, and neuroscience (\cite{dell2014automated}). Many tracking algorithms have been proposed in recent years (\cite{ohayon2013automated}, \cite{fukunaga2015grouptracker}, \cite{burgos2012social}, \cite{rasch2016closing}), often limited to/only tested with a particular organism (\cite{hewitt2018novel}, \cite{branson2009high}) or type of organism (e.g. protists, \cite{pennekamp2015bemovi}; fly larvae and worms, \cite{risse2017fimtrack}). Relatively few have been tested with a range of organisms and scenarios (\cite{idtracker}, \cite{sridhar2019tracktor}, \cite{rodriguez2018toxtrac}). Furthermore, many existing tools only have a specialized set of features, struggle with very long or high-resolution ($\ge$ 4K) videos, or simply take too long to yield results. Existing fast algorithms are often severely limited with respect to the number of individuals that can be tracked simultaneously; for example xyTracker (\cite{rasch2016closing}) allows for real-time tracking at 40Hz while accurately maintaining identities, and thus is suitable for closed-loop experimentation (experiments where stimulus presentation can depend on the real-time behaviors of the individuals, e.g. \cite{bath2014flymad}, \cite{brembs2000operant}, \cite{bianco2015visuomotor}), but has a limit of being able to track only 5 individuals simultaneously. ToxTrac (\cite{rodriguez2018toxtrac}), a software comparable to xyTracker in it's set of features, is limited to 20 individuals and relatively low frame-rates ($\leq$25fps). Others, while implementing a wide range of features and offering high-performance tracking, are costly and thus limited in access (\cite{noldus2001ethovision}). Perhaps with the exception of proprietary software, one major problem at present is the severe fragmentation of features across the various software solutions. For example, experimentalists must typically construct work-flows from many individual tools: One tool might be responsible for estimating the animal's positions, another for estimating their posture, another one for reconstructing visual fields (which in turn probably also estimates animal posture, but does not export it in any way) and one for keeping identities -- correcting results of other tools post-hoc. It can take a very long time to make them all work effectively together, adding what is often considerable overhead to behavioral studies.

\TRex{}, the software released with this publication (available at \href{https://trex.run}{trex.run} under an Open-Source license), has been designed to address these problems, and thus to provide a powerful, fast and easy to use tool that will be of use in a wide range of behavioral studies. It allows users to track moving objects/animals, as long as there is a way to separate them from the background (e.g. static backgrounds, custom masks, as discussed below). In addition to the positions of individuals, our software provides other per-individual metrics such as body shape and, if applicable, head-/tail-position. This is achieved using a basic posture analysis, which works out of the box for most organisms, and, if required, can be easily adapted for others. Posture information, which includes the body center-line, can be useful for detecting e.g. courtship displays and other behaviors that might not otherwise be obvious from mere positional data. Additionally, with the visual sense often being one of the most important modalities to consider in behavioral research, we include the capability for users to obtain a computational reconstruction of the visual fields of all individuals (\citealt{strandburg2013visual}, \citealt{rosenthal2015revealing}). This not only reveals which individuals are visible from an individual's point-of-view, as well as the distance to them, but also which parts of others' bodies are visible.

Included in the software package is a task-specific tool, \TGrabs{}, that is employed to pre-process existing video files and which allows users to record directly from cameras capable of live-streaming to a computer (with extensible support from generic webcams to high-end machine vision cameras). It supports most of the above-mentioned tracking features (positions, posture, visual field) and provides access to results immediately while continuing to record/process. This not only saves time, since tracking results are available immediately after the trial, but makes closed-loop support possible for large groups of individuals ($\leq$ 128 individuals). \TRex{} and \TGrabs{} are written in \verb!C++! but, as part of our closed-loop support, we are providing a \verb!Python!-based general scripting interface which can be fully customized by the user without the need to recompile or relaunch. This interface allows for compatibility with external programs (e.g. for closed-loop  stimulus-presentation) and other custom extensions.

The fast tracking described above employs information about the kinematics of each organism in order to try to maintain their identities. This is very fast and useful in many scenarios, e.g. where general assessments about group properties (group centroid, alignment of individuals, density, etc.) are to be made. However, when making conclusions about \textit{individuals} instead, maintaining identities perfectly throughout the video is a critical requirement. Every tracking method inevitably makes mistakes, which, for small groups of two or three individuals or short videos, can be corrected manually -- at the expense of spending much more time on analysis, which rapidly becomes prohibitive as the number of individuals to be tracked increases. To make matters worse, when multiple individuals stay out of view of the camera for too long (such as if individuals move out of frame, under a shelter, or occlude one another) there is no way to know who is whom once they re-emerge. With no baseline truth available (e.g. using physical tags as in \cite{alarcon2018automated}, \cite{nagy2013context}; or marker-less methods as in \cite{idtracker}, \cite{idtrackerai}, \cite{rasch2016closing}), these mistakes can not be corrected and accumulate over time, until eventually all identities are fully shuffled. To solve this problem (and without the need to mark, or add physical tags to individuals), \TRex{} can, at the cost of spending more time on analysis (and thus not during live-tracking), automatically learn the identity of up to approximately 100 unmarked individuals based on their visual appearance. This machine-learning based approach, herein termed \textit{visual identification}, provides an independent source of information on the identity of individuals, which is used to detect and correct potential tracking mistakes without the need for human supervision.

\changemade{In this paper, we evaluate the most important functions of our software} in terms of speed and reliability using a wide range of experimental systems, including termites, fruit flies, locusts and multiple species of schooling fish (although we stress that our software is not limited to such species).

Specifically regarding the visual identification of unmarked individuals in groups, \idtracker{} is currently state-of-the-art, yielding high-accuracy (>99\% in most cases) in maintaining consistent identity assignments across entire videos (\cite{idtrackerai}). Similarly to \TRex{}, this is achieved by training an artificial neural network to visually differentiate between individuals, and using identity predictions from this network to avoid/correct tracking mistakes. Both approaches work without human supervision, and are limited to approximately 100 individuals. Given that \idtracker{} is the only currently available tool with visual identification for such large groups of individuals, and also because of the quality of results, we will use it as a benchmark for our visual identification system. Results will be compared in terms of both accuracy and computation speed, showing \TRex{}' ability to achieve the same high level of accuracy but typically at far higher speeds, and with a much reduced memory requirement.

\TRex{} is platform-independent and runs on all major operating systems (Linux, Windows, macOS) and offers complete batch processing support, allowing users to efficiently process entire sets of videos without requiring human intervention. All parameters can be accessed either through settings files, from within the graphical user interface (or \textit{GUI}), or using the command-line. The user interface supports off-site access using a built-in web-server (although it is recommended to only use this from within a secure VPN environment). Available parameters are explained in the documentation directly as part of the GUI and on an external website (see below). Results can be exported to independent data-containers (\texttt{NPZ}, or \texttt{CSV} \changemade{for plain-text type data}) for further analyses in software of the user's choosing. We will not go into detail regarding the many GUI functions since albeit being of great utility to the researcher, they are only the means to easily apply the features presented herein. Some examples will be given in the main text and appendix, but a comprehensive collection of all of them, as well as detailed documentation, is available in the up-to-date online-documentation which can be found at \href{https://trex.run/docs}{trex.run/docs}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% FEATURES
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{figure}[!hb]
\begin{fullwidth}
\includegraphics[width=1.0\linewidth]{figures/software-overview.pdf}
\captionsetup{margin=0pt,calcmargin={0pt,-4.5cm}}
\caption{Videos are typically processed in four main stages, illustrated here each with a list of prominent features. Some of them are accessible from both \TRex{} and \TGrabs{}, while others are software specific (as shown at the very top). (a) The video is either recorded directly with our software (\TGrabs{}), or converted from a pre-recorded video file. Live-tracking enables users to perform closed-loop experiments, for which a virtual testing environment is provided. (b) Videos can be tracked and parameters adjusted with visual feedback. Various exploration and data presentation features are provided and customized data streams can be exported for use in external software. (c) After successful tracking, automatic visual identification can, optionally, be used to refine results. An artificial neural network is trained to recognize individuals, helping to automatically correct potential tracking mistakes. In the last stage, many graphical tools are available to users of \TRex{}, a selection of which is listed in (d).}
\label{fig:software_overview}

\videosupp{\changemade{This video shows an overview of the typical chronology of operations when using our software. Starting with the raw video, segmentation using \TGrabs{} (\figref{fig:software_overview}a) is the first and only step that is not optional. Tracking (\figref{fig:software_overview}b) and posture estimation (both also available for live-tracking in \TGrabs{}) are usually performed in that order, but can be partly parallelized (e.g. performing posture estimation in parallel for all individuals). Visual identification (\figref{fig:software_overview}c) is only available in \TRex{} due to relatively long processing times. All clips from this composite video have been recorded directly in \TRex{}. \url{https://youtu.be/g9EOi7FZHM0}}}
\end{fullwidth}
\end{figure}

\section{Results}\label{sec:methods_evaluation}


\changemade{Our software package consists of two task-specific tools, \TGrabs{} and \TRex{}, with different specializations. \TGrabs{} is primarily designed to connect to cameras and to be very fast. It employs the same program code as \TRex{} to achieve real-time online tracking, such as could be employed for closed-loop experiments (the user can launch \TGrabs{} from the opening dialog of \TRex{}). However, its focus on speed comes at the cost of not having access to the rich graphical user interface or more sophisticated (and thus slower) processing steps, such as deep-learning based identification, that \TRex{} provides. \TRex{} focusses on the more time-consuming tasks, as well as visual data exploration, re-tracking existing results -- but sometimes it simply functions as an easier-to-use graphical interface for tracking and adjusting parameters. Together they provide a wide range of capabilities to the user and are often used in sequence as part of the same work-flow. Typically, such a sequence can be summarized in four stages (see also \figref{fig:pipeline_overview} for a flow diagram):}

%The workflow for using our software is straightforward and can be summarized in four stages:

\begin{enumerate}
    \item \textbf{Segmentation} in \TGrabs{}. When recording a video or converting a previously recorded file (e.g. MP4, .AVI, etc.), it is segmented into background and foreground-objects (\verb!blobs!), the latter typically being the entities to be tracked. Results are saved to a custom, non-proprietary video format (\verb!PV!)  (\figref{fig:software_overview}a).
    \item \textbf{Tracking} the video, either directly in \TGrabs{}, or \changemade{in \TRex{} after pre-processing,} with access to customizable visualizations and the ability to change tracking parameters on-the-fly. Here, we will describe two types of data available within \TRex{}, 2D posture- and visual-field estimation, as well as real-time applications of such data  (\figref{fig:software_overview}b).
    \item \changemade{\textbf{Automatic identity correction} (\figref{fig:software_overview}c), a way of utilizing the power of a trained neural network to perform visual identification of individuals, is available in \TRex{} only.} This step may not be necessary in many cases, but it is the only way to guarantee consistent identities throughout the video. It is also the \changemade{most processing-heavy (and thus usually the most time-consuming)} step, as well as the only one involving machine learning. All previously collected posture- and other tracking-related data are utilized in this step, placing it late in a typical workflow.
    \item Data visualization is a critical component of any research project, especially for unfamiliar datasets, but manually crafting one for every new experiment can be very time-consuming. Thus, \TRex{} offers a universal, highly customizable, way to make all collected data available for interactive \textbf{exploration} (\figref{fig:software_overview}d) -- allowing users to change many display options and recording video clips for external playback. Tracking parameters can be adjusted on the fly (many with visual feedback) -- important e.g. when preparing a closed-loop feedback with a new species or setup.
\end{enumerate}

\begin{figure}[h]
\centering
\includegraphics[width=1\linewidth]{figures/pipeline_overview.pdf}
\caption{An overview of the interconnection between \TRex{}, \TGrabs{} and their data in- and output formats, with titles on the left corresponding to the stages in \figref{fig:software_overview}. Starting at the top of the figure, video is either streamed to \TGrabs{} from a file or directly from a compatible camera. At this stage, preprocessed data are saved to a \textit{.pv} file which can be read by \TRex{} later on. Thanks to its integration with parts of the \TRex{} code, \TGrabs{} can also perform online tracking for limited numbers of individuals, and save results to a \textit{.results} file (that can be opened by \TRex{}) along with individual tracking data saved to \protect\path{numpy} data-containers (\textit{.npz}) \changemade{or standard CSV files}, which can be used for analysis in third-party applications. If required, videos recorded directly using \TGrabs{} can also be streamed to a \textit{.mp4} video file which can be viewed in commonly available video players like \protect\path{VLC}.}
\label{fig:pipeline_overview}
\end{figure}

%\begin{featurebox}
%\caption{Compatibility with other segmentation software}
%\label{box:background-sub-compat}

%\medskip
%\end{featurebox}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% EVALUATION
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%\section{Results} 

Below we assess the performance \changemade{of our software} regarding three properties that are most important when using it (or in fact any tracking software) in practice: (i) The time it takes to perform tracking (ii) the time it takes to perform automatic identity correction and (iii) the peak memory consumption when correcting identities (since this is where memory consumption is maximal), as well as (iv) the accuracy of the produced trajectories after visual identification.


%(i) The accuracy of the produced trajectories in terms of keeping identities (ii) the time it took to produce results and (iii) the memory-consumption of the process. 
While accuracy is an important metric and specific to identification tasks, time and memory are typically of considerable practical importance for all tasks. For example, tracking-speed may be the difference between only being able to run a few trials or producing more reliable results with a much larger number of trials. In addition, tracking speed can make a major difference as the number of individuals increases. Furthermore, memory constraints can be extremely prohibitive making tracking over long video sequences and/or for a large number of individuals extremely time-consuming, or impossible, for the user.

\begin{table}[t]
% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l l l l l l l l r}
\toprule
ID & species & common & {\# ind.} & fps (Hz) & duration & size ($\mathrm{px}^2$)  \\
\midrule
\newtag{ 0 }{vid:reversals3m_1024_dotbot_20181025_105202.stitched} & \textit{Leucaspius delineatus} & sunbleak & 1024 & 40 & 8min20s & $3866\times 4048$\\
\newtag{ 1 }{vid:reversals3m_512_dotbot_20191111_165201.stitched} & \textit{Leucaspius delineatus} & sunbleak & 512 & 50 & 6min40s & $3866\times 4140$\\
\newtag{ 2 }{vid:reversals3m_512_dotbot_20190122_155201.stitched} & \textit{Leucaspius delineatus} & sunbleak & 512 & 60 & 5min59s & $3866\times 4048$\\
\newtag{ 3 }{vid:reversals3m_256_dotbot_20191122_154201.stitched} & \textit{Leucaspius delineatus} & sunbleak & 256 & 50 & 6min40s & $3866\times 4140$\\
\newtag{ 4 }{vid:reversals3m_256_dotbot_20181214_151202.stitched} & \textit{Leucaspius delineatus} & sunbleak & 256 & 60 & 5min59s & $3866\times 4048$\\
\newtag{ 5 }{vid:reversals3m_128_dotbot_20181211_153201.stitched} & \textit{Leucaspius delineatus} & sunbleak & 128 & 60 & 6min & $3866\times 4048$\\
\newtag{ 6 }{vid:reversals3m_128_dotbot_20190116_135201.stitched} & \textit{Leucaspius delineatus} & sunbleak & 128 & 60 & 5min59s & $3866\times 4048$\\
\newtag{ 7 }{vid:video_example_100fish_1min} & \textit{Danio rerio} & zebrafish & 100 & 32 & 1min & $3584\times 3500$\\
\newtag{ 8 }{vid:flies_N59} & \textit{Drosophila melanogaster} & fruit-fly & 59 & 51 & 10min & $2306\times 2306$\\
\newtag{ 9 }{vid:15locusts1h} & \textit{Schistocerca gregaria} & locust & 15 & 25 & 1h0min & $1880\times 1881$\\
\newtag{ 10 }{vid:N05HHS2019-10S-V1} & \textit{Constrictotermes cyphergaster} & termite & 10 & 100 & 10min5s & $1920\times 1080$\\
\newtag{ 11 }{vid:group_3} & \textit{Danio rerio} & zebrafish & 10 & 32 & 10min10s & $3712\times 3712$\\
\newtag{ 12 }{vid:group_2} & \textit{Danio rerio} & zebrafish & 10 & 32 & 10min3s & $3712\times 3712$\\
\newtag{ 13 }{vid:group_1} & \textit{Danio rerio} & zebrafish & 10 & 32 & 10min3s & $3712\times 3712$\\
\newtag{ 14 }{vid:guppy_8_t46_d1_20191207_102508} & \textit{Poecilia reticulata} & guppy & 8 & 30 & 3h15min22s & $3008\times 3008$\\
\newtag{ 15 }{vid:guppy_8_t36_d15_20191212_085800} & \textit{Poecilia reticulata} & guppy & 8 & 25 & 1h12min & $3008\times 3008$\\
\newtag{ 16 }{vid:guppy_8_t20_d1_20190512_115801} & \textit{Poecilia reticulata} & guppy & 8 & 35 & 3h18min13s & $3008\times 3008$\\
\newtag{ 17 }{vid:singleguppy_f2_d9} & \textit{Poecilia reticulata} & guppy & 1 & 140 & 1h9min32s & $1312\times 1312$\\
\bottomrule
\end{tabular}

\medskip 

\caption{\label{tab:videos}A list of the videos used in this paper as part of the evaluation of \TRex{}, along with the species of animals in the videos and their common names, as well as other video-specific properties. Videos are given an incremental ID, to make references more efficient in the following text, which are sorted by the number of individuals in the video. Individual quantities are given accurately, except for the videos with more than 100 where the exact number may be slightly more or less. These videos have been analysed using \TRex{}' dynamic analysis mode that supports unknown quantities of animals.}
\tabledata{\changemade{Videos \vidref{vid:video_example_100fish_1min} and \vidref{vid:flies_N59}, as well as \vidref{vid:group_1}-\vidref{vid:group_3}, are available as part of the original \texttt{idtracker} paper (\cite{idtracker}). Many of the videos are part of yet unpublished data: Guppy videos have been recorded by A. Albi, videos with sunbleak (\textit{Leucaspius delineatus}) have been recorded by D. Bath. The termite video has been kindly provided by H. Hugo and the locust video by F. Oberhauser. Due to the size of some of these videos (>150GB per video), they have to be made available upon specific request. Raw versions of these videos (some trimmed), as well as full preprocessed versions, are available as part of the dataset published alongside this paper \cite{walter2020dataset}.}}

\end{table}

In all of our tests we used a relatively modest computer system, which could be described as a mid-range consumer or gaming PC:

\begin{enumerate} [label=\textnormal{$\bullet$}]
\item \label{ref:hardware_recommend} Intel Core i9-7900X CPU
\item NVIDIA Geforce 1080 Ti
\item 64GB RAM
\item NVMe PCIe x4 hard-drive
\item Debian bullseye (\href{https://www.debian.org/devel/debian-installer/}{debian.org})
\end{enumerate}

As can be seen in the following sections (memory consumption, processing speeds, etc.) using a high-end system is not necessary to run \TRex{} and, anecdotally, we did not observe noticeable improvements when using a solid state drive versus a normal hard drive. A video card (presently an NVIDIA card due to the requirements of TensorFlow) is recommended for tasks involving visual identification as such computations will take much longer without it -- however, it is not required. We decided to employ this system due to having a relatively cheap, compatible graphics card, as well as to ensure that we have an easy way to produce direct comparisons with \idtracker{} -- which according to their website requires large amounts of RAM (32-128GB, \href{https://idtrackerai.readthedocs.io/en/latest/how_to_install.html}{idtrackerai online documentation}) and a fast solid-state drive.

\tableref{videos} shows the entire set of videos used in this paper, which have been obtained from multiple sources (credited under the table) and span a wide range of different organisms, demonstrating \TRex{}' ability to track anything as long as it moves occasionally. Videos involving a large number (>100) of individuals are all the same species of fish since these were the only organisms we had available in such quantities. However, this is not to say that only fish could be tracked efficiently in these quantities. We used the full dataset with up to 1024 individuals in one video (\videoref{vid:reversals3m_1024_dotbot_20181025_105202.stitched}) to evaluate raw tracking speed without visual identification and identity corrections (next sub-section). However, since such numbers of individuals exceed the capacity of the neural network used for automatic identity corrections (compare also \cite{idtrackerai} who used a similar network), we only used a subset of these videos (videos \vidref{vid:video_example_100fish_1min} through \vidref{vid:guppy_8_t20_d1_20190512_115801}) to look specifically into the quality of our visual identification in terms of keeping identities and its memory consumption.

\subsection{Tracking: Speed and Accuracy}

In evaluating the \nameref{sec:tracking} portion of \TRex{}, the main focus lies with processing speed, while accuracy in terms of keeping identities is of secondary importance. Tracking is required in all other parts of the software, making it an attractive target for extensive optimization. Especially with regards to closed-loop, and live-tracking situations, there may be no room even to lose a millisecond between frames and thus risk dropping frames. We therefore designed \TRex{} to support the simultaneous tracking of many ($\geq$256) individuals \textit{quickly} and achieve reasonable \textit{accuracy} for up to 100 individuals -- which are the two suppositions we will investigate in the following.

Trials were run without posture/visual-field estimation enabled, where tracking generally, and consistently, reaches speeds faster than real-time (processing times of 1.5-40\% of the video duration, 25-100Hz) even for a relatively large number of individuals (77-94.77\% for up to 256 individuals, see \tableref{absolute_speeds_no_posture}). Videos with more individuals (>500) were still tracked within reasonable time of 235\% to 358\% of the video duration. As would be expected from these results, we found that combining tracking and recording in a single step generally leads to higher processing speeds. The only situation where this was not the case was a video with 1024 individuals, which suggests that live-tracking (in \TGrabs{}) handles cases with many individuals slightly worse than offline tracking (in \TRex{}). Otherwise, 5\% to 35\% shorter total processing times were measured (14.55\% on average, see \tableref{timings}), compared to running \TGrabs{} separately and then tracking in \TRex{}. These percentage differences, in most cases, reflect the ratio between the video duration and the time it takes to track it, suggesting that most time is spent -- by far -- on the conversion of videos. This additional cost can be avoided in practice when using \TGrabs{} to record videos, by directly writing to a custom format recognized by \TRex{}, and/or using its live-tracking ability to export tracking data immediately after the recording is stopped.

We also investigated trials that were run with posture estimation \textit{enabled} and we found that real-time speed could be achieved for videos with $\leq$ 128 individuals (see column "tracking" in \tableref{timings}). Tracking speed, when posture estimation is enabled, depends more strongly on the size of individuals in the image.

Generally, tracking software becomes slower as the number of individuals to be tracked increases, as a result of an exponentially growing number of combinations to consider during matching. \changemade{\TRex{} uses a novel tree-based algorithm by default (see \nameref{sec:tracking}), but circumvents problematic situations by falling back on using the \textit{Hungarian method} (also known as the \textit{Kuhn–Munkres algorithm}, \cite{kuhn1955hungarian})} when necessary. Comparing our mixed approach (see \nameref{sec:tracking}) to purely using the Hungarian method shows that, while both perform similarly for few individuals, the Hungarian method is easily outperformed by our algorithm for larger groups of individuals (as can be seen in \figref{fig:approx_accurate}). This might be due to custom optimizations regarding local cliques of individuals, whereby we ignore objects that are too far away, and also as a result of our optimized pre-sorting. The Hungarian method has the advantage of not leading to combinatorical explosions in some situations -- and thus has a lower \textit{maximum} complexity while proving to be less optimal in the \textit{average} case. For further details, see the appendix: \nameref{sec:matching_graph}.

 \label{sec:evaluation_accuracy}
In addition to speed, we also tested the accuracy of our tracking method, with regards to the consistency of identity assignments, comparing its results to the manually reviewed data (the methodology of which is described in the next section). In order to avoid counting follow-up errors as "new" errors, we divided each trajectory in the uncorrected data into "uninterrupted" segments of frames, instead of simply comparing whole trajectories. A segment is interrupted when an individual is lost (for any of the reasons given in \nameref{sec:segments}) and starts again when it is reassigned to another object later on. We term these (re-)assignments \textit{decisions} here. Each segment of every individual can be uniquely assigned to a similar/identical segment in the baseline data and its identity. Following one trajectory in the uncorrected data, we can detect these wrong decisions by checking whether the baseline identity associated with one segment of that trajectory changes in the next. We found that roughly 80\% of such decisions made by the tree-based matching were correct, even with relatively high numbers of individuals (100). For trajectories where no manually reviewed data were available, we used automatically corrected trajectories as a base for our comparison -- we evaluate the accuracy of these automatically corrected trajectories in the following section. Even though we did not investigate accuracy in situations with more than 100 individuals, we suspect similar results since the property with the strongest influence on tracking accuracy -- individual density -- is limited physically and most of the investigated species school tightly in either case.

\begin{table}
% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l l l l l l r}
\toprule
video & {\# ind.} & N \TRex{} & \% similar individuals & $\diameter$ final uniqueness \\
\midrule
\vidref{vid:video_example_100fish_1min} & $100$ & $5$ & $99.8346\pm 0.5265$ & $0.9758\pm 0.0018$ \\
\vidref{vid:flies_N59} & $59$ & $5$ & $98.6885\pm 2.1145$ & $0.9356\pm 0.0358$ \\
\vidref{vid:group_1} & $10$ & $5$ & $99.9902\pm 0.3737$ & $0.9812\pm 0.0013$ \\
\vidref{vid:group_3} & $10$ & $5$ & $99.9212\pm 1.1208$ & $0.9461\pm 0.0039$ \\
\vidref{vid:group_2} & $10$ & $5$ & $99.9546\pm 0.8573$ & $0.9698\pm 0.0024$ \\
\vidref{vid:guppy_8_t46_d1_20191207_102508} & $8$ & $5$ & $98.8359\pm 5.8136$ & $0.9192\pm 0.0077$ \\
\vidref{vid:guppy_8_t36_d15_20191212_085800} & $8$ & $5$ & $99.2246\pm 4.4486$ & $0.9576\pm 0.0023$ \\
\vidref{vid:guppy_8_t20_d1_20190512_115801} & $8$ & $5$ & $99.7704\pm 2.1994$ & $0.9481\pm 0.0025$ \\
\bottomrule
\end{tabular}

\medskip

\caption{\label{tab:recognition_acc}Evaluating comparability of the automatic visual identification between \idtracker{} and \TRex{}. Columns show various video properties, as well as the associated uniqueness score (see \autoref{box:uniqueness_score}) and a similarity metric. Similarity (\textit{\% similar individuals}) is calculated based on comparing the positions for each identity exported by both tools, choosing the closest matches overall and counting the ones that are differently assigned per frame. An individual is classified as "wrong" in that frame, if the euclidean distance between the matched solutions from \idtracker{} and \TRex{} exceeds 1\% of the video width. The column "\% similar individuals" shows percentage values, where a value of $99\%$ would indicate that, on average, 1\% of the individuals are assigned differently. To demonstrate how uniqueness corresponds to the quality of results, the last column shows the average uniqueness achieved across trials.}
\tabledata{\changemade{This file contains all X and Y positions for each trial and each software combined into one very large table. This data is also available in different formats in \cite{walter2020dataset}.}}
\tabledata{\changemade{Assignments between identities from multiple solutions, as calculated by a bipartite-graph matching algorithm. For each permutation of trials from \TRex{} and \idtracker{} for the same video, the algorithm sought to match the trajectories of the same physical individuals in both trials with each other by finding the ones with the smallest mean euclidean distance per frame between them. Available at \url{http://dx.doi.org/10.17617/3.4y}, as \protect\path{T2_source_data.zip}.}}
\end{table}

\subsection{Visual Identification: Accuracy} \label{sec:maintaining_identities}

Since the goal of using visual identification is to generate consistent identity assignments, we evaluated the accuracy of our method in this regard. As a benchmark, we compare it to manually reviewed datasets as well as results from \idtracker{} for the same set of videos (where possible). In order to validate trajectories exported by either software, we manually reviewed multiple videos with the help from a tool within \TRex{} that allows to view each crossing and correct possible mistakes in-place. Assignments were deemed incorrect, and subsequently corrected by the reviewer, if the centroid of a given individual was not contained within the object it was assigned to (e.g. the individual was not part of the correct object). Double assignments per object are impossible due to the nature of the tracking method. Individuals were also forcibly assigned to the correct objects in case they were visible but not detected by the tracking algorithm. After manual corrections had been applied, "clean" trajectories were exported -- providing a per-frame baseline truth for the respective videos. A complete table of reviewed videos, and the percentage of reviewed frames per video, can be found in \tableref{reviewed_crossings}. For longer videos (>1h) we relied entirely on a comparison between results from \idtracker{} and \TRex{}. Their paper (\cite{idtrackerai}) suggests a very high accuracy of over 99.9\% correctly identified individual images for most videos, which should suffice for most relevant applications and provide a good baseline truth. As long as both tools produce sufficiently similar trajectories, we therefore know they have found the correct solution.

A direct comparison between \TRex{} and \idtracker{} was not possible for videos \vidref{vid:15locusts1h} and \vidref{vid:N05HHS2019-10S-V1}, where \idtracker{} frequently exceeded hardware memory-limits and caused the application to be terminated, or did not produce usable results within multiple days of run-time. However, we were able to successfully analyse these videos with \TRex{} and evaluate its performance by comparing to manually reviewed trajectories (see below in \nameref{sec:maintaining_identities}). Due to the stochastic nature of machine learning, and thus the inherent possibility of obtaining different results in each run, as well as other potential factors influencing processing time and memory consumption, both \TRex{} and \idtracker{} have been executed repeatedly (5x \TRex{}, 3x \idtracker{}).

\begin{table}
\begin{tabular}{l l | l l | l l}
\toprule
\multicolumn{2}{c|}{video metrics} & \multicolumn{2}{c|}{review stats} & \multicolumn{2}{c}{\% correct} \\
\midrule
video & \textbf{{\# ind.}} & reviewed (\%) & of that interpolated (\%) & \TRex{} & \idtracker{}  \\
\midrule
\vidref{vid:video_example_100fish_1min} & 100 & 100.0 & $ 0.23 $ & $ 99.97 \pm 0.013 $ &
$ 98.95 \pm 0.146 $ \\
\vidref{vid:flies_N59} & 59 & 100.0 & $ 0.15 $ & $ 99.68 \pm 0.533 $ &
$ 99.94 \pm 0.0 $ \\
\vidref{vid:15locusts1h} & 15 & 22.2 & $ 8.44 $ & $ 95.12 \pm 6.077 $ &
N/A \\
\vidref{vid:N05HHS2019-10S-V1} & 10 & 100.0 & $ 1.21 $ & $ 99.7 \pm 0.088 $ &
N/A \\
\vidref{vid:group_1} & 10 & 100.0 & $ 0.27 $ & $ 99.98 \pm 0.0 $ &
$ 99.96 \pm 0.0 $ \\
\vidref{vid:group_2} & 10 & 100.0 & $ 0.59 $ & $ 99.94 \pm 0.006 $ &
$ 99.63 \pm 0.0 $ \\
\vidref{vid:group_3} & 10 & 100.0 & $ 0.5 $ & $ 99.89 \pm 0.009 $ &
$ 99.34 \pm 0.002 $ \\
\bottomrule
\end{tabular}

\medskip

\caption{\label{tab:reviewed_crossings} Results of the human validation for a subset of videos. Validation was performed by going through all problematic situations (e.g. individuals lost) and correcting mistakes manually, creating a fully corrected dataset for the given videos. This dataset may still have missing frames for some individuals, if they could not be detected in certain frames (as indicated by "of that interpolated"). This was usually a very low percentage of all frames, except for \videoref{vid:15locusts1h}, where individuals tended to rest on top of each other -- and were thus not tracked -- for extended periods of time. This baseline dataset was compared to all other results obtained using the automatic visual identification by \TRex{} ($N=5$) and \idtracker{} ($N=3$) to estimate correctness. We were not able to track videos \vidref{vid:15locusts1h} and \vidref{vid:N05HHS2019-10S-V1} with \idtracker{}, which is why correctness values are not available.}
\tabledata{\changemade{A table of positions for each individual of each manually approved and corrected trial.}}
\end{table}

The trajectories exported by both \idtracker{} and \TRex{} were very similar throughout (see \tableref{recognition_acc}). While occasional disagreements happened, similarity scores were higher than \changemade{98\% in all and higher than 99\% in most cases} (i.e. less than 1\% of individuals have been differently assigned in each frame on average). Most difficulties that \textit{did} occur were, after manual review, attributable to situations where multiple individuals cross over excessively within a short time-span. In each case that has been manually reviewed, identities switched back to the correct individuals -- even after temporary disagreement. We found that both solutions occasionally experienced these same problems, which often occur when individuals repeatedly come in and out of view in quick succession (e.g. overlapping with other individuals). Disagreements were expected for videos with many such situations due to the way both algorithms deal differently with them: \idtracker{} assigns identities only based on the network output. In many cases, individuals continue to partly overlap even while already being tracked, which results in visual artifacts and can lead to unstable predictions by the network and causing \idtracker{'s} approach to fail. Comparing results from both \idtracker{} and \TRex{} to manually reviewed data (see \tableref{reviewed_crossings}) shows that both solutions consistently provide high accuracy results of above 99.5\% for most videos, but that \TRex{} is slightly improved in all cases while also having a better overall frame coverage per individual (99.65\% versus \idtracker{'s} 97.93\%, where 100\% would mean that all individuals are tracked in every frame; not shown). This suggests that the splitting algorithm (see appendix, \nameref{box:splitting-algorithm}) is working to \TRex{}' advantage here.

\begin{figure}[h]
\centering
    
    %\captionsetup[subfigure]{justification=centering}
    %\begin{subfigure}[b]{0.9\textwidth}
%            \centering
%           \includegraphics[width=\textwidth]{activations15locusts1h.pdf}
%            \caption{Locusts from \videoref{vid:15locusts1h} with 15 tagged individuals (N: 5101, 7942, 9974) -- the only video with physical tags. The network activates more strongly in regions close to the tag, as well as the bottom right corner.
%            }
%            \label{fig:activate_locusts}
%    \end{subfigure}
%    \begin{subfigure}[b]{0.9\textwidth}
%           \centering
%           	\includegraphics[width=\textwidth]{activationsguppy_8_t36_d15_20191212_085800.pdf}
%            \caption{Guppies from \videoref{vid:guppy_8_t36_d15_20191212_085800} (N: 46378, 34733, 34745). Activations are less focussed and less consistent across individuals.}
%            \label{fig:activate_guppies}
%    \end{subfigure}
%    \begin{subfigure}[b]{0.9\textwidth}
%            \centering\includegraphics[width=\textwidth]{activationsflies_N59.pdf}
%            \caption{Flies from \videoref{vid:flies_N59} (N: 993, 1986, 993). Activations are not similar between individuals and show various "hotspots" across the entire body.}
%            \label{fig:activate_flies}
%    \end{subfigure}
%    \begin{subfigure}[b]{0.9\textwidth}
%            \centering\includegraphics[width=\textwidth]{activationsN05HHS2019-10S-V1.pdf}
%            \caption{Termites from \videoref{vid:N05HHS2019-10S-V1} (N: 27097, 31135, 22746). Here, the connections between body-segments show strong activations -- in contrast to very weak ones in other parts of the body. %Anecdotally, but as can be seen here for the first two individuals, activations seem to be strong especially close to connections of body-segments.
%            }
%            \label{fig:activate_termites}
%    \end{subfigure}
\includegraphics[width=\textwidth]{figures/fig_activations.pdf}

\caption{\label{fig:network_activations}Activation differences for images of randomly selected individuals from four videos, next to a median image of the respective individual -- which hides thin extremities, such as legs in (a) and (c). The captions in (a-d) detail the species per group and number of samples per individual. Colors represent the relative activation differences, with hotter colors suggesting bigger magnitudes, which are computed by performing a forward-pass through the network up to the last convolutional layer (using \href{https://github.com/philipperemy/keract}{keract}). The outputs for each identity are averaged and stretched back to the original image size by cropping and scaling according to the network architecture. Differences shown here are calculated per cluster of pixels corresponding to each filter, comparing average activations for images from the individual's class to activations for images from other classes.}
\figdata{\changemade{Code, as well as images/weights needed to produce this figure (see README).}}
\end{figure}

Additionally, while \TRex{} could successfully track individuals in all videos without tags, we were interested to see the effect of tags (in this case QR tags attached to locusts, see \figref{fig:network_activations}a)  %\figref{fig:activate_locusts})
on network training. In \figref{fig:network_activations} we visualise differences in network activation, depending on the visual features available for the network to learn from, which are different between species (or due to physically added tags, as mentioned above). The "hot" regions indicate larger between-class differences for that specific pixel (values are the result of activation in the last convolutional layer of the trained network, see figure legend). Differences are computed separately within each group and are not directly comparable between trials/species in value. However, the distribution of values -- reflecting the network's reactivity to specific parts of the image -- is. Results show that the most apparent differences are found for the stationary parts of the body (not in absolute terms, but following normalization, as shown in \figref{fig:datasets_comparison}c), which makes sense seeing as this part (i) is the easiest to learn due to it being in exactly the same position every time, (ii) larger individuals stretch further into the corners of a cropped image, making the bottom right of each image a source of valuable information (especially in \figref{fig:network_activations}a%\figref{fig:activate_locusts}
/\figref{fig:network_activations}b) %\figref{fig:activate_guppies})
and (iii) details that often occur in the head-region (like distance between the eyes) which can also play a role here. "Hot" regions in the bottom right corner of the activation images (e.g. in \figref{fig:network_activations}d) %\figref{fig:activate_termites}%
suggest that also pixels are reacted to which are explicitly \textit{not} part of the individual itself but of other individuals -- likely this corresponds to the network making use of size/shape differences between them.


\begin{table}
% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l l l l l l l | r}
\toprule
video & \#ind. & length & max.consec.& TRex memory (GB) & idtracker.ai memory (GB)   \\
\midrule
\vidref{vid:group_2} & $10$ & $10\mathrm{min}$ & $26.03\mathrm{s}$ & $ \diameter\ 4.88\pm 0.23, \max 6.31$ & $ \diameter\ 8.23\pm 0.99, \max 28.85$ \\
\vidref{vid:group_1} & $10$ & $10\mathrm{min}$ & $36.94\mathrm{s}$ & $ \diameter\ 4.27\pm 0.12, \max 4.79$ & $ \diameter\ 7.83\pm 1.05, \max 29.43$ \\
\vidref{vid:group_3} & $10$ & $10\mathrm{min}$ & $28.75\mathrm{s}$ & $ \diameter\ 4.37\pm 0.32, \max 5.49$ & $ \diameter\ 6.53\pm 4.29, \max 29.32$ \\
\vidref{vid:video_example_100fish_1min} & $100$ & $1\mathrm{min}$ & $5.97\mathrm{s}$ & $ \diameter\ 9.4\pm 0.47, \max 13.45$ & $ \diameter\ 15.27\pm 1.05, \max 24.39$ \\
\vidref{vid:guppy_8_t36_d15_20191212_085800} & $8$ & $72\mathrm{min}$ & $79.4\mathrm{s}$ & $ \diameter\ 5.6\pm 0.22, \max 8.41$ & $ \diameter\ 35.2\pm 4.51, \max 91.26$ \\
\vidref{vid:N05HHS2019-10S-V1} & $10$ & $10\mathrm{min}$ & $1.91\mathrm{s}$ & $ \diameter\ 6.94\pm 0.27, \max 10.71$& N/A \\
\vidref{vid:15locusts1h} & $15$ & $60\mathrm{min}$ & $7.64\mathrm{s}$ & $ \diameter\ 13.81\pm 0.53, \max 16.99$& N/A \\
\vidref{vid:flies_N59} & $59$ & $10\mathrm{min}$ & $102.35\mathrm{s}$ & $ \diameter\ 12.4\pm 0.56, \max 17.41$ & $ \diameter\ 35.3\pm 0.92, \max 50.26$ \\
\vidref{vid:guppy_8_t46_d1_20191207_102508} & $8$ & $195\mathrm{min}$ & $145.77\mathrm{s}$ & $ \diameter\ 12.44\pm 0.8, \max 21.99$ & $ \diameter\ 35.08\pm 4.08, \max 98.04$ \\
\vidref{vid:guppy_8_t20_d1_20190512_115801} & $8$ & $198\mathrm{min}$ & $322.57\mathrm{s}$ & $ \diameter\ 16.15\pm 1.6, \max 28.62$ & $ \diameter\ 49.24\pm 8.21, \max 115.37$ \\

\bottomrule
\end{tabular}
\medskip 

\caption{\label{tab:memory_table}Both \TRex{} and \idtracker{} analysed the same set of videos, while continuously logging their memory consumption using an external tool. Rows have been sorted by $\mathrm{video\_length} * \mathrm{\#individuals}$, which seems to be a good predictor for the memory consumption of both solutions. \idtracker{} has mixed mean values, which, at low individual densities are similar to \TRex{}' results. Mean values can be misleading here, since more time spent in low-memory states skews results. The maximum, however, is more reliable since it marks the memory that is necessary to run the system. Here, \idtracker{} clocks in at significantly higher values (almost always more than double) than \TRex{}.}
\tabledata{\changemade{Data from log files for all trials as a single table, where each row is one sample. The total memory of each sample is calculated as $\mathrm{SWAP} + \mathrm{PRIVATE} + \mathrm{SHARED}$. Each row indicates at which exact time, by which software, and as part of which trial it was taken.}}
\end{table}


\begin{figure}
\centering
\includegraphics[width=0.9\linewidth]{figures/memory_consumption.pdf}
\caption{The maximum memory by \TRex{} and \idtracker{} when tracking videos from a subset of all videos (the same videos as in \tableref{recognition_acc}). Results are plotted as a function of video length (min) multiplied by the number of individuals. We have to emphasize here that, for the videos in the upper length regions of multiple hours (\vidref{vid:guppy_8_t20_d1_20190512_115801}, \vidref{vid:guppy_8_t46_d1_20191207_102508}), we had to set \idtracker{} to store segmentation information on disk -- as opposed to in RAM. This uses less memory, but is also slower. For the video with flies we tried out both and also settled for on-disk, since otherwise the system ran out of memory. Even then, the curve still accelerates much faster for \idtracker{}, ultimately leading to problems with most computer systems. To minimize the impact that hardware compatibility has on research, we implemented switches limiting memory usage while always trying to maximize performance given the available data. \TRex{} can be used on modern laptops and normal consumer hardware at slightly lower speeds, but without any \textit{fatal} issues.}
\label{fig:memory_per_video_length}
\figdata{\changemade{Each data-point from \figref{fig:memory_per_video_length} as plotted, indexed by video and software used.}}
\end{figure}


\begin{figure}[h]
\centering
\includegraphics[width=0.9\linewidth]{figures/raw_posture_moments.pdf}
\caption{Convergence behavior of the network training for three different normalization methods. This shows the maximum achievable validation accuracy after 100 epochs for 100 individuals (\videoref{vid:video_example_100fish_1min}), when sub-sampling the number of examples per individual. Tests were performed using a manually corrected training dataset to generate the images in three different ways, using the same, independent script (see \figref{fig:datasets_comparison}): Using no normalization (blue), using normalization based on image moments (green, similar to \idtracker{}), and using posture information (red, as in \TRex{}). Higher numbers of samples per individual result in higher maximum accuracy overall, but -- unlike the other methods -- posture-normalized runs already reach an accuracy above the 90\% mark for $\geq$75 samples. This property can help significantly in situations with more crossings, when longer global segments are harder to find.}
\label{fig:maximum_val_acc_per_samples}
\figdata{\changemade{Raw data-points as plotted in \figref{fig:maximum_val_acc_per_samples}.}}
\end{figure}

As would be expected, distinct patterns can be recognized in the resulting activations after training as soon as physical tags are attached to individuals (as in \figref{fig:network_activations}a). %\figref{fig:activate_locusts}). 
While other parts of the image are still heavily activated (probably to benefit from size/shape differences between individuals), tags are always at least a large part of where activations concentrate. The network seemingly makes use of the additional information provided by the experimenter, where that has occurred. This suggests that, while definitely not being necessary, adding tags probably does not worsen, and likely may even improve, training accuracy, for difficult cases allowing networks to exploit any source of inter-individual variation.


\subsection{Visual Identification: Memory Consumption}

In order to generate comparable results between both tested software solutions, the same external script has been used to measure shared, private and swap memory of \idtracker{} and \TRex{}, respectively. There are a number of ways with which to determine the memory usage of a process. For automation purposes we decided to use a tool called \href{https://github.com/jeetsukumaran/Syrupy}{syrupy}, which can start and save information about a specified command automatically. We modified it slightly, so we could obtain more accurate measurements for Swap, Shared and Private separately, using \href{http://www.pixelbeat.org/scripts/ps_mem.py}{ps\_mem}.


As expected, differences in memory consumption are especially prominent for long videos (4-7x lower maximum memory), and for videos with many individuals (2-3x lower). Since we already experienced significant problems tracking a long video (>3h) of only 8 individuals with \idtracker{}, we did not attempt to further study its behavior in long videos with many individuals. However, we would expect \idtracker{'s} memory usage to increase even more rapidly than is visible in \figref{fig:memory_per_video_length} since it retains a lot of image data (segmentation/pixels) in memory and we already had to "allow" it to relay to hard-disk in our efforts to make it work for Videos \vidref{vid:flies_N59}, \vidref{vid:guppy_8_t46_d1_20191207_102508} and \vidref{vid:guppy_8_t20_d1_20190512_115801} (which slows down analysis). The maximum memory consumption across all trials was on average 5.01$\pm$2.54 times higher in \idtracker{}, ranging from 1.81 to 10.85 times the maximum memory consumption of \TRex{} for the same video.

Overall memory consumption for \TRex{} also contains posture data, which contributes a lot to RAM usage. Especially with longer videos, disabling posture can lower the hardware needs for running our software. If posture is to be retained, the user can still (more slightly) reduce memory requirements by changing the outline re-sampling scale (1 by default), which adjusts the outline resolution between sub- and super-pixel accuracy. While analysis will be faster -- and memory consumption lower -- when posture is disabled (only limited by the matching algorithm, see \figref{fig:approx_accurate}), users of the visual identification might experience a decrease in training accuracy or speed (see \figref{fig:maximum_val_acc_per_samples}).


\begin{table}[!h]
% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l l l l l || l | r}
\toprule
video & {\# ind.} & length & sample & \TGrabs{} (min) &  \TRex{} (min)  & ours (min) & \idtracker{} (min)  \\
\midrule

\vidref{vid:video_example_100fish_1min} & 100 & $ 1 \mathrm{min} $ & $ 1.61 \mathrm{s} $ & $ 2.03 \pm 0.02 $ & $ 74.62 \pm 6.75 $ & $ 76.65 $ &
$ 392.22 \pm 119.43 $ \\
\vidref{vid:flies_N59} & 59 & $ 10 \mathrm{min} $ & $ 19.46 \mathrm{s} $ & $ 9.28 \pm 0.08 $ & $ 96.7 \pm 4.45 $ & $ 105.98 $ &
$ 4953.82 \pm 115.92 $ \\
\vidref{vid:15locusts1h} & 15 & $ 60 \mathrm{min} $ & $ 33.81 \mathrm{s} $ & $ 13.17 \pm 0.12 $ & $ 101.5 \pm 1.85 $ & $ 114.67 $ &
N/A \\
\vidref{vid:group_3} & 10 & $ 10 \mathrm{min} $ & $ 12.31 \mathrm{s} $ & $ 8.8 \pm 0.12 $ & $ 21.42 \pm 2.45 $ & $ 30.22 $ &
$ 127.43 \pm 57.02 $ \\
\vidref{vid:group_2} & 10 & $ 10 \mathrm{min} $ & $ 10.0 \mathrm{s} $ & $ 8.65 \pm 0.07 $ & $ 23.37 \pm 3.83 $ & $ 32.02 $ &
$ 82.28 \pm 3.83 $ \\
\vidref{vid:group_1} & 10 & $ 10 \mathrm{min} $ & $ 36.91 \mathrm{s} $ & $ 8.65 \pm 0.07 $ & $ 12.47 \pm 1.27 $ & $ 21.12 $ &
$ 79.42 \pm 4.52 $ \\
\vidref{vid:N05HHS2019-10S-V1} & 10 & $ 10 \mathrm{min} $ & $ 16.22 \mathrm{s} $ & $ 4.43 \pm 0.05 $ & $ 35.05 \pm 1.45 $ & $ 39.48 $ &
N/A \\
\vidref{vid:guppy_8_t46_d1_20191207_102508} & 8 & $ 195 \mathrm{min} $ & $ 67.97 \mathrm{s} $ & $ 109.97 \pm 2.05 $ & $ 70.48 \pm 3.67 $ & $ 180.45 $ &
$ 707.0 \pm 27.55 $ \\
\vidref{vid:guppy_8_t36_d15_20191212_085800} & 8 & $ 72 \mathrm{min} $ & $ 79.36 \mathrm{s} $ & $ 32.1 \pm 0.42 $ & $ 30.77 \pm 6.28 $ & $ 62.87 $ &
$ 291.42 \pm 16.83 $ \\
\vidref{vid:guppy_8_t20_d1_20190512_115801} & 8 & $ 198 \mathrm{min} $ & $ 134.07 \mathrm{s} $ & $ 133.1 \pm 2.28 $ & $ 68.85 \pm 13.12 $ & $ 201.95 $ &
$ 1493.83 \pm 27.75 $ \\
\bottomrule
\end{tabular}
\medskip 
%\tabledata{\changemade{}
\caption{\label{tab:recognition_timings}Evaluating time-cost for automatic identity correction -- comparing to results from \idtracker{}. Timings consist of preprocessing time in \TGrabs{} plus network training in \TRex{}, which are shown separately as well as combined (\textit{ours (min)}, $N=5$). The time it takes to analyse videos strongly depends on the number of individuals and how many usable samples per individual the initial segment provides. The length of the video factors in as well, as does the stochasticity of the gradient descent (training). \idtracker{} timings ($N=3$) contain the whole tracking and training process from start to finish, using its \texttt{terminal\_mode} (v3). Parameters have been manually adjusted per video and setting, to the best of our abilities, spending at most one hour per configuration. For videos \vidref{vid:guppy_8_t20_d1_20190512_115801} and \vidref{vid:guppy_8_t46_d1_20191207_102508} we had to set \idtracker{} to storing segmentation information on disk (as compared to in RAM) to prevent the program from being terminated for running out of memory.}
\tabledata{\changemade{Preprocessed log files (see also \protect\path{notebooks.zip} in \cite{walter2020dataset}) in a table format. The total processing time (s) of each trial is indexed by video and software used -- \TGrabs{} for conversion and \TRex{} and \idtracker{} for visual identification. This data is also used in \tableref{timings}.}}
\end{table}

\subsection{Visual Identification: Processing Time}

Automatically correcting the trajectories (to produce consistent identity assignments) means that additional time is spent on the training and application of a network, specifically for the video in question. Visual identification builds on some of the other methods described in this paper (tracking and posture estimation), naturally making it by far the most complex and time-consuming process in \TRex{} -- we thus evaluated how much time is spent on the entire sequence of all required processes. For each run of \TRex{} and \idtracker{}, we saved precise timing information from start to finish. Since \idtracker{} reads videos \textit{directly} and preprocesses them again each run, we used the same starting conditions with our software for a direct comparison:


A trial starts by converting/preprocessing a video in \TGrabs{} and then immediately opening it in \TRex{}, where automatic identity corrections were applied. \TRex{} terminated automatically after satisfying a correctness criterion (high uniqueness value) according to equation \eqref{eq:gooduniqueness}. It then exported trajectories, as well as validation data (similar to \idtracker{}), concluding the trial. The sum of time spent within \TGrabs{} and \TRex{} gives the total amount of time for that trial. For the purpose of this test it would not have been fair to compare only \TRex{} processing times to \idtracker{}, but it is important to emphasize that conversion could be skipped entirely by using \TGrabs{} to record videos directly from a camera instead of opening an existing video file.

%Conversion times correlated strongly with the total video-length (in frames) and not the number of individuals, suggesting conversion was only constrained by video-decoding/reading speeds and not by (pre-)processing. 
\changemade{In \tableref{recognition_timings} we can see that video length and processing times (in \TRex{}) did not correlate directly. Indeed, a 1 minute video (\videoref{vid:flies_N59} took significantly longer than one that was 60 minutes long (\videoref{vid:guppy_8_t36_d15_20191212_085800}). The reason for this, initially counterintuitive, result is that the process of learning identities requires sufficiently long video sequences: longer samples have a higher likelihood of capturing more of the total possible intra-individual variance which helps the algorithm to more comprehensively represent each individual's appearance. Longer videos naturally provide more material for the algorithm to choose from and, simply due to their length, have a higher probability of containing at least one higher-quality segment that allows higher uniqueness-regimes to be reached more quickly (see \nameref{sec:training_quality} and \nameref{sec:recognition_stopping}). Thus, it is important to use sufficiently long video sequences for visual identification, and longer sequences can lead to better results -- both in terms of quality and processing time.}


%a sample that, due to its length, likely contains more of the total variance
%conversion times in \TGrabs{} often overtook processing times in \TRex{} with increasing video-length if the number of individuals remained the same -- suggesting that 

%In \tableref{recognition_timings} we can see that video length and processing times did not correlate directly. \changemade{Conversion times correlated with the total video-length (in frames) and not the number of individuals, suggesting conversion was only constrained by video-decoding/reading speeds and not by (pre-)processing. Indeed, conversion times in \TGrabs{} often overtook processing times in \TRex{} with increasing video-length if the number of individuals remained the same. Furthermore, the time it took to track and correct a video was shorter when the initial segment (column "sample" in the table) was longer (and as such likely capturing more visual intra-individual variation). Longer videos often provide more material for the algorithm to choose from and (simply due to their length) have a higher probability of producing at least one such higher-quality segment. Starting with a sample containing most of the variance, likely produces higher uniqueness-scores (\nameref{sec:training_quality}) early on and prompt the algorithm to terminate early. Perhaps counter-intuitively, using longer video-sequences instead of shorter ones should, in terms of time and accuracy, can thus be expected to produce better results and more quickly.}

Compared to \idtracker{}, \TRex{} (conversion + visual identification) shows both considerably lower computation times ($2.57\times$ to $46.74\times$ faster for the same video), as well as lower variance in the timings ($79\%$ lower for the same video on average).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% LIMITATIONS, DISCUSSION
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Discussion}

\changemade{We have designed \TRex{} to be a versatile and fast program that can enable researches to track} animals (and other mobile objects) in a wide range of situations. It maintains identities of up to 100 un-tagged individuals and produces corrected tracks, along with posture estimation\changemade{, visual-field reconstruction, and other features that enable the quantitative study of animal behavior.} Even videos that can not be tracked by other solutions, such as videos with over 500 animals, can now be tracked within the same day of recording.

While all options are available from the command-line and a screen is not required, \TRex{} offers a rich, yet straight-foward to use, interface to local as well as remote users. Accompanied by the integrated documentation for all parameters, each stating purpose, type and value ranges, as well as a comprehensive online documentation, \changemade{new users are provided with all the information required for a quick adoption of our software.} Especially to the benefit of new users, we evaluated the parameter space \changemade{using videos of diverse species} (fish, termites, locusts) and determined which parameters work best in most use-cases to set their default values.

\begin{figure}[h]
%\begin{fullwidth}
\includegraphics[width=1.0\linewidth]{figures/trex_screenshot.pdf}
%\captionsetup{margin=0pt,calcmargin={0pt,-4.5cm}}
\caption{An overview of \TRex{'} the main interface, which is part of the documentation at \href{https://trex.run/docs}{trex.run/docs}. Interface elements are sorted into categories in the four corners of the screen (labelled here in black). The omni-box on the bottom left corner allows users to change parameters on-the-fly, helped by a live auto-completion and documentation for all settings. Only some of the many available features are displayed here. Generally, interface elements can be toggled on or off using the bottom-left display options or moved out of the way with the cursor. Users can customize the tinting of objects (e.g. sourcing it from their speed) to generate interesting effect and can be recorded for use in presentations. Additionally, all exportable metrics (such as border-distance, size, x/y, etc.) can also be shown as an animated graph for a number of selected objects. Keyboard shortcuts are available for select features such as loading, saving, and terminating the program. Remote access is supported and offers the same graphical user interface, e.g. in case the software is executed without an application window (for batch processing purposes).}
\label{fig:trex_screenshot}
%\end{fullwidth}
\end{figure}

The interface is structured into groups (see \figref{fig:trex_screenshot}), categorized by the typical use-case:

\begin{enumerate}
    \item The main menu, containing options for loading/saving, options for the timeline and reanalysis of parts of the video
    \item Timeline and current video playback information
    \item Information about the selected individual
    \item Display options and an interactive "omni-box" for viewing and changing parameters
    \item General status information about \TRex{} and the \texttt{Python} integration
\end{enumerate}

The tracking accuracy of \TRex{} is at the state-of-the-art while typically being $2.57\times$ to $46.74\times$ faster than comparable software and having lower hardware requirements -- \changemade{especially} RAM. In addition to visual identification and tracking, it provides a rich assortment of additional data, including body posture, visual fields, and other kinematic as well as group-related information (such as derivatives of position, border and mean neighbor distance, group compactness, etc.); even in live-tracking and closed-loop situations. 

Raw tracking speeds (without visual identification) still achieved roughly 80\% accuracy per decision (as compared to >99\% with visual identification). We have found that real-time performance can be achieved, even on relatively modest hardware, for all numbers of individuals $\leq$256 without posture estimation ($\leq$ 128 with posture estimation). More than 256 individuals can be tracked as well, remarkably still delivering frame-rates at about 10-25 frames per second using the same settings.

Not only does the increased processing-speeds benefit researchers, but the contributions we provide to data exploration should not be underestimated as well -- merely making data more easily accessible right out-of-the-box, such as visual fields and live-heatmaps, has the potential to reveal features of group- and individual behaviour which have not been visible before. \TRex{} makes information on multiple timescales of events available simultaneously, and sometimes this is the only way to detect interesting properties (e.g. trail formation in termites).


%\subsection{Future extensions}

Since the software is already actively used within the Max Planck Institute of Animal Behavior, reported issues have been taken into consideration during development. However, certain theoretical, as well as practically observed, limitations remain:

\begin{itemize}
	\item Posture: While almost all shapes can be detected correctly (by adjusting parameters), some shapes -- especially round shapes -- are hard to interpret in terms of "tail" or "head". This means that only the other image alignment method (moments) can be used. However, it does introduce some limitations e.g. calculating visual fields is impossible.
	\item Tracking: Predictions, if the wrong direction is assumed, might go really far away from where the object is. Objects are then "lost" for a fixed amount of time (parameter). This can be "fixed" by shortening this time-period, though this leads to different problems when the software does not wait long enough for individuals to reappear.
	\item General: Barely visible individuals have to be tracked with the help of deep learning (e.g. using \cite{Cae+17}) and a custom-made mask per video frame, prepared in an external program of the users choosing
	\item Visual identification: All individuals have to be \textit{visible} and \textit{separate} at the same time, at least once, for identification to work at all. Visual identification, e.g. with very high densities of individuals, can thus be very difficult. This is a hard restriction to any software since finding consecutive global segments is the underlying principle for the successful recognition of individuals.
\end{itemize}

We will continue updating the software, increasingly addressing the above issues (and likely others), as well as potentially adding new features. During development we noticed a couple of areas where improvements could be made, both theoretical and practical in nature. Specifically, incremental improvements in analysis speed could be made regarding visual identification by using the trained network more sporadically -- e.g. it is not necessary to predict every image of very long consecutive segments, since, even with fewer samples, prediction values are likely to converge to a certain value early on. A likely more potent change would be an improved "uniqueness" algorithm, which, during the accumulation phase, is better at predicting which consecutive segment will improve training results the most. This could be done, for example, by taking into account the variation between images of the same individual. Other planned extensions include:

\begin{itemize}
    \item (Feature): We want to have a more general interface available to users, so they can create their own plugins. Working with the data in live-mode, while applying their own filters. As well as specifically being able to write a plugin that can detect different species/annotate them in the video.
    \item (Crossing solver): Additional method optimized for splitting overlapping, solid-color objects. The current method, simply using a threshold, is effective for many species but often produces large holes when splitting objects consisting of largely the same color.
\end{itemize}

To obtain the most up-to-date version of \TRex{}, please download it at \href{https://trex.run}{trex.run} or update your existing installation according to our instructions listed on \href{https://trex.run/docs/install.html}{trex.run/docs/install.html}.

\section{Materials \& Methods}

\changemade{In the following sections we describe the methods implemented in \TRex{} and \TGrabs{}, as well as their most important features in a typical order of operations (see \figref{fig:pipeline_overview} for a flow diagram), starting out with a raw video. We will then describe how trajectories are obtained and end with the most technically involved features.}

% \newcolumntype{M}{>{\begin{varwidth}{4cm}}l<{\end{varwidth}}}
% \begin{table}[h]
% % Use "S" column identifier to align on decimal point 
% \begin{tabular}{p{2cm}|p{4cm}|p{3.5cm}|p{3cm}}%|p{2cm}}
% \toprule
% Resource & Designation & Source or reference & Identifiers \\ %& Additional information \\
% \midrule
% Syrupy & Measuring memory consumption during the runtime of a process & \protect\href{https://github.com/jeetsukumaran/Syrupy}{github/jeetsukumaran} \\
% ps\_mem & Adding additional information to Syrupy output & \href{http://www.pixelbeat.org/scripts/ps_mem.py}{pixelbeat.org} \\
% Jupyter Lab & Analysis & \href{https://github.com/jupyterlab/jupyterlab}{github/jupyterlab} & \protect\path{RRID:SCR_018315}\\
% Python & Analysis & \href{https://python.org}{python.org} & \protect\path{RRID:SCR_008394} \\
% Debian & Operating system & &\protect\path{RRID:SCR_006638}\\
% \bottomrule
% \end{tabular}
% \medskip 
% %\tabledata{\changemade{}
% \caption{Do I need this}
% \end{table}

\subsection{Segmentation}

When an image is first received from a camera (or a video file), the objects of interest potentially present in the frame must be \changemade{found} and cropped out. Several technologies are available to separate the foreground from the background (segmentation). Various machine learning algorithms are frequently used to great effect, even for the most complex environments (\citealt{hughey2018challenges}, \citealt{robie2017machine}, \citealt{francisco2019low}). These more advanced approaches are typically beneficial for the analysis of field-data or organisms that are very hard to see in video (e.g. very transparent or low contrast objects/animals in the scene). \changemade{In these situations, where integrated methods might not suffice, it is possible to segment objects from the background using external, e.g. deep-learning based, tools (see next paragraph).} However, for most laboratory experiments, simpler (and also much faster), classical image-processing methods yield satisfactory results. \changemade{Thus, we provide as a generically-useful capability \emph{background-subtraction}, which is the default method by which objects are segmented. This can be used immediately in experiments where the background is relatively static. Backgrounds are generated automatically by uniformly sampling images from the source video(s) -- different modes are available (min/max, mode and mean) for the user to choose from. More advanced image-processing techniques like luminance equalization (which is useful when lighting varies between images), image undistortion, and brightness/contrast adjustments are available in \TGrabs{} and can enhance segmentation results -- but come at the cost of slightly increased processing time.} Importantly, since many behavioral studies rely on $\ge$ 4K resolution videos, we heavily utilize the GPU (if available) to speed up most of the image-processing, allowing \TRex{} to scale well with increasing image resolution.

\changemade{\TGrabs{} can generally find any object in the video stream, and subsequently pass it on to the tracking algorithm (next section),} as long as either (i) the background is relatively static while the objects move at least occasionally, (ii) the objects/animals of interest have enough contrast to the background or (iii) the user provides an additional binary mask per frame which is used to separate the objects \changemade{of interest} from the background, the typical means of doing this being by deep-learning based segmentation (e.g. \citealt{Cae+17}). These masks are expected to be in a video-format themselves and correspond 1:1 in length and dimensions to the video that is to be analyzed. They are expected to be binary, marking individuals in white and background in black. Of course, these binary videos could be \changemade{used} on their own, but would not retain grey-scale information \changemade{of the objects}. There are a lot of possible applications where this could be useful; but generally, whenever individuals are really hard to detect visually and need to be recognized by a different software (e.g. a machine-learning-based \changemade{segmentation} like \citealt{Man+18b}). Individual frames can then be connected using our software as a second step.

The detected objects are saved to a custom non-proprietary compressed file format (Preprocessed Video or \protect\path{PV}, see appendix \nameref{sec:pv_files}), that stores only the most essential information from the original video stream: the objects and their pixel positions and values. This format is optimized for quick random index access by the tracking \changemade{algorithm (see next section)} and stores other meta-information (like frame timings) utilized during playback or analysis. When recording videos directly from a camera, they can also be streamed to an additional and independent MP4 container format (plus information establishing the mapping between \protect\path{PV} and MP4 video frames).

\subsection{Tracking} \label{sec:tracking}

Once animals (or, more generally, termed "objects" henceforth) have been successfully segmented from the background, we can either use the live-tracking feature in \TGrabs{} or open a pre-processed file in \TRex{}, to generate the trajectories of these objects. This process uses information regarding an object's movement (i.e. its kinematics) to follow it across frames, estimating future positions based on previous velocity and angular speed. It will be referred to as "tracking" in the following text, and is a required step in all workflows.

Note that this approach alone is very fast, but, as will be shown, is subject to error with respect to maintaining individual identities. If that is required, there is a further step, outlined in \nameref{sec:visual_recognition} below, which can be applied at the cost of processing speed. First, however, we will discuss the general basis of tracking, which is common to approaches that do, and do not, require identities to be maintained with high-fidelity. Tracking can occur for two distinct categories, which are handled slightly differently by our software:
\begin{enumerate}
  \item there is a known number of objects
  \item there is an unknown number of objects
\end{enumerate}

The first case assumes that the number of tracked objects in a frame cannot exceed a certain expected number of objects (\changemade{calculated} automatically\changemade{,} or set by the user). This allows the algorithm to make stronger assumptions, for example regarding noise, where otherwise "valid" objects (conforming to size expectations) are ignored due to their positioning in the scene (e.g. too far away from previously lost individuals). In the second case, new objects may be generated until all viable objects in a frame are assigned. While being more susceptible to noise, this is useful for tracking a large number of objects, where counting objects may not be possible, or where there is a highly variable number of objects to be tracked.

For a given video, our algorithm processes every frame sequentially, extending existing trajectories (if possible) for each of the objects found in the current frame. Every object can only be assigned to one trajectory, but some objects may not be assigned to any trajectory (e.g. in case the number of objects exceeds the allowed number of individuals) and some trajectories might not be assigned to any object (e.g. while objects are out of view). To estimate object identities across frames we use an approach akin to the popular Kalman filter \citep{kalman1960new} which makes predictions based on multiple noisy data streams (here, positional history and posture information).  
In the initial frame, objects are simply assigned from top-left to bottom-right. In all other frames, assignments are made based on probabilities (see appendix \nameref{sec:matching_graph}) calculated for every combination of object and trajectory. These probabilities represent the degree to which the program believes that "it makes sense" to extend an existing trajectory with an object in the current frame, given its position and speed. Our tracking algorithm only considers assignments with probabilities larger than a certain threshold, generally constrained to a certain proximity around an object assigned in the previous frame. 

Matching a set of objects in one frame with a set of objects in the next frame is representative of a typical assignment problem, which can be solved in polynomial time (e.g. using the Hungarian method \citealt{kuhn1955hungarian}). However, we found that, in practice, the computational complexity of the Hungarian method can constrain analysis speed to such a degree that we decided to implement a custom algorithm, which we term tree-based matching, which has a better \textit{average-case} performance (see evaluation), even while having a comparatively bad \textit{worst-case} complexity. Our algorithm constructs a tree of all possible object/trajectory combinations in the frame and tries to find a compatible (such that no objects/trajectories are assigned twice) set of choices, maximizing the sum of probabilities amongst these choices (described in detail in the appendix \nameref{sec:matching_graph}). Problematic are situations where a large number of objects are in close proximity of one another, since then the number of possible sets of choices grows exponentially. These situations are avoided by using a mixed approach: tree-based matching is used most of the time, but as soon as the combinatorical complexity of a certain situation becomes too great, our software falls back on using the Hungarian method. If videos are known to be problematic throughout (e.g. with >100 individuals consistently very close to each other), the user may choose to use an approximate method instead (described in the appendix \autoref{sec:matching_graph}), which simply iterates through all objects and assigns each to the trajectory for which it has the highest probability and subsequently does not consider whether another object has an even higher probability for that trajectory. While the approximate method scales better with an increasing number of individuals, it is "wrong" (seeing as it does not consider all possible combinations) -- which is why it is not recommended unless strictly necessary. However, since it does not consider all combinations, making it more sensitive to parameter choice, it scales better for very large numbers of objects and produces results good enough for it to be useful in very large groups (see \tableref{decisions}). %The requirement being well-chosen parameters, such as maximum speed, to reduce the number of possible mistakes/choices per individual as much as possible.

% that it may occasionally spike in terms of the time it takes to analyse a frame --, it performs better in almost all cases. Since problems may occur with a large number of objects in close proximity of one another, where computational complexity grows exponentially, we employ a mixed approach: Generally, \TRex{} uses the tree-based algorithm, but circumvents problematic situations by falling back on using the Hungarian method when necessary. It also offers the option to use an \textit{approximate} matching algorithm for an entire video, in case the video is especially problematic throughout. This approximate algorithm is not mathematically \textit{correct}, in that it works based on a first-come-first-serve principle: It iterates through all objects and assigns each object to the trajectory for which it has the highest probability. This is of course wrong, seeing as it does not consider all possible combinations, but scales significantly better for very large numbers of objects and produces results good enough for it to be useful in very large groups (see evaluation \nameref{sec:evaluation_accuracy}).

%Since matching is a global optimization, choosing sub-optimally on an object level can still result in an overall greater probability sum. While being "wrong" mathematically speaking, and not yielding a significant improvement in performance for a few objects, combinatorically it still scales significantly better for very large numbers of objects and produces good enough results to be useful in very large groups.

Situations where objects/individuals are touching, partly overlapping, or even completely overlapping, is an issue that all tracking solutions have to deal with in some way. The first problem is the \textit{detection} of such an overlap/crossing, the second is its \textit{resolution}. \idtracker{}, for example, deals only with the first problem: It trains a neural network to detect crossings and essentially ignores the involved individuals until the problem is resolved by movement of the individuals themselves. However, using such an image-based approach can never be fully independent of the species or even video (it has to be retrained for each specific experiment) while also being time-costly to use. In some cases the size of objects might indicate that they contain multiple overlapping objects, while other cases might not allow for such an easy distinction -- e.g. when sexually dimorphic animals (or multiple species) are present at the same time. We propose a method, similar to \verb!xyTracker! in that it uses the object's movement history to detect overlaps. If there are fewer objects in a region than would be expected by looking at previous frames, an attempt is made to split the biggest ones in that area. The size of that area is estimated using the maximal speed objects are allowed to travel per frame (parameter, see documentation \protect\path{track_max_speed}). This, of course, requires relatively good predictions or, alternatively, high frame-rates relative to the object's movement speeds (which are likely necessary anyway to observe behavior at the appropriate time-scales).

By default, objects suspected to contain overlapping individuals are split by thresholding their background-difference image (see appendix \autoref{box:splitting-algorithm}), continuously increasing the threshold until the expected number (or more) similarly sized objects are found. Greyscale values and, more generally, the shading of three-dimensional objects and animals often produces a natural gradient (see for example \figref{fig:datasets_comparison}) making this process surprisingly effective for many of the species we tested with. Even when there is almost no visible gradient and thresholding produces holes inside objects, objects are still successfully separated with this approach. Missing pixels from inside the objects can even be regenerated afterwards. The algorithm fails, however, if the remaining objects are too small or are too different in size, in which case the overlapping objects will not be assigned to any trajectory until all involved objects are found again separately in a later frame.

After an object is assigned to a specific trajectory, two kinds of data (posture and visual-fields) are calculated and made available to the user, which will each be described in one of the following subsections. In the last subsection, we outline how these can be utilized in real-time tracking situations.

\subsubsection{Posture Analysis}

Groups of animals are often modeled as systems of simple particles (\citealt{inada2002order}, \citealt{cavagna2010empirical}, \citealt{perez2011collective}), a reasonable simplification which helps to formalize/predict behavior. However, intricate behaviors, like courtship displays, can only be fully observed once the body shape and orientation are considered (e.g. using tools such as DeepPoseKit, \citealt{graving2019deepposekit}, LEAP \cite{pereira2019fast}/SLEAP \cite{Pereira2020.08.31.276246}, and DeepLabCut, \citealt{mathis2018deeplabcut}). \TRex{} does not track individual body parts apart from the head and tail (where applicable), but even the included simple and fast 2D posture estimator already allows for deductions to be made about how an animal is positioned in space, bent and oriented -- crucial e.g. when trying to estimate the position of eyes/antennae as part of an analysis, where this is required (e.g. \citealt{strandburg2013visual}, \citealt{rosenthal2015revealing}). \changemade{When detailed tracking of all extremities is required, \TRex{} offers an option that allows it to interface with third-party software like DeepPoseKit (\citealt{graving2019deepposekit}), SLEAP (\citealt{Pereira2020.08.31.276246}), or DeepLabCut (\citealt{mathis2018deeplabcut}). This option (\protect\path{output_image_per_tracklet}), when set to true, exports cropped and (optionally) normalised videos per individual that can be imported directly into these tools -- where they might perform better than the raw video. Normalisation, for example, can make it easier for machine-learning algorithms in these tools to learn where body-parts are likely to be (see \figref{fig:maximum_val_acc_per_samples}) and may even reduce the number of clicks required during annotation.}

In \TRex{}, the 2D posture of an animal consists of (i) an outline around the outer edge of a blob, (ii) a center-line (or midline for short) that curves with the body and (iii) positions on the outline that represent the front and rear of the animal (typically head and tail). Our only assumptions here are that the animal is bilateral with a mirror-axis through its center and that it has a beginning and an end, and that the camera-view is roughly perpendicular to this axis. This is true for most animals, but may not hold e.g. for jellyfish (with radial symmetry) or animals with different symmetries (e.g. radiolaria (protozoa) with spherical symmetry). Still, as long as the animal is not exactly circular from the perspective of the camera, the midline will follow its longest axis and a posture can be estimated successfully.  The algorithm implemented in our software is run for every (cropped out) image of an individual and processes it as follows:

i. A tree-based approach follows edge pixels around an object in a clock-wise manner. Drawing the line \emph{around} pixels, as implemented here, instead of through their centers, as done in comparable approaches, helps with very small objects (e.g. one single pixel would still be represented as a valid outline, instead of a single point).

ii. The pointiest end of the outline is assumed, by default, to be either the tail or the head (based on curvature and area between the outline points in question). Assignment of head vs. tail can be set by the user, seeing as some animals might have "pointier" heads than tails (e.g. termite workers, one of the examples we employ). Posture data coming directly from an image can be very noisy, which is why the program offers options to simplify outline shapes using an Elliptical Fourier Transform (EFT, see \citealt{iwata2015genomic}, \citealt{kuhl1982elliptic}) or smoothing via a simple weighted average across points of the curve (inspired by common subdivision techniques, see \citealt{warren2001subdivision}). The EFT allows for the user to set the desired level of approximation detail (via the number of elliptic fourier descriptors, EFDs) and thus make it "rounder" and less jittery. Using an EFT with just two descriptors is equivalent to fitting an ellipse to the animal's shape (as, for example, \verb!xyTracker! does), which is the simplest supported representation of an animal's body.

iii. The reference-point chosen in (ii) marks the start for the midline-algorithm. It walks both left and right from this point, always trying to move approximately the same distance on the outline (with limited wiggle-room), while at the same time minimizing the distance from the left to the right point. This works well for most shapes and also automatically yields distances between a midline point and its corresponding two points on the outline, estimating thickness of this object's body at this point.

Compared to the tracking itself, posture estimation is a time-consuming process and can be disabled. It is, however, required to estimate -- and subsequently normalize -- an animal's orientation in space (e.g. required later in \nameref{sec:visual_recognition}), or to reconstruct their visual field as described in the following sub-section.

\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{figures/screenshot_visual_field.jpg}

    \caption{Visual field estimate of the individual in the center (zoomed in, the individuals are approximately 2-3cm long, \videoref{vid:guppy_8_t36_d15_20191212_085800}). Right (blue) and left (orange) fields of view intersect in the binocular region (pink). Most individuals can be seen directly by the focal individual (1, green), which has a wide field of view of $260^\circ$ per eye. Individual 3 on the top-left is not detected by the focal individual directly and not part of its first-order visual field. However, second-order intersections (visualized by grey lines here) are also saved and accessible through a separate layer in the exported data.}
  \label{fig:occlusion}
  \videosupp{\changemade{A clip from \videoref{vid:guppy_8_t36_d15_20191212_085800}, showing \TRex{'} visual-field estimation for Individual 1. \url{https://youtu.be/yEO_3lpZIzU}}}
\end{figure}

\subsubsection{Reconstructing 2D Visual Fields}

Visual input is an important modality for many species (e.g. fish \citealt{strandburg2013visual}, \citealt{bilotta2001zebrafish} and humans \citealt{colavita1974human}). Due to its importance in widely used model organisms like zebrafish (\emph{Danio rerio}), we decided to include the capability to conduct a 2-dimensional reconstruction of each individual's visual field as part of the software. The requirements for this are successful posture estimation and that individuals are viewed from above, as is usually the case in laboratory studies.


The algorithm makes use of the fact that outlines have already been calculated during posture estimation. Eye positions are estimated to be evenly distanced from the "snout" and will be spaced apart depending on the thickness of the body at that point (the distance is based on a ratio, relative to body-size, which can be adjusted by the user). Eye orientation is also adjustable, which influences the size of the stereoscopic part of the visual field. We then use ray-casting to intersect rays from each of the eyes with all other individuals as well as the focal individual itself (self-occlusion). Individuals not detected in the current frame are approximated using the last available posture. Data are organized as a multi-layered 1D-image of fixed size for each frame, with each image prepresenting angles from $-180^{\circ}$ to $180^{\circ}$ for the given frame. Simulating a limited field-of-view would thus be as simple as cropping parts of these images off the left and right sides. The different layers per pixel encode:

\begin{enumerate}
  \item identity of the occluder
  \item distance to the occluder
  \item body-part that was hit (distance from the head on the outline in percent)
\end{enumerate}

While the individuals viewed from above on a computer screen look 2-dimensional, one major disadvantage of any 2D approach is, of course, that it is merely a projection of the 3D scene. Any visual field estimator has to assume that, from an individual's perspective, other individuals act as an occluder in all instances (see \figref{fig:occlusion}). This may only be partly true in the real world, depending on the experimental design, as other individuals may be able to move slightly below, or above, the focal individuals line-of-sight, revealing otherwise occluded conspecifics behind them. We therefore support multiple occlusion-layers, allowing second-order and $N$th-order occlusions to be calculated for each individual. %\emph{Utilizing individual size differences in consecutive frames (e.g. because of diving) might even yield a proxy for approximating occlusions during post-processing.}

% This also holds true for a commonly used model-organism in behavioral ecology: zebrafish larvae (\emph{Danio rerio}), who, up to a certain age, due to anatomical restrictions do not possess other sensing abilities (\emph{cite}). 

\subsubsection{Realtime Tracking Option for Closed-Loop Experiments}

Live tracking is supported, as an option to the user, during the recording, or conversion, of a video in \TGrabs{}. When closed-loop feedback is enabled, \TGrabs{} focusses on maintaining stable recording frame-rates and may not track recorded frames if tracking takes too long. This is done to ensure that the recorded file can later be tracked again in full/with higher accuracy (thus no information is lost) if required, and to help the closed-loop feedback to stay synchronized with real-world events.

During development we worked with a mid-range gaming computer and Basler cameras at $90$fps and $2048^2$px resolution, where drawbacks did not occur. \changemade{Running the program on hardware with specifications below our recommendations (see \nameref{ref:hardware_recommend}), however, may affect frame-rates as described below.}

\TRex{} loads a prepared \verb!Python! script, handing down an array of data per individual in every frame. Which data fields are being generated and sent to the script is selected by the script. Available fields are:

\begin{itemize}[label=\textnormal{$\bullet$}]
    \item Position
    \item Midline information
    \item Visual field
\end{itemize}

If the script (\changemade{or any other part of the recording process}) takes too long to execute \changemade{in one} frame, \changemade{consecutive frames may be} dropped until a stable frame-rate can be achieved. This scales well for all computer-systems, \changemade{but results in fragmented tracking data, causing worse identity assignment, and reduces the number of frames and quality of data available for closed-loop feedback. However, since even untracked frames are saved to disk, these inaccuracies can be fixed in \TRex{} later. Alternatively, if live-tracking is enabled but closed-loop feedback is disabled, the program maintains detected objects in memory and tracks them in an asynchronous thread (potentially introducing wait time after the recording stops).} When the program terminates, the tracked individual's data are exported -- along with a \verb!results! file that can be loaded by the \verb!tracker! at a later time.

In order to make this interface easy to use for prototyping and to debug experiments, the script may be changed during its run-time and will be reloaded if necessary. Errors in the \verb!Python! code lead to a temporary pause of the closed-loop part of the program (not the recording) until all errors have been fixed.

Additionally, thanks to \verb!Python! being a fully-featured scripting language, it is also possible to call and send information to other programs during real-time tracking. Communication with other external programs may be necessary whenever easy-to-use \verb!Python! interfaces are not available for e.g. hardware being used by the experimenter.

%\textit{the following is basically a lie, until i can actually get back to the lab:}
%Closed-loop has been tested with up to 30 (mock) individuals present at the same time. With posture (and thus also visual field) being a very costly process, more individuals can be tracked at higher speeds with it disabled.

\subsection{Automatic Visual Identification Based on Machine Learning} \label{sec:visual_recognition}

Tracking, when it is only based on individual's positional history, can be very accurate under good circumstances and is currently the fastest way to analyse video recordings or to perform closed-loop experiments. However, such tracking methods simply do not have access to enough information to allow them to ensure identities are maintained for the duration of most entire trials -- small mistakes can and will happen. There are cases, e.g. when studying polarity (only based on short trajectory segments), or other general group-level assessments, where this is acceptable and identities do not have to be maintained perfectly. However, consistent identities are required in many individual-level assessments, and with no baseline truth available to correct mistakes, errors start accumulating until eventually all identities are fully shuffled. Even a hypothetical, \emph{perfect} tracking algorithm will not be able to yield correct results in all situations as multiple individuals might go out of view at the same time (e.g. hiding under cover or just occluded by other animals). There is no way to tell who is whom, once they re-emerge.

The only way to solve this problem is by providing an independent source of information from which to infer identity of individuals, which is of course a principle we make use of all the time in our everyday lives: Facial identification of con-specifics is something that \changemade{is easy for most humans}, to an extent where we sometimes recognize face-like features where there aren't any. Our natural tendency to find patterns enables us to train experts on recognizing differences between animals, even when they belong to a completely different taxonomic order. Tracking individuals is a demanding task, especially with large numbers of moving animals (\citealt{liu2009effect} shows humans to be effective for up to 4 objects). Human observers are able to solve simple memory recall tasks for 39 objects at only 92\% correct (see \citealt{humphrey1992recognizing}), where the presented objects do not even have to be identified individually (just classified as old/new) and contain more inherent variation than most con-specific animals would. Even with this being true, human observers are still the most efficient solution in some cases (e.g. for long-lived animals in complex habitats). Enhancing visual inter-individual differences by attaching physical tags is an effective way to make the task easier and more straight-forward to automate. RFID tags are useful in many situations, but are also limited since individuals have to be in very close proximity to a sensor in order to be detected \citep{bonter2011applications}. Attaching \changemade{fiducial markers (such as QR codes)} to animals allows for a very large number \changemade{(thousands) of individuals to be uniquely identified at the same time (see \citealt{Gernat1433}, \citealt{Wild2020.05.06.076943}, \citealt{mersch2013tracking}, \citealt{crall2015beetag}) -- and over a much greater distance than RFID tags.} Generating codes can also be automated, generating tags with optimal visual inter-marker distances \citep{garrido2016generation}, making it feasible to identify a large number of individuals with minimal tracking mistakes.

While physical tagging is often an effective method by which to identify individuals, it requires animals to be caught and manipulated, which can be difficult \citep{mersch2013tracking} and is subject to the physical limitations of the respective system. Tags have to be large enough so a program can recognize it in a video stream. Even worse, especially with increased relative tag-size, the animal's behavior may be affected by the presence of the tag \changemade{or during its application (\citealt{DENNIS20081939}, \citealt{pankiw2003effect}, \citealt{SOCKMAN2001205}),} and there might be no way for experimenters to necessarily know that it did \changemade{(unless with considerable effort, see \citealt{switzer2016bombus})}. In addition, for some animals, like fish and termites, attachment of tags that are effective for discriminating among a large number of individuals can be problematic, or impossible.

Recognizing such issues, \citep{idtracker} first proposed an algorithm termed \textit{idtracker}, generalizing the process of pattern recognition for a range of different species. Training an expert program to tell individuals apart, by detecting slight differences in patterning on their bodies, allows the correction of identities without any human involvement. Even while being limited to about 15 individuals per group, this was a very promising approach. It became much improved upon only a few years later by the same group in their software \idtracker{} \citep{idtrackerai}, implementing a paradigm shift from explicit, hard-coded, color-difference detection to using more general machine learning methods instead -- increasing the supported group size by an order of magnitude.

%It works in stages (or \textit{protocols}) and adapts to problem complexity by skipping later steps if the estimated training quality is deemed good enough. Stages otherwise build on previous progress, continually improving results. First, individuals are tracked by (i) detecting objects of interest and (ii) following them in the next frame by finding objects overlapping with the pixels from the last frame. In order to ensure that individuals do not merge, a secondary network is trained to distinguish between crossing and singular individuals. After individuals have been tracked, a part of the video is selected where all individuals are visible and separated from each other. This \textit{global segment} marks a starting point for the following training procedure, ensuring individual sequences to be unobstructed by crossings or other visibility issues. While a \emph{global segment} spans a certain (short) range of frames, lengths of associated segments per individual may vary: for each individual, frames extending before and after the global segment can be assumed to be correctly assigned as well, until the individual "disappears" (e.g. overlaps with another individual, moves too fast, etc.).

%Using the first set of generated samples, training commences until certain stopping criteria are fulfilled (in their paper referenced as \emph{protocol 1}). If the training quality is deemed to be good enough, training may stop here. This is the case, if (i) no two individuals are predicted to be of the same identity and (ii) the predicted probabilities per individual are certain enough. If these conditions do \textit{not} hold, other global segments have to be added to the training dataset (\emph{protocol 2}). This procedure extends the dataset step by step, until at least $99.95\%$ of images in global segments have been accumulated. The last protocol, \emph{protocol 3}, trains the first half of the network (convolutional layers) separately from the rest and then starts iterating \emph{protocol 2} again - this time only training the classification (dense) part of the network (see \figref{fig:software_overview}c).

%\subsection{Visual identification in \TRex{}}% \label{sec:preparation}

We employ a method for visual identification in \TRex{} that is similar to the one used in \idtracker{}, where a neural network is trained to visually recognize individuals and is used to correct tracking mistakes automatically, without human intervention -- the network layout (see \figref{fig:software_overview}c) is almost the same as well (differing only by the addition of a pre-processing layer and using 2D- instead of 1D-dropout layers). However, in \TRex{}, processing speed and chances of success are improved (the former being greatly improved) by (i) minimizing the variance landscape of the problem and (ii) exploring the landscape to our best ability, optimally covering all poses and lighting-conditions an individual can be in, as well as (iii) shortening the training duration by significantly altering the training process -- e.g. choosing new samples more adaptively and using different stopping-criteria (accuracy, as well as speed, are part of the later evaluation).

While \nameref{sec:tracking} already \textit{tries} to (within each trajectory) consistently follow the same individual, there is no way to ensure/check the validity of this process without providing independent identity information. Generating this source of information, based on the visual appearance of individuals, is what the algorithm for visual identification, described in the following subsections, aims to achieve. Re-stated simply, the goal of using automatic visual identification is to obtain reliable predictions of the identities of all (or most) objects in each frame. Assuming these predictions are of sufficient quality, they can be used to detect and correct potential mistakes made during \nameref{sec:tracking} by looking for identity switches within trajectories. Ensuring that predicted identities within trajectories are consistent, by proxy, also ensures that each trajectory is consistently associated with a single, real individual. In the following, before describing the four stages of that algorithm, we will point out key aspects of how tracking/image data are processed and how we addressed the points (i)-(iii) above and especially highlight the features that ultimately improved performance compared to other solutions. %and highlight differences to \idtracker{} where applicable.
%As is described in the following sub-sections, a network is trained, and continuously improved, to discriminate between individuals from within the same group. 
%Regarding the specific aspect of automated visual recognition, the main contribution of this paper lies in the way training is approached to address some of the areas in which comparable solutions left room for optimization. Chances of success are improved, compared to other approaches, by (i) minimizing the variance landscape of the problem and (ii) exploring the landscape to our best ability, optimally covering all poses and lighting-conditions an individual can be in, as well as (iii) shortening the training duration by choosing new samples more adaptively. To address the latter, we introduce a measure of training quality that we call \textit{uniqueness} (see Box \ref{box:uniqueness_score}), which is integral to every part of the algorithm. Instead of accumulating samples in a pre-determined order or based on kinematic properties of the individuals themselves, \textit{uniqueness} is more global and dynamic while being tied directly to the networks predictive ability. Most importantly, it gives us a way to determine when to terminate early or which segment should be added next. Broadly speaking, additional samples are only added to the training dataset if they are deemed promising or can be ignored if we do not expect them to provide us with new information.


\subsubsection{Preparing Tracking-Data} \label{sec:segments}

Visual identification starts out only with the trajectories that the \nameref{sec:tracking} provides.
Tracking, on its own, is already an improvement over other solutions, especially since (unlike e.g. \idtracker{}) \TRex{} makes an effort to separate overlapping objects (see the \nameref{box:splitting-algorithm}) and thus is able to keep track of individuals for longer (see \figref{fig:segment_lengths}). Here, we -- quite conservatively -- assume that, after every problematic situation (defined in the list below), the assignments made by our tracking algorithm are wrong. Whenever a problematic situation is encountered as part of a trajectory, we split the trajectory at that point. This way, all trajectories of all individuals in a video become an assortment of trajectory snippets (termed "segments" from here on), which are clear of problematic situations, and for each of which the goal is to find the correct identity ("correct" meaning that identities are consistently assigned to the same \textit{real} individual throughout the video). Situations are considered "problematic", and cause the trajectory to be split, when:

%The most important factors to consider are (i) avoiding the switching of identities or at least determining that such a switching could have happened, and (ii) optimizing variation within the samples gathered per individual; while (iii) bounding computation time as well as (iv) keeping memory consumption within reasonable limits.

%In order to address (i), consecutive and uninterrupted frame segments are first calculated at an individual level during the \nameref{sec:tracking} step. The length of each of those segments is limited by various factors:

\begin{itemize}[label=\textnormal{$\bullet$}]
  \item \textbf{The individual has been lost for at least one frame.} For example when individuals are moving unexpectedly fast, are occluded by other individuals/the environment, or simply not present anymore (e.g. eaten).
  \item \textbf{Uncertainty of assignment was too high ($>50\%$)} e.g. due to very high movement speeds or extreme variation in size between frames. With simpler tracking tasks in mind, these segments are kept as \emph{connected} tracks, but regarded as separate ones here.
  \item \textbf{Timestamps suggest skipped frames.} Missing frames in the video may cause wrong assignments and are thus treated as if the individuals have been lost. This distinction can only be made if accurate frame timings are available (when recording using \TGrabs{} or provided alongside the video files in separate \protect\path{npz} files).
\end{itemize}

Unless one of the above conditions becomes true, a segment is assumed to be consecutive and connected; that is, throughout the whole segment, no mistakes have been made that lead to identities being switched. Frames where all individuals are currently within one such segment at the same time will henceforth be termed \emph{global segments}.

Since we know that there are no problematic situations inside each per-individual segment, and thus also not across individuals within the range of a global segment, we can choose any global segment as a basis for an initial, arbitrary assignment of identities to trajectories. One of the most important steps of the identification algorithm then becomes deciding which global segment is the best starting point for the training. If a mistake is made here, consecutive predictions for other segments will fail and/or produce unreliable results in general. %The next sub-section describes the process of estimating the quality of global segments, allowing the program to assign a priority to each one and order them accordingly.

%\subsubsection{Calculating the quality of global segments}

Only a limited set of global segments is kept -- striking a balance between respecting user-given constraints and capturing as much of the variance as possible. In many of the videos used for evaluation, we found that only few segments had to be considered -- however, computation time is ultimately bounded by reducing the number of qualifying segments. While this is true, it is also beneficial to avoid auto-correlation by incorporating samples from all sections of the video instead of only sourcing them from a small portion -- to help achieve a balance, global segments are binned by their middle frame into four bins (each quarter of the video being a bin) and then reducing the number of segments inside each bin. With that goal in mind, we sort the segments within bins by their "quality" -- a combination of two factors:

\begin{enumerate}
    \item To capture as much as possible the variation due to an individual's own movement, as well as within the background that it moves across, a "good" segment should be a segment where all individuals move as much as possible and also travel as large a distance as possible. Thus, we derive a per-individual \textit{spatial coverage descriptor} for the given segment by dissecting the arena (virtually) into a grid of equally sized, rectangular "cells" (depending on the aspect ratio of the video). Each time an individual's center-point moves from one cell to the next, a counter is incremented for that individual. To avoid situations where, for example, all individuals but one are moving, we only use the lowest per-individual spatial coverage value to represent a given segment.

    \item It is beneficial to have more examples for the network to learn from. Thus, as a second sorting criterion, we use the average number of samples per individual.
\end{enumerate}

After being sorted according to these two metrics, the list of segments per bin is reduced, according to a user-defined variable (4 by default), leaving only the most viable options per quarter of video.

The number of visited cells may, at first, appear to be essentially equivalent to a spatially normalized \textit{distance travelled} (as used in \idtracker{}). In edge cases, where individuals never stop or always stop, both metrics can be very similar. However, one can imagine an individual continuously moving around in the same corner of the arena, which would be counted as an equally good segment for that individual as if it had traversed the whole arena (and thus capturing all variable environmental factors). In most cases, using highly restricted movement for training is problematic, and worse than using a shorter segment of the individual moving diagonally through the entire space, since the latter captures more of the variation within background, lighting conditions and the animals movement in the process.

%Addressing the last point, memory usage is important since memory is a limited resource. Exceeding the hardware-prescribed memory capacity can subsequently lead to significant slowdowns for the entire system, or even terminate the program ungracefully. In \TRex{}, memory usage during training limited to user-specified criteria by sub-sampling the available images uniformly within global segments. Of course, information could potentially be lost in the process seeing as it could be present in frame $f$ and not $f+1$ -- and information \textit{will} be lost if limits are too strict. However, generally, this does not pose a problem since consecutive frames are usually highly auto-correlated -- at least as long as frame-rates are high enough relative to the timescale of the individual's behavior.

%A network is trained, and continuously improved, to discriminate between individuals from within the same group. Chances of success are improved by (i) minimizing the variance landscape of the problem and (ii) exploring the landscape to our best ability, optimally covering all poses and lighting-conditions an individual can be in.

%Simply re-stating the goal can be helpful sometimes: If every step has been completed successfully, the goal is to end up with a set of identities assigned to each individual representative of the \textit{real} individual. Which is an important distinction to make when comparing to "normal" tracking without identity information. We start out on the basis of "normal" tracking, which tries to consistently assign to a distinct individual. We segment the time-series into smaller segments which are easier to validate. 

%Although training in \emph{TRex} follows similar principles as \idtracker{}, with the network being largely identical, and steps during training could be named similarly, almost every part of the algorithm has either been altered or replaced (like the costly \textit{protocol 3}).


%The attractiveness of a global segment (\textit{uniqueness}, see Box \ref{box:uniqueness_score}) may change after every training unit and is based directly on predictions offered by the network itself. This evaluation method importantly also offers a way to terminate accumulation early, when the achieved score becomes sufficiently high.


%\subsubsection{Preparation} 

%Within each bin, segments are sorted by their quality, according to a \textit{segment quality index}, consisting of (i) the average number of samples per individual and (ii) the minimum number of grid-cells (defined in the next sentence) visited per individual. The grid is a virtual dissection of the arena into equally sized rectangles ("cells"). Each time an individual's center-point moves from one cell to the next, a counter is incremented. This way, we obtain a per-individual spatial coverage descriptor for the given segment, which is used as the first criterion for sorting among the segments within a bin (per segment, we only count the value of the individual with the worst spatial coverage in that segment). The second criterion is simply the average number of samples The maximum remaining number of segments in each bin is limited by a user-defined variable (4 by default), leaving only the most viable options per quarter of video.

%The quality index consists of (i) the minimum number of grid-cells\footnote{} visited per individual and (ii) the average number of samples per individual. First, the arena is (virtually) dissected into a grid of equally sized cells. Each time an individual's center-point moves from one cell to the next, a counter is incremented. This way, we obtain a per-individual spatial coverage descriptor for the given segment. The maximum remaining number of segments in each bin is limited by a user-defined variable (4 by default), leaving only the most viable options per quarter of video.

%This set of global segments is sorted by a \emph{segment quality index}. The quality index consists of (i) the minimum number of grid-cells (as defined in the next sentence) visited per individual and (ii) the average number of samples per individual. First, the arena is (virtually) dissected into a grid of equally sized cells. Each time an individual's center-point moves from one cell to the next, a counter is incremented. This way, we obtain a per-individual spatial coverage descriptor for the given segment. Since it is an integer with no decimal places, duplicates across segments will occur frequently, allowing for a finer-grade sorting amongst the segments. Sets of segments with an equal number of cells visited are sorted, a second time, by the average number of samples per tracked entity. Preferring longer segments over shorter segments is important, since longer segments typically capture more of the variation exhibited by individuals -- even if the travelled distance per individual is low.


%The list of global segments is reduced before the training process commences, in order to accommodate both the user-given constraints as well capturing as much of the variance as possible. In many of the videos used for evaluation, we found that only few segments had to be considered -- however, computation time is ultimately bounded by reducing the number of qualifying segments. It is beneficial to avoid auto-correlation by incorporating samples from all sections of the video instead of only sourcing them from a small portion -- to help achieve this, global segments are are binned by their middle frame into four bins (each quarter of the video being a bin). Within each bin, segments are sorted by their quality and the worst-quality segments removed. The maximum remaining number of segments in each bin is limited by a user-defined variable (4 by default), leaving only the most viable options per quarter of video.


%1. generate training samples

%2. passed to tensorflow

%3. assign potential ID to blobs (not individuals)

%4. correct cases of wrongly assigned segments

%- main differences to idtracker: posture is used for normalizing blob direction, as it also supports C shapes and is accurately recognizing all quantiles (have to test again with blob orientation instead of midline orientation?). filtering the consecutive segments can use more accurate filters like midline length. i dont resize individual images, just crop, although it is possible (!). I need to normalize histogram?

%- not aware of other software that implements recognition for an unknown number of individuals, so its not too bad i dont either?

%- not using weighted cross-entropy-loss, but focal-loss to account for imbalanced datasets

%\emph{- if we do online training and assignment (as in xyTracker), it means that we have to perform complicated backtracking in order to sort out problems. it would also mean that, judging from the fact that problem complexity (and number of individuals involved) accumulates over time, this could lead to computational cost exploding in situations with lots of consecutive crossings. in real videos, periods between crossings may also be short for all individuals, as they are usually sticking together (being the reason why they overlap in the first place), having phases of frequent crossings in close quarters. we have instead adopted a different way of resolving issues, while also keeping the process efficient enough to handle long videos. }

%- can also automatically generate an area of interest based on heatmap \emph{appendix?}

\subsubsection{Minimizing the Variance Landscape by Normalizing Samples} \label{sec:posture_normalization}

A big strength of machine learning approaches is their resistance to noise in the data. Generally, any machine learning method will likely still converge - even with noisy data. Eliminating unnecessary noise and degrees of freedom in the dataset, however, will typically help the network to converge much more quickly: Tasks that are easier to solve will of course also be solved more accurately within similar or smaller timescales. This is due to the optimizer not having to consider various parts of the possible parameter-space during training, or, put differently, shrinking the overall parameter-space to the smallest possible size without losing important information. The simplest such optimization included in most tracking and visual identification approaches is to segment out the objects and centering the individuals in the cropped out images. This means that (i) the network does not have to consider the whole image, (ii) needs only to consider one individual at a time and (iii) the corners of the image can most likely be neglected.

\begin{figure}[t]
    \centering
    \includegraphics[width=\textwidth]{figures/fig_normalization.pdf}
    \caption{Comparison of different normalization methods. Images all stem from the same video and belong to the same identity. The video has previously been automatically corrected using the visual identification. Each object visible here consists of $N$ images $M_i, i\in[0,N]$ that have been accumulated into a single image using $\min_{i\in [0,N]}M_i$, with $\min$ being the element-wise minimum across images. The columns represent same samples from the same frames, but normalized in three different ways: In (a), images have not been normalized at all. Images in (b) have been normalized by aligning the objects along their main axis (calculated using \textit{image-moments}), which only gives the axis within 0 to 180 degrees. In (c), all images have been aligned using posture information generated during the tracking process. As the images become more and more recognizable to \textit{us} from left to right, the same applies to a network trying to tell identities apart: Reducing noise in the data speeds up the learning process.}
  \label{fig:datasets_comparison}
\end{figure}

Further improving on this, approaches like \idtracker{} align all objects along their most-elongated axis, essentially removing global orientation as a degree of freedom. The orientation of an arbitrary object can be calculated e.g. using an approach often referred to as image-moments \citep{hu1962visual}, yielding an angle within $[0-180]^\circ$. Of course, this means that

\begin{enumerate}
\item circular objects have a random (noisy) orientation
\item elongated objects (e.g. fish) can be either head-first or flipped by $180^\circ$ and there is no way to discriminate between those two cases (see second row, \figref{fig:datasets_comparison})
\item a C-shaped body deformation, for example, results in a slightly bent axis, meaning that the head will not be in exactly the same position as with a straight posture of the animal.
\end{enumerate}

%\begin{figure}[h]
%\begin{fullwidth}
%\includegraphics[width=1.0\linewidth]{growth_per_samples.pdf}
%\caption{Number of epochs, until a satisfactory validation accuracy $\ge95\%$ has been reached, plotted against the average number of samples provided per class for three different normalization methods. Colored background shows the 90\% and 10\% intervals. The same 77 videos have been analysed with the same parameters, except for the way that the samples are processed before adding them to the training dataset. Network training converges faster and to a higher accuracy when more samples are provided for each identity, but using posture to align images yields consistently better and more predictable results than other methods.}
%\label{fig:rawvnormalized}
%\end{fullwidth}
%\end{figure}

Each of these issues adds to the things the network has to learn to account for, widening the parameter-space to be searched and increasing computation time. However, barring the first point, each problem can be tackled using the already available posture information. Knowing head and tail positions and points along the individual's center-line, the individual's heads can be locked roughly into a single position. This leaves room only for their rear end to move, reducing variation in the data to a minimum (see \figref{fig:datasets_comparison}). In addition to faster convergence, this also results in better generalization right from the start and even with a smaller number of samples per individual (see \figref{fig:maximum_val_acc_per_samples}). \changemade{For further discussion of highly deformable bodies, such as of rodents, please see Appendix (\nameref{sec:deformable_bodies}).}

\subsubsection{Guiding the Training Process} \label{sec:training_quality}

Per batch, the stochastic gradient descent is directed by the local accuracy (a fraction of correct/total predictions), which is a simple and commonly used metric that has no prior knowledge of where the samples within a batch come from. This has the desirable consequence that no knowledge about the temporal arrangement of images is necessary in order to train and, more importantly, to apply the network later on. 

In order to achieve accurate results quickly across batches, while at the same time making it possible to indicate to the user potentially problematic sequences within the video, we devised a metric that can be used to estimate local as well as global training quality: We term this uniqueness and it combines information about objects within a frame, following the principle of non-duplication; images of individuals within the same frame are required to be assigned different identities by the networks predictions.

\begin{featurebox}
\caption{Calculating uniqueness for a frame}
\label{box:uniqueness_score}
\begin{algorithm}[H]
\DontPrintSemicolon
\KwData{
 frame $x$
}
 \KwResult{Uniqueness score for frame $x$}
 uids = map\{\}\;
 $\hat{p}\given{i|b}$ is the probability of blob $b$ to be identity $i$\;
 $f(x)$ returns a list of the tracked objects in frame $x$\;
 $E(v) = \left(1 + \exp(-\pi)\right) / \left(1 + \exp(-\pi v)\right)$ is a shift of roughly $+0.5$ and non-linear scaling of values $0\leq v\leq 1$\;
 \;
 \ForEach{object $b \in f(x)$}{
    $\mathrm{maxid} = \argmax{i} \hat{p}\given{i|b}$ with $i \in \mathrm{identities}$\;
    \eIf{maxid $\in$ uids}{
        $\mathrm{uids}[\mathrm{maxid}] = \max(\mathrm{uids}[\mathrm{maxid}], \hat{p}(\mathrm{maxid}, b))$
    }{
        $\mathrm{uids}[\mathrm{maxid}] = \hat{p}(\mathrm{maxid}, b)$
    }
 }
 \Return{$|\mathrm{uids}|^{-1}|f(x)| * E\left(|\mathrm{uids}|^{-1} \left(\sum_{i \in \mathrm{uids}} \mathrm{uids}[i]\right)\right)$}\;
 \caption{The algorithm used to calculate the uniqueness score for an individual frame. Probabilities $\hat{p}\given{i|b}$ are predictions by the pre-trained network. During the accumulation these predictions will gradually improve proportional to the global training quality. Multiplying the unique percentage $|\mathrm{uids}|^{-1}|f(x)|$ by the (scaled) mean probability deals with cases of low accuracy, where individuals switch every frame (but uniquely).}
\end{algorithm}
\end{featurebox}

%\textit{uniqueness} is a percentage calculated per frame, which, compared to a normal \textit{accuracy} metric takes more of the local context into account.
%
The program generates image data for evenly spaced frames across the entire video. All images of tracked individuals within the selected frames are, after every epoch of the training, passed on to the network. It returns a vector of probabilities $p_{ij}$ for each image $i$ to be identity $j\in[0,N]$, with $N$ being the number of individuals. Based on these probabilities, uniqueness can be calculated as in Box \ref{box:uniqueness_score}, evenly covering the entire video. The magnitude of this probability vector per image is taken into account, rewarding strong predictions of $\max_j \left\{ p_{ij} \right\}=1$ and punishing weak predictions of $\max_j \left\{ p_{ij} \right\} <1$.

Uniqueness is not integrated as part of the loss function, but it is used as a global gradient before and after each training unit in order to detect global improvements. Based on the average uniqueness calculated before and after a training unit, we can determine whether to stop the training, or whether training on the current segment made our results worse (faulty data). If uniqueness is consistently high throughout the video, then training has been successful and we may terminate early. Otherwise, valleys in the uniqueness curve indicate bad generalization and thus currently missing information regarding some of the individuals. In order to detect problematic sections of the video we search for values below $1-\frac{0.5}{N}$, meaning that the section potentially contains new information we should be adding to our training data. Using accuracy per-batch and then using uniqueness to determine global progress, we get the best of both worlds: A context-free prediction method that is trained on global segments that are strategically selected by utilizing local context information.

%\begin{equation}
%\begin{split}
%    U_{\marhrm{id}}(x) &= \set*{ \argmax{j} p_{ij} \given i,j \in \mathbb{N}; i \leq |f(x)| \wedge j \leq N_{\mathrm{id}} } \\
%    P_j &= 1/\min(N_{\marhrm{id}}, |f(x)|) \sum_{i \in U_{\marhrm{id}}(x)}{ \underset{0\leq j \leq N_{\mathrm{id}}}{\max} p_{ij} } \\
%    \sum_{j\in U_\mathrm{id}(x)} 
%\end{split}
%\end{equation}

% uids = map{}
% p_{i,b} is the probability of blob b to be identity i
%
% for each object b \in objects(frame)
%   max_id = argmax_i p_{i,b} where i \in identities
%  
%   if max_id in uids
%       uids[max_id] = max(uids[max_id], p_{max_id, b})
%   else
%       uids[max_id] = p_{max_id, b}
% 
% U_{frame} = logit(1/|uids| * \sum_{id \in uids} uids[id]) * |uids|/|objects(frame)|


%$$ U_{\marhrm{id}}(x) = \set*{ \argmax{j} p_{ij} \given i,j \in \mathbb{N}; i \leq |f(x)| \wedge j \leq N_{\mathrm{id}} } $$
%$$ \mathrm{uniqueness}(x) = 1/\min(N_{\marhrm{id}}, |f(x)|) \sum_{i \in U_{\marhrm{id}}(x)}{ \underset{0\leq j \leq N_{\mathrm{id}}}{\max} p_{ij} }. $$

%where $f(x)$ returns a set of all tracked objects within frame $x$. $U_{\marhrm{id}}(x)$ is a set of unique identities that have been detected within the given frame.

The closest example of such a procedure in \idtracker{} is the termination criterion after \textit{protocol 1}, which states that individual segments have to be consistent and certain enough in all global segments in order to stop iterating. While this seems to be similar at first, the way accuracy is calculated and the terminology here are quite different: (i) Every metric in \idtracker{'s} final assessment after \textit{protocol 1} is calculated at segment-level, not utilizing per-frame information. \textit{Uniqueness} works per-frame, not per segment, and considers individual frames to be entirely independent from each other. It can be considered a much stronger constraint set upon the network's predictive ability, seeing as it basically counts the number of times mistakes are estimated to have happened within single frames. Averaging only happens \textit{afterwards}. (ii) The terminology of identities being unique is only used in \idtracker{} once after \textit{procotol 1} and essentially as a binary value, not recognizing its potential as a descendable gradient. Images are simply added until a certain percentage of images has been reached, at which point accumulation is terminated. (iii) Testing uniqueness is much faster than testing network accuracy across segments, seeing as the same images are tested over and over again (meaning they can be cached) and the testing dataset can be much smaller due to its locality. \textit{Uniqueness} thus provides a stronger gradient estimation, while at the same time being more local (meaning it can be used independently of whether images are part of global segments), as well as more manageable in terms of speed and memory size.

%While the most obvious difference is that the quality assessment there works per consecutive individual segment and determines whether it is considered to be consistent, there are also more general differences: (ii) the way it is used there is very discrete and does not allow it to be recognized as a descendable gradient, (iii) the principle is not extended upon or used in any of the later steps, which simply continue accumulating global segments until a certain percentage of images has been added to the dataset.

In the next four sections, we describe the training phases of our algorithm (1-3), and how the successfully trained network can be used to automatically correct trajectories based on its predictions (4).

%Since our algorithm was originally inspired by their algorithm, it makes sense to also give a short summary of the \idtracker{} approach here:

%Below, we address the most significant changes to the individual parts of the algorithm categorized into topical units as follows:

%\begin{enumerate}
%\item Preparation
%\item Estimating training quality
%\item Minimizing the variance landscape by normalizing samples
%\item The initial training unit
%\item Accumulation of additional segments and stopping-criteria
%\item The final training unit
%\item Assigning identities based on network predictions
%\end{enumerate}

%For more technical information on each of the bullet points, please refer to the appendix \nameref{sec:appendix_recognition}.
%The following subsections will address each point in turn, chronologically describing every step of the algorithm.

\subsubsection{1. The Initial Training Unit}

All global segments are considered and sorted by the criteria listed below in \nameref{sec:accumulation_quality_criteria}. The best suitable segment from the beginning of that set of segments is used as the initial dataset for the network. Images are split into a training and a validation set (4:1 ratio). Efforts are made to equalize the sample sizes per class/identity beforehand, but there has to always be a trade-off between similar sample sizes (encouraging unbiased priors) and having as many samples as possible available for the network to learn from. Thus, in order to alleviate some of the severity of dealing with imbalanced datasets, the performance during training iterations is evaluated using a categorical focal loss function \citep{lin2017focal}. Focal loss down-weighs classes that are already reliably predicted by the network and in turn emphasizes neglected classes. An Adam optimizer \citep{kingma2014adam} is used to traverse the loss landscape towards the global (or to at least a local) minimum.

The network layout used for the classification in \TRex{} (see \figref{fig:software_overview}c) is a typical Convolutional Neural Network (CNN). The concepts of "convolutional" and "downsampling" layers, as well as the back-propagation used during training, are not new. They were introduced in \cite{fukushima1988neocognitron}, inspired originally by the work of Hubel and Wiesel on cats and rhesus monkeys (\citealt{hubel1959receptive}, \citealt{hubel1963receptive}, \citealt{wiesel1966spatial}), describing receptive fields and their hierarchical structure in the visual cortex. Soon afterward, in \cite{lecun1989backpropagation}, CNNs, in combination with back-propagation, were already successfully used to recognize handwritten ZIP codes -- for the first time, the learning process was fully automated. A critical step towards making their application practical, and the reason they are popular today.

The network architecture used in our software is similar to the identification module of the network in \cite{idtrackerai}, and is, as in most typical CNNs, (reverse-)pyramid-like. However, key differences between \TRex{'} and \idtracker{'s} procedure lie with the way that training data is prepared (see previous sections) and how further segments are accumulated/evaluated (see next section). Furthermore, contrary to \idtracker{'s} approach, images in \TRex{} are augmented (during training) before being passed on to the network. While this augmentation is relatively simple (random shift of the image in x-direction), it can help to account for positional noise introduced by e.g. the posture estimation or the video itself when the network is used for predictions later on \citep{perez2017effectiveness}. We do not flip the image in this step, or rotate it, since this would defeat the purpose of using orientation normalization in the first place (as in \nameref{sec:posture_normalization}, see \figref{fig:datasets_comparison}). Here, in fact, normalization of object orientation (during training and predictions) could be seen as a superior alternative to data augmentation.

The input data for \TRex{'} network is a single, cropped grayscale image of an individual (see \figref{fig:software_overview}c). This image is first passed through a "lambda" layer (blue) that normalizes the pixel values, dividing them by half the value limit of $255 / 2 = 127.5$ and subtracting $1$ -- this moves them into the range of $[-1,1]$. From then on, sections are a combination of convolutional layers (kernel sizes of 16, 64 and 100 pixels), each followed by a 2D (2x2) max-pooling and a 2D spatial dropout layer (with a rate of 0.25). Within each of these blocks the input data is reduced further, focussing it down to information that is deemed important. Towards the end, the data are flattened and flow into a densely connected layer (100 units) with exactly as many outputs as the number of classes. The output is a vector with values between $0$ and $1$ for all elements of the vector, which, due to softmax-activation, sum to $1$.

Training commences by performing a stochastic gradient descent (using the Adam optimizer, see \citealt{kingma2014adam}), which iteratively minimizes the error between network predictions and previously known associations of images with identities -- the original assignments within the initial frame segment. The optimizer's behavior in the last five epochs is continuously observed and training is terminated immediately if one of the following criteria is met:
%Training on the initial segment is continued until one of the following criteria is met:

\begin{itemize}[label=\textnormal{$\bullet$}]
\item the maximum number of iterations is reached (150 by default, but can be set by the user)
\item a plateau is achieved at a high per-class accuracy
\item overfitting/overly optimizing for the training data at the loss of generality
\item no further improvements can be made (due to the accuracy within the current training data already being $1$)
\end{itemize}

The initial training unit is also by far the most important as it determines the predicted identities within further segments that are to be added. It is thus less risky to overfit than it is important to get high-quality training results, and the algorithm has to be relatively conservative regarding termination criteria. Later iterations, however, are only meant to extend an already existing dataset and thus (with computation speed in mind) allow for additional termination criteria to be added:

\begin{itemize}[label=\textnormal{$\bullet$}]
    \item plateauing at/circling around a certain \protect\path{val_loss} level
    \item plateauing around a certain uniqueness level
\end{itemize}

%Please refer to the supplement for a more detailed definition of the individual criteria (\nameref{sec:recognition_stopping}).

\subsubsection{2. Accumulation of Additional Segments and Stopping-Criteria}

If necessary, initial training results can be improved by adding more samples to the active dataset. This could be done manually by the user, always trying to select the most promising segment next, but requiring such manual work is not acceptable for high-throughput processing. Instead, in order to translate this idea into features that can be calculated automatically, the following set of metrics is re-generated per (yet inactive) segment after each successful step:

\begin{enumerate} \label{sec:accumulation_quality_criteria}
    \item Average uniqueness index (rounded to an integer percentage in 5\% steps)
    \item Minimal distance to regions that have previously been trained on (rounded to the next power of two), larger is better as it potentially includes samples more different from the already known ones
    \item Minimum \textit{cells visited} per individual (larger is better for the same reason as 2)
    \item Minimum average samples per individual (larger is better)
    \item Whether its image data has already been generated before (mostly for saving memory)
    \item The uniqueness value is smaller than $U_{prev}^2$ after 5 steps, with $U_{prev}$ being the best uniqueness value previous to the current accumulation step
\end{enumerate}

With the help of these values, the segment list is sorted and the best segment selected to be considered next. Adding a segment to a set of already active samples requires us to correct the identities inside it, potentially switching temporary identities to represent the same \textit{real} identities as in our previous data. This is done by predicting identities for the new samples using the network that has been trained on the old samples. Making mistakes here can lead to significant subsequent problems, so merely plausible segments will be added - meaning only those samples are accepted for which the predicted IDs are \textit{unique} within each unobstructed sequence of frames for every temporary identity. If multiple temporary individuals are predicted to be the same real identity, the segment is saved for later and the search continues.

If multiple additional segments are found, the program tries to actively improve local uniqueness valleys by adding samples first from regions with comparatively \textit{low} accuracy predictions. Seeing as low accuracy regions will also most likely fail to predict unique identities, it is important to emphasize here that this is generally not a problem for the algorithm: Failed segments are simply ignored and can be inserted back into the queue later. Smoothing the curve also makes sure to prefer regions close to valleys, making the algorithm follow the valley walls upwards in both directions.

%Using such a fine-tuned and targeted selection method can help to significantly improve the analysis speed by avoiding to add sections that will not lead to better generalization.
%TODO: test this properly

Finishing a training unit does not necessarily mean that it was successful. Only the network states improving upon results from previous units are considered and saved. Any training result - except the initial one - may be rejected after training in case the uniqueness score has not improved globally, or at least remained within 99\% of the previous best value. This ensures stability of the process, even with tracking errors present (which can be corrected for later on, see next section). If a segment is rejected, the network is restored to the best recorded state.

%In each accumulation step, only the state of the network weights is saved that improved the most upon the previous accumulation steps regarding their global uniqueness value. Each accumulation step following the initial training may fail if no improvement could be made upon the previous steps, in which case they will be discarded.

Each new segment is always combined with regularly sampled data from previous steps, ensuring that identities don't switch back and forth between steps due to uncertain predictions. If switching did occur, then the uniqueness and accuracy values can never reach high value regimes -- leading to the training unit being discarded as a result. The contribution of each previously added segment $R$ is limited to $\ceil{|R_S| / ( \mathrm{samples\_max} * |R| / N )}$ samples, with $N$ as the total number of frames in global segments for this individual and $\mathrm{samples\_max}$ a constant that is calculated using image size and memory constraints (or 1GB by default). $R_S$ is the actual \textit{usable} number of images in segment $R$. This limitation is an attempt to not bias the priors of the network by sub-sampling segments according to their contribution to the total number of frames in global segments.

%long_t step_size = max(1, ceil(SR / (max_images_per_class * double(range.length()) / double(N))));
        %//       (with S_R being the set of actually available frames <= all frames within R)

Training is considered to be successful globally, as soon as either (i) accumulative individual gaps between sampled regions is less than 25\% of the video length for all individuals, or (ii) uniqueness has reached a value higher than \changemade{
\inlineequation[eq:gooduniqueness]{1-\frac{0.5}{N_{\mathrm{id}}}} }
so that almost all detected identities are present exactly once per frame. Otherwise, training will be continued as described above with additional segments -- each time extending the percentage of images seen by the network further.

Training accuracy/consistency could potentially be further improved by letting the program add an arbitrary amount of segments, however we found this not to be necessary in any of our test-cases. Users are allowed to set a custom limit if required in their specific cases.

\subsubsection{3. The Final Training Unit}

After the accumulation phase, one last training step is performed. In previous steps, validation data has been kept strictly separate from the training set to get a better gauge on how generalizable the results are to unseen parts of the video. \changemade{This is especially important during early training units, since "overfitting" is much more likely to occur in smaller datasets and we still potentially need to add samples from different parts of the video. Now that we are not going to extend our training dataset anymore, maintaining generalizibility is no longer the main} objective -- so why not use \textit{all} of the available data? The entire dataset is simply merged and sub-sampled again, according to the memory strategy used. Network training is started, with a maximum of $\max\{ 3; \mathrm{max\_epochs} * 0.25 \}$ iterations (max\_epochs is 150 by default). During this training, the same stopping-criteria apply as during the initial step.

\changemade{Even if we tolerate the risk of potentially overfitting on the training data, there is still a way to detect} overfitting if it occurs: \changemade{Only training steps that lead to improvements in mean uniqueness across the video} are saved. \changemade{Often, if prediction results become worse (e.g. due to overfitting), multiple individuals in a single frame are predicted to be the same identity -- precisely the problem which our uniqueness metric was designed to detect.}

\changemade{F}or some videos, this is the step where most progress is made (e.g. \videoref{vid:15locusts1h}). The reason being that this is the first time when all of the training data from all segments is considered at once (instead of mostly the current segment plus fewer samples from previously accepted segments), and samples from all parts of the video \changemade{have} an equal likelihood of being used in training after possible reduction due to memory-constraints.

\subsubsection{4. Assigning Identities Based on Network Predictions}

After the network has been successfully trained, all parts of the video which were not part of the training are packaged together and the network calculates predictive probabilities for each image of each individual to be any of the available identities. The vectors returned by the network are then averaged per consecutive segment per individual. The average probability vectors for all overlapping segments are weighed against each other -- usually forcing assignment to the most likely identity (ID) for each segment, given that no other segments have similar probabilities. When referring to segments here, meant is simply a number of consecutive frames of one individual that the tracker is fairly sure does \textit{not} contain any mix-ups. We implemented a way to detect tracking mistakes, which is mentioned later.

If an assignment is ambiguous, meaning that multiple segments $S_{j\dots M}$ overlapping in time have the same maximum probability index $\argmax{i\in[0,N]} \left\{ P\given{i|S_j} \right\}$ (for the segment to belong to a certain identity $i$), a decision has to be made. Assignments are deferred if the ratio

$$ R_\mathrm{max} = \max\left\{ 
\frac{P\given{i | S_j}}{P\given{i | S_k}}, \forall S_{j\not= k}\in \mathrm{\ overlapping\ segments} \right\} $$

between any two maximal probabilities is \textit{larger than} $0.6$ for said $i$ ($R_\mathrm{max}$ is inverted if it is greater than $1$). In such a case, we rely on the general purpose tracking algorithm to pick a sensible option -- other identities might even be successfully assigned (using network predictions) in following frames, which is a complexity we do not have to deal with here. In case all ratios are \textit{below} $0.6$, when the best choices per identity are not too ambiguous, the following steps are performed to resolve remaining conflicts:

\begin{enumerate}
\item count the number of samples $N_{me}$ in the current segment, and the number of samples $N_{he}$ in the other segment that this segment is compared to
\item calculate average probability vectors $P_{me}$ and $P_{he}$
\item if $S(P_{me}, N_{me}) \ge S(P_{he}, N_{he})$, then assign the current segment with the ID in question. Otherwise assign the ID to the other segment. Where:
\begin{equation}
\begin{split}
    \mathrm{norm}(x) = \frac{x}{N_{me} + N_{he}},\ &
    \mathrm{sig}(x) = \left(1 + e^{2\pi(0.5-x)}\right)^{-1} \\
    S(p,x) = \mathrm{sig}(p) &+ \mathrm{sig}(\mathrm{norm}(x)) .
\end{split}
\end{equation}
\end{enumerate}

This procedure prefers segments with larger numbers of samples over segments with fewer samples, ensuring that identities are not switched around randomly whenever a short segment (e.g. of noisy data) is predicted to be the given identity for a few frames -- at least as long as a better alternative is available. The non-linearity in $S(p,x)$ exaggerates differences between lower values and dampens differences between higher values: For example, the quality of a segment with $4000$ samples is barely different from a segment with $5000$ samples; however, there is likely to be a significant quality difference between segments with $10$ and $100$ samples.

In case something goes wrong during the tracking, e.g. an individual is switched with another individual without the program knowing that it might have happened, the training might still be successful (for example if that particular segment has not been used for training). In such cases, the program tries to correct for identity switches mid-segment by calculating a running-window median identity throughout the whole segment. If the identity switches for a significant length of time, before identities are assigned to segments, the segment is split up at the point of the first change within the window and the two parts are handled as separate segments from then on.

\section{Software and Licenses}

\TRex{} is published under the GNU GPLv3 license (see \href{https://choosealicense.com/licenses/gpl-3.0/}{here} for permissions granted by GPLv3). All of the code has been written by the first author of this paper (a few individual lines of code from other sources have been marked inside the code). While none of these libraries are distributed alongside \TRex{} (they have to be provided separately), the following libraries are used: OpenCV (\href{https://opencv.org/about/}{opencv.org}) is a core library, used for all kinds of image manipulation. GLFW (\href{https://www.glfw.org}{glfw.org}) helps with opening application windows and maintaining graphics contexts, while DearImGui (\href{https://github.com/ocornut/imgui}{github.com/ocornut/imgui}) helps with some more abstractions regarding graphics. \texttt{pybind11} (\cite{pybind11}) for Python integration within a C++ environment. miniLZO (\href{http://www.oberhumer.com/opensource/lzo/\#minilzo}{oberhumer.com/opensource/lzo}) is used for compression of PV frames. Optional bindings are available to FFMPEG (\href{http://ffmpeg.org}{ffmpeg.org}) and libpng libraries, if available. (optional) GNU Libmicrohttpd (\href{https://www.gnu.org/software/libmicrohttpd/}{gnu.org/software/libmicrohttpd}), if available, can be used for an HTTP interface of the software, but is non-essential.

\section{Acknowledgments}

We thank A. Albi, F. Nowak, H. Hugo, D. E. Bath, F. Oberhauser, H. Naik, J. Graving, I. Etheredge for helping with their insights, by providing videos, for comments on the manuscript, testing the software and for frequent coffee breaks during development. The development of this software would not have been possible without them. \changemade{We thank D. Mink and M. Groettrup providing additional video material of mice. We thank the reviewers and editors for their constructive and useful comments and suggestions.} IDC acknowledges support from the NSF (IOS-1355061), the Office of Naval Research grant (ONR, N00014-19-1-2556), the Struktur- und Innovationsfunds f\"{u}r die Forschung of the State of Baden-W\"{u}rttemberg, the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany's Excellence Strategy--EXC 2117-422037984, and the Max Planck Society.

%\nocite{*} % This command displays all refs in the bib file. PLEASE DELETE IT BEFORE YOU SUBMIT YOUR MANUSCRIPT!
\bibliography{elife-sample}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% APPENDICES
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\appendix

\setcounter{table}{0}

%\textit{\textbf{Appendix \arabic{appendix} 
\renewcommand{\thetable}{A\arabic{table}}
\renewcommand\thesection{\Alph{section}}
\renewcommand{\thefigure}{Appendix~\arabic{appendix} Figure~A\arabic{figure}}
\captionsetup*[table]{name={\hspace{-2.25pt}},font={color=eLifeDarkBlue,small}}
\captionsetup*[figure]{name={\hspace{-2.25pt}},font={color=eLifeDarkBlue,small},skip=\smallskipamount}
\renewcommand{\thetable}{Appendix~\arabic{appendix} Table~A\arabic{table}}

\begin{appendixbox}
\section{Installation requirements and Usage}

Compiled, ready-to-use binaries are available for all major operating systems (\texttt{Windows}, \texttt{Linux}, \texttt{MacOS}). However, it should be possible to compile the software yourself for any Unix- or Windows-based system ($\ge$ 8), possibly with minor adjustments. Tested setups include:

\begin{itemize}
    \item \texttt{Windows}, \texttt{Linux}, \texttt{MacOS}
    \item A computer with $\geq 16\mathrm{GB}$ RAM is recommended
    \item \texttt{OpenCV}\footnote{\href{https://opencv.org}{opencv.org}} libraries $\ge \mathrm{v}3.3$
    \item \texttt{Python} libraries $\ge \mathrm{v}3.6$, as well as additional packages such as:
    \item Keras $\approx \mathrm{v}2.2$ with one of the following backends installed
    \begin{itemize}
        \item Tensorflow $<\mathrm{v}2$\footnote{\href{https://tensorflow.org}{tensorflow.org}} (either CPU-based, or GPU-based)
        \item Theano \footnote{\href{http://deeplearning.net/software/theano/}{deeplearning.net}}
    \end{itemize}
    \item GPU-based recognition requires an \texttt{NVIDIA} graphics-card and drivers (see Tensorflow documentation)
    
\end{itemize}

For detailed download/installation instructions and up-to-date requirements, please refer to the documentation at \href{https://trex.run/install}{trex.run/install}.
% should i put license information here as well? like, what i use?

\subsection{Workflow}

\TRex{} can be opened in one of two ways: (i) Simply starting the application (e.g. using the operating systems' file-browser), (ii) using the command-line. If the user simply opens the application, a file opening dialog displays a list of compatible files as well as information on a selected files content. Certain startup parameters can be adjusted from within the graphical user-interface, before confirming and loading up the file (see \figref{fig:file_opening}). Users with more command-line experience, or the intent of running \TRex{} in batch-mode, can append necessary parameter values without adding them to a settings file.

To acquire video-files that can be opened using \TRex{}, one needs to first run \TGrabs{} in one way or another. It is possible to use a webcam (generic USB camera) for recording, but \TGrabs{} can also be compiled with Basler Pylon5 support\footnote{The \href{https://www.baslerweb.com/de/produkte/software/basler-pylon-camera-software-suite/}{baslerweb.com} Pylon SDK is required to be installed to support Basler USB cameras.}. \TGrabs{} can also convert existing videos and write to a more suitable format for \TRex{} to interact with (a static background with moving objects clearly separated in front of it). It can be started just like \TRex{}, although most options are either set via the command-line, or a web-interface. \TGrabs{} can perform basic tracking tasks on the fly, offering closed-loop support as well.

For automatic visual recognition, one might need to adjust some parameters. Mostly, these adjustments consist of changing the following parameters:

\begin{itemize}
    \item \texttt{blob\_size\_ranges}: Setting one (or multiple) size thresholds for individuals, by giving lower and upper limit value pairs.
    \item \texttt{track\_max\_individuals}: Sets the number of individuals expected in a trial. This number needs to be known for recognition tasks (and will be guessed if not provided), but can be set to $0$ for unknown numbers of individuals.
    \item \texttt{track\_max\_speed}: Sets the maximum speed (cm/s) that individuals are expected to travel at. This is influenced by meta information provided to \TGrabs{} by the user (e.g. the width of the tank), as well as frame timings.
    \item \texttt{track\_threshold}: Even \TRex{} can threshold images of individuals, so it is beneficial to not threshold away too many pixels during conversion/recording and do finer-grade adjustments in the tracker itself.
    \item \texttt{outline\_resample}: A factor that is $>0$, by which the number of points in the outline is essentially "divided". Smaller resample rates lead to more points on the outline (good for very small shapes).
\end{itemize}

Training can be started once the user is satisfied with the basic tracking results. Consecutive segments are highlighted in the time-line and suggest better or worse tracking, based on their quantity and length. Problematic segments of the video are highlighted using yellow bars in that same time-line, giving another hint to the user as to the tracking quality. To start the training, the user just clicks on "train network" in the main menu -- triggering the accumulation process immediately. After training, the user can click on "auto correct" in the menu and let \TRex{} correct the tracks automatically (this will re-track the video). The entire process can be automated by adding the "auto\_train" parameter to the command-line, or selecting it in the interface.

\subsection{Output}

Once finished, the user may export the data in the desired format. Which parts of the data are exported is up to the user as well. By default, almost all the data is exported and saved in NPZ files in the output folder.

Output folders are structured in this way:

\begin{itemize}
\item \textbf{output} folder:
    \begin{itemize}
        \item Settings files
        \item Training weights
        \item Saved program states
        \item \textbf{data} folder:
        \begin{itemize}
            \item Statistics
            \item All exported NPZ files (named \texttt{[video\_name]\_fish[number].npz} -- the prefix "fish" can be changed).
            \item \dots
        \end{itemize}
        \item \textbf{frames} folder (contains video clips recorded in the GUI, e.g. for presentations):
        \begin{itemize}
            \item \textbf{[video name]} folder
            \begin{itemize}
                \item \texttt{clip[index].avi}
                \item \dots
            \end{itemize}
            \item \dots
        \end{itemize}
    \end{itemize}
\end{itemize}

At any point in time (except during training), the user can save the current program state and return to it at a later time (e.g. after a computer restart).

\subsubsection{Export options}

After individuals have been assigned by the matching algorithm, various metrics are calculated (depending on settings):

\begin{itemize}
    \item \textbf{Angle:} The angle of an individual can be calculated without any context using image moments (\cite{hu1962visual}). However, this angle is only reliable within 0 to 180 degrees -- not the full 360. Within these 180 degrees it is probably more accurate than is movement direction.
    \item \textbf{Position:} Centroid information on the current, as well as the previous position of the individual are maintained. Based on previous positions, velocity as well as acceleration are calculated. This process is based on information sourced from the respective video file or camera on the time passed between frames. The centroid of an individual is calculated based on the mass center of the pixels that the object comprises. Angles calculated in the previous steps are corrected (flipped by 180 degrees) if the angle difference between movement direction and angle + 180 degrees is smaller than with the raw angle. \item \textbf{Posture:} A large part of the computational complexity comes from calculating the posture of individuals. While this process is relatively fast in \TRex{}, it is still the main factor (except with many individuals, where the matching process takes longest). We dedicated a subsection to it below.
    \item \textbf{Visual Field:} Based on posture, rays can be cast to detect which animal is visible from the position of another individual. We also dedicated a subsection to visual field further down.
    \item \textbf{Other} features can be computed, such as inter-individual distances or distance to the tank border. These are optional and will only be computed if necessary when exporting the data. A (non-comprehensive) list of metrics that can be exported follows:
        \begin{itemize}
            \item Time: The time of the current frame (relative to the start of the video) in seconds.
            \item Frame: Index of the frame in the \texttt{PV} video file.
            \item Individual components of position its derivatives (as well as their magnitudes, e.g. speed)
            \item Midline offset: The center-line, e.g. of a beating fish-tail, is normalized to be roughly parallel to the x-axis (from its head to a user-defined percentage of a body). The y-offset of its last point is exported as a "midline offset". This is useful, e.g. to detect burst-and-glide events.
            \item Midline variance: Variance in midline offset, e.g. for detection of irregular postures or increased activity.
            \item Border distance
            \item Average neighbour distance: Could be used to detect individuals who prefer to be located far away from the others or are avoided by them.
        \end{itemize}
\end{itemize}

Additionally, tracks of individuals can be exported as a series of cropped-out images -- a very useful tool if they are to be used with an external posture estimator or tag-recognition. This series of images can be either every single image, or the median of multiple images (the time-series is down-sampled).
\end{appendixbox}

\begin{figure}
%\begin{fullwidth}
\includegraphics[width=1.0\linewidth]{figures/Figure_A1.pdf}
%\captionsetup{margin=0pt,calcmargin={0pt,-4.5cm}}
\caption{Using the interactive heatmap generator within \TRex{}, the foraging trail formation of \textit{Constrictotermes cyphergaster} (termites) can be visualized during analysis, as well as other potentially interesting metrics (based on posture- as well basic positional data). This is generalizable to all output data fields available in \TRex{}, e.g. also making it possible to visualize "time" as a heatmap and showing where individuals were more likely to be located during the beginning or towards end of the video. \textit{Video: H. Hugo}}
\label{fig:trex_screenshot_heatmaps}
%\end{fullwidth}
\end{figure}

\begin{figure}
    \centering
   \includegraphics[width=\textwidth]{figures/FigureA2.png}
    \caption{The file opening dialog. On the left is a list of compatible files in the current folder. The center column shows meta-information provided by the video file, including its frame-rate and resolution -- or some of the settings used during conversion and the timestamp of conversion. The column on the right provides an easy interface for adjusting the most important parameters before starting up the software. Most parameters can be changed later on from within \TRex{} as well.}
	\label{fig:file_opening}
\end{figure}

\begin{appendixbox}

\section{From video frame to blobs}

Video frames can originate either from a camera, or from a pre-recorded video file saved on disk. \TGrabs{} treats both sources equally, the only exception being some minor details and that pre-recorded videos have a well-defined end (which only has an impact on MP4 encoding). Multiple formats are supported, but the full list of supported codecs depends on the specific system and OpenCV version installed. \TGrabs{} saves images in RAW quality, but does not store complete images. Merely the objects of interest, defined by common tracking parameters such as size, will actually be written to a file. Since \TGrabs{} is mostly meant for use with stable backgrounds (except when contrast is good or a video-mask is provided), the rest of the area can be approximated by a static background image generated in the beginning of the process (or previously).

Generally, every image goes through a number of steps before it can be tracked in \TRex{}:

\begin{enumerate}
    \item Images are decoded by either (i) a camera driver, or (ii) OpenCV. They consist of an array of values between $0$ and $255$ (grayscale). Color images will be converted to grayscale images (color channel or "hue" can be chosen).
    \item Timing information is saved and images are appended to a queue of images to be processed
    \item All operations from now on are performed on the GPU if available. Once images are in the queue, they are picked one-by-one by the processing thread, which performs operations on them based on user-defined parameters:
    \begin{itemize}
        \item Cropping
        \item Inverting
        \item Contrast/brightness and lighting corrections
        \item Undistortion (see OpenCV \href{https://docs.opencv.org/3.4/dc/dbb/tutorial_py_calibration.html}{Tutorial})
    \end{itemize}
    \item (optional) Background subtraction ($d(x) = b(x) - f(x)$, with $f$ being the image and $b$ the background image), leaving a difference image containing only the objects. This can be an absolute difference $|b(x)-f(x)|$ or a signed one, which has different effects on the following step. Otherwise $d(x) = f(x)$
    \item Thresholding to obtain a binary image, with all pixels either being 1 or 0:
    $$ t(x) = \begin{cases} 0 & d(x) < T \\ 1 & d(x) \ge T \end{cases} $$
    where $0\leq T\leq 255$ is the threshold constant.
    \item Options are available for further adjustment of the binary image: Dilation, Erosion and Closing are used to close gaps in the shapes, which are filled up by successive dilation and erosion operations (see \figref{fig:erosion}). If there is an imbalance of dilation and erosion commands, noise can be removed or shapes made more inclusive.
    \item The original image is multiplied by the thresholded image, obtaining a masked grayscale image: $t(x)\cdot f(x)$, where $\cdot$ is the element-wise multiplication operator.
\end{enumerate}

At this point, the masked image is returned to the CPU, where connected components (objects) are detected. A connected component is a number of adjacent pixels with color values greater than zero. Algorithms for connected-component labeling either use a 4-neighborhood or an 8-neighborhood, which considers diagonal neighbors to be adjacent as well. Many such algorithms are available (\cite{4728561}, \cite{chang2003component}, and many others), even capable of real-time speeds (\cite{suzuki2003linear}, \cite{he2009fast}). However, since we want to use a compressed representation throughout our solution, as well as transfer over valuable information to integrate it with posture analysis, we needed to implement our own (see \nameref{sec:connected_components}). 

\texttt{MP4} encoding has some special properties, since its speed is mainly determined by the external encoding software. Encoding at high-speed frame-rates can be challenging, since we are also encoding to a \texttt{PV}-file simultaneously. Videos are encoded in a separate thread, without muxing, and will be remuxed after the recording is stopped. For very high frame-rates or resolutions, it may be necessary to limit the duration of videos since all of the images have to be kept in RAM until they have been encoded. RAW images in RAM can take up a lot of space ($1024*1024*1000=1,048,576,000$ bytes for 1000 images quite low in resolution). If there a recording length is defined prior to starting the program, or a video is converted to \texttt{PV} and streamed to \texttt{MP4} at the same time (though it is unclear why that would be necessary), \TGrabs{} is able to automatically determine which frame-rate can be maintained reliably and without filling the memory.

\end{appendixbox}

\begin{figure}[t]
    \centering
    % \begin{subfigure}[b]{0.25\textwidth}
    %         \centering
    %        \includegraphics[width=\textwidth]{structure}
    %         \caption{The structure element.}
    %         \label{fig:b}
    % \end{subfigure}
    % \begin{subfigure}[b]{0.36\textwidth}
    %        \centering
    %        	\includegraphics[width=\textwidth]{structure_image_before}
    %         \caption{The original image.}
    %         \label{fig:a}
    % \end{subfigure}
    % \begin{subfigure}[b]{0.36\textwidth}
    %         \centering
    %         \includegraphics[width=\textwidth]{structure_image_after}
    %         \caption{Image modified by the structure element.}
    %         \label{fig:c}
    % \end{subfigure}
    \includegraphics[width=\textwidth]{figures/fig_structure_elements.pdf}
    \caption{Example of morphological operations on images: "Erosion". Blue pixels denote on-pixels with color values greater than zero, white pixels are "off-pixels" with a value equal to zero. A mask is moved across the original image, with its center (dot) being the focal pixel. A focal pixel is \textit{retained} if all of the on-pixels within the structure element/mask are on top of on-pixels in the original image. Otherwise the focal pixel is set to 0. The type of operation performed is entirely determined by the structure element.}
	\label{fig:erosion}
\end{figure}

\begin{appendixbox}
\section{Connected components algorithm} \label{sec:connected_components}
Pixels are not represented individually in \TRex{}. Instead, they are saved as connected horizontal line segments. For each of these lines, only y- as well as start- and end-position are saved ($y, x_0$ and $x_1$). This representation is especially suited for objects stretching out along the x-axis, but of course its worst-case is a straight, vertical line -- in which case space requirements are $O(2*N)$ for $N$ pixels. Especially for big objects, however, only a fraction of coordinates has to be kept in memory (with a space requirement of $O(2*H)$ instead of $O(W*H)$, with $W,H$ being width and height of the object).

Extracting these connected horizontal line segments from an image can be parallelized easily by cutting the image into full-width pieces and running the following algorithm repeatedly for each row:

\begin{enumerate}
    \item From $0$ to $W$, iterate all pixels. Always maintain the previous value (binary), as well as the current value. We start out with our previous value of $\overline{p}=0$ (the border is considered not to be an object).
    \item Now repeat for every pixel $p_i$ in the current row:
    \begin{enumerate}
        \item If $\overline{p}$ is $1$ and $p_i$ is $0$, set $\overline{p}\coloneqq 0$ and save the position as the end of a line segment $x_1=i-1$.
        \item If $\overline{p}$ is $0$ and $p_i$ is $1$, we did not have a previous line segment and a new one starts. We save it as our current line segment with $x_0$ and $y$ equal to the current row. Set $\overline{p}\coloneqq 1$.
    \end{enumerate}
    \item After each row, if we have a valid current line, we save it in our array of lines. If $\overline{p} = 1$ was set, and the line segment ended at the border $W$ of the image, we first set its end position to $x_1\coloneqq W-1$.
\end{enumerate}

We keep the array of extracted lines sorted by their y-coordinate, as well as their x-coordinates in the order we encountered them. To extract connected components, we now just need to walk through all extracted rows and detect changes in the y-coordinate. The only information needed are the current row and the previous row, as well as a list of active preliminary "blobs" (or connected components). A blob is simply a collection of ordered horizontal line segments belonging to a single connected component. These blobs are preliminary until the whole image has been processed, since they might be merged into a single blob further down despite currently being separate (see \figref{fig:connected_component}).

"Rows" are an array of horizontal lines with the same y-coordinate, ordered by their x-coordinates (increasing). The following algorithm only considers pairs of previous row $R_{i-1}$ and current row $R_i$. We start by inserting all separate horizontal line segments of the very first row into the pool of active blobs, each assigned their own blob. Lines within row $R_i$ are $L_{i,j}$. Coordinates of $L_{i,j}$ will be denoted as $x_0(i,j)$, $x_1(i,j)$ and $y(i,j)$. Our current index in row $R_{i-1}$ is $j$ and our index in row $R_{i}$ is $k$. We initialize $j\coloneqq 0, k\coloneqq 1$. Now for each pair of rows, three different actions may be required depending on the case at hand. All three actions are hierarchically ordered and mutually exclusive (like a typical \texttt{if}/\texttt{else} structure would be), meaning that case 0-2 can be true at the same time while no other combination can be simultaneously true:

\begin{enumerate}
    \item \textbf{Case 0,1 and 2: We have to create a new blob.} This is the case if (0) the line in $R_i$ ends before the line in $R_{i-1}$ starts ($x_1(i,k)+1 < x_0(i,j)$), or (1) y-coordinates of $R_i$ and $R_{i-1}$ are farther apart than $1$ ($y(i-1,j) > y(i,k)+1$), or (2) there are no lines left in $R_{i-1}$ to match the current line in $R_i$ to ($j \geq |R_{i-1}|$). $L_{i,k}$ is assigned with a new blob.
    
    \item \textbf{Case 3: Segment in the previous row ends before the segment in the current row starts.} If $x_0(i,k) > x_1(i-1,j) + 1$, then we just have to $j \coloneqq j + 1$.
    
    \item \textbf{Case 4: Segment in the previous row and segment in the current row intersect in x-coordinates.} If $L_{i,k}$ is no yet assigned with a blob, assign it with the one from $L_{i-1,j}$. Otherwise, both blobs have to be merged. This is done in a sub-routine, which guarantees that lines within blobs stay properly sorted during merging. This means that (i) y-coordinates increase or stay the same and (ii) x-coordinates increase monotonically. Afterwards, we increase either $k$ or $j$ based on which one associated line ends earlier: If $x_1(i,k) \leq x_1(i-1,j)$, then we increase $k \coloneqq k + 1$; otherwise $j \coloneqq j + 1$.
\end{enumerate}

After the previous algorithm has been executed on a pair of $R_{i-1}$ and $R_{i}$, we increase $i$ by one $i \coloneqq i + 1$. This process is continued until $i = H$, at which point all connected components are contained within the active blob array.

Retaining information about pixel values adds slightly more complexity to the algorithm, but is straight-forward to implement. In \TRex{}, horizontal line segments comprise $y$, $x_0$ and $x_1$ values plus an additional pointer. It points to the start of a line within array of all pixels (or an image matrix), adding only little computational complexity overall.

Based on the horizontal line segments and their order, posture analysis can be sped up when properly integrated. Another advantage is that detection of connected components within arrays of horizontal line segments is supported due to the way the algorithm functions -- we can just get rid of the extraction phase.

\end{appendixbox}

\begin{figure}[b]
    \centering
    \includegraphics[width=0.36\textwidth]{figures/connected_component.pdf}
    \caption{An example array of pixels, or image, to be processed by the connected components algorithm. This figure should be read from top to bottom, just as the connected components algorithm would do. When this image is analysed, the red and blue objects will temporarily stay separate within different "blobs". When the green pixels are reached, both objects are combined into one identity.}
	\label{fig:connected_component}
\end{figure}


\begin{appendixbox}
\section{Matching an object to an object in the next frame} \label{sec:matching_graph}

\subsection{Terminology}

A graph is a mathematical structure commonly used in many fields of research, such as computer science, biology and linguistics. Graphs are made up of vertices, which in turn are connected by edges. Below we define relevant terms that we are going to use in the following section:

\begin{itemize}
    \item Directed graph: Edges have a direction assigned to them
    \item Weighted edges: Edges have a weight (or cost) assigned to them
    \item Adjacent nodes: Nodes which are connected immediately by an edge
    \item Path: A path is a sequence of edges, where each edges starting vertex is the end vertex of the previous edge
    \item Acyclic graph: The graph contains no path in which the same vertex appears more than once
    \item Connected graph: There are no vertices without edges, there is a path from any vertex to any other vertex in the graph
    \item Bipartite graph: Vertices can be sorted into two distinct groups, without an edge from any vertex to elements of its own group -- only to the other group
    \item Tree: A tree is a connected, undirected, acyclic graph, in which any two vertices are only connected by exactly one path
    \item Rooted, directed out-tree: A tree where one vertex has been defined to be the root and directed edges, with all edges flowing away from the root
    \item Visited vertex: A vertex that is already part of the current path
    \item Leaf: A vertex which has only one edge arriving, but none going out (in a tree this are the bottom-most vertices)
    \item Depth-first/breadth-first and best-first search: Different strategies to pick the next vertex to explore for a set of paths with traversable edges. Depth-first prefers to first go deeper inside a graph/tree, before going on to explore other edges of the same vertex. Breadth-first is the opposite of depth-search. Best-first search uses strategies to explore the most promising path first.
\end{itemize}

\subsection{Background}
The transportation problem is one of the fundamental problems in computer science. It solves the problem of transporting a finite number of \textit{goods} to a finite number of \textit{factories}, where each possible transport route is associated with a \textit{cost} (or weight). Every factory has a \textit{demand} for goods and every good has a limited \textit{supply}. The sum of this cost has to be minimized (or benefits maximized), while remaining within the constraints given by supply and demand. In the special case where demand by each factory and supply for each good are exactly equal to 1, this problem reduces to the \textit{assignment problem}.

The assignment problem can be further separated into two distinct cases: the \textit{balanced} and the \textit{unbalanced} assignment problem. In the balanced case, net-supply and demand are the same -- meaning that the number of factories matches exactly the number of suppliers. While the balanced case can be solved slightly more efficiently, most practical problems are usually unbalanced (\cite{ramshaw2012minimum}). Thankfully, unbalanced assignments can be reduced to balanced assignments, for example using graph-duplication methods or by adding nodes (\cite{ramshaw2012minimum},  \cite{ramshaw2012weight}). This makes the widely used Hungarian method (\cite{kuhn1955hungarian}; \cite{munkres1957algorithms}) a viable solution to both, with a computational complexity of $O(n^3)$. It can be further improved using Fibonacci heaps (not implemented in \TRex{}), resulting in $O(ms+s^2\log n)$ time-complexity (\cite{fredman1987fibonacci}), with $m$ being the number of possible connections/edges, $s\leq n$ the number of factories to be supplied and $n$ the number of factories. Re-balancing, by adding nodes or other structures, also adds computational cost -- especially when $s \ll n$ (\cite{ramshaw2012weight}).
% used in various trackers

\subsubsection{Adaptation for our matching problem}

Assigning individuals to objects in the frame is, in the worst case, exactly that: an unbalanced assignment problem -- potentially with $r\not=s$. During development, we found that we can achieve better average-complexity by combining an approach commonly used to solve \textit{NP-hard} problems. This is a class problems for which it is (probably) not possible to find a polynomial-time solution. In order to motivate our usage of a less stable algorithm than e.g. the Hungarian method, let us first introduce a more general algorithm, following along with remarks for adapting it to our special case. The next subsection concludes with considerations regarding its complexity in comparison to the more stable Hungarian method.

%One such example being the Traveling Salesman Problem (TSP), where a salesman wants to visit a number of cities e.g. using the shortest route, with the route starting and ending in the same city.

\textit{Branch \& Bound} (or BnB, \cite{land1960automatic}, formalized in \cite{little1963algorithm}) is a very general approach to traversing the large search spaces of \textit{NP-hard} problems, traditionally represented by a tree. Branching and bounding gives optimal solutions by traversing the entire search space if necessary, but stopping along the way to evaluate its options, always trying to choose better branches of the tree to explore next or skip unnecessary ones. BnB always consists of three main ingredients:

\begin{enumerate} \label{enum:bnb_rules}
    \item Branching: The division of our problem into smaller, partial problems
    \item Bounding: Estimate the upper/lower limits of the probability/cost gain to be expected by traversing a given edge
    \item Selection: Determining the next node to be processed
\end{enumerate}

Finding good strategies is essential and can have a big impact on overall computation time. Strategies can only be worked out with insight into the specific problem, but \textit{bounding} is generally the dominating factor here -- in that choosing good selection and branching techniques cannot make up for a bad bounding function (\cite{clausen1999branch}). A bounding function estimates an upper (or lower) limit for the quality of results that can be achieved within a given sub-problem (current branch of the tree).

The "problem" is the entire assignment problem located at the root node of the tree. The further down we go in the tree, the smaller the partial problems become until we reach a leaf. Any graph can be represented as a tree by duplicating nodes when necessary (\cite{zhang1996branch}, "Graph vs. tree"). So even if the bipartite assignment graph (an example sketched in \figref{fig:bipartite_graph}) is a more "traditional" representation of the assignment problem, we can translate it into a rooted, directed out-tree $T = (U,V,E,F)$ with weighted edges. Here, $U$ are individuals and $V$ are objects in the current frame that are potentially assigned to identities in $U$. $E$ are edges mapping from $U\rightarrow V$, while $F: V\rightarrow U$. It is quite visible from \figref{fig:bipartite_graph}, that the representation as a tree (b) is much more verbose than a bipartite graph (a). However, its structure is very simple:

Looking at the tree in \figref{fig:bipartite_graph} (b), individuals (blue) are found along the y-axis/deeper into the tree while objects in the frame (orange) are listed along on the x-axis. This includes a "\texttt{null}" case per individual, representing the possibility that it is \textit{not} assigned to any object -- ensuring that every individual has at least one edge.

Tree is never generated in its entirety (except in extreme cases), but it represents all \textit{possible} combinations of individuals and objects. Overall, the set $Q$ of every complete and valid path from top to bottom would be exactly the same as the set of every valid permutation of pairings between objects (plus \texttt{null}) and individuals. Edge weights in $E$ are equal to the probability $P_i\given{t,\tau_i|B_j}$ (see equation \ref{eq:combined_prob}), abbreviated to $P_i(B_j)$ here since we are only ever looking at one time-step. $B_j$ is an object and $i$ is an individual, so we can rewrite it in the current context as $P_{u}(v)$, with $u\in U; v \in V$.

We are maximizing the objective function

$$ o(\rho) = \sum_{uv \in \rho} P_u(v), $$

where $\rho\in Q$ is an element of all valid paths within $T$.

The simplest approach would be to traverse every edge in the graph and accumulate a sum of probabilities along each path, guaranteeing to find the optimal solution eventually. Since the number of possible combinations $|U|^{|E|}$ grows rapidly with the number of edges, this is not realistic -- even with few individuals. Thus, at least the \textit{typical} number of visited edges has to be minimized. While we do not know the exact solution to our problem before traversing the graph, we can make very good guesses. For example, we may order nodes in such a way that branching (visiting a node leads to $>1$ new edges to be visited) is reduced in most cases. To do that, we first need to calculate the \textit{degree} of each individual. The degree $C_u$ of individual $u$, which is exactly equivalent to the maximum number of edges going out from that individual, we define as

$$ C_u \in \mathbb{N} \coloneqq \sum_{u\in U} \begin{cases} 1 & \mathrm{if } P_u(v) > P_\mathrm{min}\\ 0 & \mathrm{otherwise} \end{cases}. $$

The maximally probable edge per individual also has to be computed beforehand, defined as

$$ \overline{P_u} = \max_{v\in V}\{ P_u(v) \}. $$

Nodes are sorted first by their degree (ascending) and secondly by $\overline{P_u}$ (descending). We call this ordered set $S$. Sorting by degree ensures that the nodes with the fewest outgoing edges are visited \textit{first}, causing severe branching to only happen in the lower regions of the tree. This is preferable, because a new branch in the bottom layer merely results in a few more options. If this happens at the top, the tree is essentially duplicated $C_u$ times -- in one step drastically increasing the overall number of items to be kept in memory. This process is, fittingly, called \textit{node sorting} (\cite{zhang1996branch}). Sorting by $\overline{P_u}$ is only applied whenever nodes of the same degree have to be considered. 

We always follow the most promising paths first (the one with the highest accumulated probability), which is called "best-first search" (BFS) -- our selection strategy for (1.) in \ref{enum:bnb_rules}. BFS is implemented using a queue maintaining the list of all currently expanded nodes.

Regarding (2.) in \ref{enum:bnb_rules}, we utilize $\overline{P_u}$ as an approximation for the upper bound to the achievable probability in each vertex. For each layer with vertices of $U$, we calculate an accumulative sum $\mathrm{upper\_limit}(i) = \sum_{j > i \in U}  \overline{P_j}$, with $j,i$ being indices into our ordered set $S$ of individuals and $i$ being the current depth in the graph (only counting vertices of $U$). This hierarchical upper limit for the expected value does not consider whether the respective edges are still \textit{viable}, so they could have been eliminated already by assigning the object of $V$ to another vertex of $U$ above the current one. Any edge with $P_\mathrm{current} + \mathrm{upper\_limit}(i) < P_\mathrm{best}$ is skipped since it can not improve upon our previous best value $P_\mathrm{best}$. If we do find an edge with a better value, we replace $P_\mathrm{best}$ with the new value and continue.

As an example, let us traverse the tree in \figref{fig:bipartite_graph}b: 

\begin{itemize}
    \item We first calculate $\overline{P_u}$ for every $u\in U$ ($\overline{P_0}=0.85; \overline{P_2}=0.9; \overline{P_1}=0.75$), as well as the hierarchical probability table $\mathrm{upper\_limit}(i)$ for each index $0\leq i < N$ ($0.9+0.75; 0.75; 0$). $P_\mathrm{best} \coloneqq 0$.
    \item Individual 0 (the root) is expanded, which has one edge with probability $0.85 + \mathrm{upper\_limit}(0) \geq P_\mathrm{best}$ to object 3 (plus the \texttt{null} case) and is the only node with a degree of 1. We know that our now expanded node is the best, since it has the largest probability due to sorting, plus also is the deepest. In fact, this is true for all expanded nodes exactly in the order they are expanded (depth-first search $==$ best-first search for our case). We set $P_\mathrm{best}\coloneqq 0.85$. The edge to \texttt{NIL} is added to our queue.
    \item Objects in $V$ are only virtual and always have zero-probability connections to the next individual in an ordered set ($f\in F$), so they do not add to the overall probability sum. We skip to the next node.
    \item Individual 2 branches off into one or two different edges, depending on which edges have been chosen previously.
    \item We first explore the edge towards object 4 with a probability of $0.9 + \mathrm{upper\_limit}(1) = 1.65 \geq P_\mathrm{best}$ and add it to $P_\mathrm{best}$.
    \item Only one possibility is left and we arrive at a leaf with an accumulated probability of $0.85+0.9+0=1.75$.
    \item We now perform backtracking, meaning we look at every expanded node in our queue, each time observing $\overline{P_u} + \mathrm{upper\_limit}(i)$. 
    \begin{itemize}
        \item \texttt{NIL} (from node 2) would be added to the front of our queue, however its probability $0.85 + 0 + \mathrm{upper\_limit}(1) = 1.6 < 1.75 = P_\mathrm{best}$, so it is discarded.
        \item \texttt{NIL} (from node 0) would be added now, but its probability of $0 + \mathrm{upper\_limit}(0) = 1.65 < P_\mathrm{best}$, so it is also discarded.
    \end{itemize}
\end{itemize}      

We can see that with increased depth, we have to keep track of more and more possibilities. Since our nodes and edges are pre-sorted, our path through the tree is optimal after exactly $N=|U|$ node expansions (not counting $v\in V$ expansions since they are only "virtual").

\subsubsection{Complexity}

Utilizing these techniques, we can achieve very good average-case complexity. Of course having a good worst-case complexity is important (such as the Hungarian method), but the impact of a good average-case complexity can be significant as well. This is illustrated nicely by the timings measured in Table \tableref{timings_apprx}, where our method consistently surpasses the Hungarian method in terms of performance -- especially for very large groups of animals -- despite having worse worst-case complexity. Usually, even in situations with over $1000$ individuals present, the average number of leaves visited was approximately $1.112$ (see Table \tableref{matching_stats}) and each visit was a global improvement (not shown). The number of nodes visited per frame were around $2844$ to $19,804,880$ in the same video, which, given the maximal number of possible combinations $N^M$ for $M$ edges and $N$ individuals (\cite{thomas2015matching}), is quite moderate. Especially considering the number of calculations that the Hungarian method has to perform in every step, which, according to its complexity, will be in the range of $N^3 \approx 1\mathrm{e}9$ for $N=1024$ individuals.
%This is of course only the maximum number of combinations possible. It would require finer-grade inspection per respective problem/individual graph to get a more bounded number of combinations. 

The average complexity of a solution using best-first-search BnB is given by \cite{zhang1996branch}. It depends on the probability of encountering a "zero-cost edge" $p_0$, as well as the mean branching factor $b$ of the tree:

\begin{enumerate}
    \item $\Theta(\beta^N)$ when $bp_0 < 1$, with $\beta \leq b$ and $N$ is the depth of the tree
    \item $\Theta(N^2)$ when $bp_0 = 1$
    \item $\Theta(N)$ when $bp_0 > 1 \Leftrightarrow b > 1 / p_0$
\end{enumerate}

as $N\rightarrow\infty$.

In our case the depth of the tree is exactly the number of individuals $N$, which we have already substituted here. This is the number of nodes that have to be visited in the best case. A "zero-cost edge" is an edge that does not add any cost to the current path. We are maximizing (not minimizing) so in our case this would be "an edge with a probability of 1". While reaching exactly 1 is improbable, it is (in our case) equivalent to "having only one viable edge arriving at an object". $p_0$ depends very much on the settings, specifically the maximum movement speed allowed, and behavior of individuals, which is why in scenarios with $>100$ individuals the maximum speed should always be adjusted first. To put it another way: If there are only few branching options available for the algorithm to explore per individual, which seems to be the case even in large groups, we can assume our graph to have a probability $p_0$ within $0 \ll p_0 \leq 1$. The mean branching factor $b$ is given by the mean number of edges arriving at an object (not an individual). Averaging at around $b \approx k + 1$, with $k \geq 1$ being the average number of assignable blobs per individual (roughly $1.005$ in \videoref{vid:reversals3m_1024_dotbot_20181025_105202.stitched}) and $1$ the \texttt{null}-case, we can assume $bp_0$ to be $>1$ on average. An average complexity of $O(N^2)$, as long as $b > 1 / p_0$, is even better than the complexity of the Hungarian method (which is also $O(N^3)$ in the average-case, \cite{bertsekas1981new}), giving a possible explanation for the good results achieved using tree-based matching in \TRex{} on average (Table \tableref{timings_apprx}).

%We can borrow some techniques from Branch \& Bound here, namely \textit{node ordering} and \textit{backtracking} combined with a \textit{depth-first search}. Node ordering is using prior knowledge to order nodes in a way that, for most cases, minimizes branching (the number of edges traversed). 

%When traversing through the tree, individuals and edges of individuals are always processed in the same order. This order has significant influence on the speed of the algorithm. 

%$O(|U| + |E|)$ generating the maximum probabilities / early stopping criteria
%$O()$ 

%I am using node ordering, backtracking \cite{zhang1996branch} and depth-first-search

Further optimizations could be implemented, e.g. using impact-based heuristics (as an example of dynamic variable ordering) instead of the static and coarse maximum probability estimate used here. Such heuristics first choose the vertex "triggering the largest search space reduction" (\cite{pesant2012counting}). In our case, assigning an individual first if, for example, it has edges to many objects that each only one other individual is connected to.

\end{appendixbox}

\begin{figure}
\includegraphics[width=1.0\textwidth]{figures/tree.pdf}
\caption{A bipartite graph (a) and its equivalent tree-representation (b). It is \textit{bipartite} since nodes can be sorted into two disjoint and independent sets ($\{0,1,2\}$ and $\{3,4\}$), where no nodes have edges to other nodes within the same set. (a) is a straight-forward way of depicting an assignment problem, with the identities on the left side and objects being assigned to the identities on the right side. Edge weights are, in \TRex{} and this example, probabilities for a given identity to be the object in question. This graph is also an example for an unbalanced assignment problem, since there are fewer objects (orange) available than individuals (blue). The optimal solution in this case, using weight-maximization, is to assign $0\rightarrow3; 2\rightarrow4$ and leave $1$ unassigned. Invalid edges have been pruned from the tree in (b), enforcing the rule that objects can only appear once in each path. The optimal assignments have been highlighted in red.}
\label{fig:bipartite_graph}
\end{figure}

\begin{table}
% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l l | l l l l l | l}
\toprule
\multicolumn{3}{c|}{video characteristics} & \multicolumn{5}{c|}{ms / frame (processing)} & processing time \\
\midrule
video & \textbf{{\# ind.}}  & ms / frame & 5\% & mean & 95\% & max & $>$ real-time & \% video length  \\
\midrule

\rowcolor{Gray} \vidref{vid:reversals3m_1024_dotbot_20181025_105202.stitched} & 1024 & 25.0 & $ 46.93 $ & $ 62.96 $ & $ 119.54 $ & $ 849.16 $ & $ 100.0 \%$ & $ 358.12 $ \\
\rowcolor{Gray} \vidref{vid:reversals3m_512_dotbot_20191111_165201.stitched} & 512 & 20.0 & $ 19.09 $ & $ 29.26 $ & $ 88.57 $ & $ 913.52 $ & $ 92.11 \%$ & $ 259.92 $ \\
\rowcolor{Gray} \vidref{vid:reversals3m_512_dotbot_20190122_155201.stitched} & 512 & 16.67 & $ 17.51 $ & $ 26.53 $ & $ 36.72 $ & $ 442.12 $ & $ 97.26 \%$ & $ 235.39 $ \\
 \vidref{vid:reversals3m_256_dotbot_20191122_154201.stitched} & 256 & 20.0 & $ 8.35 $ & $ 11.28 $ & $ 13.25 $ & $ 402.54 $ & $ 1.03 \%$ & $ 77.18 $ \\
 \vidref{vid:reversals3m_256_dotbot_20181214_151202.stitched} & 256 & 16.67 & $ 8.04 $ & $ 11.62 $ & $ 13.48 $ & $ 394.75 $ & $ 1.13 \%$ & $ 94.77 $ \\
 \vidref{vid:reversals3m_128_dotbot_20181211_153201.stitched} & 128 & 16.67 & $ 3.54 $ & $ 5.14 $ & $ 5.97 $ & $ 367.92 $ & $ 0.41 \%$ & $ 40.1 $ \\
 \vidref{vid:reversals3m_128_dotbot_20190116_135201.stitched} & 128 & 16.67 & $ 3.91 $ & $ 5.64 $ & $ 6.89 $ & $ 381.51 $ & $ 0.51 \%$ & $ 44.38 $ \\
 \vidref{vid:video_example_100fish_1min} & 100 & 31.25 & $ 2.5 $ & $ 3.57 $ & $ 5.19 $ & $ 316.75 $ & $ 0.1 \%$ & $ 28.35 $ \\
 \vidref{vid:flies_N59} & 59 & 19.61 & $ 1.43 $ & $ 2.29 $ & $ 3.93 $ & $ 2108.77 $ & $ 0.19 \%$ & $ 16.33 $ \\
 \vidref{vid:15locusts1h} & 15 & 40.0 & $ 0.4 $ & $ 0.52 $ & $ 1.67 $ & $ 4688.5 $ & $ 0.01 \%$ & $ 2.96 $ \\
 \vidref{vid:N05HHS2019-10S-V1} & 10 & 10.0 & $ 0.28 $ & $ 0.33 $ & $ 0.57 $ & $ 283.7 $ & $ 0.07 \%$ & $ 8.08 $ \\
 \vidref{vid:group_3} & 10 & 31.25 & $ 0.21 $ & $ 0.25 $ & $ 0.65 $ & $ 233.7 $ & $ 0.01 \%$ & $ 3.48 $ \\
 \vidref{vid:group_2} & 10 & 31.25 & $ 0.23 $ & $ 0.27 $ & $ 0.75 $ & $ 225.63 $ & $ 0.02 \%$ & $ 2.82 $ \\
 \vidref{vid:group_1} & 10 & 31.25 & $ 0.22 $ & $ 0.25 $ & $ 0.54 $ & $ 237.32 $ & $ 0.02 \%$ & $ 2.64 $ \\
 \vidref{vid:guppy_8_t46_d1_20191207_102508} & 8 & 33.33 & $ 0.24 $ & $ 0.29 $ & $ 0.66 $ & $ 172.8 $ & $ 0.02 \%$ & $ 1.8 $ \\
 \vidref{vid:guppy_8_t36_d15_20191212_085800} & 8 & 40.0 & $ 0.22 $ & $ 0.26 $ & $ 0.88 $ & $ 244.88 $ & $ 0.01 \%$ & $ 1.5 $ \\
 \vidref{vid:guppy_8_t20_d1_20190512_115801} & 8 & 28.57 & $ 0.18 $ & $ 0.21 $ & $ 0.51 $ & $ 1667.14 $ & $ 0.02 \%$ & $ 1.38 $ \\
 \vidref{vid:singleguppy_f2_d9} & 1 & 7.14 & $ 0.03 $ & $ 0.04 $ & $ 0.06 $ & $ 220.81 $ & $ 0.01 \%$ & $ 1.56 $ \\

\bottomrule
\end{tabular}
\medskip 
%\tabledata{\changemade{}
\caption{Showing quantiles for frame timings for videos of the \textit{speed dataset} (without posture enabled). Videos \vidref{vid:guppy_8_t36_d15_20191212_085800}, \vidref{vid:guppy_8_t20_d1_20190512_115801} and \vidref{vid:guppy_8_t46_d1_20191207_102508} each contain a short sequence of taking out the fish, causing a lot of big objects and noise in the frame. This leads to relatively high spikes in these segments of the video, resulting in high peak processing timings here. Generally, processing time is influenced by a lot of factors involving not only \TRex{}, but also the operating system as well as other programs. While we did try to control for these, there is no way to make sure. However, having sporadic spikes in the timings per frame does not significantly influence overall processing time, since it can be compensated for by later frames. We can see that videos of all quantities $\leq$ 256 individuals can be processed faster than they could be recorded. Videos that can not be processed faster than real-time are underlaid in gray.}\label{tab:absolute_speeds_no_posture}
\tabledata{\changemade{Raw samples for this table and \tableref{matching_stats}.}}
\end{table}

\begin{figure}
\includegraphics[width=1.0\linewidth]{figures/segment_lengths.pdf}
\caption{The same set of videos as in \tableref{recognition_timings} pooled together, we evaluate the efficiency of our crossings solver. \textit{Consecutive frame segments} are sequences of frames without gaps, for example due to crossings or visibility issues. We find these \textit{consecutive frame segments} in data exported by \TRex{}, and compare the distribution of segment-lengths to \idtracker{'s} results (as a reference for an algorithm without a way to resolve crossings). In \idtracker{'s} case, we segmented the non-interpolated tracks by missing frames, assuming tracks to be correct in between. The Y-axis shows the percentage of $\sum_{k\in [1,V]}\mathrm{video\_length}_k * \mathrm{\#individuals}_k$ in $V$ videos that one column makes up for -- the overall coverage for \TRex{} was $98\%$, while \idtracker{} was slightly worse with $95.17\%$. Overall, the data distribution suggests that, probably due to it attempting to resolve crossings, \TRex{} seems to produce longer consecutive segments.}
\label{fig:segment_lengths}
\figdata{\changemade{A list of all consecutive frame segments used in \figref{fig:segment_lengths}. In the table, they are indexed by their length, the software they were produced by, the video they originate from, as well as they bin they belong to.}}
\figdata{\changemade{The raw data-points as plotted in \figref{fig:segment_lengths}.}}
\end{figure}

\begin{table}

% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l l || l r | l r | l r }
\toprule
video & {\# ind.} & length & \multicolumn{2}{c|}{total} & \multicolumn{2}{c|}{excluded}  & \multicolumn{2}{c}{wrong} \\
\midrule
\vidref{vid:video_example_100fish_1min} & $ 100 $ &  1min  & $ 717 $ & $ 755 $ & $ 22 $ & $ 22 $ & $ 45 \ ( 6.47 \%)$ & $ 65 \ ( 8.87 \%)$\\
\vidref{vid:flies_N59} & $ 59 $ &  10min  & $ 279 $ & $ 312 $ & $ 146 $ & $ 100 $ & $ 55 \ ( 41.35 \%)$ & $ 32 \ ( 15.09 \%)$\\
\vidref{vid:15locusts1h} & $ 15 $ &  1h0min  & $ 838 $ & $ 972 $ & $ 70 $ & $ 111 $ & $ 100 \ ( 13.02 \%)$ & $ 240 \ ( 27.87 \%)$\\
\vidref{vid:group_1} & $ 10 $ &  10min3s  & $ 331 $ & $ 337 $ & $ 22 $ & $ 22 $ & $ 36 \ ( 11.65 \%)$ & $ 54 \ ( 17.14 \%)$\\
\vidref{vid:group_2} & $ 10 $ &  10min3s  & $ 382 $ & $ 404 $ & $ 42 $ & $ 43 $ & $ 83 \ ( 24.41 \%)$ & $ 130 \ ( 36.01 \%)$\\
\vidref{vid:group_3} & $ 10 $ &  10min10s  & $ 1067 $ & $ 1085 $ & $ 50 $ & $ 52 $ & $ 73 \ ( 7.18 \%)$ & $ 92 \ ( 8.91 \%)$\\
\vidref{vid:guppy_8_t46_d1_20191207_102508} & $ 8 $ &  3h15min22s  & $ 7424 $ & $ 7644 $ & $ 1428 $ & $ 1481 $ & $ 1174 \ ( 19.58 \%)$ & $ 1481 \ ( 24.03 \%)$\\
\vidref{vid:guppy_8_t36_d15_20191212_085800} & $ 8 $ &  1h12min  & $ 3538 $ & $ 3714 $ & $ 427 $ & $ 517 $ & $ 651 \ ( 20.93 \%)$ & $ 962 \ ( 30.09 \%)$\\
\vidref{vid:guppy_8_t20_d1_20190512_115801} & $ 8 $ &  3h18min13s  & $ 2376 $ & $ 3305 $ & $ 136 $ & $ 206 $ & $ 594 \ ( 26.52 \%)$ & $ 1318 \ ( 42.53 \%)$\\
\hline
\multicolumn{3}{r||}{sum} & $ 16952 $ & $ 16754 $ & $ - 2343 $ & $- 2554 $ &$ 2811 \ ( 19.24 \%)$ & $ 4374 \ ( 27.38 \%)$ \\
\bottomrule
\end{tabular}
\medskip
\caption{\label{tab:decisions}A quality assessment of assignment decisions made by the general purpose tracking system without the aid of visual recognition -- comparing results of two accurate tracking algorithms with the assignments made by an approximate method. Here, \textit{decisions} are reassignments of an individual after it has been lost, or the tracker was too "unsure" about an assignment. Decisions can be either correct or wrong, which is determined by comparing to reference data generated using automatic visual recognition: Every segment of frames between decisions is associated with a corresponding "baseline-truth" identity from the reference data. If this association changes after a decision, then that decision is counted as wrong. Analysing a decision may fail if no good match can be found in the reference data (which is not interpolated). Failed decisions are ignored. Comparative values for the Hungarian algorithm (\cite{kuhn1955hungarian}) are always exactly the same as for our tree-based algorithm, and are therefore not listed separately.
Left-aligned \textit{total}, \textit{excluded} and \textit{wrong} counts in each column are results achieved by an accurate algorithm, numbers to their right are the corresponding results using an approximate method.}
\tabledata{\changemade{Raw data of trial runs using the hungarian and tree-based matching algorithms, as well as baseline data from manually or automatically corrected trials used in this table is available for download from \cite{walter2020dataset} (in \protect\path{A4T2_source_data.zip}).}}
\end{table}

\begin{figure}
\includegraphics[width=0.9\linewidth]{figures/approx_accurate.pdf}
\caption{Mean values of processing-times and 5\%/95\% percentiles for video frames of all videos in the \textit{speed dataset} (Table \tableref{videos}), comparing two different matching algorithms. Parameters were kept identical, except for the matching mode, and posture was turned off to eliminate its effects on performance. Our tree-based algorithm is shown in green and the Hungarian method in red. Grey numbers above the graphs show the number of samples within each bin, per method. Differences between the algorithms increase very quickly, proportional to the number of individuals. Especially the Hungarian method quickly becomes very computationally intensive, while our tree-based algorithm shows a much shallower curve. Some frames could not be solved in reasonable time by the tree-based algorithm alone, at which point it falls back to the Hungarian algorithm. Data-points belonging to these frames ($N=79$) have been excluded from the results for both algorithms. One main advantage of the Hungarian method is that, with its bounded worst-case complexity (see \nameref{sec:matching_graph}), no such combinatorical explosions can happen. However, even given this advantage the Hungarian method still leads to significantly lower processing speed overall (see also appendix Table \tableref{timings_apprx}).}
\label{fig:approx_accurate}
\figdata{\changemade{Raw data for producing this figure and \tableref{timings_apprx}. Each sample is represented as a row here, indexed by method (tree, approximate, hungarian), video and the bin (horizontal line in this figure).}}
\end{figure}

\begin{table}[h]
% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l l l | l l l l}
\toprule
\multicolumn{4}{c|}{video metrics} & \multicolumn{3}{c}{\% real-time} \\
\midrule
video & \textbf{{\# ind.}}  & fps (Hz) & size ($\mathrm{px}^2$) & tree & approximate & hungarian  \\
\midrule

\vidref{vid:reversals3m_1024_dotbot_20181025_105202.stitched} & 1024 & 40 & $ 3866 \times 4048 $ & $ 35.49 \pm 65.94 $ & $ 38.69 \pm 65.39 $ & $ 12.05 \pm 18.72 $ \\
\vidref{vid:reversals3m_512_dotbot_20191111_165201.stitched} & 512 & 50 & $ 3866 \times 4140 $ & $ 51.18 \pm 180.08 $ & $ 75.02 \pm 193.0 $ & $ 28.92 \pm 29.12 $ \\
\vidref{vid:reversals3m_512_dotbot_20190122_155201.stitched} & 512 & 60 & $ 3866 \times 4048 $ & $ 59.66 \pm 121.4 $ & $ 65.58 \pm 175.51 $ & $ 23.18 \pm 26.83 $ \\
\vidref{vid:reversals3m_256_dotbot_20191122_154201.stitched} & 256 & 50 & $ 3866 \times 4140 $ & $ 174.02 \pm 793.12 $ & $ 190.62 \pm 743.54 $ & $ 127.86 \pm 9841.21 $ \\
\vidref{vid:reversals3m_256_dotbot_20181214_151202.stitched} & 256 & 60 & $ 3866 \times 4048 $ & $ 140.73 \pm 988.15 $ & $ 155.9 \pm 760.05 $ & $ 108.48 \pm 2501.06 $ \\
\vidref{vid:reversals3m_128_dotbot_20181211_153201.stitched} & 128 & 60 & $ 3866 \times 4048 $ & $ 318.6 \pm 347.8 $ & $ 353.58 \pm 291.63 $ & $ 312.05 \pm 337.71 $ \\
\vidref{vid:reversals3m_128_dotbot_20190116_135201.stitched} & 128 & 60 & $ 3866 \times 4048 $ & $ 286.13 \pm 330.08 $ & $ 314.91 \pm 303.53 $ & $ 232.33 \pm 395.21 $ \\
\vidref{vid:video_example_100fish_1min} & 100 & 32 & $ 3584 \times 3500 $ & $ 572.46 \pm 98.21 $ & $ 611.5 \pm 96.46 $ & $ 637.87 \pm 97.03 $ \\
\vidref{vid:flies_N59} & 59 & 51 & $ 2306 \times 2306 $ & $ 744.98 \pm 264.43 $ & $ 839.45 \pm 257.56 $ & $ 864.01 \pm 223.47 $ \\
\vidref{vid:15locusts1h} & 15 & 25 & $ 1880 \times 1881 $ & $ 4626.84 \pm 424.8 $ & $ 4585.08 \pm 378.64 $ & $ 4508.08 \pm 404.56 $ \\
\vidref{vid:N05HHS2019-10S-V1} & 10 & 100 & $ 1920 \times 1080 $ & $ 2370.35 \pm 303.94 $ & $ 2408.27 \pm 297.83 $ & $ 2362.42 \pm 296.99 $ \\
\vidref{vid:group_3} & 10 & 32 & $ 3712 \times 3712 $ & $ 6489.12 \pm 322.59 $ & $ 6571.28 \pm 306.34 $ & $ 6472.0 \pm 322.03 $ \\
\vidref{vid:group_2} & 10 & 32 & $ 3712 \times 3712 $ & $ 6011.59 \pm 318.12 $ & $ 6106.12 \pm 305.96 $ & $ 5549.25 \pm 318.21 $ \\
\vidref{vid:group_1} & 10 & 32 & $ 3712 \times 3712 $ & $ 6717.12 \pm 325.37 $ & $ 6980.12 \pm 316.59 $ & $ 6726.46 \pm 316.87 $ \\
\vidref{vid:guppy_8_t46_d1_20191207_102508} & 8 & 30 & $ 3008 \times 3008 $ & $ 8752.2 \pm 2141.03 $ & $ 8814.63 \pm 2101.4 $ & $ 8630.73 \pm 2177.16 $ \\
\vidref{vid:guppy_8_t36_d15_20191212_085800} & 8 & 25 & $ 3008 \times 3008 $ & $ 9786.68 \pm 1438.08 $ & $ 10118.04 \pm 1380.2 $ & $ 9593.44 \pm 1439.28 $ \\
\vidref{vid:guppy_8_t20_d1_20190512_115801} & 8 & 35 & $ 3008 \times 3008 $ & $ 9861.42 \pm 1424.91 $ & $ 10268.82 \pm 1339.8 $ & $ 9680.68 \pm 1387.14 $ \\
\vidref{vid:singleguppy_f2_d9} & 1 & 140 & $ 1312 \times 1312 $ & $ 15323.05 \pm 637.17 $ & $ 15250.39 \pm 639.2 $ & $ 15680.93 \pm 640.99 $ \\
\bottomrule
\end{tabular}
\medskip 
%\tabledata{\changemade{}
\caption{\label{tab:timings_apprx}Comparing computation speeds of the tree-based tracking algorithm with the widely established Hungarian algorithm \cite{kuhn1955hungarian}, as well as an approximate version optimized for large quantities of individuals. Posture estimation has been disabled, focusing purely on the assignment problem in our timing measurements. The tree-based algorithm is programmed to fall back on the Hungarian method whenever the current problem "explodes" computationally -- these frames were excluded. Listed are relevant video metrics on the left and mean computation speeds on the right side for three different algorithms: (1) The tree-based and (2) the approximate algorithm presented in this paper, and (3) the Hungarian algorithm. Speeds listed here are percentages of real-time (the videos' fps), demonstrating usability in closed-loop applications and overall performance. Results show that increasing the number of individuals both increases the time-cost, as well as producing much larger relative standard deviation values. (1) is almost always fast than (3), while becoming slower than (2) with increasing individual numbers. In our implementation, all algorithms produce faster than real-time speeds with 256 or fewer individuals (see also appendix Table \tableref{absolute_speeds_no_posture}), with (1) and (2) even getting close for 512 individuals.}
\end{table}

\begin{table}[h]
% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l l l | l l l | r}
\toprule
\multicolumn{4}{c|}{video metrics} & \multicolumn{3}{c|}{minutes} & \\
\midrule
video & {\# ind.} & length & fps (Hz) & prepare &  tracking  & live & win (\%)   \\
\midrule

\vidref{vid:reversals3m_1024_dotbot_20181025_105202.stitched} & 1024 & $ 8.33 \mathrm{min} $ & $ 40 $ & $ 10.96 \pm 0.3 $ & $ 41.11 \pm 0.34 $ & $ 65.72 \pm 1.35 $ &$ -26.23 $ \\
\vidref{vid:reversals3m_512_dotbot_20191111_165201.stitched} & 512 & $ 6.67 \mathrm{min} $ & $ 50 $ & $ 11.09 \pm 0.24 $ & $ 24.43 \pm 0.2 $ & $ 33.67 \pm 0.58 $ &$ 5.24 $ \\
\vidref{vid:reversals3m_512_dotbot_20190122_155201.stitched} & 512 & $ 5.98 \mathrm{min} $ & $ 60 $ & $ 11.72 \pm 0.2 $ & $ 20.86 \pm 0.47 $ & $ 31.1 \pm 0.62 $ &$ 4.55 $ \\
\vidref{vid:reversals3m_256_dotbot_20191122_154201.stitched} & 256 & $ 6.67 \mathrm{min} $ & $ 50 $ & $ 11.09 \pm 0.21 $ & $ 7.99 \pm 0.17 $ & $ 12.35 \pm 0.17 $ &$ 35.26 $ \\
\vidref{vid:reversals3m_256_dotbot_20181214_151202.stitched} & 256 & $ 5.98 \mathrm{min} $ & $ 60 $ & $ 11.76 \pm 0.26 $ & $ 9.04 \pm 0.26 $ & $ 15.08 \pm 0.13 $ &$ 27.46 $ \\
\vidref{vid:reversals3m_128_dotbot_20190116_135201.stitched} & 128 & $ 5.98 \mathrm{min} $ & $ 60 $ & $ 11.77 \pm 0.29 $ & $ 4.74 \pm 0.13 $ & $ 12.13 \pm 0.32 $ &$ 26.49 $ \\
\vidref{vid:reversals3m_128_dotbot_20181211_153201.stitched} & 128 & $ 6.0 \mathrm{min} $ & $ 60 $ & $ 11.74 \pm 0.26 $ & $ 4.54 \pm 0.1 $ & $ 12.08 \pm 0.25 $ &$ 25.79 $ \\
\vidref{vid:video_example_100fish_1min} & 100 & $ 1.0 \mathrm{min} $ & $ 32 $ & $ 1.92 \pm 0.02 $ & $ 0.47 \pm 0.01 $ & $ 2.03 \pm 0.02 $ &$ 14.88 $ \\
\vidref{vid:flies_N59} & 59 & $ 10.0 \mathrm{min} $ & $ 51 $ & $ 6.11 \pm 0.07 $ & $ 7.68 \pm 0.12 $ & $ 9.28 \pm 0.08 $ &$ 32.7 $ \\
\vidref{vid:15locusts1h} & 15 & $ 60.0 \mathrm{min} $ & $ 25 $ & $ 12.59 \pm 0.18 $ & $ 5.32 \pm 0.07 $ & $ 13.17 \pm 0.12 $ &$ 26.47 $ \\
\vidref{vid:group_3} & 10 & $ 10.17 \mathrm{min} $ & $ 32 $ & $ 8.58 \pm 0.04 $ & $ 0.74 \pm 0.01 $ & $ 8.8 \pm 0.12 $ &$ 5.66 $ \\
\vidref{vid:group_2} & 10 & $ 10.05 \mathrm{min} $ & $ 32 $ & $ 8.68 \pm 0.04 $ & $ 0.75 \pm 0.01 $ & $ 8.65 \pm 0.07 $ &$ 8.3 $ \\
\vidref{vid:group_1} & 10 & $ 10.05 \mathrm{min} $ & $ 32 $ & $ 8.67 \pm 0.03 $ & $ 0.71 \pm 0.01 $ & $ 8.65 \pm 0.07 $ &$ 7.76 $ \\
\vidref{vid:N05HHS2019-10S-V1} & 10 & $ 10.08 \mathrm{min} $ & $ 100 $ & $ 4.17 \pm 0.06 $ & $ 2.02 \pm 0.02 $ & $ 4.43 \pm 0.05 $ &$ 28.3 $ \\
\vidref{vid:guppy_8_t46_d1_20191207_102508} & 8 & $ 195.37 \mathrm{min} $ & $ 30 $ & $ 110.51 \pm 2.32 $ & $ 8.99 \pm 0.22 $ & $ 109.97 \pm 2.05 $ &$ 7.98 $ \\
\vidref{vid:guppy_8_t36_d15_20191212_085800} & 8 & $ 72.0 \mathrm{min} $ & $ 25 $ & $ 31.84 \pm 0.53 $ & $ 3.26 \pm 0.07 $ & $ 32.1 \pm 0.42 $ &$ 8.55 $ \\
\vidref{vid:guppy_8_t20_d1_20190512_115801} & 8 & $ 198.22 \mathrm{min} $ & $ 35 $ & $ 133.45 \pm 2.22 $ & $ 11.38 \pm 0.28 $ & $ 133.1 \pm 2.28 $ &$ 8.1 $ \\
\hline
\multicolumn{7}{r}{mean} & 14.55 \%\\

\bottomrule
\end{tabular}

\medskip 
%\tabledata{\changemade{}
\caption{\label{tab:timings}Comparing the time-cost for tracking and converting videos in two steps with doing both of those tasks at the same time. The columns \textit{prepare} and \textit{tracking} show timings for the tasks when executed separately, while \textit{live} shows the time when both of them are performed at the same time using the live-tracking feature of \TGrabs{}. The column \textit{win} shows the time "won" by combining tracking and preprocessing as the percentage $(\mathrm{prepare}+\mathrm{tracking}-\mathrm{live})/(\mathrm{prepare}+\mathrm{tracking})$. The process is more complicated than simply adding up timings of the tasks. Memory and the interplay of work-loads have a huge effect here. Posture is enabled in all variants.}
%\tabledata{\changemade{Same as \tableref{recognition_timings} source-data.}
\end{table}

\begin{table}
% Use "S" column identifier to align on decimal point 
\begin{tabular}{l l | l l l}
\toprule
\multicolumn{2}{c|}{video characteristics} & \multicolumn{3}{c}{matching stats} \\
\midrule
video & \textbf{{\# ind.}}  & \# nodes visited (5,50,95,100\%) & \# leafs visited & \# improvements %& obj.edges & ind.edges 
\\
\midrule
\vidref{vid:reversals3m_1024_dotbot_20181025_105202.stitched} & 1024 & $ [ 1535 ; 2858 ; 83243 ; 18576918 ] $ & $ 1.113 \pm 0.37 $ & $ 1.113 $\\
\vidref{vid:reversals3m_512_dotbot_20191111_165201.stitched} & 512 & $ [ 1060 ; 8156 ; 999137 ; 19811558 ] $ & $ 1.247 \pm 0.61 $ & $ 1.247 $\\
\vidref{vid:reversals3m_512_dotbot_20190122_155201.stitched} & 512 & $ [ 989 ; 2209 ; 56061 ; 8692547 ] $ & $ 1.159 \pm 0.47 $ & $ 1.159 $\\
\vidref{vid:reversals3m_256_dotbot_20191122_154201.stitched} & 256 & $ [ 452 ; 479 ; 969 ; 205761 ] $ & $ 1.064 \pm 0.29 $ & $ 1.064 $\\
\vidref{vid:reversals3m_256_dotbot_20181214_151202.stitched} & 256 & $ [ 475 ; 496 ; 584 ; 608994 ] $ & $ 1.028 \pm 0.18 $ & $ 1.028 $\\
\vidref{vid:reversals3m_128_dotbot_20181211_153201.stitched} & 128 & $ [ 233 ; 245 ; 258 ; 7149 ] $ & $ 1.012 \pm 0.12 $ & $ 1.012 $\\
\vidref{vid:reversals3m_128_dotbot_20190116_135201.stitched} & 128 & $ [ 237 ; 259 ; 510 ; 681702 ] $ & $ 1.046 \pm 0.25 $ & $ 1.046 $\\
\vidref{vid:video_example_100fish_1min} & 100 & $ [ 195 ; 199 ; 199 ; 13585 ] $ & $ 1.014 \pm 0.14 $ & $ 1.014 $\\
\vidref{vid:flies_N59} & 59 & $ [ 117 ; 117 ; 117 ; 16430 ] $ & $ 1.014 \pm 0.2 $ & $ 1.014 $\\
\vidref{vid:15locusts1h} & 15 & $ [ 24 ; 29 ; 29 ; 635 ] $ & $ 1.027 \pm 0.22 $ & $ 1.027 $\\
\vidref{vid:N05HHS2019-10S-V1} & 10 & $ [ 17 ; 19 ; 19 ; 56 ] $ & $ 1.001 \pm 0.02 $ & $ 1.001 $\\
\vidref{vid:group_3} & 10 & $ [ 19 ; 19 ; 19 ; 129 ] $ & $ 1.006 \pm 0.1 $ & $ 1.006 $\\
\vidref{vid:group_2} & 10 & $ [ 19 ; 19 ; 19 ; 1060 ] $ & $ 1.023 \pm 0.23 $ & $ 1.023 $\\
\vidref{vid:group_1} & 10 & $ [ 19 ; 19 ; 19 ; 106 ] $ & $ 1.001 \pm 0.04 $ & $ 1.001 $\\
\vidref{vid:guppy_8_t46_d1_20191207_102508} & 8 & $ [ 11 ; 15 ; 15 ; 893 ] $ & $ 1.003 \pm 0.08 $ & $ 1.003 $\\
\vidref{vid:guppy_8_t36_d15_20191212_085800} & 8 & $ [ 13 ; 15 ; 15 ; 597 ] $ & $ 1.024 \pm 0.23 $ & $ 1.024 $\\
\vidref{vid:guppy_8_t20_d1_20190512_115801} & 8 & $ [ 15 ; 15 ; 15 ; 2151 ] $ & $ 1.009 \pm 0.17 $ & $ 1.009 $\\
\vidref{vid:singleguppy_f2_d9} & 1 & $ [ 1 ; 1 ; 1 ; 1 ] $ & $ 1.0 \pm 0.02 $ & $ 1.0 $\\

\bottomrule
\end{tabular}
\medskip 
%\tabledata{\changemade{}
\caption{\label{tab:matching_stats} Statistics for running the tree-based matching algorithm with the videos of the speed dataset. We achieve low leaf and node visits across the board -- this is especially interesting in videos with high numbers of individuals. High values for '\# nodes visited' are only impactful if they make up a large portion of the assignments. These are the result of too many choices for assignments -- the weak point of the tree-based algorithm -- and lead to combinatorical "explosions" (the method will take a really long time to finish). If such an event is detected, \TRex{} automatically switches to a more computationally bounded algorithm like the Hungarian method.}
%\tabledata{\changemade{Same as \tableref{absolute_speeds_no_posture}.}
\end{table}

\begin{appendixbox}
\section{Posture}

Estimating an animals orientation and body pose in space is a diverse topic, where angle and pose can mean many different things. We are not estimating the individual positions of many legs and antennae in \TRex{}, we simply want to know where the front- and the back-end of the animal are. Ultimately, the goal here is to be able to align animals using an arbitrary axis with their head extending in one direction and their tail roughly in the opposite direction. In order to achieve this, we are required to follow a series of steps to acquire all the necessary information:

\begin{enumerate}
    \item Locate objects in the image
    \item Detect the edge of objects
    \item Find an ordered set of points (the outline), which in sequence approximate the outer edge of an object in the scene. This is done for each object (as well as for holes).
    \item Calculate a center-line based on local curvature of the outline.
    \item Calculate head and tail positions.
\end{enumerate}

The first point is a given at this point (see \nameref{sec:connected_components}). We can utilize the format in which connected components are computed in \TRex{} (an ordered array of horizontal line segments), which reduces redundancy by avoiding to look at every individual pixel. These line segments also contain information about edges since every start and end has to be an edge-pixel, too.

Even though we already have a list of edge-pixels, retrieving an \textit{ordered} set of points is crucial and requires much more effort. Without information about a pixels connectivity, we can not differentiate between inner and outer shapes (holes vs. outlines) and we can not calculate local curvature. 

\subsection{Connecting pixels to form an outline}

We implemented an algorithm based on horizontal line segments, which only ever retains three consecutive rows of pixels ($p$ previous, $c$ current and $n$ next). These horizontal line segments always stem from a "blob" (or connected component). Rows contain (i) their y-value in pixels, (ii) $x_0,x_1$ values describing the first and last "on"-pixel that has been found in it, (iii) a set of detected border pixels (identified by their x-coordinate). A row is valid, whenever the $y$ coordinate is not $-1$ -- all three rows are initialized to an invalid $y = -1$. $l'$ is the previous row. Using $p,c or n$ as a function $c(x)$ returns 1 for on-pixels at that x-coordinate, and 0 for off-pixels.

For each line $l$ in the sorted list of horizontal line segments, we detect border pixels:

\begin{enumerate}
    \item subtract the blobs position (minimum of all $l_{x0}$ and $l_y$ separately) from $l$
    \item \textbf{if} $n_y\not=l_y$, a row has ended and a new one starts: call finalize\\
    \textbf{else if} $l_{x0} - l_{x1}' \geq 1 \wedge l_{x0} \geq c_{x0}$, we either skipped a few pixels in $n$ or $l$ starts before $c$ even had valid pixels. This means that all pixels $x$ between $\max\{ l_{x1}'+1; c_{x0} \} \leq x < \min\{ l_{x0}; c_{x1}+1 \}$ are border pixels in $c$.
    
    \item \textbf{if} $l_{x1} < c_{x0}$, or $c$ is invalid, then line $l$ ends before the previous row ($c$) even has any "on"-pixels. All pixels $x$ between $l_{x0}\leq x \leq l_{x1}$ are border pixels in $n$.\\
    \textbf{else} 
    \begin{enumerate}
        \item $s\coloneqq l_{x0}$
        \item \textbf{if} $s < c_{x0}$, then lines are overlapping in $c$ and $n$ (line $l$). We can fill $n$ up with border while $x<c_{x0}$ and $x\leq l_{x1}$. Set $s\coloneqq \min\{ c_{x0}-1; l_{x1} \}$.\\
        \textbf{else if} $s = 0$ or $s > 0 \wedge n(s-1)=0$, then $l$ starts at the image border (which is an automatic border pixel) or there is a gap before $l$. Set $s\coloneqq s + 1$.
        \item All pixels at $x$-coordinates $s\leq x \leq l_{x1}$ are border in $n$, if they are either (i) beyond $c$'s bounds ($x \geq c_{x1}$), or (ii) $c(x) = 0$.
    \end{enumerate}
    
    \item Set $n_{x1} \coloneqq l_{x1}$.
\end{enumerate}

After iterating through all lines, we need two additional calls to \texttt{finalize} to populate the lines currently in $c$ and $n$ through. 

A graph is updated each time a row is finalized. This graph stores all border "nodes", as well as all a maximum of two edges per node (since this is the maximum number of neighbors for a line vertex). More on that below. The following procedure (\texttt{finalize}) prepares a row ($c$) to be integrated into the graph, using two parameters: A triplet of rows $(p,c,n)$ and the first line $l$, which started the new row to be added.

\begin{enumerate}
    \item \textbf{if} $n$ is invalid, continue to the next operation.\\
    \textbf{else if} $l_y > n_y + 1$, then we skipped at least one row between $n$ and the new row -- making all on-pixels in $n$ border pixels.\\
    \textbf{else} we have consecutive rows where $l_y = n_y+1$. All on-pixels $x$ in $n$ between $n_{x0} \leq x \leq l_{x0}-1$ are border pixels.
    \item Now the current row ($c$) is certainly finished, as it will in the following become the previous row ($p$), which is read-only at that point. We can add every border-pixel of $c$ to our graph (see below).
    \item It then discards $p$ and moves $c\rightarrow p$ and $n\rightarrow c$, as well as reading a new row to assign to $n$, setting $n_{x0}=l_{x0},n_{x1}=l_{x1},n_{y}=l_y$.
\end{enumerate}

The graph consists of nodes (border pixels), indexed by their x and y coordinates (integers) and containing a list of all on-pixels around them (8-neighbourhood with top-left, top, left, bottom-left, etc.). This information is available when  \texttt{finalize} is called, since the middle row ($c$) is fully defined at that point (its entire neighbourhood has been cached).

After all rows have been processed, an additional step is needed to connect all nodes and produce a connected, clockwise ordered outline. We already marked all pixels that have at least one border. We can also already mark TOP, RIGHT, BOTTOM and LEFT borders per node if no neighbouring pixel is present in that direction, since these major directions will definitely get a "line" in the end. So all we have left to do now, is check the diagonals. The points that will be returned, are located half-way along the outer edges of pixels. In the end, each pixel can potentially have four border lines (if it is a singular pixel without connections to other pixels, see yellow "hole" in \figref{fig:outline_approx}b). The half-edge-points for each node are generated as follows:

%Now we need to connect them using half-pixel borders, meaning that each pixel can potentially have four border lines (if it is a singular pixel without connections to other pixels, see yellow "hole" in \figref{fig:connected_component}).

\begin{enumerate}
    \item A nodes list of border pixels is a sparse, ordered list of directions (top, top-right, \dots, top-left). Each major direction of these (TOP, RIGHT, BOTTOM, LEFT), if present, check the face of their square to the left of them (own direction - 1, or -45°). For example, TOP would check top-left.
    \item \textbf{if} the checked neighbour is on, we add an edge between our face (e.g. TOP) and its 90° rotated face (e.g. own direction + 2 = RIGHT).\\
    \textbf{else} check the face an additional 45° to the left (e.g. LEFT). 
    \begin{enumerate}
        \item \textbf{if} it there is an on-pixel attached to this face, add an edge between the two faces (of the focal and its left pixel) in the same direction (e.g. TOP$\rightarrow$TOP).
        \item \textbf{else} we do not seem to have a neighbour to either side, so this must be a corner pixel. Add an edge from the focal face (e.g. TOP) to the side 90° to the left of itself (e.g. LEFT).
    \end{enumerate}
\end{enumerate}

Each time an edge is added, more and more of the half-edges are becoming fully-connected (meaning they have two of the allowed two edges). To generate the final result, all we have to do is to start somewhere in the graph and walk strictly in clockwise direction. "Walking" is done using a queue and edges are followed using depth-first search (see \nameref{sec:matching_graph}): Each time a node is visited, all its yet unexplored edges are added to the front of the queue (in clockwise order). Already visited edges are marked (or pruned) and will not be traversed again -- their floating-point positions (somewhere on an edge of its parent pixel) are added to an array. 

After a path ended, meaning that no more edges can be reached from our current node, the collected floating-point positions are pushed to another array and a different, yet unvisited, starting node is sought. This way, we can accumulate all available outlines in a given image one-by-one -- including holes.

These outlines will usually be further processed using an Elliptical Fourier Transform (or EFT, \cite{kuhl1982elliptic}), as mentioned in the main-text. Outlines can also be smoothed using a weighted average of the $N$ points around a given point, or resampled to either reduce or (virtually) increase resolution.

\subsection{Finding the tail}

Given an ordered outline, curvature can be calculated locally (per index $i$):

$$ C(i) = 4 * \mathrm{triangle\_area}\left(p_{i-r}, p_{i}, p_{i+r}\right) / \left( \norm{p_{i} - p_{i-r}} * \norm{p_{i} - p_{i+r}} * \norm{p_{i-r} - p_{i+r}} \right) $$

where $1 \leq r\in \mathbb{N}$ is a parameter, which effectively leads to more smoothing when increased. Triangle area can be calculated as follows:

$$ \mathrm{triangle\_area}\left(\mathbf{a},\mathbf{b},\mathbf{c}\right) = (\mathbf{b}_x - \mathbf{a}_x) (\mathbf{c}_y - \mathbf{a}_y) - (\mathbf{b}_y - \mathbf{a}_y) (\mathbf{c}_x - \mathbf{a}_x). $$

To find the "tail", or the pointy end of the shape, we employ a method closely related to scipys find\_peaks function: We find local maxima using discrete curve differentiation and then generate a hierarchy of these extrema. The only major difference to normal differentiation is that we assume periodicity to achieve our results -- values wrap around in both directions, since we are dealing with an outline here. We then find the peak with the largest integral, meaning we detect both very wide and very high peaks (just not very slim ones). The center of this peak is the "tail".

To find the head as well, we now have to search for the peak that has the largest (index-) distance to the tail-peak. This is a periodic distance, too, meaning that $N$ is one of the closest neighbours of $0$.

The entire outline array is then rotated, so that the head is always the first point in it. Both indexes are saved.

\subsection{Calculating the center-line}

A center-line, for a given outline, can be calculated by starting out at the head and walking in both directions from there -- always trying to find a pair of points with minimal distance to each other on both sides. Two indices are used: $l,r$ for left and right. We also allow some "wiggle-room" for the algorithm to find the best-matching points on each side. This is limited by a maximum offset of $\omega$ points which is set to $0.025 * N$ by default, where $N$ is the number of points in the outline. $\mathbf{f}(i)$ gives the point on in outline at position $i$.

Starting from $l\coloneqq-1,r\coloneqq 1$ we continue while $r < l + N$:

\begin{enumerate}
    \item Find $m\coloneqq \mathrm{argmin}_i \left\{ \norm{\mathbf{f}(r+i) - \mathbf{f}(l)};\ \forall i \leq \omega \wedge r+i < N \right\}$. If no valid $m$ can be found, abort. Otherwise set $r\coloneqq m$.
    
    \item Find $k\coloneqq \mathrm{argmin}_i \left\{ \norm{\mathbf{f}(l-i+N) - \mathbf{f}(r)};\ \forall i \leq \omega \wedge l-i \leq -N \right\}$. If no valid $k$ can be found, abort. Otherwise set $l\coloneqq k$.
    
    \item Our segment now consists of points $\mathbf{f}(m)$ and $\mathbf{f}(k)$, with a center vector of $(\mathbf{f}(k) - \mathbf{f}(m)) * 0.5 + \mathbf{f}(m)$. Push it to the center-line array. We can also calculate the width of the body at that point using $\norm{\mathbf{f}(k) - \mathbf{f}(m)}$.
    
    \item Set $l\coloneqq l - 1$.
    \item Set $r\coloneqq r + 1$.
\end{enumerate}

Head and tail positions can be switched now, e.g. for animals where the wider part is the head. We may also want to start at the slimmest peak first, which ever that is, since there we have not as much space for floating-point errors regarding where \textit{exactly} the peak was. These options depend on the specific settings used in each video.

The angle of the center-line is calculated using $\atantwo$ for a vector between the first point and one point at an offset from it. The specific offset is determined by a \texttt{midline stiffness} parameter, which offers some additional stability -- despite e.g. potentially noisy peak detection.

%$$ C(i) = 2 * \left( (p_{i,x} - p_{i-r,x}) (p_{i+r,y} - p_{i,y}) - (p_{i,y} - p_{i-r,y}) (p_{i+r,x} - p_{i,x}) \right) * 1 / \sqrt{ ((p_{i-r,x}-p_{i,x})^2 + (p_{i-r,y}-p_{i,y})^2) *  } $$

%Adding one completed row to the graph is quite simple, but could potentially be optimized in upcoming versions of \TRex{}. It performs essentially two actions: 

%\begin{itemize}
%	\item Posture: 
%	    \begin{enumerate}
%	        \item describe the tree-based algorithm that follows edge pixels around the object
%	        \item describe how the head (and tail) position is found and that it is possible to search for either broad or pointy body ends
%	        \item describe how the midline algorithm tries to follow the left and right side at the same time in order to estimate body thickness
%	    \end{enumerate}
%	\item Position (and derivatives)
%	    \begin{enumerate}
%	        \item centroid, weighted centroid, posture centroid, head position
%	        \item velocity (etc.) takes individual frame timings into account if available
%	    \end{enumerate}
%	\item Visual Field
%	    \begin{enumerate}
%	        \item datastreams and how they are generated/formulas: depth (basically a depth map exactly like OpenGL does) including identity, body-part (distance from head)
%	    \end{enumerate}
%	\item summary of output fields?
%\end{itemize}
\end{appendixbox}

\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{figures/fig_outline.pdf}
    \caption{The original image is displayed on the left. Each square represents one pixel. The processed image on the right is overlaid with lines of different colors, each representing one connected component detected by our outline estimation algorithm. Dots in the centers of pixels are per-pixel-identities returned by OpenCVs \texttt{findContours} function (for reference) coded in the same colors as ours.  Contours calculated by OpenCVs algorithm can not be used to estimate the one-pixel-wide "tail" of the 9-like shape seen here, since it becomes a 1D line without sub-pixel accuracy. Our algorithm also detects diagonal lines of pixels, which would otherwise be an aliased line when scaled up.}
	\label{fig:outline_approx}
	%Both images are in grayscale -- the pixel values are displayed using the "Viridis" color-scheme (an overview of various commonly used color-schemes can be found here: \href{https://bids.github.io/colormap/}{bids.github.io/colormap}).
\end{figure}

\begin{appendixbox}
\section{Visual field estimation} \label{sec:visual_field}

Visual fields are calculated by casting rays and intersecting them with other individuals and the focal individual (for self-occlusion). An example of this can be seen in \figref{fig:occlusion}. The following procedure requires posture for all individuals in a frame. In case an individual does not have a valid posture in the given frame, its most recent posture and position are used as an approximation. The field is internally represented as a discretized vector of multi-dimensional pixel values. Depending on the resolution parameter ($F_\mathrm{res}$), which sets the number of pixels, each index in the array represents step-sizes of $(F_\mathrm{max} - F_\mathrm{min}) / F_\mathrm{res}$ radians. The $F$ values are constants setting the minimum and maximum field of view ($-130
^\circ$ to $130^\circ$ by default, which gives a range of $260^\circ$). Each pixel consists of multiple data-streams: The distance to the other individual, the identity of the other individual and the body-part that the ray intersected with.

Eyes are simulated to be located on the outline of the focal individual, near the head. The distance to the head can be set by the user as a percentage of midline-length. To find the exact eye position, the program calculates intersections between vectors going left/right from that midline point, perpendicular to the midline, and the individual's outline. In order to be able to simulate different types of binocular and monocular sight, a parameter for eye separation $E_\mathrm{sep}$ (radians) controls the offset from the head angle $H_\alpha$ per eye. Left and right eye are looking in directions $H_\alpha - E_\mathrm{sep}$ and $H_\alpha + E_\mathrm{sep}$, respectively.

We iterate through all available postures in a given frame and use a procedure which is very similar to depth-maps (\cite{williams1978casting}) in e.g. \texttt{OpenGL}. In the case of 2D visual fields, this depth-map is 1D. Each pixel holds a floating-point value (initialized to $\infty$) which is continuously compared to new samples for the same position -- if the new sample is closer to the "camera" than the reference value, the reference value is replaced. This way, after all samples have been evaluated, we generate a map of the objects closest to the "camera" (in this case the eye of the focal individual). For that to work we also have to keep the identity in each of these discrete slots maintained. So each time a depth value is replaced, the same goes for all the other data-streams (such as identity and head-position). When an existing value is replaced, values in deeper layers of occlusion are pushed downwards alongside the old value for the first layer.

Position of the intersecting object's top-left corner is located at $\hat{P}$. Let $E_e$ be the position of each eye, relative to $\hat{P}$. For each point $P_j$ (coordinates relative to $\hat{P}$) of the outline, check the distance between $E_e$ and the outline segments ($P_j - P_{j-1}$). For each eye $E_e$:

\begin{enumerate}
    \item Project angles ranging from $\left[\atantwo(P_{j-1} + E_e), \atantwo(P_j + E_e)\right]$, where $\alpha_e$ is the eye orientation, using:
    $$ \Gamma_{e}(\beta) = \mathrm{angle\_normalize}\left(\beta - \alpha_e - F_\mathrm{min}\right) / (F_\mathrm{max} - F_\mathrm{min}) * F_\mathrm{res} $$
    $\mathrm{angle\_normalize}(\beta)$ normalizes beta to be between $[-\pi,\pi]$.
    
    \item \textbf{if} either $\max(R)$ or $\min(R)$ is inside the visual field ($0 \leq \Gamma_e(\beta) \leq 1$):
    \begin{enumerate}
        \item We call the first angle satisfying the condition $\beta$.
        \item Then the search range becomes $R \coloneqq \left[\nint{\max\{\beta-0.5; 0\}}, \nint{\beta+0.5} \right]$, where the elements in $R$ are integers.
        \item Let $\delta_{j,e} = \norm{ P_{j-1} - E_e } $, the distance between outline point at $j-1$ and the eye (interpolation could be done here).
        \item Let index $k \in \mathbb{N}, k \in R$ be our index into the first layer of the depth-map $\mathrm{depth}_0$:
       % \item  Let index $k \coloneqq (\alpha_e - F_\mathrm{min}) / (F_\mathrm{max} - F_\mathrm{min}) \in \mathbb{N},\ 0 \leq k < F_\mathrm{res} $ in discrete FOV for our selected angle be
        \item \textbf{if} $\mathrm{depth}_0(k) > \delta_{j,e}$: Calculate all properties $D_0(k) \coloneqq \{ \mathrm{head\_distance}, \dots \in \mathrm{data\_streams} \}^T$, and push values at $k$ in layer $0$ to layer $1$.
        \item \textbf{otherwise}, if $\mathrm{depth}_1(k) > \delta_{j,e}$, calculate properties for layer $1$ instead and move data from layer $1$ further down, etc.
    \end{enumerate}
\end{enumerate}

The data-streams are calculated individually with the following equations:

\begin{itemize}
    \item \textbf{Distance}: Given already in $\mathrm{depth}_i(k)$. In practice, values are cut off at the maximum distance (size of the video squared) and normalized to $[0,255]$.
    \item \textbf{Identity}: Is assigned alongside $\mathrm{depth}_i(k)$ for each element that successfully replacing another in the map.
    \item \textbf{Body-part}: 
    Let $T_i =$ tail index, $L_{l/r} =$ number of points in left/right side of the outline (given by tail- and head-indexes):
    \begin{enumerate}
        \item \textbf{if} $i > T_i$: $\mathrm{head\_distance} = 1 - |i - T_i| / L_l$
        \item \textbf{else}: $\mathrm{head\_distance} = 1 - |i - T_i| / L_r$
    \end{enumerate}
\end{itemize}

\end{appendixbox}

\begin{appendixbox}
\section{The \texttt{PV} file format} \label{sec:pv_files}
Since we are using a custom file format to save videos recorded using \TGrabs{} (\texttt{MP4} video can be saved alongside \texttt{PV} for a limited time or frame-rate), the following is a short overview of \texttt{PV6} contents and structure. This description is purely technical and concise. It is mainly intended for users who wish to implement a loader for the file format (e.g. in Python) or are curious.

\subsection{Structure}

Generally, the file is built as a header (containing meta information on the video) followed by a long data section and an index table plus a settings string at the end. The header at the start of the file can be read as follows:

\begin{enumerate}
    \item version (string): "PV6"
    \item channels (uint8): Hard-coded to 1
    \item width \& height (uint16): Video size
    \item crop offsets (4x uint16): Offsets from original image
    \item size of HorizontalLine struct (uchar)
    \item \# frames (uint32)
    \item index offset (uint64): Byte offset pointing to the index table for
    \item timestamp (uint64): time since 1970 in microseconds of recording (or conversion time if unavailable)
    \item empty string
    \item background image (byte*): An array of uint8 values of size width * height * channels.
    \item mask image size (uint64): 0 if no mask image was used, otherwise size in bytes followed by a byte* array of that size
\end{enumerate}

Followed by the data section, where information is saved per frame. This information can either be in a zip-compressed format, or raw (determined by size), see below:

\begin{enumerate}
    \item compression flag (uint8): 1 if compression was used, 0 otherwise
    \item if compressed:
    \begin{enumerate}
        \item original size (uint32)
        \item compressed size (uint32)
        \item lzo1x compressed data (byte*) in the format of the uncompressed variant (below)
    \end{enumerate}
    \item if uncompressed:
    \begin{enumerate}
        \item timestamp since start time in header (uint32)
        \item number of images in frame (uint16)
        \item for each image in frame:
        \begin{enumerate}
            \item number of HorizontalLines (uint16)
            \item data of HorizontalLine (byte*)
            \item pixel data for each pixel in the previous array (byte*)
        \end{enumerate}
    \end{enumerate}
\end{enumerate}

Files are concluded by the index table, which gives a byte offset for each video frame in the file, and a settings string. This index is used for quick frame skipping in \TRex{} as well as random access. It consists of exactly one uint64 index per video frame (as determined by the number of video frames read earlier). After that map ends, a string follows, which contains a JSON style string of all metadata associated by the user (or program) with the video (such as species or size of the tank).

\end{appendixbox}

\begin{appendixbox}
\section{Automatic visual identification} \label{sec:appendix_recognition}

\subsection{Network layout and training procedure}

Network layout is sketched in \figref{fig:software_overview}c. Using version $2.2.4$ of Keras\footnote{See \href{https://keras.io/api/layers/initializers/\#glorotuniform-class}{keras.io} documentation for default arguments}, weights of densely connected layers as well as convolutional layers are initialized using Xavier-initialization (\cite{pmlr-v9-glorot10a}). Biases are used and initialized to 0. The default image size in \TRex{} is $80\times 80$, but can be changed to any size in order to retain more detail or improve computation speed.

During training, we use the Adam optimizer (\cite{kingma2014adam}) to traverse the loss landscape, which is generated by categorical focal loss. \textit{Categorical} focal loss is an adaptation of the original \textit{binary} focal loss (\cite{lin2017focal}) for multiple classes:

$$ \mathrm{cFL}(j) = \sum_{c=1}^N  - \alpha \left(1 - \mathbf{P}_{jc}\right)^\gamma \mathbf{V}_{jc} \log\left(\mathbf{P}_{jc}\right), $$

where $\mathbf{P}_{jc}$ is the prediction vector component returned by the network for class $c$ in image $j$. $\mathbf{V}$ is a set of validation images, which remains the same throughout the training process. It comprises 25\% of the images available per individual. Images are marked \textit{globally} when becoming part of the validation dataset and are not used for training in the current or any of the following steps. 

After each epoch, predictions are generated by performing a forward-pass through the network layers. Returned are the softmax-activations $\mathbf{P}_{jc}$ of the last layer for each image $j$ in the validation dataset. Simply calculating the mean of 

$$ \overline{A} = \frac1M\sum_{j\in[0,M]} \begin{cases} 1 & \mathrm{if}\ \mathbf{P}_j = \mathbf{V}_j \\ 0 & \mathrm{otherwise} \end{cases}, $$

gives the mean accuracy of the network. $M$ is the number of images in the validation dataset, where $\mathbf{V}_j$ are the expected probability vectors per image $j$. However, much more informative is the per-class (per-identity) accuracy of the network among the set of images $i$ belonging to class $c$, which is

$$ I_c = \left\{ j; \mathrm{where}\ \mathbf{V}_{jc} = 1, j \in [0,M] \right\}, $$

given that all vectors in $V$ are one-hot vectors -- meaning the vector has length $N$ with $\mathbf{V}_{j\phi} = 0\ \forall \phi \not= c$ and $\mathbf{V}_{jc} = 1$.

$$ A_c = \frac1{|I_c|}\sum_{j\in I_c} \begin{cases} 1 & \mathrm{if}\ \mathbf{P}_j = \mathbf{V}_j \\ 0 & \mathrm{otherwise} \end{cases} $$

Another constant, across training units -- not just across epochs, is the set of images used to calculate mean uniqueness $\mean{U}$ (see Box \ref{box:uniqueness_score}, as well as \nameref{sec:training_quality}). Values generated in each epoch $t$ of every training unit are kept in memory and used to calculate their derivative $\mean{U}'(t)$.

\subsection{Stopping-criteria} \label{sec:recognition_stopping}

A training unit can be interrupted if one of the following conditions becomes true:

\begin{enumerate}
    \item Training commenced for at least $t=5$ epochs, but uniqueness value $\mean{U}$ was never above
    
    $$ \mean{U}_\mathrm{best}^2 > \mean{U}(t)\ \forall t $$
    
    where $\mean{U}_\mathrm{best}$ is the best mean uniqueness currently achieved by any training unit (initialized with zero). This prevents to train on faulty segments after a first successful epoch.
    
    \item The worst accuracy value per class has been "good enough" in the last three epochs: 
    $$ \min_{c\in [0,N]} \left\{ A_c \right\} \geq 0.97 $$
    %After this threshold has been reached, continuing to train on the same set of samples does not have much potential for improvements globally (as well as locally). We should continue with samples from a different part of the video.
    
    \item The global uniqueness value has been plateauing for more than 10 epochs. 
    $$ \sum_{k\in[t-10,t]} \mean{U}'(k) \leq 0.01 $$
    
    \item Overfitting: Change in loss is very low on average after more than 5 epochs. Mean loss is calculated as follows:
    
    $$ \mathrm{cFL}_\mu(t) = \frac1{5}\sum_{k\in[t-6,t-1]} \mathrm{cFL}(k) $$
    
    Now if the difference between the current loss and the previous loss is below a threshold:
    
    $$ \lambda(t) = \nint[\big]{ \ln\big(\mathrm{cFL}(t)\big)} - 1 $$
    $$ \frac15 \sum_{k\in[t-5,t]} \max\left\{ \epsilon; \left| \mathrm{cFL}(k) - \mathrm{cFL}_\mu(k) \right| \right\} < 0.05 * 10^{\textstyle \lambda(t)} $$
    
    \item Maximum number of epochs has been reached. User-defined option limiting the amount of time that training can take per unit. By default this limit is set to 150 epochs. %We found that even if we are still making incremental improvements to uniqueness values, no significant improvements could be achieved anymore. Adding more samples from a different part of the video always had a much greater effect.
    
    \item Loss is zero. No further improvements are possible within the current training unit, so we terminate and continue with the next.
\end{enumerate}

A high per-class accuracy over multiple consecutive epochs is usually an indication that everything that can be learned from the given data has already been learned. No further improvements should be expected from this point, unless the training data is extended by adding samples from a different part of the video. The same applies to scenarios with consistently zero or very low change in loss. Even if improvements are still possible, they are more likely to happen during the final (overfitting) step where all of the data is combined.

%\subsection{Error correction during final identity assignment}

%[move the text here]

%\subsection{Network activation per class} \label{app:activations}

%\begin{itemize}
%    \item Calculation to get to the plot display in \figref{fig:network_activations}
%    \item Explain in more detail, how it is hard to understand how networks do things and that is is cool that somehow it seems to do something interesting
%    \item Do the same thing again, but this time with tagged individuals (maybe locusts from Felix)
%\end{itemize}

\end{appendixbox}

\begin{appendixbox}
\label{sec:reproduce_tables}
\section{Data used in this paper and reproducibility}

All of the data, as well as the figures and tables showing the data, have been generated automatically. We provide the scripts that have been used, as well as the videos if requested. "Data" refers to converted video-files, as well as log- and NPZ-files. Analysis has been done in \texttt{Python} notebooks, using mostly \texttt{matplotlib} and \texttt{pandas}, as well as \texttt{numpy} to load the data. Since \TRex{} and \TGrabs{}, as well as \idtracker{} have been run on a Linux system, we were able to run everything from two separate \texttt{bash} files:

\begin{enumerate}
    \item run.bash
    \item run\_idtracker.bash
\end{enumerate}

Where (1) encompasses all trials run using \TRex{} and \TGrabs{}, both for the speed- and recognition-datasets. (2) runs \idtracker{} in its own dedicated \texttt{Python} environment, using only the recognition-dataset. The parameters we picked for \idtracker{} vary between videos and are hand-crafted, saved in individual \texttt{.json} files (see Table \tableref{settings_idtracker} for a list of settings used). We ran multiple trials for each combination of tools and data with $N=5$ where necessary: 

\begin{itemize}
    \item 3x \TGrabs{} [speed-dataset]
    \item 5x \TRex{} + recognition [recognition-dataset]
    \item 3x \idtracker{} [recognition-dataset]
    \item \TRex{} without recognition enabled [speed-dataset]:
    \begin{itemize}
        \item 3x for testing the tree-based, approximate and Hungarian methods (\nameref{sec:tracking}), without posture enabled -- testing raw speeds (see Table \tableref{timings_apprx})
        \item 3x testing accuracy of basic tracking (see Table \tableref{decisions}), with posture enabled
    \end{itemize}
\end{itemize}

A \texttt{Python} script used for \figref{fig:maximum_val_acc_per_samples}, which is run only once. It generates a series of results for the same video (\videoref{vid:video_example_100fish_1min} with 100 individuals) with different sample-sizes. It uses a single set of training samples and then -- after equalizing the numbers of images per individual -- generates multiple virtual subsets with fewer images. They span 15 different sample-sizes per individual, saving a history of accuracies for each run. We repeated the same procedure with for the different normalization methods (no normalization, moments and posture), each repeated five times.

%Some evaluations were conducted in a different manner, or did not require repetition since the results do not change between trials, or running the trial once generates repetitions of the data required. These comprise: 

%\begin{itemize}
%    \item \TRex{} without recognition [speed-dataset], not loading live-tracking results, run three times using (i) tree-based / (ii) approximate or (iii) Hungarian method. Analysis merely looked at the accuracy as compared to corrected videos. Algorithms are deterministic and produce the same results in every run.
%    \item A \texttt{Python} script used for \figref{fig:maximum_val_acc_per_samples}, which is run only once. It generates a series of results for the same video with different sample-sizes.
%\end{itemize}

As described in the main text, we recorded memory usage with an external tool (syrupy) and used it to measure both software solutions. This tool saves a log-file for each run, which is appropriately renamed and stored alongside the other files of that trial.

All runs of \TRex{} are preceded by running a series of \TGrabs{} commands first, in order to convert the videos in the datasets. We chose to keep these trials separately and \texttt{load} whenever possible, to avoid data-duplication. Since subsequent results of \TGrabs{} are always identical (with the exception of timings), we only keep one version of the \texttt{PV} files (\nameref{sec:pv_files}) as well as only one version of the \texttt{results} files generated using live-tracking. However, multiple runs of \TGrabs{} were recorded in the form of log-files to get a measure of variance between runs in terms of speed and memory.

% TODO: here should be some text on ians videos if required. maybe i should re-do that experiment. would be easier to explain

\subsection{Human validation} \label{sec:human_validation}

To ensure that results from the automatic evaluation (in \nameref{sec:maintaining_identities}) are plausible, we manually reviewed part of the data. Specifically, the table in \tableref{reviewed_crossings} shows an overview of the individual events reviewed and percentages of wrongly assigned frames. Due to the length of videos and the numbers of individuals inside the videos we did not review all videos in their entirety, as shown in the table. Using the reviewing tools integrated in \TRex{}, we focused on crossings that were automatically detected. These tools allow the user to jump directly to points in the video that it deems problematic. Detecting problematic situations is equivalent to detecting the end of individual segments (see \nameref{sec:visual_recognition}). While iterating through these situations, we corrected individuals that have been assigned to the wrong object, generating a clean and corrected baseline dataset. We assumed that an assignment is correct, as long as the individual is at least part of the object that the identity has been assigned to. Misassignments were typically fixed after a few frames. Identities always returned to the correct individuals afterward (thus not causing a chain of follow-up errors).

\subsection{Comparison between trajectories from different softwares, or multiple runs of the same software}

In our tests, the same individuals may have been given different IDs (or "names") by each software (and in each run of each software for the same video), so, as a first step in every test where this was relevant, we had to determine the optimal pairing between identities of two datasets we wished to compare. This was done using a square distance matrix containing overall euclidean distances between identities is calculated by summing their per-frame distances. Optimally this number would be zero for one and greater than zero for every other pairing, but temporary tracking mistakes and differences in the calculation of centroids may introduce noise. Thus, we solved the matching problem (see \nameref{sec:matching_graph}) for identities between each two datasets and paired individuals with the smallest accumulative distance between them. This was done for all results presented, where a direct comparison between two datasets was required.

\end{appendixbox}

\begin{table}[t]
\begin{tabular}{l | l l l l l l}
\toprule
%\multicolumn{2}{c|}{video characteristics} & \multicolumn{4}{c}{matching stats} \\
%\midrule
video & length (\# frames)  & nblobs & area & max. intensity & roi \\
\midrule
\vidref{vid:video_example_100fish_1min} & 1921 & 100 & [165, 1500] & 170 & Yes \\
\vidref{vid:flies_N59} & 30626 & 59 & [100, 2500] & 160 & Yes \\
\vidref{vid:group_3} & 19539 & 10 & [200, 1500] & 10 & Yes \\
\vidref{vid:group_1} & 19317 & 10 & [200, 1500] & 10 & Yes \\
\vidref{vid:group_2} & 19309 & 10 & [200, 1500] & 10 & Yes \\
\vidref{vid:15locusts1h} & 90001 & 8 & [190, 4000] & 147 & Yes \\
\vidref{vid:guppy_8_t20_d1_20190512_115801} & 416259 & 8 & [200, 2500] & 50 & No \\
\vidref{vid:guppy_8_t46_d1_20191207_102508} & 351677 & 8 & [200, 2500] & 50 & No \\
\vidref{vid:guppy_8_t36_d15_20191212_085800} & 108000 & 8 & [250, 2500] & 10 & No \\
\bottomrule
\end{tabular}

\medskip
\caption{\label{tab:settings_idtracker} Settings used for \idtracker{} trials, as saved inside the \texttt{.json} files used for tracking. The minimum intensity was always set to 0 and background subtraction was always enabled. An ROI is an area of interest in the form of an array of 2D vectors, typically a convex polygon containing the area of the tank (e.g. for fish or locusts). Since this format is quite lengthy, we only indicate here whether we limited the area of interest or not.}
\end{table}

%\begin{figure}[h]
%\includegraphics[width=1.0\linewidth]{comparison_convergence.pdf}
%\caption{Uniqueness (apprx. training quality) plotted per training-epoch for three different normalization methods (see \figref{fig:datasets_comparison}). Shaded areas indicate standard deviation across the tested videos ($N=77$). Higher values suggest better absolute uniqueness values achieved in that epoch, relative to the other normalization methods. Per video, values are normalized to $[0-1]$ and the best value per epoch is subtracted. The comparison shows that, given the same data, using raw images leads to much slower convergence and generalization than normalizing images first. Aligning images along their main axis (moments) converges faster than no normalization at all, but converges more slowly and to a worse absolute value of about $5\%$ below posture-based normalization in the best case.}
%\label{fig:convergence}
%\end{figure}

\begin{appendixbox}
\label{first:app}
\section{Matching probabilities}

One of the most important steps, when matching objects in one frame with objects in the next frame, is to calculate a numerical landscape that can then be traversed by a maximization algorithm to find the optimal combination. This landscape, which can be expressed as an $m\times n$ matrix $\mathbf{P}(t)$, contains the probability values between $[0,1]$ for each assignment between individuals $i$ and objects $B_j$.

Below are definitions used in the following text:

\begin{itemize}
    \item $T_\Delta$ is the typical time between frames (s), which depends on the video
    \item $\tau_i < t$ is most recent frame assigned to individual $i$ previous to the current frame $t$
    \item $P_\mathrm{min}$ is the minimally allowed probability for the matching algorithm, underneath which the probabilities are assumed to be zero (and respective combination of object and individual is ignored). This value is set to $0.1$ by default.
    \item $ F(t\in\mathbb{R}) \rightarrow \mathbb{N} $ is the frame number associated with the time $t$ (s)
    \item $ \Tau(f\in\mathbb{N}) \rightarrow \mathbb{R} $ is the time in seconds of frame $f$, with $F(\Tau(f)) = f$
    %\item $f'(x)$ is the first derivative $\frac{\delta}{\delta x}f(x)$
    \item $\mathbf{x}$ indicates that $x$ is a vector
    %\item $\mean{f}(x)$ indicates that the result is an average of across multiple frames
    \item $ \mathbf{U}(\mathbf{x}) = \mathbf{x} / \norm{\mathbf{x}} $
\end{itemize}

Some values necessary for the following calculations are independent of the objects in the current frame and merely depend on data from previous frames. They can be re-used per frame and individual in the spirit of dynamic programming, reducing computational complexity in later steps:

$$ \mathbf{v}_i(t) = \mathbf{p}_i'(t) = \frac{\delta}{\delta t} \mathbf{p}_i(t) $$
$$ \hat{\mathbf{v}}_i(t) = \mathbf{v}_i(t) * \begin{cases} 1 & \mathrm{if} \norm{\mathbf{v}_i(t)} \le D_\mathrm{max} \\ D_\mathrm{max} / \norm{\mathbf{v}_i(t)} & \mathrm{otherwise} \end{cases} $$
$$ \mathbf{a}_i(t) = \frac{\delta}{\delta t} \hat{\mathbf{v}}_i(t) $$

Velocity $\mathbf{v}_i(t)$ and acceleration $\mathbf{a}_i(t)$ are simply the first and second derivatives of the individuals position at time $t$. $\hat{\mathbf{v}}_i(t)$ is almost the same as the raw velocity, but its length is limited to the maximally allowed travel distance per second ($D_\mathrm{max}$, parameter \texttt{track\_max\_speed}).

These are then further processed, combining and smoothing across values of multiple previous frames (the last 5 valid ones). Here, $\mean{f}(x)$ indicates that the resulting value uses data from multiple frames.
 
$$ \mean{s}_{i}(t) = \underset{k \in [F(\tau)-5, F(t)]}{\median} \norm{\hat{\mathbf{v}}_i(\Tau(k))} $$ % speed estimate

is the speed at which the individual has travelled at recently. The mean direction of movement is expressed as

$$ \mean{\mathbf{d}_i}(t) = \frac{1}{F(t)-F(\tau)+5} \sum_{k \in [F(\tau)-5, F(t)]} \hat{\mathbf{v}}_i(\Tau(k)) $$ % average direction

with the corresponding direction of acceleration

$$ \mean{\mathbf{a}}_i(t) = \mathbf{U}\left( \frac{1}{F(t)-F(\tau)+5} \sum_{k \in [F(\tau)-5, F(t)]} \mathbf{a}_i(\Tau(k)) \right) . $$

The predicted position for individual $i$ at time $t$ is calculated as follows:

$$ \dot{\mathbf{p}}_i(t) = s_i(t) \sum_{k\in [F(\tau_i), F(t)-1]} w(k) \left(\mean{\mathbf{d}_i}(t) + \Tau'(k) * \mean{\mathbf{a}}_i(t) \right) , $$

with weights for each considered time-step of

$$ w(f) = \frac{1 + \lambda^4}{1 + \lambda^4 \max\left\{ 1, f - F(\tau_i) + 1 \right\}} , $$

where $\lambda\in[0,1]$ is a decay rate (parameter \texttt{track\_speed\_decay}) at which the impact of previous positions on the predicted position decreases with distance in time. With its value approaching $1$, the resulting curve becomes steeper - giving less weight previous positions the farther away they are from the focal frame.

%after the frame $F(\tau_i) $ where was seen last (with $\tau_i < t$).
In order to locate an individual $i$ in the current frame $F(t)$, a probability is calculated for each object $B_j$ found in the current frame resulting in the matrix:

\begin{equation}
    \mathbf{P}(t) = \begin{bmatrix}
      P_0\given*{t|B_0} & \dots & P_{n}\given*{t|B_0} \\
     \vdots   & \ddots  & \vdots \\
      P_{0}\given*{t|B_m}  & \dots  & P_{n}\given*{t| B_m}
    \end{bmatrix}.
\end{equation}

Probabilities $P_{i}\given{t|B_j}$ for all potential connections between blobs $B_j$ and identities $i$ at time $t$ are calculated by first predicting the expected position $\dot{\mathbf{p}}_{i}(t)$ for each individual in the current frame $F(t)$. This allows the program to focus on a small region of where the individual is expected to be located, instead of having to search the whole arena each time.

Based on the individual's recent speed $\mean{s}_{i}(t)$, direction $\mean{\mathbf{d}_i}(t)$, acceleration $ \mean{\mathbf{a}}_{i}(t)$ and angular momentum $\mean{\alpha}_{i}'(t)$, the individual's projected position $\dot{\mathbf{p}}_{i}(t)$ is usually not far away from its last seen location for small time-steps. Only when $\Delta t$ increases, if the individual has been lost for more than one frame or frame-rates are low, does it really play a role.

The actual probability values in $\mathbf{P}(t)$ are then calculated by combining three metrics - each describing different aspects of potential concatenation of object $b$ at time $t$ to the already existing track for individual $i$:

The time metric $T_i(t)$, which does not depend on the blob the individual is trying to be matched to. It merely reflects the recency of the individuals last occurence in a way that recently seen individual will always be preferred over individuals that have been lost for longer.
    $$ F_\mathrm{min} = \min\left\{\frac{1}{T_\Delta}, 5\right\} $$
    $$ R_i(t) = \norm{ \givenset[\Big]{ \Tau(k) | F(t) - T_\Delta^{-1} \leq k \leq t \wedge \Tau(k) - \Tau(k-1) \leq T_\mathrm{max}} } $$
    \begin{equation} \label{eq:time_prob}
    T_i(t) = \left(1 - \min\left\{ 1, \frac{\max\left\{ 0, \tau_i - t - T_\Delta \right\}} {T_\mathrm{max}} \right\}\right) * \begin{cases}
            \min\left\{ 1, \frac{R_i(\tau_i) - 1}{F_\mathrm{min}} + P_\mathrm{min} \right\} & F(\tau_i) \geq F(t_0) + F_\mathrm{min}\\
            1 & \mathrm{otherwise}
        \end{cases}
    \end{equation}
    
$S_{i}\given{t|B_j}$ is the speed that it would take to travel from the individuals position to the blobs position in the given time (which might be longer than one frame), inverted and normalized to a value between $0$ and $1$.
    \begin{equation} \label{eq:speed_prob}
        S_{i}\given{t | B_j} = \left(1 + \frac{\norm{ \left(\mathbf{p}_{B_j}(t) - \dot{\mathbf{p}}_i(t) \right) / (\tau_i - t) }}{ D_{\mathrm{max}}}\right)^{-2}
    \end{equation}
    
And the angular difference metric $A_{i}\given{t,\tau_i | B_j}$, describing how close in angle the resulting vector of connecting blob and individual to a track would be to the previous direction vector:
    $$ \mathbf{a} = \dot{\mathbf{p}}_i(t) - \mathbf{p}_i(\tau_i)  $$
    $$ \mathbf{b} = \mathbf{p}_{B_j}(t) - \mathbf{p}_i(\tau_i) $$
    
    \begin{equation} \label{eq:angle_prob}
        A_{i}\given{t,\tau_i | B_j } =
        \begin{cases}
            1 - \frac{1}{\pi}\left|\atantwo\left\{\norm{ \mathbf{a}\times \mathbf{b} }, \mathbf{a}\cdot \mathbf{b}\right\}\right| & \mathrm{if} \norm{\mathbf{a}} > 1 \wedge \norm{\mathbf{b}} > 1 \\
            1 & \mathrm{otherwise}
        \end{cases}
    \end{equation}
    
The conditional ensures that the individual travelled a long enough distance, as the $\atantwo$ function used to determine angular difference here lacks numerical precision for very small magnitudes. This is, however, an unproblematic case in this situation as the positions are in pixel-coordinates and anything below a movement of one pixel is likely to be due to noise anyway.

Combining \eqref{eq:time_prob}, \eqref{eq:speed_prob} and \eqref{eq:angle_prob} into a weighted probability product yields:
\begin{equation} \label{eq:combined_prob}
P_{i} \given[\big]{t,\tau_i | B_j } =  S_{i} \given*{t | B_j} * \left(1 - \omega_1 \left(1 + A_{i} \given*{t,\tau_i | B_j } \right) \right) * \left(1 - \omega_2 \left( 1 +  T_{i}(t,\tau_i) \right) \right)
\end{equation}

Results from equation \eqref{eq:combined_prob} can now easily be used in a matching algorithm, in order to determine the best combination of objects and individuals as in \nameref{sec:matching_graph}. $\omega_1$ is usually set to $0.1$, $\omega_2$ is set to $0.25$ by default.

\end{appendixbox}

\begin{appendixbox}
\section{Algorithm for splitting touching individuals}
\label{box:splitting-algorithm}
\begin{algorithm}[H]
  \SetAlgoLined
  \SetKw{Continue}{continue}
  \SetKw{Break}{break}
  \SetKw{Fail}{fail}
  \KwData{image of a blob, $N_e$ number of expected blobs}
  \KwResult{$N \ge N_e$ smaller image-segments, or \emph{error}}
  $\mathrm{threshold} = T_c$\;
  
  \While{$\mathrm{threshold} < 255$}{
    $\mathrm{blobs} = \mathrm{apply\_threshold}(\mathrm{image}, \mathrm{threshold})$\;
    \If{$\norm{\mathrm{blobs}}=0$}{
      \Break\;
    }
    \eIf{$\norm{\mathrm{blobs}} \ge N_e$}{
      sort blobs by size in decreasing fashion\;
	  loop through all blobs $i$ up to $i\leq N_e$ and detect whether the size-ratio between them is roughly even. until then, we keep iterating.\;
	  \eIf{$\min\{\mathrm{ratio}_i\ \forall i\in[0,N_e]\}<0.3$}{
	    $\mathrm{threshold} = \mathrm{threshold} + 1$\;
	    \Continue\;
	  }{
	    \Return{blobs}\;
	  }
      }{
      $\mathrm{threshold} = \mathrm{threshold} + 1$\;
      }
    }
  \Return{\Fail}\;
  
  \caption{The algorithm used whenever two individuals touch, which is detected by a history-based method. This history-based method also provides $N_e$, the number of expected objects within the current (big) object. $T_e$ is the starting threshold constant parameter, as set by the user.}
\end{algorithm}

\end{appendixbox}

\begin{appendixbox}
\changemade{
\section{Posture and Visual Identification of Highly-Deformable Bodies} \label{sec:deformable_bodies}

To evaluate further whether TRex's posture and visual identification algorithms are broadly applicable, such as to mammals (e.g. rodents) -- which have highly-deformable bodies and thus increased variance per individual, we conducted additional analyses on videos of groups of four freely behaving mice (four C57BL/6 mice provided by D. Mink and M. Groettrup, and four "black mice" from \cite{idtrackerai} provided to us by G.G. de Polavieja, and now linked under \href{https://idtrackerai.readthedocs.io/en/latest/data.html}{idtrackerai.readthedocs.io}).

Both videos, listed in \tableref{mouse_crossings} and previewed in \figref{fig:mice_video_pics}, were analyzed using the same scripts used to generate \tableref{reviewed_crossings}, although each video has only been automatically tracked once (since accuracy of tracking is very high, as detailed below). We manually generated verified trajectories for both videos in full, following the same procedure described in \nameref{sec:human_validation}, and compared them to the automatically generated trajectories. As can be seen in \tableref{mouse_crossings}, \TRex{} provides highly accurate results for both videos ($\geq 99.6\%$).

Tracking, in theory and in practice as per our results here, is not generally impacted by the shape of individuals. However, individuals of some species tend to stay close/on top of con-specifics, which may render them impossible to track during periods where traditional image processing methods are unable to separate them. This explains the $\sim6\%$ interpolated frames in V1 (see \tableref{mouse_crossings}), and also gives a reason why there is similarity between \videoref{vid:15locusts1h} and V1 in that respect -- the locusts in \videoref{vid:15locusts1h} also spend much time either on top of others, or in places where they are harder to see.

Very short segments of mistaken identities (with a maximum length of less than 200ms) occurred whenever individuals “appear” only for a short moment and the segment does not contain enough data to be properly matched with a learned identity. Correct identities were reassigned in all cases after the individuals could be visually separated from each other again, and such events only make up $<1\%$ of the tracked data.

Furthermore, we found that our method for posture estimation works well despite the more deformable bodies and complex 3D-postures of mice. Head and tail may switch occasionally, especially when animals shrink to “a circle” from the viewpoint of the camera. Overall, however, by far most samples are normalised correctly -- as can be seen in \figref{fig:mice_image} and \figref{fig:mice_camera_image}.}

\end{appendixbox}

\begin{table}[h]
\begin{tabular}{l | l l l l}
\toprule
video & \textbf{{\# ind.}} & reviewed (\%) & interpolated (\%) & \TRex{}\\
\midrule
(V1) \cite{idtrackerai} & 4 & 100.0 & $ 6.41 $ & $ 99.6 \pm 0.0 $ \\
(V2) D. Mink, M. Groettrup & 4 & 100.0 & $ 1.74 $ & $ 99.82 \pm 0.0 $ \\
\bottomrule
\end{tabular}
\medskip
\caption{\label{tab:mouse_crossings}\changemade{Analogous to our analysis in \tableref{reviewed_crossings}, we compared automatically generated trajectories for two videos with manually verified ones. Unlike the table in the main text, the sample size per video is only one here, which is why the standard deviation is zero in both cases. Results show very high accuracy for both videos, but relatively high numbers of interpolated frames  compared to \tableref{reviewed_crossings}, where only the results for \videoref{vid:15locusts1h} showed more than 8\% interpolation and all others remained below 1\%.}}
\end{table}

\begin{figure}
\begin{fullwidth}
\includegraphics[width=1.0\linewidth]{figures/mice_video_pics.pdf}
\captionsetup{margin=0pt,calcmargin={0pt,-4.5cm}}
\caption{\changemade{Screenshots from videos V1 and V2 listed in \tableref{mouse_crossings}. Left (V1), video of four "black mice" (17min, 1272x909px resolution) from \cite{idtrackerai}. Right (V2), four C57BL/6 mice (1:08min, 1280x960px resolution) by M. Groettrup, D. Mink.}}
\label{fig:mice_video_pics}
\videosupp{\changemade{A clip of the tracking results from V1, played back at normal speed. Although it succumbs to noise in some frames (e.g. around 13 seconds), posture estimation remains remarkably robust to it throughout the video -- sometimes even through periods where individuals overlap (e.g. at 27 seconds). Identity assignments are near perfect here, confirming our results in \tableref{mouse_crossings}. \url{https://youtu.be/UnqRNKrYiR4}}}
\videosupp{\changemade{Tracking results from V2, played back at two times normal speed. Since resolution per animal in V2 is lower than V1, and contrast is lower, posture estimation in V2 is also slightly worse than in V1. Importantly, however, identity assignment is very stable and accurate. \url{https://youtu.be/OTP4dVSc7Es}}}
\end{fullwidth}
\end{figure}

\begin{figure}
\begin{fullwidth}
\includegraphics[width=1.0\linewidth]{figures/mice_image.pdf}
\captionsetup{margin=0pt,calcmargin={0pt,-4.5cm}}
\caption{\changemade{Median of all normalised images (N=7161, 7040, 7153, 7076) for each of the four individuals from V1 in \tableref{mouse_crossings}. Posture information was used to normalise each image sample, which was stable enough — also for \TRex{} — to tell where the head is, and even to make out the ears on each side (brighter spots).}}
\label{fig:mice_image}
\end{fullwidth}
\end{figure}

\begin{figure}
\begin{fullwidth}
\includegraphics[width=1.0\linewidth]{figures/mice_camera_image.pdf}
\captionsetup{margin=0pt,calcmargin={0pt,-4.5cm}}
\caption{\changemade{Median of all normalised images (N=1593, 1586, 1620, 1538) for each of the four individuals from V2 in \tableref{mouse_crossings}. Resolution per animal is lower than in V1, but ears are still clearly visible.}}
\label{fig:mice_camera_image}
\end{fullwidth}
\end{figure}

\end{document}