diff --git a/.gitignore b/.gitignore index 47be7db0..61907613 100644 --- a/.gitignore +++ b/.gitignore @@ -191,5 +191,12 @@ research_bench/profile_dbs_old **/author_data/* **/paper_data/* **/reference_proposal_data/* +**/paper_bench/ +**/paper_data_/ +**/reference_proposal_data_/ +**/oodbench/ +**/iclrbench/ +**/author_data_/ +**/manifest.json !**/.gitkeep diff --git a/configs/param.yaml b/configs/param.yaml index 94bef813..46f13243 100644 --- a/configs/param.yaml +++ b/configs/param.yaml @@ -9,5 +9,5 @@ temperature: 0.6 top_p: null write_proposal_strategy: default max_env_run_num: 1 -proposal_num: 2 +proposal_num: 1 use_rag: True diff --git a/research_bench/Profile.json b/research_bench/Profile.json new file mode 100644 index 00000000..61807710 --- /dev/null +++ b/research_bench/Profile.json @@ -0,0 +1,2611 @@ +{ + "6f34668a-8c53-4a5d-8317-66dd8b3ce24f": { + "pk": "6f34668a-8c53-4a5d-8317-66dd8b3ce24f", + "name": "Andrew Lizarraga", + "bio": "I am a researcher dedicated to advancing the fields of generative modeling and reinforcement learning, with a particular focus on planning and decision-making. My recent work has led to the development of the Latent Plan Transformer (LPT), a novel model that effectively connects trajectory generation with long-term returns through latent variable inference. This approach allows for nuanced credit assignments and improved decision-making from sub-optimal trajectories, demonstrating the potential of planning as inference.\n\nIn addition to LPT, I have explored the integration of world models with decision transformers, creating a framework that enhances contextual decision-making through anticipatory trajectory generation. This bidirectional enhancement loop not only accelerates learning but also improves sample efficiency and robustness across diverse benchmarks.\n\nMy research also extends to the realm of text-to-image diffusion models, where I have proposed a Bayesian approach to refine attention mechanisms. By designing custom priors, I aim to improve attribute-object alignment and enhance the interpretability of generative models. My work has consistently achieved state-of-the-art results, addressing long-standing challenges in the field and paving the way for more reliable and interpretable generative systems.\n\nThrough my research, I strive to bridge the gap between complex decision-making tasks and generative modeling, contributing to the development of more sophisticated and effective AI systems.", + "collaborators": [ + "Ying Nian Wu", + "Deqian Kong", + "E. H. Jiang", + "Zhi Zhang", + "Yasi Zhang", + "Dehong Xu", + "Minglu Zhao", + "Bo Pang", + "Jianwen Xie", + "Yuhao Huang", + "Sirui Xie", + "Dinghuai Zhang", + "Chenheng Xu", + "Siyan Zhao", + "Zhengjie Xu", + "Peiyu Yu", + "Yuer Tang", + "Shufan Li" + ], + "pub_titles": [ + "Latent Plan Transformer for Trajectory Abstraction: Planning as Latent Space Inference", + "DODT: Enhanced Online Decision Transformer Learning through Dreamer's Actor-Critic Trajectory Forecasting", + "Unlocking the Potential of Text-to-Image Diffusion with PAC-Bayesian Theory" + ], + "pub_abstracts": [ + "In tasks aiming for long-term returns, planning becomes essential. We study generative modeling for planning with datasets repurposed from offline reinforcement learning. Specifically, we identify temporal consistency in the absence of step-wise rewards as one key technical challenge. We introduce the Latent Plan Transformer (LPT), a novel model that leverages a latent variable to connect a Transformer-based trajectory generator and the final return. LPT can be learned with maximum likelihood estimation on trajectory-return pairs. In learning, posterior sampling of the latent variable naturally integrates sub-trajectories to form a consistent abstraction despite the finite context. At test time, the latent variable is inferred from an expected return before policy execution, realizing the idea of planning as inference. Our experiments demonstrate that LPT can discover improved decisions from sub-optimal trajectories, achieving competitive performance across several benchmarks, including Gym-Mujoco, Franka Kitchen, Maze2D, and Connect Four. It exhibits capabilities in nuanced credit assignments, trajectory stitching, and adaptation to environmental contingencies. These results validate that latent variable inference can be a strong alternative to step-wise reward prompting.", + "Advancements in reinforcement learning have led to the development of sophisticated models capable of learning complex decision-making tasks. However, efficiently integrating world models with decision transformers remains a challenge. In this paper, we introduce a novel approach that combines the Dreamer algorithm's ability to generate anticipatory trajectories with the adaptive learning strengths of the Online Decision Transformer. Our methodology enables parallel training where Dreamer-produced trajectories enhance the contextual decision-making of the transformer, creating a bidirectional enhancement loop. We empirically demonstrate the efficacy of our approach on a suite of challenging benchmarks, achieving notable improvements in sample efficiency and reward maximization over existing methods. Our results indicate that the proposed integrated framework not only accelerates learning but also showcases robustness in diverse and dynamic scenarios, marking a significant step forward in model-based reinforcement learning.", + "Text-to-image (T2I) diffusion models have revolutionized generative modeling by producing high-fidelity, diverse, and visually realistic images from textual prompts. Despite these advances, existing models struggle with complex prompts involving multiple objects and attributes, often misaligning modifiers with their corresponding nouns or neglecting certain elements. Recent attention-based methods have improved object inclusion and linguistic binding, but still face challenges such as attribute misbinding and a lack of robust generalization guarantees. Leveraging the PAC-Bayes framework, we propose a Bayesian approach that designs custom priors over attention distributions to enforce desirable properties, including divergence between objects, alignment between modifiers and their corresponding nouns, minimal attention to irrelevant tokens, and regularization for better generalization. Our approach treats the attention mechanism as an interpretable component, enabling fine-grained control and improved attribute-object alignment. We demonstrate the effectiveness of our method on standard benchmarks, achieving state-of-the-art results across multiple metrics. By integrating custom priors into the denoising process, our method enhances image quality and addresses long-standing challenges in T2I diffusion models, paving the way for more reliable and interpretable generative models." + ], + "domain": [ + "Reinforcement Learning", + "Generative Modeling", + "Text-to-Image", + "Machine Learning" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "5420a687-e30b-4b3a-b8a8-4e30555f0989": { + "pk": "5420a687-e30b-4b3a-b8a8-4e30555f0989", + "name": "Eric Hanchen Jiang", + "bio": "I am a researcher dedicated to advancing the field of reinforcement learning (RL), particularly in the context of lifelong learning and generative modeling. My recent work has focused on developing innovative algorithms that enhance the adaptability and efficiency of RL agents in dynamic environments. One of my key contributions is EPIC (Empirical PAC-Bayes that Improves Continuously), which leverages PAC-Bayes theory to create a shared policy distribution, enabling agents to rapidly adapt to new tasks while retaining valuable knowledge from previous experiences. \n\nIn addition, I have explored the integration of world models with decision transformers, proposing a novel approach that combines the strengths of the Dreamer algorithm with the adaptive learning capabilities of Online Decision Transformers. This work has demonstrated significant improvements in sample efficiency and reward maximization across challenging benchmarks.\n\nMy research also extends to generative modeling, where I have tackled the complexities of text-to-image diffusion models. By employing a Bayesian approach to design custom priors over attention distributions, I have enhanced attribute-object alignment and improved the overall quality of generated images.\n\nFurthermore, I introduced the Skill-Driven Skill Recombination Algorithm (SDSRA), which enhances the efficiency of achieving maximum entropy in RL tasks, outperforming traditional methods like Soft Actor-Critic. Through these contributions, I aim to push the boundaries of RL and generative modeling, making them more robust, interpretable, and applicable to real-world challenges.", + "collaborators": [ + "Zhi Zhang", + "Yasi Zhang", + "Andrew Lizarraga", + "Ying Nian Wu", + "Chris Chow", + "Yanchao Sun", + "Haochen Zhang", + "Han Liu", + "Furong Huang", + "Yuchen Cui", + "Oscar Hernan Madrid Padilla", + "Dinghuai Zhang", + "Chenheng Xu", + "Siyan Zhao", + "Zhengjie Xu", + "Peiyu Yu", + "Yuer Tang", + "Deqian Kong", + "Shufan Li" + ], + "pub_titles": [ + "Statistical Guarantees for Lifelong Reinforcement Learning using PAC-Bayesian Theory", + "DODT: Enhanced Online Decision Transformer Learning through Dreamer's Actor-Critic Trajectory Forecasting", + "Unlocking the Potential of Text-to-Image Diffusion with PAC-Bayesian Theory", + "SDSRA: A Skill-Driven Skill-Recombination Algorithm for Efficient Policy Learning" + ], + "pub_abstracts": [ + "Lifelong reinforcement learning (RL) has been developed as a paradigm for extending single-task RL to more realistic, dynamic settings. In lifelong RL, the\"life\"of an RL agent is modeled as a stream of tasks drawn from a task distribution. We propose EPIC (\\underline{E}mpirical \\underline{P}AC-Bayes that \\underline{I}mproves \\underline{C}ontinuously), a novel algorithm designed for lifelong RL using PAC-Bayes theory. EPIC learns a shared policy distribution, referred to as the \\textit{world policy}, which enables rapid adaptation to new tasks while retaining valuable knowledge from previous experiences. Our theoretical analysis establishes a relationship between the algorithm's generalization performance and the number of prior tasks preserved in memory. We also derive the sample complexity of EPIC in terms of RL regret. Extensive experiments on a variety of environments demonstrate that EPIC significantly outperforms existing methods in lifelong RL, offering both theoretical guarantees and practical efficacy through the use of the world policy.", + "Advancements in reinforcement learning have led to the development of sophisticated models capable of learning complex decision-making tasks. However, efficiently integrating world models with decision transformers remains a challenge. In this paper, we introduce a novel approach that combines the Dreamer algorithm's ability to generate anticipatory trajectories with the adaptive learning strengths of the Online Decision Transformer. Our methodology enables parallel training where Dreamer-produced trajectories enhance the contextual decision-making of the transformer, creating a bidirectional enhancement loop. We empirically demonstrate the efficacy of our approach on a suite of challenging benchmarks, achieving notable improvements in sample efficiency and reward maximization over existing methods. Our results indicate that the proposed integrated framework not only accelerates learning but also showcases robustness in diverse and dynamic scenarios, marking a significant step forward in model-based reinforcement learning.", + "Text-to-image (T2I) diffusion models have revolutionized generative modeling by producing high-fidelity, diverse, and visually realistic images from textual prompts. Despite these advances, existing models struggle with complex prompts involving multiple objects and attributes, often misaligning modifiers with their corresponding nouns or neglecting certain elements. Recent attention-based methods have improved object inclusion and linguistic binding, but still face challenges such as attribute misbinding and a lack of robust generalization guarantees. Leveraging the PAC-Bayes framework, we propose a Bayesian approach that designs custom priors over attention distributions to enforce desirable properties, including divergence between objects, alignment between modifiers and their corresponding nouns, minimal attention to irrelevant tokens, and regularization for better generalization. Our approach treats the attention mechanism as an interpretable component, enabling fine-grained control and improved attribute-object alignment. We demonstrate the effectiveness of our method on standard benchmarks, achieving state-of-the-art results across multiple metrics. By integrating custom priors into the denoising process, our method enhances image quality and addresses long-standing challenges in T2I diffusion models, paving the way for more reliable and interpretable generative models.", + "In this paper, we introduce a novel algorithm - the Skill-Driven Skill Recombination Algorithm (SDSRA) - an innovative framework that significantly enhances the efficiency of achieving maximum entropy in reinforcement learning tasks. We find that SDSRA achieves faster convergence compared to the traditional Soft Actor-Critic (SAC) algorithm and produces improved policies. By integrating skill-based strategies within the robust Actor-Critic framework, SDSRA demonstrates remarkable adaptability and performance across a wide array of complex and diverse benchmarks." + ], + "domain": [ + "Reinforcement Learning", + "Generative Modeling", + "PAC-Bayes", + "Decision Making" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "01dd5b8b-f5b7-4417-8c02-9c55a3827848": { + "pk": "01dd5b8b-f5b7-4417-8c02-9c55a3827848", + "name": "Yun Qi Li", + "bio": "As a researcher in the field of astrophysics, I am deeply engaged in leveraging machine learning techniques to address fundamental questions about the universe, particularly in the context of large-scale extragalactic surveys. My recent work focuses on improving photometric redshift (photo-z) estimation using Bayesian convolutional neural networks (BCNNs), which allow for accurate predictions and well-constrained uncertainties. By utilizing high-quality imaging data from the Hyper Suprime-Cam survey, I have demonstrated significant improvements in photo-z accuracy, particularly when using galaxy images over traditional photometry.\n\nIn addition to photo-z estimation, I am exploring the potential of generative models to advance our understanding of galaxy evolution. My research includes developing conditional denoising diffusion probabilistic models and conditional variational autoencoders to generate realistic galaxy images based on their redshifts. This work not only tests the capabilities of these models but also incorporates physics-motivated metrics to evaluate their performance, revealing insights that traditional human evaluations may overlook.\n\nI am also committed to fostering collaboration and innovation in the field by creating and sharing comprehensive datasets, such as GalaxiesML, which includes over 286,000 galaxy images and associated properties. This dataset is designed to support machine learning applications in astrophysics and is crucial for the next generation of surveys like Euclid and LSST. Through my research, I aim to bridge the gap between machine learning and astrophysics, unlocking new discoveries and enhancing our understanding of the cosmos.", + "collaborators": [ + "Evan Jones", + "Tuan Do", + "Kevin Alfaro", + "Bernie Boscoe", + "J. Singal", + "Zooey Nguyen" + ], + "pub_titles": [ + "Redshift Prediction with Images for Cosmology Using a Bayesian Convolutional Neural Network with Conformal Predictions", + "Using Galaxy Evolution as Source of Physics-Based Ground Truth for Generative Models", + "GalaxiesML: a dataset of galaxy images, photometry, redshifts, and structural parameters for machine learning" + ], + "pub_abstracts": [ + " In the emerging era of big data astrophysics, large-scale extragalactic surveys will soon provide high-quality imaging for billions of celestial objects to answer major questions in astrophysics such as the nature of dark matter and dark energy. Precision cosmology with surveys requires accurate photometric redshift (photo-z) estimation with well-constrained uncertainties as inputs for weak lensing models to measure cosmological parameters. Machine learning methods have shown promise in optimizing the information gained from galaxy images in photo-z estimation; however, many of these methods are limited in their ability to estimate accurate uncertainties. In this work, we present one of the first applications of Bayesian convolutional neural networks (BCNNs) for photo-z estimation and uncertainties. In addition, we use conformal mapping to calibrate the photo-z uncertainties to achieve good statistical coverage. We use the public GalaxiesML data set of \u223c300k galaxies from the Hyper Suprime-Cam survey containing five-band photometric images and known spectroscopic redshifts from 0 < z < 4. We find that the performance is much improved when using images compared to photometry, with the BCNN achieving 0.098 rms error, a standard outlier rate of 3.9%, a 3\u03c3 outlier rate of 4.5%, and a bias of 0.0007. The performance drops significantly beyond z > 1.5 due to the relative lack of training data beyond those redshifts. This investigation demonstrates the power of using images directly and we advocate that future photo-z analysis of large-scale surveys include galaxy images.", + "Generative models producing images have enormous potential to advance discoveries across scientific fields and require metrics capable of quantifying the high dimensional output. We propose that astrophysics data, such as galaxy images, can test generative models with additional physics-motivated ground truths in addition to human judgment. For example, galaxies in the Universe form and change over billions of years, following physical laws and relationships that are both easy to characterize and difficult to encode in generative models. We build a conditional denoising diffusion probabilistic model (DDPM) and a conditional variational autoencoder (CVAE) and test their ability to generate realistic galaxies conditioned on their redshifts (galaxy ages). This is one of the first studies to probe these generative models using physically motivated metrics. We find that both models produce comparable realistic galaxies based on human evaluation, but our physics-based metrics are better able to discern the strengths and weaknesses of the generative models. Overall, the DDPM model performs better than the CVAE on the majority of the physics-based metrics. Ultimately, if we can show that generative models can learn the physics of galaxy evolution, they have the potential to unlock new astrophysical discoveries.", + "We present a dataset built for machine learning applications consisting of galaxy photometry, images, spectroscopic redshifts, and structural properties. This dataset comprises 286,401 galaxy images and photometry from the Hyper-Suprime-Cam Survey PDR2 in five imaging filters ($g,r,i,z,y$) with spectroscopically confirmed redshifts as ground truth. Such a dataset is important for machine learning applications because it is uniform, consistent, and has minimal outliers but still contains a realistic range of signal-to-noise ratios. We make this dataset public to help spur development of machine learning methods for the next generation of surveys such as Euclid and LSST. The aim of GalaxiesML is to provide a robust dataset that can be used not only for astrophysics but also for machine learning, where image properties cannot be validated by the human eye and are instead governed by physical laws. We describe the challenges associated with putting together a dataset from publicly available archives, including outlier rejection, duplication, establishing ground truths, and sample selection. This is one of the largest public machine learning-ready training sets of its kind with redshifts ranging from 0.01 to 4. The redshift distribution of this sample peaks at redshift of 1.5 and falls off rapidly beyond redshift 2.5. We also include an example application of this dataset for redshift estimation, demonstrating that using images for redshift estimation produces more accurate results compared to using photometry alone. For example, the bias in redshift estimate is a factor of 10 lower when using images between redshift of 0.1 to 1.25 compared to photometry alone. Results from dataset such as this will help inform us on how to best make use of data from the next generation of galaxy surveys." + ], + "domain": [ + "Astrophysics", + "Machine Learning", + "Generative Models", + "Bayesian Inference" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "679004cb-e0dd-42a7-bca1-313c12c74bc4": { + "pk": "679004cb-e0dd-42a7-bca1-313c12c74bc4", + "name": "Bernie Boscoe", + "bio": "I am a researcher at the intersection of machine learning and astrophysics, dedicated to harnessing advanced computational techniques to tackle fundamental questions in cosmology. My recent work focuses on improving photometric redshift (photo-z) estimation, a critical task for interpreting data from large-scale extragalactic surveys. I have pioneered the application of Bayesian convolutional neural networks (BCNNs) for photo-z estimation, achieving significant improvements in accuracy and uncertainty quantification. \n\nI am particularly interested in the potential of generative models to advance our understanding of galaxy evolution. By developing conditional denoising diffusion probabilistic models and conditional variational autoencoders, I have explored how these models can generate realistic galaxy images while adhering to physical laws. My research emphasizes the importance of combining different ground truths, utilizing transfer learning to enhance model generalizability, and creating robust datasets like GalaxiesML to support machine learning applications in astrophysics.\n\nThrough my work, I aim to bridge the gap between machine learning and astrophysical data analysis, ensuring that our methodologies not only meet the scientific requirements of upcoming surveys like LSST but also contribute to the broader understanding of dark matter and dark energy. I am committed to fostering reproducible science practices and developing datasets that empower future research in this exciting field.", + "collaborators": [ + "Tuan Do", + "Evan Jones", + "Kevin Alfaro", + "J. Singal", + "Yunqi Li", + "Zooey Nguyen", + "E. Jones", + "Yujie Wan", + "Jonathan Soriano", + "Srinath Saikrishnan", + "Vikram Seenivasan", + "T. Do", + "Yunqiang Li", + "Christy Ma" + ], + "pub_titles": [ + "Redshift Prediction with Images for Cosmology Using a Bayesian Convolutional Neural Network with Conformal Predictions", + "Using Galaxy Evolution as Source of Physics-Based Ground Truth for Generative Models", + "Using different sources of ground truths and transfer learning to improve the generalization of photometric redshift estimation", + "GalaxiesML: a dataset of galaxy images, photometry, redshifts, and structural parameters for machine learning", + "Improving Photometric Redshift Estimation for Cosmology with LSST Using Bayesian Neural Networks", + "Photometric Redshifts for Cosmology: Improving Accuracy and Uncertainty Estimates Using Bayesian Neural Networks", + "Elements of effective machine learning datasets in astronomy" + ], + "pub_abstracts": [ + " In the emerging era of big data astrophysics, large-scale extragalactic surveys will soon provide high-quality imaging for billions of celestial objects to answer major questions in astrophysics such as the nature of dark matter and dark energy. Precision cosmology with surveys requires accurate photometric redshift (photo-z) estimation with well-constrained uncertainties as inputs for weak lensing models to measure cosmological parameters. Machine learning methods have shown promise in optimizing the information gained from galaxy images in photo-z estimation; however, many of these methods are limited in their ability to estimate accurate uncertainties. In this work, we present one of the first applications of Bayesian convolutional neural networks (BCNNs) for photo-z estimation and uncertainties. In addition, we use conformal mapping to calibrate the photo-z uncertainties to achieve good statistical coverage. We use the public GalaxiesML data set of \u223c300k galaxies from the Hyper Suprime-Cam survey containing five-band photometric images and known spectroscopic redshifts from 0 < z < 4. We find that the performance is much improved when using images compared to photometry, with the BCNN achieving 0.098 rms error, a standard outlier rate of 3.9%, a 3\u03c3 outlier rate of 4.5%, and a bias of 0.0007. The performance drops significantly beyond z > 1.5 due to the relative lack of training data beyond those redshifts. This investigation demonstrates the power of using images directly and we advocate that future photo-z analysis of large-scale surveys include galaxy images.", + "Generative models producing images have enormous potential to advance discoveries across scientific fields and require metrics capable of quantifying the high dimensional output. We propose that astrophysics data, such as galaxy images, can test generative models with additional physics-motivated ground truths in addition to human judgment. For example, galaxies in the Universe form and change over billions of years, following physical laws and relationships that are both easy to characterize and difficult to encode in generative models. We build a conditional denoising diffusion probabilistic model (DDPM) and a conditional variational autoencoder (CVAE) and test their ability to generate realistic galaxies conditioned on their redshifts (galaxy ages). This is one of the first studies to probe these generative models using physically motivated metrics. We find that both models produce comparable realistic galaxies based on human evaluation, but our physics-based metrics are better able to discern the strengths and weaknesses of the generative models. Overall, the DDPM model performs better than the CVAE on the majority of the physics-based metrics. Ultimately, if we can show that generative models can learn the physics of galaxy evolution, they have the potential to unlock new astrophysical discoveries.", + "In this work, we explore methods to improve galaxy redshift predictions by combining different ground truths. Traditional machine learning models rely on training sets with known spectroscopic redshifts, which are precise but only represent a limited sample of galaxies. To make redshift models more generalizable to the broader galaxy population, we investigate transfer learning and directly combining ground truth redshifts derived from photometry and spectroscopy. We use the COSMOS2020 survey to create a dataset, TransferZ, which includes photometric redshift estimates derived from up to 35 imaging filters using template fitting. This dataset spans a wider range of galaxy types and colors compared to spectroscopic samples, though its redshift estimates are less accurate. We first train a base neural network on TransferZ and then refine it using transfer learning on a dataset of galaxies with more precise spectroscopic redshifts (GalaxiesML). In addition, we train a neural network on a combined dataset of TransferZ and GalaxiesML. Both methods reduce bias by $\\sim$ 5x, RMS error by $\\sim$ 1.5x, and catastrophic outlier rates by 1.3x on GalaxiesML, compared to a baseline trained only on TransferZ. However, we also find a reduction in performance for RMS and bias when evaluated on TransferZ data. Overall, our results demonstrate these approaches can meet cosmological requirements.", + "We present a dataset built for machine learning applications consisting of galaxy photometry, images, spectroscopic redshifts, and structural properties. This dataset comprises 286,401 galaxy images and photometry from the Hyper-Suprime-Cam Survey PDR2 in five imaging filters ($g,r,i,z,y$) with spectroscopically confirmed redshifts as ground truth. Such a dataset is important for machine learning applications because it is uniform, consistent, and has minimal outliers but still contains a realistic range of signal-to-noise ratios. We make this dataset public to help spur development of machine learning methods for the next generation of surveys such as Euclid and LSST. The aim of GalaxiesML is to provide a robust dataset that can be used not only for astrophysics but also for machine learning, where image properties cannot be validated by the human eye and are instead governed by physical laws. We describe the challenges associated with putting together a dataset from publicly available archives, including outlier rejection, duplication, establishing ground truths, and sample selection. This is one of the largest public machine learning-ready training sets of its kind with redshifts ranging from 0.01 to 4. The redshift distribution of this sample peaks at redshift of 1.5 and falls off rapidly beyond redshift 2.5. We also include an example application of this dataset for redshift estimation, demonstrating that using images for redshift estimation produces more accurate results compared to using photometry alone. For example, the bias in redshift estimate is a factor of 10 lower when using images between redshift of 0.1 to 1.25 compared to photometry alone. Results from dataset such as this will help inform us on how to best make use of data from the next generation of galaxy surveys.", + "We present results exploring the role that probabilistic deep learning models can play in cosmology from large-scale astronomical surveys through photometric redshift (photo-z) estimation. Photo-z uncertainty estimates are critical for the science goals of upcoming large-scale surveys such as the Legacy Survey of Space and Time (LSST); however, common machine learning methods typically provide only point estimates and lack uncertainties on predictions. We turn to Bayesian neural networks (BNNs) as a promising way to provide accurate predictions of redshift values with uncertainty estimates. We have compiled a galaxy data set from the Hyper Suprime-Cam Survey with grizy photometry, which is designed to be a smaller-scale version of large surveys like LSST. We use this data set to investigate the performance of a neural network and a probabilistic BNN for photo-z estimation and evaluate their performance with respect to LSST photo-z science requirements. We also examine the utility of photo-z uncertainties as a means to reduce catastrophic outlier estimates. The BNN outputs the estimate in the form of a Gaussian probability distribution. We use the mean and standard deviation as the redshift estimate and uncertainty. We find that the BNN can produce accurate uncertainties. Using a coverage test, we find excellent agreement with expectation\u201467.2% of galaxies between 0 < 2.5 have 1\u03c3 uncertainties that cover the spectroscopic value. We also include a comparison to alternative machine learning models using the same data. We find the BNN meets two out of three of the LSST photo-z science requirements in the range 0 < z < 2.5.", + "We present results exploring the role that probabilistic deep learning models can play in cosmology from large scale astronomical surveys through estimating the distances to galaxies (redshifts) from photometry. Due to the massive scale of data coming from these new and upcoming sky surveys, machine learning techniques using galaxy photometry are increasingly adopted to predict galactic redshifts which are important for inferring cosmological parameters such as the nature of dark energy. Associated uncertainty estimates are also critical measurements, however, common machine learning methods typically provide only point estimates and lack uncertainty information as outputs. We turn to Bayesian neural networks (BNNs) as a promising way to provide accurate predictions of redshift values. We have compiled a new galaxy training dataset from the Hyper Suprime-Cam Survey, designed to mimic large surveys, but over a smaller portion of the sky. We evaluate the performance and accuracy of photometric redshift (photo-z) predictions from photometry using machine learning, astronomical and probabilistic metrics. We find that while the Bayesian neural network did not perform as well as non-Bayesian neural networks if evaluated solely by point estimate photo-z values, BNNs can provide uncertainty estimates that are necessary for cosmology", + "In this work, we identify elements of effective machine learning datasets in astronomy and present suggestions for their design and creation. Machine learning has become an increasingly important tool for analyzing and understanding the large-scale flood of data in astronomy. To take advantage of these tools, datasets are required for training and testing. However, building machine learning datasets for astronomy can be challenging. Astronomical data is collected from instruments built to explore science questions in a traditional fashion rather than to conduct machine learning. Thus, it is often the case that raw data, or even downstream processed data is not in a form amenable to machine learning. We explore the construction of machine learning datasets and we ask: what elements define effective machine learning datasets? We define effective machine learning datasets in astronomy to be formed with well-defined data points, structure, and metadata. We discuss why these elements are important for astronomical applications and ways to put them in practice. We posit that these qualities not only make the data suitable for machine learning, they also help to foster usable, reusable, and replicable science practices." + ], + "domain": [ + "Machine Learning", + "Astrophysics", + "Bayesian Neural Networks", + "Generative Models" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "46e85f58-2d76-4c46-b59c-51fd1b059560": { + "pk": "46e85f58-2d76-4c46-b59c-51fd1b059560", + "name": "Tuan Do", + "bio": "I am a researcher at the intersection of machine learning and astrophysics, dedicated to enhancing our understanding of the universe through advanced data analysis techniques. My recent work focuses on the critical task of photometric redshift (photo-z) estimation, which is essential for precision cosmology and understanding phenomena like dark matter and dark energy. I have pioneered the application of Bayesian convolutional neural networks (BCNNs) for photo-z estimation, achieving significant improvements in accuracy and uncertainty quantification.\n\nIn my research, I explore innovative methods to leverage large-scale datasets, such as the GalaxiesML dataset, which I developed to facilitate machine learning applications in astrophysics. This dataset, comprising over 286,000 galaxy images and photometric data, serves as a robust foundation for training models that can predict redshifts more effectively. I am particularly interested in generative models and their potential to simulate realistic galaxy images, using physics-based metrics to evaluate their performance.\n\nMy work also delves into transfer learning techniques to enhance redshift predictions by combining data from different sources, demonstrating that integrating diverse datasets can significantly reduce bias and improve accuracy. I am passionate about making my datasets publicly available to spur further research and development in the field, particularly as we prepare for the next generation of astronomical surveys. Ultimately, my goal is to harness the power of machine learning to unlock new insights into the cosmos and contribute to our understanding of fundamental astrophysical questions.", + "collaborators": [ + "Bernie Boscoe", + "Evan Jones", + "Yunqi Li", + "Kevin Alfaro", + "J. Singal", + "Zooey Nguyen", + "Jonathan Soriano", + "Srinath Saikrishnan", + "Vikram Seenivasan", + "Yujie Wan" + ], + "pub_titles": [ + "Redshift Prediction with Images for Cosmology Using a Bayesian Convolutional Neural Network with Conformal Predictions", + "Using Galaxy Evolution as Source of Physics-Based Ground Truth for Generative Models", + "Using different sources of ground truths and transfer learning to improve the generalization of photometric redshift estimation", + "GalaxiesML: a dataset of galaxy images, photometry, redshifts, and structural parameters for machine learning", + "Photometric Redshifts for Cosmology: Improving Accuracy and Uncertainty Estimates Using Bayesian Neural Networks" + ], + "pub_abstracts": [ + " In the emerging era of big data astrophysics, large-scale extragalactic surveys will soon provide high-quality imaging for billions of celestial objects to answer major questions in astrophysics such as the nature of dark matter and dark energy. Precision cosmology with surveys requires accurate photometric redshift (photo-z) estimation with well-constrained uncertainties as inputs for weak lensing models to measure cosmological parameters. Machine learning methods have shown promise in optimizing the information gained from galaxy images in photo-z estimation; however, many of these methods are limited in their ability to estimate accurate uncertainties. In this work, we present one of the first applications of Bayesian convolutional neural networks (BCNNs) for photo-z estimation and uncertainties. In addition, we use conformal mapping to calibrate the photo-z uncertainties to achieve good statistical coverage. We use the public GalaxiesML data set of \u223c300k galaxies from the Hyper Suprime-Cam survey containing five-band photometric images and known spectroscopic redshifts from 0 < z < 4. We find that the performance is much improved when using images compared to photometry, with the BCNN achieving 0.098 rms error, a standard outlier rate of 3.9%, a 3\u03c3 outlier rate of 4.5%, and a bias of 0.0007. The performance drops significantly beyond z > 1.5 due to the relative lack of training data beyond those redshifts. This investigation demonstrates the power of using images directly and we advocate that future photo-z analysis of large-scale surveys include galaxy images.", + "Generative models producing images have enormous potential to advance discoveries across scientific fields and require metrics capable of quantifying the high dimensional output. We propose that astrophysics data, such as galaxy images, can test generative models with additional physics-motivated ground truths in addition to human judgment. For example, galaxies in the Universe form and change over billions of years, following physical laws and relationships that are both easy to characterize and difficult to encode in generative models. We build a conditional denoising diffusion probabilistic model (DDPM) and a conditional variational autoencoder (CVAE) and test their ability to generate realistic galaxies conditioned on their redshifts (galaxy ages). This is one of the first studies to probe these generative models using physically motivated metrics. We find that both models produce comparable realistic galaxies based on human evaluation, but our physics-based metrics are better able to discern the strengths and weaknesses of the generative models. Overall, the DDPM model performs better than the CVAE on the majority of the physics-based metrics. Ultimately, if we can show that generative models can learn the physics of galaxy evolution, they have the potential to unlock new astrophysical discoveries.", + "In this work, we explore methods to improve galaxy redshift predictions by combining different ground truths. Traditional machine learning models rely on training sets with known spectroscopic redshifts, which are precise but only represent a limited sample of galaxies. To make redshift models more generalizable to the broader galaxy population, we investigate transfer learning and directly combining ground truth redshifts derived from photometry and spectroscopy. We use the COSMOS2020 survey to create a dataset, TransferZ, which includes photometric redshift estimates derived from up to 35 imaging filters using template fitting. This dataset spans a wider range of galaxy types and colors compared to spectroscopic samples, though its redshift estimates are less accurate. We first train a base neural network on TransferZ and then refine it using transfer learning on a dataset of galaxies with more precise spectroscopic redshifts (GalaxiesML). In addition, we train a neural network on a combined dataset of TransferZ and GalaxiesML. Both methods reduce bias by $\\sim$ 5x, RMS error by $\\sim$ 1.5x, and catastrophic outlier rates by 1.3x on GalaxiesML, compared to a baseline trained only on TransferZ. However, we also find a reduction in performance for RMS and bias when evaluated on TransferZ data. Overall, our results demonstrate these approaches can meet cosmological requirements.", + "We present a dataset built for machine learning applications consisting of galaxy photometry, images, spectroscopic redshifts, and structural properties. This dataset comprises 286,401 galaxy images and photometry from the Hyper-Suprime-Cam Survey PDR2 in five imaging filters ($g,r,i,z,y$) with spectroscopically confirmed redshifts as ground truth. Such a dataset is important for machine learning applications because it is uniform, consistent, and has minimal outliers but still contains a realistic range of signal-to-noise ratios. We make this dataset public to help spur development of machine learning methods for the next generation of surveys such as Euclid and LSST. The aim of GalaxiesML is to provide a robust dataset that can be used not only for astrophysics but also for machine learning, where image properties cannot be validated by the human eye and are instead governed by physical laws. We describe the challenges associated with putting together a dataset from publicly available archives, including outlier rejection, duplication, establishing ground truths, and sample selection. This is one of the largest public machine learning-ready training sets of its kind with redshifts ranging from 0.01 to 4. The redshift distribution of this sample peaks at redshift of 1.5 and falls off rapidly beyond redshift 2.5. We also include an example application of this dataset for redshift estimation, demonstrating that using images for redshift estimation produces more accurate results compared to using photometry alone. For example, the bias in redshift estimate is a factor of 10 lower when using images between redshift of 0.1 to 1.25 compared to photometry alone. Results from dataset such as this will help inform us on how to best make use of data from the next generation of galaxy surveys.", + "We present results exploring the role that probabilistic deep learning models can play in cosmology from large scale astronomical surveys through estimating the distances to galaxies (redshifts) from photometry. Due to the massive scale of data coming from these new and upcoming sky surveys, machine learning techniques using galaxy photometry are increasingly adopted to predict galactic redshifts which are important for inferring cosmological parameters such as the nature of dark energy. Associated uncertainty estimates are also critical measurements, however, common machine learning methods typically provide only point estimates and lack uncertainty information as outputs. We turn to Bayesian neural networks (BNNs) as a promising way to provide accurate predictions of redshift values. We have compiled a new galaxy training dataset from the Hyper Suprime-Cam Survey, designed to mimic large surveys, but over a smaller portion of the sky. We evaluate the performance and accuracy of photometric redshift (photo-z) predictions from photometry using machine learning, astronomical and probabilistic metrics. We find that while the Bayesian neural network did not perform as well as non-Bayesian neural networks if evaluated solely by point estimate photo-z values, BNNs can provide uncertainty estimates that are necessary for cosmology" + ], + "domain": [ + "Astrophysics", + "Machine Learning", + "Bayesian Neural Networks", + "Generative Models" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "bcc19da8-9033-4a13-8467-96e87ed513ad": { + "pk": "bcc19da8-9033-4a13-8467-96e87ed513ad", + "name": "Paxson Swierc", + "bio": "I am a researcher dedicated to the intersection of deep learning and astrophysics, particularly in the analysis of galaxy-scale gravitational lensing. My recent work addresses a critical challenge in this field: the scarcity of real lensing data for training deep learning algorithms. To tackle this, I have explored the potential of domain adaptation techniques to bridge the gap between simulated and real datasets. \n\nIn my latest study, I applied Domain Adversarial Neural Networks (DANN) and Maximum Mean Discrepancy (MMD) to estimate the Einstein radius in simulated gravitational lensing images. By training on a source domain of simulated lenses and testing on a target domain that mimics the noise conditions of the Dark Energy Survey, I demonstrated significant improvements in model performance. This work marks a pioneering application of domain adaptation for regression tasks in strong lensing imaging analysis, showcasing the promise of these techniques for analyzing future survey data.\n\nI am passionate about leveraging advanced machine learning methods to enhance our understanding of the universe, and I am excited about the potential of my research to contribute to the discovery of new astronomical phenomena.", + "collaborators": [ + "Megan Zhao", + "A. 'Ciprijanovi'c", + "Brian Nord" + ], + "pub_titles": [ + "Domain Adaptation for Measurements of Strong Gravitational Lenses" + ], + "pub_abstracts": [ + "Upcoming surveys are predicted to discover galaxy-scale strong lenses on the order of $10^5$, making deep learning methods necessary in lensing data analysis. Currently, there is insufficient real lensing data to train deep learning algorithms, but the alternative of training only on simulated data results in poor performance on real data. Domain Adaptation may be able to bridge the gap between simulated and real datasets. We utilize domain adaptation for the estimation of Einstein radius ($\\Theta_E$) in simulated galaxy-scale gravitational lensing images with different levels of observational realism. We evaluate two domain adaptation techniques - Domain Adversarial Neural Networks (DANN) and Maximum Mean Discrepancy (MMD). We train on a source domain of simulated lenses and apply it to a target domain of lenses simulated to emulate noise conditions in the Dark Energy Survey (DES). We show that both domain adaptation techniques can significantly improve the model performance on the more complex target domain dataset. This work is the first application of domain adaptation for a regression task in strong lensing imaging analysis. Our results show the potential of using domain adaptation to perform analysis of future survey data with a deep neural network trained on simulated data." + ], + "domain": [ + "Deep Learning", + "Domain Adaptation", + "Gravitational Lensing", + "Computer Vision" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "9921c57b-f223-4d36-a98f-ed54b287d496": { + "pk": "9921c57b-f223-4d36-a98f-ed54b287d496", + "name": "Marcos Tamargo-Arizmendi", + "bio": "I am an astrophysicist specializing in strong gravitational lensing and the study of quasar host galaxies. My recent work has focused on the analysis of cluster-scale strong gravitational lenses, particularly through the COOL-LAMPS collaboration. I have developed parametric measurements of the Einstein-radius-enclosed total mass for 177 strong gravitational lenses, revealing significant correlations between the enclosed total mass, luminosity, and stellar mass. These findings provide a framework for validating strong lensing candidates in future imaging surveys, such as the Rubin/Legacy Survey of Space and Time (LSST).\n\nAdditionally, I have explored the intriguing realm of wide-separation lensed quasars (WSLQs), where I derived constraints on the properties of their host galaxies. My research indicates a mixture of star-forming and quiescent galaxies among these hosts, and I have investigated the co-evolution of active galactic nuclei (AGNs) and their host galaxies, finding minimal evolution in the black hole mass-stellar mass relation.\n\nOne of my notable discoveries is COOL J0335\u22121927, the highest redshift wide-separation lensed quasar known to date. This work involved constructing a parametric strong gravitational lens model and predicting time delays between the quasar images, which will ultimately allow for further studies of AGN variability and time delay measurements. My research aims to deepen our understanding of the interplay between massive galaxies and their central black holes, contributing to the broader field of cosmology and galaxy formation.", + "collaborators": [ + "Simon Mork", + "M. Gladders", + "K. Sharon", + "Aidan P. Cloonan", + "H. Dahle", + "Grace Wagner", + "Yunchong Zhang", + "K. Napier", + "Riley Rosener", + "Jamar L. Sullivan", + "Isaiah Escapa", + "Josh Garza", + "Natalie Malagon", + "Kunwanhui Niu", + "Raul Teixeira", + "Kabelo Tsiane", + "Megan Zhao", + "G. Khullar", + "N. Chicoine", + "Diego Garza", + "Rowen Glusman", + "K. Gozman", + "Gabriela Horwath", + "Benjamin C. Levine", + "Olina Liang", + "Michael N. Martinez", + "A. Masegian", + "Owen S. Matthews Acuna", + "Yue Pan", + "Isaac Sierra", + "Ezra O. Sukay", + "K. Tavangar", + "G. Mahler", + "Andrew Kisare", + "Marie Tagliavia", + "Daniel Mahronic", + "V. Manwadkar", + "Kaiya Merz", + "Jorge A. Sanchez", + "Daniel J. Kavin Stein", + "Ruoyang Tu", + "E. Zaborowski", + "M. Bayliss", + "Andi Kisare", + "M. Riley Owens", + "J. Rigby", + "Antony Stark", + "Erik Zaborowski" + ], + "pub_titles": [ + "COOL-LAMPS. VII. Quantifying Strong-lens Scaling Relations with 177 Cluster-scale Gravitational Lenses in DECaLS", + "COOL-LAMPS VIII: Known wide-separation lensed quasars and their host galaxies reveal a lack of evolution in $M_{\\rm{BH}}/M_\\star$ since $z\\sim 3$", + "COOL-LAMPS. Discovery of COOL J0335\u22121927, a Gravitationally Lensed Quasar at z = 3.27 with an Image Separation of 23.\u20333", + "COOL-LAMPS. V. Discovery of COOL J0335 \u2212 1927, a Gravitationally Lensed Quasar at z =3.27 with an Image Separation of 23 . \u2032\u2032 3" + ], + "pub_abstracts": [ + "We compute parametric measurements of the Einstein-radius-enclosed total mass for 177 cluster-scale strong gravitational lenses identified by the ChicagO Optically-selected Lenses Located At the Margins of Public Surveys (COOL-LAMPS) collaboration with lens redshifts ranging from $0.2 \\lessapprox z \\lessapprox 1.0$ using only two measured parameters in each lensing system: the Einstein radius, and the brightest-cluster-galaxy (BCG) redshift. We then constrain the Einstein-radius-enclosed luminosity and stellar mass by fitting parametric spectral energy distributions (SEDs) with aperture photometry from the Dark Energy Camera Legacy Survey (DECaLS) in the $g$, $r$, and $z$-band Dark Energy Camera (DECam) filters. We find that the BCG redshift, enclosed total mass, and enclosed luminosity are strongly correlated and well described by a planar relationship in 3D space. We also find that the enclosed total mass and stellar mass are correlated with a logarithmic slope of $0.443\\pm0.035$, and the enclosed total mass and stellar-to-total mass fraction are correlated with a logarithmic slope of $-0.563\\pm0.035$. The correlations described here can be used to validate strong lensing candidates in upcoming imaging surveys -- such as Rubin/Legacy Survey of Space and Time (LSST) -- in which an algorithmic treatment of lensing systems will be needed due to the sheer volume of data these surveys will produce.", + "Wide-separation lensed quasars (WSLQs) are a rare class of strongly lensed quasars, magnified by foreground massive galaxy clusters, with typically large magnifications of the multiple quasar images. They are a relatively unexplored opportunity for detailed study of quasar host galaxies. The current small sample of known WSLQs has a median redshift of $z\\approx 2.1$, larger than most other samples of quasar host galaxies studied to date. Here, we derive precise constraints on the properties of six WSLQs and their host galaxies, using parametric surface brightness fitting, measurements of quasar emission lines, and stellar population synthesis of host galaxies in six WSLQ systems. Our results, with significant uncertainty, indicate that these six hosts are a mixture of star-forming and quiescent galaxies. To probe for co-evolution between AGNs and host galaxies, we model the offset from the `local' ($z=0$) $M_{\\rm{BH}}\\unicode{x2013}M_\\star$ relation as a simple power-law in redshift. Accounting for selection effects, a WSLQ-based model for evolution in the $M_{\\rm{BH}}\\unicode{x2013}M_\\star$ relation has a power-law index of $\\gamma_M=-0.42\\pm0.31$, consistent with no evolution. Compared to several literature samples, which mostly probe unlensed quasars at $z<2$, the WSLQ sample shows less evolution from the local relation, at $\\sim 4\\sigma$. We find that selection affects and choices of $M_{\\rm{BH}}$ calibration are the most important systematics in these comparisons. Given that we resolve host galaxy flux confidently even from the ground in some instances, our work demonstrates that WSLQs and highly magnified AGNs are exceptional systems for future AGN$\\unicode{x2013}$host co-evolution studies.", + "We report the discovery of COOL J0335\u22121927, a quasar at z = 3.27 lensed into three images with a maximum separation of 23.\u20333 by a galaxy cluster at z = 0.4178. To date, this is the highest redshift wide-separation lensed quasar known. In addition, COOL J0335\u22121927 shows several strong intervening absorbers visible in the spectra of all three quasar images with varying equivalent widths. The quasar also shows mini-broad line absorption. We construct a parametric strong gravitational lens model using ground-based imaging, constrained by the redshift and positions of the quasar images as well as the positions of three other multiply imaged background galaxies. Using our best-fit lens model, we calculate the predicted time delays between the three quasar images to be \u0394t AB = 499\u2212146+141 (stat) and \u0394t AC = \u2212127\u221217+83 (stat) days. Folding in systematic uncertainties, the model-predicted time delays are within the ranges 240 < \u0394t AB < 700 and \u2212300 < \u0394t AC < \u221230. We also present g-band photometry from archival Dark Energy Camera Legacy Survey and Pan-STARRS imaging, and new multi-epoch observations obtained between 2022 September 18 UT and 2023 February 22 UT, which demonstrate significant variability in the quasar and will eventually enable the measurement of the time delay between the three quasar images. The currently available light curves are consistent with the model-predicted time delays. This is the fifth paper from the COOL-LAMPS collaboration.", + "We report the discovery of COOL J0335-1927, a quasar at z = 3.27 lensed into three images with a maximum separation of 23 . \u2032\u2032 3 by a galaxy cluster at z = 0.4178. We construct a parametric strong gravitational lens model using ground-based imaging, constrained by the redshift and positions of the quasar images as well as the positions of three other multiply-imaged background galaxies. Using our best-fit lens model, we calculate the predicted time delays between the three quasar images to be" + ], + "domain": [ + "Gravitational Lensing", + "Quasar Host Galaxies", + "Astrophysics", + "Cosmology" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "4d0a1a93-1dfb-4df8-a5ef-c0f799b10f58": { + "pk": "4d0a1a93-1dfb-4df8-a5ef-c0f799b10f58", + "name": "Brian D. Nord", + "bio": "I am a researcher dedicated to the intersection of deep learning and uncertainty quantification (UQ), particularly in scientific applications where understanding uncertainty is crucial. My recent work has focused on assessing the quality of aleatoric uncertainty estimates produced by various UQ methods, such as Deep Ensembles (DE) and Deep Evidential Regression (DER). I systematically investigate how these methods perform across different data dimensionalities, revealing critical insights into their calibration and accuracy, especially under high-noise conditions.\n\nAdditionally, I have explored the application of deep learning in modeling strong gravitational lenses, a task that is computationally intensive due to the complexity of data from modern cosmic surveys. My research demonstrates the effectiveness of combining Mean-Variance Estimators (MVEs) with unsupervised domain adaptation (UDA) to enhance the accuracy of predictions on real observational data. This work not only improves the performance of UQ methods but also paves the way for their application in future astronomical studies.\n\nThrough my research, I aim to bridge the gap between theoretical uncertainty quantification and practical applications, ensuring that the models we develop are robust, reliable, and capable of handling the complexities of real-world data.", + "collaborators": [ + "Aleksandra 'Ciprijanovi'c", + "Rebecca Nevin", + "Shrihan Agarwal" + ], + "pub_titles": [ + "DeepUQ: Assessing the Aleatoric Uncertainties from two Deep Learning Methods", + "Neural Network Prediction of Strong Lensing Systems with Domain Adaptation and Uncertainty Quantification" + ], + "pub_abstracts": [ + "Assessing the quality of aleatoric uncertainty estimates from uncertainty quantification (UQ) deep learning methods is important in scientific contexts, where uncertainty is physically meaningful and important to characterize and interpret exactly. We systematically compare aleatoric uncertainty measured by two UQ techniques, Deep Ensembles (DE) and Deep Evidential Regression (DER). Our method focuses on both zero-dimensional (0D) and two-dimensional (2D) data, to explore how the UQ methods function for different data dimensionalities. We investigate uncertainty injected on the input and output variables and include a method to propagate uncertainty in the case of input uncertainty so that we can compare the predicted aleatoric uncertainty to the known values. We experiment with three levels of noise. The aleatoric uncertainty predicted across all models and experiments scales with the injected noise level. However, the predicted uncertainty is miscalibrated to $\\rm{std}(\\sigma_{\\rm al})$ with the true uncertainty for half of the DE experiments and almost all of the DER experiments. The predicted uncertainty is the least accurate for both UQ methods for the 2D input uncertainty experiment and the high-noise level. While these results do not apply to more complex data, they highlight that further research on post-facto calibration for these methods would be beneficial, particularly for high-noise and high-dimensional settings.", + "Modeling strong gravitational lenses is computationally expensive for the complex data from modern and next-generation cosmic surveys. Deep learning has emerged as a promising approach for finding lenses and predicting lensing parameters, such as the Einstein radius. Mean-variance Estimators (MVEs) are a common approach for obtaining aleatoric (data) uncertainties from a neural network prediction. However, neural networks have not been demonstrated to perform well on out-of-domain target data successfully - e.g., when trained on simulated data and applied to real, observational data. In this work, we perform the first study of the efficacy of MVEs in combination with unsupervised domain adaptation (UDA) on strong lensing data. The source domain data is noiseless, and the target domain data has noise mimicking modern cosmology surveys. We find that adding UDA to MVE increases the accuracy on the target data by a factor of about two over an MVE model without UDA. Including UDA also permits much more well-calibrated aleatoric uncertainty predictions. Advancements in this approach may enable future applications of MVE models to real observational data." + ], + "domain": [ + "Uncertainty Quantification", + "Deep Learning", + "Domain Adaptation", + "Gravitational Lensing" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "3e8319e2-43d5-47ca-ad98-e68463a67415": { + "pk": "3e8319e2-43d5-47ca-ad98-e68463a67415", + "name": "Felix Richards", + "bio": "I am a researcher dedicated to advancing the field of astronomical image analysis through innovative machine learning techniques. My work primarily focuses on the segmentation and classification of low surface brightness (LSB) structures in large astronomical images, where I have developed novel methodologies to enhance the efficiency and accuracy of these processes.\n\nIn my recent publications, I introduced a gridded attention mechanism that significantly improves the efficiency of capturing global context in images while maintaining sensitivity to textural patterns. This approach has been validated on a new dataset of astronomical images, specifically targeting the segmentation of large contaminating dust clouds. Additionally, I created an online annotation tool that has facilitated the delineation of LSB structures around hundreds of galaxies, resulting in a comprehensive database that supports both quantitative analysis and machine learning training.\n\nI have also explored the integration of learned Gabor filters into convolutional neural networks to achieve orientation robustness, leading to the development of Learnable Convolutional Gabor Networks (LCGNs). This work demonstrates my commitment to addressing the unique challenges posed by astronomical imaging, particularly in handling the complexities of orientation and low surface brightness.\n\nMy overarching goal is to automate the cataloguing of astronomical objects, leveraging deep learning to tackle the increasing volume of data generated by modern imaging surveys. By synthesizing datasets and proposing efficient neural network architectures, I aim to push the boundaries of what is possible in astronomical image analysis, ultimately contributing to our understanding of the universe.", + "collaborators": [ + "A. Paiement", + "P. Duc", + "Xianghua Xie", + "Elisabeth Sola", + "Mathias Urbano", + "J. Klehammer", + "M. B'ilek", + "J. Cuillandre", + "S. Gwyn", + "A. McConnachie" + ], + "pub_titles": [ + "Multi-scale gridded Gabor attention for cirrus segmentation", + "Characterization of low surface brightness structures in annotated deep images", + "Learnable Gabor modulated complex-valued networks for orientation robustness", + "Classification and Segmentation of Galactic Structuresin Large Multi-spectral Images" + ], + "pub_abstracts": [ + "In this paper, we address the challenge of segmenting global contaminants in large images. The precise delineation of such structures requires ample global context alongside understanding of textural patterns. CNNs specialise in the latter, though their ability to generate global features is limited. Attention measures long range dependencies in images, capturing global context, though at a large computational cost. We propose a gridded attention mechanism to address this limitation, greatly increasing efficiency by processing multi-scale features into smaller tiles. We also enhance the attention mechanism for increased sensitivity to texture orientation, by measuring correlations across features dependent on different orientations, in addition to channel and positional attention. We present results on a new dataset of astronomical images, where the task is segmenting large contaminating dust clouds.", + "The characterization of Low Surface Brightness (LSB) stellar structures around galaxies such as tidal debris of on-going or past collisions is essential to constrain models of galactic evolution. Our goal is to obtain quantitative measurements of LSB structures identified in deep images of samples consisting of hundreds of galaxies. We developed an online annotation tool that enables contributors to delineate the shapes of diffuse extended stellar structures, as well as artefacts or foreground structures. All parameters are automatically stored in a database which may be queried to retrieve quantitative measurements. We annotated LSB structures around 352 nearby massive galaxies with deep images obtained with the CFHT as part of two large programs: MATLAS and UNIONS/CFIS. Each LSB structure was delineated and labeled according to its likely nature: stellar shells, streams associated to a disrupted satellite, tails formed in major mergers, ghost reflections or cirrus. From our database containing 8441 annotations, the area, size, median surface brightness and distance to the host of 228 structures were computed. The results confirm the fact that tidal structures defined as streams are thinner than tails, as expected by numerical simulations. In addition, tidal tails appear to exhibit a higher surface brightness than streams (by about 1 mag), which may be related to different survival times for the two types of collisional debris. We did not detect any tidal feature fainter than 27.5 mag.arcsec$^{-2}$, while the nominal surface brightness limits of our surveys range between 28.3 and 29 mag.arcsec$^{-2}$, a difference that needs to be taken into account when estimating the sensitivity of future surveys to identify LSB structures. Our annotation database of observed LSB structures may be used for quantitative analysis and as a training set for machine learning algorithms (abbreviated).", + "Robustness to transformation is desirable in many computer vision tasks, given that input data often exhibits pose variance within classes. While translation invariance and equivariance is a documented phenomenon of CNNs, sensitivity to other transformations is typically encouraged through data augmentation. We investigate the modulation of complex valued convolutional weights with learned Gabor filters to enable orientation robustness. With Gabor modulation, the designed network is able to generate orientation dependent features free of interpolation with a single set of rotation-governing parameters. Moreover, by learning rotation parameters alongside traditional convolutional weights, the representation space is not constrained and may adapt to the exact input transformation. We present Learnable Convolutional Gabor Networks (LCGNs), that are parameter-efficient and offer increased model complexity while keeping backpropagation simple. We demonstrate that learned Gabor modulation utilising an end-to-end complex architecture enables rotation invariance and equivariance on MNIST and a new dataset of simulated images of galactic cirri.", + "Extensive and exhaustive cataloguing of astronomical objects is imperative for studies seeking to understand mechanisms which drive the universe. Such cataloguing tasks can be tedious, time consuming and demand a high level of domain specific knowledge. Past astronomical imaging surveys have been catalogued through mostly manual effort. Immi-nent imaging surveys, however, will produce a magnitude of data that cannot be feasibly processed through manual cataloguing. Furthermore, these surveys will capture objects fainter than the night sky, termed low surface brightness objects, and at unprecedented spatial resolution owing to advancements in astronomical imaging. In this thesis, we in-vestigate the use of deep learning to automate cataloguing processes, such as detection, classification and segmentation of objects. A common theme throughout this work is the adaptation of machine learning methods to challenges specific to the domain of low surface brightness imaging.We begin with creating an annotated dataset of structures in low surface brightness images. To facilitate supervised learning in neural networks, a dataset comprised of input and corresponding ground truth target labels is required. An online tool is presented, allowing astronomers to classify and draw over objects in large multi-spectral images. A dataset produced using the tool is then detailed, containing 227 low surface brightness images from the MATLAS survey and labels made by four annotators. We then present a method for synthesising images of galactic cirrus which appear similar to MATLAS images, allowing pretraining of neural networks.A method for integrating sensitivity to orientation in convolutional neural networks is then presented. Objects in astronomical images can present in any given orientation, and thus the ability for neural networks to handle rotations is desirable. We modify con-volutional filters with sets of Gabor filters with different orientations. These orientations are learned alongside network parameters during backpropagation, allowing exact optimal orientations to be captured. The method is validated extensively on multiple datasets and use cases.We propose an attention based neural network architecture to process global contami-nants in large images. Performing analysis of low surface brightness images requires plenty of contextual information and local textual patterns. As a result, a network for processing low surface brightness images should ideally be able to accommodate large high resolu-tion images without compromising on either local or global features. We utilise attention to capture long range dependencies, and propose an efficient attention operator which significantly reduces computational cost, allowing the input of large images. We also use Gabor filters to build an attention mechanism to better capture long range orientational patterns. These techniques are validated on the task of cirrus segmentation in MAT-LAS images, and cloud segmentation on the SWIMSEG database, where state of the art performance is achieved.Following, cirrus segmentation in MATLAS images is further investigated, and a com-prehensive study is performed on the task. We discuss challenges associated with cirrus segmentation and low surface brightness images in general, and present several tech-niques to accommodate them. A novel loss function is proposed to facilitate training of the segmentation model on probabilistic targets. Results are presented on the annotated MATLAS images, with extensive ablation studies and a final benchmark to test the limits of the detailed segmentation pipeline.Finally, we develop a pipeline for multi-class segmentation of galactic structures and surrounding contaminants. Techniques of previous chapters are combined with a popu-lar instance segmentation architecture to create a neural network capable of segmenting localised objects and extended amorphous regions. The process of data preparation for training instance segmentation models is thoroughly detailed. The method is tested on segmentation of five object classes in MATLAS images. We find that unifying the tasks of galactic structure segmentation and contaminant segmentation improves model perfor-mance in comparison to isolating each task." + ], + "domain": [ + "Computer Vision", + "Deep Learning", + "Image Segmentation", + "Astronomy" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "68f73e5d-ef54-4934-960a-a91770fa967a": { + "pk": "68f73e5d-ef54-4934-960a-a91770fa967a", + "name": "Adeline Paiement", + "bio": "I am a researcher dedicated to advancing the fields of computer vision, deep learning, and their applications in health and environmental sciences. My recent work has focused on developing innovative methodologies that leverage attention mechanisms and graph neural networks to enhance image analysis and prediction tasks. For instance, I introduced a gridded attention mechanism to efficiently segment global contaminants in large images, addressing the limitations of traditional CNNs in capturing global context.\n\nIn the realm of medical imaging, I designed AttentNet, an automated lung nodule detection framework that utilizes fully convolutional attention blocks to improve detection accuracy in 3D pulmonary CT scans. My research also extends to solar imaging, where I developed a U-Net based method for removing cloud shadows from solar images, enhancing the quality of solar structure detection.\n\nI am particularly interested in integrating domain knowledge into graph neural networks, as demonstrated in my work on estimating the potential energy of chemical systems. By incorporating knowledge of chemical bonds and physical quantities, I have shown how GNNs can achieve higher accuracy and generalization.\n\nAdditionally, I have explored the intersection of health and technology, proposing a symbolic behavior recognition approach to monitor kitchen activities and assess eating behaviors, which could provide valuable insights for clinicians. My work is driven by a commitment to not only push the boundaries of machine learning but also to apply these advancements to real-world challenges, from healthcare to environmental monitoring.", + "collaborators": [ + "Xianghua Xie", + "Majedaldein Almahasneh", + "Jay Morgan", + "J. Aboudarham", + "M. Mirmehdi", + "Felix Richards", + "P. Duc", + "Elisabeth Sola", + "C. Klinke", + "J. Jenkins", + "Faegheh Sardari", + "S. Hannuna", + "Kristina Yordanova", + "I. Craddock", + "Amal Chaoui", + "Jean Aboudarham", + "Xiang-Wen Xie", + "Mathias Urbano", + "J. Klehammer", + "M. B'ilek", + "J. Cuillandre", + "S. Gwyn", + "A. McConnachie", + "Y. Ourmi\u00e8res", + "J. Sommer", + "J. Verron", + "C. Ubelmann", + "H. Glotin", + "A. Pauly", + "M. Seisenberger", + "X. Bonnin", + "S. L\u00fcdtke", + "Samuel Whitehouse", + "Frank Kr\u00fcger", + "T. Kirste", + "Paul Stroe", + "Max Schr\u00f6der", + "E. Tonkin", + "Przemyslaw Woznowski", + "C. Olsson", + "Joseph Rafferty", + "T. Sztyler", + "L. Tao", + "T. Burghardt", + "D. Damen", + "A. Cooper", + "M. Camplani" + ], + "pub_titles": [ + "Multi-scale gridded Gabor attention for cirrus segmentation", + "AttentNet: Fully Convolutional 3D Attention for Lung Nodule Detection", + "Removing cloud shadows from ground-based solar imagery", + "Domain-informed graph neural networks: a quantum chemistry case study", + "Characterization of low surface brightness structures in annotated deep images", + "Adaptive Neighbourhoods for the Discovery of Adversarial Examples", + "Active Region Detection in Multi-spectral Solar Images", + "Physics-informed detection and segmentation of type II solar radio bursts", + "Learnable Gabor modulated complex-valued networks for orientation robustness", + "VI-Net: View-Invariant Quality of Human Movement Assessment", + "Analysing Cooking Behaviour in Home Settings: Towards Health Monitoring \u2020", + "VIMPNN: A physics informed neural network for estimating potential energies of out-of-equilibrium systems", + "Challenges in Annotation of useR Data for UbiquitOUs Systems: Results from the 1st ARDUOUS Workshop", + "Energy expenditure estimation using visual and inertial sensors" + ], + "pub_abstracts": [ + "In this paper, we address the challenge of segmenting global contaminants in large images. The precise delineation of such structures requires ample global context alongside understanding of textural patterns. CNNs specialise in the latter, though their ability to generate global features is limited. Attention measures long range dependencies in images, capturing global context, though at a large computational cost. We propose a gridded attention mechanism to address this limitation, greatly increasing efficiency by processing multi-scale features into smaller tiles. We also enhance the attention mechanism for increased sensitivity to texture orientation, by measuring correlations across features dependent on different orientations, in addition to channel and positional attention. We present results on a new dataset of astronomical images, where the task is segmenting large contaminating dust clouds.", + "Motivated by the increasing popularity of attention mechanisms, we observe that popular convolutional (conv.) attention models like Squeeze-and-Excite (SE) and Convolutional Block Attention Module (CBAM) rely on expensive multi-layer perception (MLP) layers. These MLP layers significantly increase computational complexity, making such models less applicable to 3D image contexts, where data dimensionality and computational costs are higher. In 3D medical imaging, such as 3D pulmonary CT scans, efficient processing is crucial due to the large data volume. Traditional 2D attention generalized to 3D increases the computational load, creating demand for more efficient attention mechanisms for 3D tasks. We investigate the possibility of incorporating fully convolutional (conv.) attention in 3D context. We present two 3D fully conv. attention blocks, demonstrating their effectiveness in 3D context. Using pulmonary CT scans for 3D lung nodule detection, we present AttentNet, an automated lung nodule detection framework from CT images, performing detection as an ensemble of two stages, candidate proposal and false positive (FP) reduction. We compare the proposed 3D attention blocks to popular 2D conv. attention methods generalized to 3D modules and to self-attention units. For the FP reduction stage, we also use a joint analysis approach to aggregate spatial information from different contextual levels. We use LUNA-16 lung nodule detection dataset to demonstrate the benefits of the proposed fully conv. attention blocks compared to baseline popular lung nodule detection methods when no attention is used. Our work does not aim at achieving state-of-the-art results in the lung nodule detection task, rather to demonstrate the benefits of incorporating fully conv. attention within a 3D context.", + "The study and prediction of space weather entails the analysis of solar images showing structures of the Sun\u2019s atmosphere. When imaged from the Earth\u2019s ground, images may be polluted by terrestrial clouds which hinder the detection of solar structures. We propose a new method to remove cloud shadows, based on a U-Net architecture, and compare classical supervision with conditional GAN. We evaluate our method on two different imaging modalities, using both real images and a new dataset of synthetic clouds. Quantitative assessments are obtained through image quality indices (RMSE, PSNR, SSIM, and FID). We demonstrate improved results with regards to the traditional cloud removal technique and a sparse coding baseline, on different cloud types and textures.", + "We explore different strategies to integrate prior domain knowledge into the design of graph neural networks (GNN). Our study is supported by a use-case of estimating the potential energy of chemical systems (molecules and crystals) represented as graphs. We integrate two elements of domain knowledge into the design of the GNN to constrain and regularise its learning, towards higher accuracy and generalisation. First, knowledge on the existence of different types of relations/graph edges (e.g. chemical bonds in our case study) between nodes of the graph is used to modulate their interactions. We formulate and compare two strategies, namely specialised message production and specialised update of internal states. Second, knowledge of the relevance of some physical quantities is used to constrain the learnt features towards a higher physical relevance using a simple multi-task learning (MTL) paradigm. We explore the potential of MTL to better capture the underlying mechanisms behind the studied phenomenon. We demonstrate the general applicability of our two knowledge integrations by applying them to three architectures that rely on different mechanisms to propagate information between nodes and to update node states. Our implementations are made publicly available. To support these experiments, we release three new datasets of out-of-equilibrium molecules and crystals of various complexities.", + "The characterization of Low Surface Brightness (LSB) stellar structures around galaxies such as tidal debris of on-going or past collisions is essential to constrain models of galactic evolution. Our goal is to obtain quantitative measurements of LSB structures identified in deep images of samples consisting of hundreds of galaxies. We developed an online annotation tool that enables contributors to delineate the shapes of diffuse extended stellar structures, as well as artefacts or foreground structures. All parameters are automatically stored in a database which may be queried to retrieve quantitative measurements. We annotated LSB structures around 352 nearby massive galaxies with deep images obtained with the CFHT as part of two large programs: MATLAS and UNIONS/CFIS. Each LSB structure was delineated and labeled according to its likely nature: stellar shells, streams associated to a disrupted satellite, tails formed in major mergers, ghost reflections or cirrus. From our database containing 8441 annotations, the area, size, median surface brightness and distance to the host of 228 structures were computed. The results confirm the fact that tidal structures defined as streams are thinner than tails, as expected by numerical simulations. In addition, tidal tails appear to exhibit a higher surface brightness than streams (by about 1 mag), which may be related to different survival times for the two types of collisional debris. We did not detect any tidal feature fainter than 27.5 mag.arcsec$^{-2}$, while the nominal surface brightness limits of our surveys range between 28.3 and 29 mag.arcsec$^{-2}$, a difference that needs to be taken into account when estimating the sensitivity of future surveys to identify LSB structures. Our annotation database of observed LSB structures may be used for quantitative analysis and as a training set for machine learning algorithms (abbreviated).", + "Deep Neural Networks (DNNs) have often supplied state-of-the-art results in pattern recognition tasks. Despite their advances, however, the existence of adversarial examples have caught the attention of the community. Many existing works have proposed methods for searching for adversarial examples within fixed-sized regions around training points. Our work complements and improves these existing approaches by adapting the size of these regions based on the problem complexity and data sampling density. This makes such approaches more appropriate for other types of data and may further improve adversarial training methods by increasing the region sizes without creating incorrect labels.", + "Precisely detecting solar Active Regions (AR) from multi-spectral images is a challenging task yet important in understanding solar activity and its influence on space weather. A main challenge comes from each modality capturing a different location of these 3D objects, as opposed to more traditional multi-spectral imaging scenarios where all image bands observe the same scene. We present a multi-task deep learning framework that exploits the dependencies between image bands to produce 3D AR detection where different image bands (and physical locations) each have their own set of results. We compare our detection method against baseline approaches for solar image analysis (multi-channel coronal hole detection, SPOCA for ARs (Verbeeck et al., 2013)) and a state-of-the-art deep learning method (Faster RCNN) and show enhanced performances in detecting ARs jointly from multiple bands.", + "Type II solar radio bursts have proven to be a useful tool for gaining insights into the behaviour of complex solar events and for forecasting and mitigating their damages on Earth. In this work, we detect and segment the occurrence of type II bursts in solar radio spectrograms, thereby facilitating the extraction of parameters needed to gain insight into solar events. We utilise prior knowledge of how type II bursts drift through frequencies over time to assist with these tasks of detection and segmentation. A new adaptive Region of Interest (ROI) is proposed, to constrain the search to regions that follow the burst curvature at a given frequency. It comes with an implicit data normalisation that reduces the variance of burst appearance in the data, hence simplifying the learning process from small datasets. We demonstrate the effectiveness of our methodology using a simple and popular HOG and logistic regression detector and basic segmentation based on voting and background subtraction. On a custom dataset representative of different levels of solar activity, at a wavelength range where no other detection algorithm currently operates, our adaptive ROI significantly improves over traditional sliding windows. In future work, it may be applied to other, state-of-the-art, machine learning algorithms.", + "Robustness to transformation is desirable in many computer vision tasks, given that input data often exhibits pose variance within classes. While translation invariance and equivariance is a documented phenomenon of CNNs, sensitivity to other transformations is typically encouraged through data augmentation. We investigate the modulation of complex valued convolutional weights with learned Gabor filters to enable orientation robustness. With Gabor modulation, the designed network is able to generate orientation dependent features free of interpolation with a single set of rotation-governing parameters. Moreover, by learning rotation parameters alongside traditional convolutional weights, the representation space is not constrained and may adapt to the exact input transformation. We present Learnable Convolutional Gabor Networks (LCGNs), that are parameter-efficient and offer increased model complexity while keeping backpropagation simple. We demonstrate that learned Gabor modulation utilising an end-to-end complex architecture enables rotation invariance and equivariance on MNIST and a new dataset of simulated images of galactic cirri.", + "We propose a view-invariant method towards the assessment of the quality of human movements which does not rely on skeleton data. Our end-to-end convolutional neural network consists of two stages, where at first a view-invariant trajectory descriptor for each body joint is generated from RGB images, and then the collection of trajectories for all joints are processed by an adapted, pre-trained 2D CNN (e.g. VGG-19 or ResNeXt-50) to learn the relationship amongst the different body parts and deliver a score for the movement quality. We release the only publicly-available, multi-view, non-skeleton, non-mocap, rehabilitation movement dataset (QMAR), and provide results for both cross-subject and cross-view scenarios on this dataset. We show that VI-Net achieves average rank correlation of 0.66 on cross-subject and 0.65 on unseen views when trained on only two views. We also evaluate the proposed method on the single-view rehabilitation dataset KIMORE and obtain 0.66 rank correlation against a baseline of 0.62.", + "Wellbeing is often affected by health-related conditions. Among them are nutrition-related health conditions, which can significantly decrease the quality of life. We envision a system that monitors the kitchen activities of patients and that based on the detected eating behaviour could provide clinicians with indicators for improving a patient\u2019s health. To be successful, such system has to reason about the person\u2019s actions and goals. To address this problem, we introduce a symbolic behaviour recognition approach, called Computational Causal Behaviour Models (CCBM). CCBM combines symbolic representation of person\u2019s behaviour with probabilistic inference to reason about one\u2019s actions, the type of meal being prepared, and its potential health impact. To evaluate the approach, we use a cooking dataset of unscripted kitchen activities, which contains data from various sensors in a real kitchen. The results show that the approach is able to reason about the person\u2019s cooking actions. It is also able to recognise the goal in terms of type of prepared meal and whether it is healthy. Furthermore, we compare CCBM to state-of-the-art approaches such as Hidden Markov Models (HMM) and decision trees (DT). The results show that our approach performs comparable to the HMM and DT when used for activity recognition. It outperformed the HMM for goal recognition of the type of meal with median accuracy of 1 compared to median accuracy of 0.12 when applying the HMM. Our approach also outperformed the HMM for recognising whether a meal is healthy with a median accuracy of 1 compared to median accuracy of 0.5 with the HMM.", + "Simulation of molecular and crystal systems enables insight into interesting chemical properties that benefit processes ranging from drug discovery to material synthesis. However these simulations can be computationally expensive and time consuming despite the approximations through Density Functional Theory (DFT). We propose the Valence Interaction Message Passing Neural Network (VIMPNN) to approximate DFT\u2019s ground-state energy calculations. VIMPNN integrates physics prior knowledge such as the existence of different interatomic bounds to estimate more accurate energies. Furthermore, while many previous machine learning methods consider only stable systems, our proposed method is demonstrated on unstable systems at different atomic distances. VIMPNN predictions can be used to determine the stable configurations of systems, i.e. stable distance for atoms \u2013 a necessary step for the future simulation of crystal growth for example. Our method is extensively evaluated on a augmented version of the QM9 dataset that includes unstable molecules, as well as a new dataset of infiniteand finite-size crystals, and is compared with the Message Passing Neural Network (MPNN). VIMPNN has comparable accuracy with DFT, while allowing for 5 orders of magnitude in computational speed up compared to DFT simulations, and produces more accurate and informative potential energy curves than MPNN for estimating stable configurations.", + "Labelling user data is a central part of the design and evaluation of pervasive systems that aim to support the user through situation-aware reasoning. It is essential both in designing and trainin ...", + "Deriving a person's energy expenditure accurately forms the foundation for tracking physical activity levels across many health and lifestyle monitoring tasks. In this study, the authors present a method for estimating calorific expenditure from combined visual and accelerometer sensors by way of an RGB-Depth camera and a wearable inertial sensor. The proposed individual-independent framework fuses information from both modalities which leads to improved estimates beyond the accuracy of single modality and manual metabolic equivalents of task (MET) lookup table based methods. For evaluation, the authors introduce a new dataset called SPHERE_RGBD\u2009 +\u2009 Inertial_calorie, for which visual and inertial data are simultaneously obtained with indirect calorimetry ground truth measurements based on gas exchange. Experiments show that the fusion of visual and inertial data reduces the estimation error by 8 and 18% compared with the use of visual only and inertial sensor only, respectively, and by 33% compared with a MET-based approach. The authors conclude from their results that the proposed approach is suitable for home monitoring in a controlled environment." + ], + "domain": [ + "Computer Vision", + "Deep Learning", + "Graph Neural Network", + "Image Processing" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "b043edd2-fc56-4e1a-8062-0da3548c8ec9": { + "pk": "b043edd2-fc56-4e1a-8062-0da3548c8ec9", + "name": "Xianghua Xie", + "bio": "I am a researcher dedicated to advancing machine learning techniques with a strong focus on privacy, efficiency, and real-world applications. My recent work in Federated Learning (FL) addresses critical privacy concerns, proposing a novel gradient leakage defense technique that secures model architectures while maintaining performance. I have also explored the intersection of machine learning and manufacturing, developing models to enhance the accuracy of surface texture predictions in steel production and improve vehicle detection using LiDAR data.\n\nMy research extends to the medical domain, where I have applied deep learning to nuclei detection in histopathology images, demonstrating the effectiveness of graph convolutional networks over traditional methods. Additionally, I have contributed to the development of interpretable models for predicting hospitalizations related to COVID-19, collaborating with medical experts to enhance the usability of recurrent neural networks.\n\nI am passionate about tackling complex challenges, such as defect detection in steel manufacturing and 3D geometry segmentation, by leveraging innovative machine learning approaches. My work emphasizes the importance of model efficiency and robustness, as evidenced by my contributions to model compression and acceleration techniques. Overall, I strive to bridge the gap between theoretical advancements and practical applications, ensuring that my research has a meaningful impact across various domains.", + "collaborators": [ + "Jingjing Deng", + "Lin Wu", + "Xiaoke Ma", + "Chen Hu", + "Deyin Liu", + "Bo Li", + "Hanchi Ren", + "A. Milne", + "Michael Edwards", + "F. Boussaid", + "Bennamoun", + "Chengwu Liang", + "J. Ma", + "Hongyang Chen", + "Y. Ye", + "Kayalvizhi Lakshmanan", + "Matt Roach", + "C. Giannetti", + "Shubham Bhoite", + "D. George", + "Tim Mortensen", + "Manduhu Manduhu", + "B. Heravi", + "S. Kariyawasam", + "Connor Clarkson", + "Sachin Bahade", + "Zengfa Dou", + "Hui Liu", + "Chubing Guo", + "L. Wu", + "Shiyin Tan", + "Xiaoxiong Zhong", + "Hassan Eshkiki", + "Benjamin Mora", + "Suraj Ramchand", + "Gavin Tsang", + "Duncan Cole", + "Ali Alqahtani", + "Mark W. Jones", + "Majedaldein Almahasneh", + "A. Paiement", + "J. Aboudarham" + ], + "pub_titles": [ + "Gradient Leakage Defense with Key-Lock Module for Federated Learning", + "Predicting Surface Texture in Steel Manufacturing at Speed", + "Image Template Matching via Dense and Consistent Contrastive Learning", + "Steel Surface Roughness Parameter Calculations Using Lasers and Machine Learning Models", + "A Robust Vehicle Detection Model for LiDAR Sensor Using Simulation Data and Transfer Learning Methods", + "Active Anchors", + "Cascaded Graph Convolution Approach for Nuclei Detection in Histopathology Images", + "Fully Connected Networks on a Diet With the Mediterranean Matrix Multiplication", + "RetainEXT: Enhancing Rare Event Detection and Improving Interpretability of Health Records using Temporal Neural Networks", + "GRNN: Generative Regression Neural Network\u2014A Data Leakage Attack for Federated Learning", + "Literature Review of Deep Network Compression", + "3D Interactive Segmentation With Semi-Implicit Representation and Active Learning" + ], + "pub_abstracts": [ + "Federated Learning (FL) is a widely adopted privacy-preserving machine learning approach where private data remains local, enabling secure computations and the exchange of local model gradients between local clients and third-party parameter servers. However, recent findings reveal that privacy may be compromised and sensitive information potentially recovered from shared gradients. In this study, we offer detailed analysis and a novel perspective on understanding the gradient leakage problem. These theoretical works lead to a new gradient leakage defense technique that secures arbitrary model architectures using a private key-lock module. Only the locked gradient is transmitted to the parameter server for global model aggregation. Our proposed learning method is resistant to gradient leakage attacks, and the key-lock module is designed and trained to ensure that, without the private information of the key-lock module: a) reconstructing private training data from the shared gradient is infeasible; and b) the global model's inference performance is significantly compromised. We discuss the theoretical underpinnings of why gradients can leak private information and provide theoretical proof of our method's effectiveness. We conducted extensive empirical evaluations with a total of forty-four models on several popular benchmarks, demonstrating the robustness of our proposed approach in both maintaining model performance and defending against gradient leakage attacks.", + "Control of the surface texture of steel strip during the galvanizing and temper rolling processes is essential to satisfy customer requirements and is conventionally measured post-production using a stylus. In-production laser reflection measurement is less consistent than physical measurement but enables real time adjustment of processing parameters to optimize product surface characteristics. We propose the use of machine learning to improve accuracy of the transformation from inline laser reflection measurements to a prediction of surface properties. In addition to accuracy, model evaluation speed is important for fast feedback control. The ROCKET model is one of the fastest state of the art models, however it can be sped up by utilizing a GPU. Our contribution is to implement the model in PyTorch for fast GPU kernel transforms and provide a soft version of the Proportion of Positive Values (PPV) nonlinear pooling function, allowing gradient flow. We perform timing and performance experiments comparing the implementations.", + "Image template matching refers to localizing a small query image as opposed to a large reference image map. The query image a.k.a template has to be screened across every equal-sized region in the reference map to perform inner-product at pixel-level and the resulting similarity indicates the template location. Due to the domain heterogeneity between template and reference images, the matching performance degrades under dramatic appearance changes. More severely, the asymmetric matching easily leads to over-fitting by suggesting excessively false positive regions. To these ends, we propose an effective template matching method based on contrastive learning to perform a dense and consistent InfoNCEloss during matching. This can increase the matching at finer details, and thus effectively regularizes network training to prevent over-fitting. Extensive experiments on the synthetic aperture radar (SAR) and optical datasets, i.e., SEN1-2 and OS datasets demonstrate that our proposed method outperforms state-of-the-art methods by a large margin.", + "Control of surface texture in strip steel is essential to meet customer requirements during galvanizing and temper rolling processes. Traditional methods rely on post-production stylus measurements, while on-line techniques offer non-contact and real-time measurements of the entire strip. However, ensuring accurate measurement is imperative for their effective utilization in the manufacturing pipeline. Moreover, accurate on-line measurements enable real-time adjustments of manufacturing processing parameters during production, ensuring consistent quality and the possibility of closed-loop control of the temper mill. In this study, we leverage state-of-the-art machine learning models to enhance the transformation of on-line measurements into significantly a more accurate Ra surface roughness metric. By comparing a selection of data-driven approaches, including both deep learning and non-deep learning methods, to the close-form transformation, we evaluate their potential for improving surface texture control in temper strip steel manufacturing.", + "Vehicle detection in parking areas provides the spatial and temporal utilisation of parking spaces. Parking observations are typically performed manually, limiting the temporal resolution due to the high labour cost. This paper uses simulated data and transfer learning to build a robust real-world model for vehicle detection and classification from single-beam LiDAR of a roadside parking scenario. The paper presents a synthetically augmented transfer learning approach for LiDAR-based vehicle detection and the implementation of synthetic LiDAR data. A synthetic augmented transfer learning method was used to supplement the small real-world data set and allow the development of data-handling techniques. In addition, adding the synthetically augmented transfer learning method increases the robustness and overall accuracy of the model. Experiments show that the method can be used for fast deployment of the model for vehicle detection using a LIDAR sensor.", + "Defect detection in steel manufacturing has achieved state-of-the-art results in both localisation and classification of various types of defects, however, this assumes very high-quality datasets that have been verified by domain experts. Labelling such data has become a time-consuming and interaction-heavy task with a great amount of user effort, this is due to variability in the defect characteristics and composite nature. We propose a new acquisition function based on the similarity of defects for refining labels over time by showing the user only the most required to be labelled. We also explore different ways in which to feed these new refinements back into the model to utilize the new knowledge in an effortful way. We achieve this with a graphical interface that provides additional information to the domain expert as the data gets refined, allowing for decision-making with uncertain areas of the steel.", + "Nuclei detection in histopathology images of cancerous tissue stained with conventional hematoxylin and eosin stain is a challenging task due to the complexity and diversity of cell data. Deep learning techniques have produced encouraging results in the field of nuclei detection, where the main emphasis is on classification and regressionbased methods. Recent research has demonstrated that regression-based techniques outperform classification. In this paper, we propose a classification model based on graph convolutions to classify nuclei, and similar models to detect nuclei using cascaded architecture. With nearly 29,000 annotated nuclei in a large dataset of cancer histology images, we evaluated the Convolutional Neural Network (CNN) and Graph Convolutional Networks (GCN) based approaches. Our findings demonstrate that graph convolutions perform better with a cascaded GCN architecture and are more stable than centre-of-pixel approach. We have compared our twofold evaluation quantitative results with CNN-based models such as Spatial Constrained Convolutional Neural Network (SC-CNN) and Centre-of-Pixel Convolutional Neural Network (CP-CNN). We used two different loss functions, binary cross-entropy and focal loss function, and also investigated the behaviour of CP-CNN and GCN models to observe the effectiveness of CNN and GCN operators. The compared quantitative F1 score of cascaded-GCN shows an improvement of 6% compared to state-of-the-art methods.", + "This article proposes the Mediterranean matrix multiplication, a new, simple and practical randomized algorithm that samples angles between the rows and columns of two matrices with sizes $m, n, $ and $p$ to approximate matrix multiplication in $O(k(mn+np+mp))$ steps, where $k$ is a constant only related to the precision desired. The number of instructions carried out is mainly bounded by bitwise operators, amenable to a simplified processing architecture and compressed matrix weights. Results show that the method is superior in size and number of operations to the standard approximation with signed matrices. Equally important, this article demonstrates a first application to machine learning inference by showing that weights of fully connected layers can be compressed between $30\\times $ and $100\\times $ with little to no loss in inference accuracy. The requirements for pure floating-point operations are also down as our algorithm relies mainly on simpler bitwise operators.", + "A recurring theme during the pandemic was the shortage of hospital beds. Despite all efforts, the healthcare system still faces 25 % of resource strain felt during the first peak of coronavirus. Digitisation of Electronic Healthcare Records (EHRs) and the pandemic have brought about many successful applications of Recurrent Neural Networks (RNNs) to predict patients' current and future states. Despite their strong per-formance, it remains a challenge for users to delve into the black box which has heavily influenced researchers to utilise more interpretable techniques such as ID-Convolutional neural networks. Others focus on using more interpretable machine learning techniques but only achieve high performance on a select subset of patients. By collaborating with medical experts and artificial intelligence scientists, our study improves on the REverse Time AttentIoN EX model, a feature and visit level attention network, for increased interpretability and usability of RNNs in predicting COVID-19-related hospitalisations. We achieved 82.40 % area under the receiver operating characteristic curve and showcased effective use of the REverse Time AttentIoN EXTension model and EHRs in understanding how individual medical codes contribute to hospitalisation risk prediction. This study provides a guideline for researchers aiming to design interpretable temporal neural networks using the power of RNNs and data mining techniques.", + "Data privacy has become an increasingly important issue in Machine Learning (ML), where many approaches have been developed to tackle this challenge, e.g., cryptography (Homomorphic Encryption (HE), Differential Privacy (DP)) and collaborative training (Secure Multi-Party Computation (MPC), Distributed Learning, and Federated Learning (FL)). These techniques have a particular focus on data encryption or secure local computation. They transfer the intermediate information to the third party to compute the final result. Gradient exchanging is commonly considered to be a secure way of training a robust model collaboratively in Deep Learning (DL). However, recent researches have demonstrated that sensitive information can be recovered from the shared gradient. Generative Adversarial Network (GAN), in particular, has shown to be effective in recovering such information. However, GAN based techniques require additional information, such as class labels that are generally unavailable for privacy-preserved learning. In this article, we show that, in the FL system, image-based privacy data can be easily recovered in full from the shared gradient only via our proposed Generative Regression Neural Network (GRNN). We formulate the attack to be a regression problem and optimize two branches of the generative model by minimizing the distance between gradients. We evaluate our method on several image classification tasks. The results illustrate that our proposed GRNN outperforms state-of-the-art methods with better stability, stronger robustness, and higher accuracy. It also has no convergence requirement to the global FL model. Moreover, we demonstrate information leakage using face re-identification. Some defense strategies are also discussed in this work.", + "Deep networks often possess a vast number of parameters, and their significant redundancy in parameterization has become a widely-recognized property. This presents significant challenges and restricts many deep learning applications, making the focus on reducing the complexity of models while maintaining their powerful performance. In this paper, we present an overview of popular methods and review recent works on compressing and accelerating deep neural networks. We consider not only pruning methods but also quantization methods, and low-rank factorization methods. This review also intends to clarify these major concepts, and highlights their characteristics, advantages, and shortcomings.", + "Segmenting complex 3D geometry is a challenging task due to rich structural details and complex appearance variations of target object. Shape representation and foreground-background delineation are two of the core components of segmentation. Explicit shape models, such as mesh based representations, suffer from poor handling of topological changes. On the other hand, implicit shape models, such as level-set based representations, have limited capacity for interactive manipulation. Fully automatic segmentation for separating foreground objects from background generally utilizes non-interoperable machine learning methods, which heavily rely on the off-line training dataset and are limited to the discrimination power of the chosen model. To address these issues, we propose a novel semi-implicit representation method, namely Non-Uniform Implicit B-spline Surface (NU-IBS), which adaptively distributes parametrically blended patches according to geometrical complexity. Then, a two-stage cascade classifier is introduced to carry out efficient foreground and background delineation, where a simplistic Na\u00efve-Bayesian model is trained for fast background elimination, followed by a stronger pseudo-3D Convolutional Neural Network (CNN) multi-scale classifier to precisely identify the foreground objects. A localized interactive and adaptive segmentation scheme is incorporated to boost the delineation accuracy by utilizing the information iteratively gained from user intervention. The segmentation result is obtained via deforming an NU-IBS according to the probabilistic interpretation of delineated regions, which also imposes a homogeneity constrain for individual segments. The proposed method is evaluated on a 3D cardiovascular Computed Tomography Angiography (CTA) image dataset and Brain Tumor Image Segmentation Benchmark 2015 (BraTS2015) 3D Magnetic Resonance Imaging (MRI) dataset." + ], + "domain": [ + "Federated Learning", + "Machine Learning", + "Computer Vision", + "Data Privacy" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "7554771d-3558-45b1-bca3-a5ad1d763759": { + "pk": "7554771d-3558-45b1-bca3-a5ad1d763759", + "name": "Elisabeth Sola", + "bio": "I am an astrophysicist with a keen interest in the structural dynamics of early-type galaxies (ETGs) and their globular cluster (GC) systems. My recent work has focused on understanding the intricate relationships between galaxy morphology, rotational support, and the effects of mergers. Through observational studies utilizing data from the MATLAS and NGVS surveys, I have explored how different merger types influence the characteristics of ETGs, revealing significant correlations between tidal features, kinematic structures, and metallicity.\n\nOne of my notable contributions is the development of a gridded attention mechanism for segmenting global contaminants in large astronomical images, which enhances the efficiency of capturing both local textural patterns and global context. This innovative approach has been applied to a new dataset of astronomical images, demonstrating its effectiveness in delineating complex structures like dust clouds.\n\nAdditionally, I have conducted extensive analyses of GC systems across various early-type galaxies, uncovering relationships between effective radii and galaxy properties. My research has also delved into the plane-of-satellite problem, where I employed spectroscopic follow-up observations to confirm dwarf galaxy candidates and investigate their stellar populations.\n\nI am passionate about leveraging advanced techniques and collaborative tools to enhance our understanding of galaxy evolution and the underlying physical processes that shape the universe. My work not only contributes to the field of astrophysics but also aims to inspire future research through the development of comprehensive databases and methodologies for analyzing low surface brightness structures.", + "collaborators": [ + "P. Duc", + "Sungsoon Lim", + "J. Cuillandre", + "P. Durrell", + "E. Emsellem", + "S. Gwyn", + "F. Marleau", + "M. B'ilek", + "Felix Richards", + "A. Paiement", + "E. Peng", + "P. Cot'e", + "L. Ferrarese", + "J. Roediger", + "Chengze Liu", + "C. Spengler", + "Laura V. Sales", + "J. Blakeslee", + "J. C. Mihos", + "T. Puzia", + "R. S'anchez-Janssen", + "Mathias Urbano", + "Xianghua Xie", + "A. Lanccon", + "Oliver Muller", + "Michal Bilek", + "O. Muller", + "N. Heesters", + "M. Pawlowski", + "M. Poulain", + "R. Habas", + "Rory Smith", + "S. Paudel", + "A. Lan\u00e7on", + "O. M\u00fcller", + "R\u00faben S\u00e1nchez-Janssen", + "J. Klehammer", + "A. McConnachie" + ], + "pub_titles": [ + "Multi-scale gridded Gabor attention for cirrus segmentation", + "The Spatial Distribution of Globular Cluster Systems in Early Type Galaxies: Estimation Procedure and Catalog of Properties for Globular Cluster Systems Observed with Deep Imaging Surveys", + "Why do different early-type-galaxies have different amounts of rotational support?", + "Dwarf galaxies in the MATLAS survey: The satellite system of NGC 474 under scrutiny with MUSE", + "The Next Generation Virgo Cluster Survey (NGVS). XXVII. The Size and Structure of Globular Cluster Systems and Their Connection to Dark Matter Halos", + "When and how did early-type galaxies outside of galaxy clusters lose their rotational support?", + "Characterization of low surface brightness structures in annotated deep images", + "Origin of the differences in rotational support among early-type galaxies: The case of galaxies outside clusters" + ], + "pub_abstracts": [ + "In this paper, we address the challenge of segmenting global contaminants in large images. The precise delineation of such structures requires ample global context alongside understanding of textural patterns. CNNs specialise in the latter, though their ability to generate global features is limited. Attention measures long range dependencies in images, capturing global context, though at a large computational cost. We propose a gridded attention mechanism to address this limitation, greatly increasing efficiency by processing multi-scale features into smaller tiles. We also enhance the attention mechanism for increased sensitivity to texture orientation, by measuring correlations across features dependent on different orientations, in addition to channel and positional attention. We present results on a new dataset of astronomical images, where the task is segmenting large contaminating dust clouds.", + "We present an analysis of the spatial distribution of globular cluster (GC) systems of 118 nearby early-type galaxies in the Next Generation Virgo Cluster Survey (NGVS) and Mass Assembly of early-Type GaLAxies with their fine Structures (MATLAS) survey programs, which both used MegaCam on the Canada-France-Hawaii Telescope. We describe the procedure used to select GC candidates and fit the spatial distributions of GCs to a two-dimensional S\\'ersic function, which provides effective radii (half number radii) and S\\'ersic indices, and estimate background contamination by adding a constant term to the S'ersic function. In cases where a neighboring galaxy affects the estimation of the GC spatial distribution in the target galaxy, we fit two 2D S\\'ersic functions, simultaneously. We also investigate the color distributions of GCs in our sample by using Gaussian Mixture Modeling. For GC systems with bimodal color distributions, we divide the GCs into blue and red subgroups and fit their respective spatial distributions with S\\'ersic functions. Finally, we measure the total number of GCs based on our fitted S\\'ersic function, and calculate the GC specific frequency.", + "Early-type galaxies (ETGs, i.e. elliptical and lenticular galaxies) differ in their amount of rotational support -- some are purely supported by velocity dispersion, while others show pronounced ordered rotation. Cosmological hydrodynamical simulations show that the progenitors of all ETGs were first rotating quickly, but then mergers decreased their rotational support. In the presented work, we studied this process using an observational archaeological approach. Namely, we inspected the correlations of 23 merger-sensitive characteristics of local ETGs with a parameter quantifying the rotational support. We used a volume-limited sample of local ETGs, that are not in galaxy clusters, from the MATLAS survey. We found, for example, that slowly rotating galaxies have tidal features and kinematically distinct components more often and have lower metallicities. We sought for mutual interpretation of the correlations among all 23 quantities, together with literature results on high-redshift massive galaxies. There seems to be only one interpretation possible: on average, ETGs lose their rotational support through multiple minor wet mergers happening at the redshifts above about two.", + "A recent study of the distribution of dwarf galaxies in the MATLAS sample in galaxy groups revealed an excess of flattened satellite structures, reminiscent of the co-rotating planes of dwarf galaxies discovered in the local Universe. If confirmed, this lends credence to the plane-of-satellite problem and further challenges the standard model of hierarchical structure formation. However, with only photometric data and no confirmation of the satellite membership the study could not address the plane-of-satellite problem in full detail . Here we present spectroscopic follow-up observations of one of the most promising planes-of-satellite candidates in the MATLAS survey, the satellite system of NGC\\,474. Employing MUSE at the VLT and full spectrum fitting we studied 13 dwarf galaxy candidates and confirmed nine to be members of the field around NGC\\,474. Measuring the stellar populations of all observed galaxies, we find that the MATLAS dwarfs have lower metallicities than the Local Group dwarfs at a given luminosity. Two dwarf galaxies may form a pair of satellites based on their close projection and common velocity. Within the virial radius, we do not find a significant plane-of-satellites, however, there is a sub-population of six dwarf galaxies which seem to be anti-correlated in phase-space. Due to the low number of dwarf galaxies, this signal may arise by chance. With over 2000 dwarf galaxy candidates found in the MATLAS survey this remains an intriguing data set to study the plane-of-satellites problem in a statistical fashion once more follow-up observations have been conducted.", + "We study the size and structure of globular cluster (GC) systems of 118 early-type galaxies from the NGVS, MATLAS, and ACSVCS surveys. Fitting S\u00e9rsic profiles, we investigate the relationship between effective radii of GC systems (R e,gc) and galaxy properties. GC systems are 2\u20134 times more extended than host galaxies across the entire stellar mass range of our sample (108.3 M \u2299 < M * < 1011.6 M \u2299). The relationship between R e,gc and galaxy stellar mass exhibits a characteristic \u201cknee\u201d at a stellar mass of M p \u2243 1010.8, similar to the galaxy R e \u2013stellar mass relationship. We present a new characterization of the traditional blue and red GC color subpopulations, describing them with respect to host galaxy (g\u2032\u2212i\u2032) color (\u0394gi): GCs with similar colors to their hosts have a \u201cred\u201d \u0394gi, and those significantly bluer GCs have a \u201cblue\u201d \u0394gi. The GC populations with red \u0394gi, even in dwarf galaxies, are twice as extended as the stars, suggesting that formation or survival mechanisms favor the outer regions. We find a tight correlation between R e,gc and the total number of GCs, with intrinsic scatter \u22720.1 dex spanning two and three orders of magnitude in size and number, respectively. This holds for both red and blue subpopulations, albeit with different slopes. Assuming that N GC,Total correlates with M 200, we find that the red GC systems have effective radii of roughly 1%\u20135% R 200, while the blue GC systems in massive galaxies can have sizes as large as \u223c10% R 200. Environmental dependence on R e,gc is also found, with lower-density environments exhibiting more extended GC systems at fixed mass.", + "Context. Early-type galaxies (ETGs) are divided into slow and fast rotators (FRs and SRs) according to the degree of ordered rotation of their stellar populations. Cosmological hydrodynamical simulations indicate that galaxies are formed as FRs before their rotational support decreases, usually because of mergers. Aims. We aimed to investigate this process observationally for galaxies outside of clusters. Methods. We made use of the fact that di \ufb00 erent merger types leave di \ufb00 erent traces that have di \ufb00 erent lifetimes. We statistically analyzed multiple characteristics of galaxies that are expected to be in\ufb02uenced by mergers: tidal features, kinematically distinct cores, stellar age, etc. They were taken from the MATLAS and ATLAS 3D databases. We identi\ufb01ed through multilinear regression the quantities that, at a \ufb01xed mass and environmental density of the galaxy, signi\ufb01cantly correlate with a measure of the ordered rotation of the galaxy, \u03bb N R e . Results. We found a negative correlation of the rotational support with the occurrence of tidal disturbances and kinematic substructures and a positive correlation with metallicity and metallicity gradients. For massive galaxies, the rotational support correlates negatively with the abundance of alpha elements, and for the galaxies in low-density environments, it correlates negatively with the central photometric cuspiness. These and additional literature observational constraints are explained the easiest if the mergers that decreased the rotational support of ETGs were typically minor, wet and happening at z \u2248 2. They did not form the currently observed tidal features. The observed frequency of tidal features implies a merging rate of 0.07-0.2 per Gyr. This is insu \ufb03 cient for explaining the observed growth of radii of ETGs with redshift by", + "The characterization of Low Surface Brightness (LSB) stellar structures around galaxies such as tidal debris of on-going or past collisions is essential to constrain models of galactic evolution. Our goal is to obtain quantitative measurements of LSB structures identified in deep images of samples consisting of hundreds of galaxies. We developed an online annotation tool that enables contributors to delineate the shapes of diffuse extended stellar structures, as well as artefacts or foreground structures. All parameters are automatically stored in a database which may be queried to retrieve quantitative measurements. We annotated LSB structures around 352 nearby massive galaxies with deep images obtained with the CFHT as part of two large programs: MATLAS and UNIONS/CFIS. Each LSB structure was delineated and labeled according to its likely nature: stellar shells, streams associated to a disrupted satellite, tails formed in major mergers, ghost reflections or cirrus. From our database containing 8441 annotations, the area, size, median surface brightness and distance to the host of 228 structures were computed. The results confirm the fact that tidal structures defined as streams are thinner than tails, as expected by numerical simulations. In addition, tidal tails appear to exhibit a higher surface brightness than streams (by about 1 mag), which may be related to different survival times for the two types of collisional debris. We did not detect any tidal feature fainter than 27.5 mag.arcsec$^{-2}$, while the nominal surface brightness limits of our surveys range between 28.3 and 29 mag.arcsec$^{-2}$, a difference that needs to be taken into account when estimating the sensitivity of future surveys to identify LSB structures. Our annotation database of observed LSB structures may be used for quantitative analysis and as a training set for machine learning algorithms (abbreviated).", + "Context: Early-type galaxies (ETGs) are divided into slow and fast rotators (FRs and SRs) according to the degree of ordered rotation of their stellar populations. Cosmological hydrodynamical simulations indicate that galaxies form as FRs before their rotational support decreases, usually because of mergers. Aims: We aimed to investigate this process observationally for galaxies outside of clusters. Methods: We made use of the fact that different merger types leave different traces that have different lifetimes. We statistically analyzed multiple characteristics of galaxies that are expected to be influenced by mergers, such as tidal features, kinematically distinct cores, and stellar ages. They were taken from the MATLAS and ATLAS$^\\mathrm{3D}$ databases. Through multilinear regression we identified the quantities that, at a fixed mass and environmental density of the galaxy, significantly correlate with a measure of the ordered rotation of the galaxy, $\\lambda_{R_e}^N$. Results: We found a negative correlation of the rotational support with the occurrence of tidal disturbances and kinematic substructures, and a positive correlation with metallicity and metallicity gradients. For massive galaxies, the rotational support correlates negatively with the abundance of alpha elements, and for the galaxies in low-density environments, it correlates negatively with the central photometric cuspiness. These and additional literature observational constraints are explained the easiest if the mergers that decreased the rotational support of ETGs were typically minor, wet, and happening at $z\\approx 2$. They did not form the currently observed tidal features. The observed frequency of tidal features implies a merging rate of 0.07-0.2 per Gyr. This is insufficient to explain the observed growth of the radii of ETGs with redshift by mergers." + ], + "domain": [ + "Astronomy", + "Image Segmentation", + "Machine Learning", + "Galaxy Dynamics" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "2ed0796e-b45f-48a8-ba58-22096e303231": { + "pk": "2ed0796e-b45f-48a8-ba58-22096e303231", + "name": "Pierre-Alain Duc", + "bio": "I am an astrophysicist specializing in the study of galaxy interactions, star formation, and the complex dynamics of the intergalactic medium. My recent research has focused on the intricate processes occurring in environments like Stephan's Quintet and tidal dwarf galaxies, where I combine advanced imaging from the James Webb Space Telescope and Hubble Space Telescope with spectroscopy from the Atacama Large Millimeter Array. \n\nThrough my work, I have explored the turbulent multiphase intergalactic medium, revealing how massive clouds of gas interact and evolve under extreme conditions. I have also investigated the star formation rates and gas dynamics in tidal dwarf galaxies, demonstrating their unique behaviors compared to regular galaxies. My studies extend to dwarf galaxies beyond the Local Volume, where I analyze their stellar populations and metallicity relations, contributing to our understanding of galaxy formation and evolution across different environments.\n\nI am particularly interested in the role of tidal features and ram pressure stripping in shaping galaxy morphology and star formation activity. My findings highlight the diverse mechanisms driving these processes, from minor mergers to the effects of environmental density. I aim to bridge observational data with theoretical models, providing insights into the complex histories of galaxies and their interactions. As I continue my research, I look forward to leveraging new observational technologies to deepen our understanding of the universe's structure and evolution.", + "collaborators": [ + "Sungsoon Lim", + "J. Cuillandre", + "J. Fensch", + "F. Marleau", + "M. Poulain", + "R. Habas", + "P. Durrell", + "S. Gwyn", + "M. B'ilek", + "Rub'en S'anchez-Janssen", + "S. Paudel", + "R. S\u00e1nchez-Janssen", + "Elisabeth Sola", + "A. McConnachie", + "P. Appleton", + "K. Voggel", + "U. Lisenfeld", + "N. Heesters", + "F. Renaud", + "S. Brough", + "J. Carlin", + "N. E. Chisari", + "R. Gavazzi", + "R. Ibata", + "C. Laigle", + "M. Montes", + "J. Rom'an", + "A. Watkins", + "A. Wright", + "I. Yoon", + "J. Klehammer", + "R. Carlberg", + "O. Mueller", + "P. Guillard", + "B. Emonts", + "F. Boulanger", + "A. Togi", + "W. Reach", + "K. Alatalo", + "M. Cluver", + "T. Diaz Santos", + "S. Gallagher", + "P. Ogle", + "E. O\u2019Sullivan", + "C. Xu", + "Navyasree Kovakkuni", + "F. Lelli", + "M. Boquien", + "J. Braine", + "E. Brinks", + "V. Charmandaris", + "Francoise Combes", + "S. McGaugh", + "J. Mihos", + "M. Pawlowski", + "Y. Revaz", + "P. Weilbacher", + "O. Muller", + "Oliver M\u00fcller", + "D. N. Chhatkuli", + "Suk-Jin Yoon", + "R. Rakhi", + "Geethika Santhosh", + "Prajwel Joseph", + "K. George", + "S. Subramanian", + "Indulekha Kavila", + "J. Postma", + "P. C\u00f4t\u00e9", + "L. Cortese", + "S. Ghosh", + "A. Subramaniam", + "S. Tandon", + "J. Hutchings", + "P Samuel Wesley", + "Aditya Bharadwaj", + "Neeran Niroula", + "Cheng Cheng", + "Cong K. Xu", + "N. Tang", + "Y. Dai", + "J.-S. Huang", + "Chuan He", + "H. Feng", + "L. Guy", + "E. Bachelet", + "M. Banerji", + "F. Bauer", + "T. Collett", + "C. Conselice", + "S. Eggl", + "A. Ferguson", + "A. Fontana", + "C. Heymans", + "I. Hook", + "'Eric Aubourg", + "H. Aussel", + "J. Bosch", + "B. Carry", + "H. Hoekstra" + ], + "pub_titles": [ + "Multiphase Gas Interactions on Subarcsec Scales in the Shocked Intergalactic Medium of Stephan\u2019s Quintet with JWST and ALMA", + "Molecular and Ionized Gas in Tidal Dwarf Galaxies: The Spatially Resolved Star-Formation Relation", + "Radial velocities and stellar population properties of 56 MATLAS dwarf galaxies observed with MUSE", + "The creation of a massive UCD by tidal threshing from NGC\u00a0936", + "UVIT view of NGC 5291: Ongoing star formation in tidal dwarf galaxies at \u223c 0.35\u00a0kpc resolution", + "Deep H i Mapping of Stephan\u2019s Quintet and Its Neighborhood", + "Rubin-Euclid Derived Data Products: Initial Recommendations", + "Preparing for low surface brightness science with the Vera C. Rubin Observatory: characterisation of tidal features from mock images", + "From starburst to quenching: merger-driven evolution of the star formation regimes in a shell galaxy", + "The interacting pair of galaxies Arp 82: Integral field spectroscopy and numerical simulations", + "The Next Generation Virgo Cluster Survey. XXXIII. Stellar Population Gradients in the Virgo Cluster Core Globular Cluster System", + "When and how did early-type galaxies outside of galaxy clusters lose their rotational support?", + "Constraints on galaxy formation from the cosmic-far-infrared-background\u00a0\u2212\u00a0optical-imaging cross-correlation using Herschel and UNIONS", + "Characterization of low surface brightness structures in annotated deep images", + "Origin of the differences in rotational support among early-type galaxies: The case of galaxies outside clusters", + "Origin of the spectacular tidal shells of galaxy NGC474", + "HI observations of the MATLAS dwarf and ultra-diffuse galaxies", + "Structure and morphology of the MATLAS dwarf galaxies and their central nuclei", + "Ram Pressure Candidates in UNIONS" + ], + "pub_abstracts": [ + "We combine James Webb Space Telescope (JWST) and Hubble Space Telescope imaging with Atacama Large Millimeter Array CO(2\u20131) spectroscopy to study the highly turbulent multiphase intergalactic medium (IGM) in Stephan\u2019s Quintet on 25\u2013150 pc scales. Previous Spitzer observations revealed luminous H2 line cooling across a 45 kpc-long filament, created by a giant shock wave, following the collision with an intruder galaxy, NGC 7318b. We demonstrate that the Mid-Infrared Instrument/F1000W/F770W filters are dominated by 0\u20130 S(3) H2 and a combination of polycyclic aromatic hydrocarbon and 0\u20130 S(5) H2 emission. These observations reveal the dissipation of kinetic energy as massive clouds experience collisions, interactions, and likely destruction/recycling within different phases of the IGM. In 1 kpc-scaled structure, warm H2 was seen to form a triangular-shaped head and tail of compressed and stripped gas behind a narrow shell of cold H2. In another region, two cold molecular clumps with very different velocities are connected by an arrow-shaped stream of warm, probably shocked, H2 suggesting a cloud\u2013cloud collision is occurring. In both regions, a high warm-to-cold molecular gas fraction indicates that the cold clouds are being disrupted and converted into warm gas. We also map gas associated with an apparently forming dwarf galaxy. We suggest that the primary mechanism for exciting strong mid-IR H2 lines throughout Stephan\u2019s Quintet is through a fog of warm gas created by the shattering of denser cold molecular clouds and mixing/recycling in the post-shocked gas. A full picture of the diverse kinematics and excitation of the warm H2 will require future JWST mid-IR spectroscopy. The current observations reveal the rich variety of ways that different gas phases can interact with one another.", + "Tidal dwarf galaxies (TDGs) are low-mass objects that form within tidal and/or collisional debris ejected from more massive interacting galaxies. We use CO($1-0$) observations from ALMA and integral-field spectroscopy from MUSE to study molecular and ionized gas in three TDGs: two around the collisional galaxy NGC 5291 and one in the late-stage merger NGC 7252. The CO and H$\\alpha$ emission is more compact than the HI emission and displaced from the HI dynamical center, so these gas phases cannot be used to study the internal dynamics of TDGs. We use CO, HI, and H$\\alpha$ data to measure the surface densities of molecular gas ($\\Sigma_{\\rm mol}$), atomic gas ($\\Sigma_{\\rm atom}$) and star-formation rate ($\\Sigma_{\\rm SFR}$), respectively. We confirm that TDGs follow the same spatially integrated $\\Sigma_{\\rm SFR}-\\Sigma_{\\rm gas}$ relation of regular galaxies, where $\\Sigma_{\\rm gas} = \\Sigma_{\\rm mol} + \\Sigma_{\\rm atom}$, even though they are HI dominated. We find a more complex behaviour in terms of the spatially resolved $\\Sigma_{\\rm SFR}-\\Sigma_{\\rm mol}$ relation on sub-kpc scales. The majority ($\\sim$60$\\%$) of SF regions in TDGs lie on the same $\\Sigma_{\\rm SFR}-\\Sigma_{\\rm mol}$ relation of normal spiral galaxies but show a higher dispersion around the mean. The remaining fraction of SF regions ($\\sim$40$\\%$) lie in the starburst region and are associated with the formation of massive super star clusters, as shown by Hubble Space Telescope images. We conclude that the local SF activity in TDGs proceeds in a hybrid fashion, with some regions comparable to normal spiral galaxies and others to extreme starbursts.", + "Dwarf galaxies have been extensively studied in the Local Group, in nearby groups, and selected clusters, giving us a robust picture of their global stellar and dynamical properties in particular locations in the Universe. Intense study of these properties has revealed correlations between them, including the well known universal stellar mass-metallicity relation. However, since dwarfs play a role in a vast range of different environments, much can be learned about galaxy formation and evolution through extending the study of these objects to various locations. We present MUSE spectroscopy of a sample of 56 dwarf galaxies as a follow-up to the MATLAS survey in low-to-moderate density environments beyond the Local Volume. The dwarfs have stellar masses in the range of $M_{*}/M_{\\odot}$ = 10$^{6.1}$-10$^{9.4}$ and show a distance range of D = 14-148 Mpc, the majority (75%) of which are located in the range targeted by the MATLAS survey (10-45 Mpc). We thus report a 75% (79% for dwarf ellipticals) success rate for the semi-automatic identification of dwarf galaxies in the MATLAS survey on the here presented subsample. Using pPXF full spectrum fitting, we determine their line-of-sight velocity and can match the majority of them with their massive host galaxy. Close inspection of their spectra reveals that ~30% show clear emission lines and thus star formation activity. We estimate their stellar population properties (age and metallicity) and compare our results with other works investigating Local Volume and cluster dwarf galaxies. We find that the dwarf galaxies presented in this work show a systematic offset from the stellar mass-metallicity relation towards lower metallicities at the same stellar mass. A similar deviation is present in other works in the stellar mass range probed in this work and might be attributed to the use of different methodologies for deriving the metallicity.", + " We study a compact nucleus embedded in an early-type dwarf galaxy, MATLAS-167, which is in the process of disruption by the tidal force of the neighboring giant S0 galaxy, NGC\u00a0936, in a group environment. Using the imaging data of the MATLAS survey, we analyze the stellar tidal tail of MATLAS-167 and its central compact nucleus, designated as NGC\u00a0936_UCD. We find that NGC\u00a0936_UCD has a luminosity of Mg = \u221211.43 \u00b10.01 mag and a size of 66.5\u00b117\u00a0pc, sharing the global properties of Ultra Compact Dwarf galaxies (UCDs) but significantly larger and brighter compared to the typical UCD populations observed in the Virgo cluster. By integrating the total luminosity of both the tidal stream and MATLAS-167, we estimate that the disrupted dwarf progenitor possesses a luminosity of Mg = \u221215.92 \u00b10.06 mag, a typical bright dE luminosity. With the help of the optical spectrum observed by the SDSS survey, we derive the simple stellar population properties of NGC\u00a0936_UCD: a light-weighted age of 5.6\u00b10.7\u00a0Gyr and metallicity of [Z/H] = \u22120.83 \u00b10.3 dex. Our findings suggest that tidal threshing is a possible formation mechanism of bright UCD populations in close proximity to giant galaxies.", + " NGC 5291, an early-type galaxy surrounded by a giant H\u00a0i ring, is believed to be formed from collision with another galaxy. Several star forming complexes and tidal dwarf galaxies are distributed along the collisional ring which are sites of star formation in environments where extreme dynamical effects are involved. Dynamical effects can affect the star formation properties and the spatial distribution of star forming complexes along the tidal features. To study and quantify the star formation activity in the main body and in the ring structure of the NGC 5291 system, we use high spatial resolution FUV and NUV imaging observations from the Ultraviolet Imaging Telescope onboard AstroSat. A total of 57 star-forming knots are identified to be part of this interacting system out of which 12 are new detections (star forming complexes that lie inside the H\u00a0i contour) compared to the previous measurements from lower resolution UV imaging. We estimate the attenuation in UV for each of the resolved star-forming knots using the UV spectral slope \u03b2, derived from the FUV \u2212 NUV colour. Using the extinction corrected UV fluxes, we derive the star formation rate of the resolved star forming complexes. The extinction corrected total star formation rate of this system is estimated as 1.75 \u00b1 0.04\u00a0M\u2299\u2009yr\u22121. The comparison with dwarf galaxy populations (BCD, Sm and dIm galaxies) in the nearby Universe shows that many of the knots in the NGC 5291 system have SFR values comparable to the SFR of BCD galaxies.", + "We carried out deep mapping observations of the atomic hydrogen (H i) 21 cm line emission in a field centered on the famous galaxy group Stephan's Quintet (SQ), using the Five-hundred-meter Aperture Spherical Telescope (FAST) equipped with a 19-beam receiver. The final data cube reaches an H i column density sensitivity of 5\u03c3 = 2.1 \u00d7 1017 cm\u22122 per 20 km s\u22121 channel with an angular resolution of 4.\u20320. The discovery of a large diffuse feature of the H i emission in the outskirts of the intragroup medium of SQ was reported in a previous paper (Xu et al.). Here we present a new study of the total H i emission of SQ and the detection of several neighboring galaxies, exploiting the high sensitivity and the large sky coverage of the FAST observations. A total H i mass of M H I = 3.48 \u00b1 0.35 \u00d7 1010 M \u2609 is found for SQ, which is significantly higher than previous measurements in the literature. This indicates that, contrary to earlier claims, SQ is not H i deficient. The excessive H i gas is mainly found in the velocity ranges of 6200\u20136400 km s\u22121 and 6800\u20137000 km s\u22121, which were undetected in previous observations that are less sensitive than ours. Our results suggest that the \u201cmissing H i\u201d in compact groups may be hidden in the low-density diffuse neutral gas instead of in the ionized gas.", + "This report is the result of a joint discussion between the Rubin and Euclid scientific communities. The work presented in this report was focused on designing and recommending an initial set of Derived Data products (DDPs) that could realize the science goals enabled by joint processing. All interested Rubin and Euclid data rights holders were invited to contribute via an online discussion forum and a series of virtual meetings. Strong interest in enhancing science with joint DDPs emerged from across a wide range of astrophysical domains: Solar System, the Galaxy, the Local Volume, from the nearby to the primaeval Universe, and cosmology.", + " Tidal features in the outskirts of galaxies yield unique information about their past interactions and are a key prediction of the hierarchical structure formation paradigm. The Vera C. Rubin Observatory is poised to deliver deep observations for potentially of millions of objects with visible tidal features, but the inference of galaxy interaction histories from such features is not straightforward. Utilising automated techniques and human visual classification in conjunction with realistic mock images produced using the NewHorizon cosmological simulation, we investigate the nature, frequency and visibility of tidal features and debris across a range of environments and stellar masses. In our simulated sample, around 80 per cent of the flux in the tidal features around Milky Way or greater mass galaxies is detected at the 10-year depth of the Legacy Survey of Space and Time (30 \u2212 31 mag arcsec\u22122), falling to 60 per cent assuming a shallower final depth of 29.5 mag arcsec\u22122. The fraction of total flux found in tidal features increases towards higher masses, rising to 10 per cent for the most massive objects in our sample (M\u22c6 \u223c 1011.5\u00a0M\u2299). When observed at sufficient depth, such objects frequently exhibit many distinct tidal features with complex shapes. The interpretation and characterisation of such features varies significantly with image depth and object orientation, introducing significant biases in their classification. Assuming the data reduction pipeline is properly optimised, we expect the Rubin Observatory to be capable of recovering much of the flux found in the outskirts of Milky Way mass galaxies, even at intermediate redshifts (z < 0.2).", + " Shell galaxies make a class of tidally distorted galaxies, characterised by wide concentric arc(s), extending out to large galactocentric distances with sharp outer edges. Recent observations of young massive star clusters in the prominent outer shell of NGC 474 suggest that such systems host extreme conditions of star formation. In this paper, we present a hydrodynamic simulation of a galaxy merger and its transformation into a shell galaxy. We analyse how the star formation activity evolves with time, location-wise within the system, and what are the physical conditions for star formation. During the interaction, an excess of dense gas appears, triggering a starburst, i.e. an enhanced star formation rate and a reduced depletion time. Star formation coincides with regions of high molecular gas fraction, such as the galactic nucleus, spiral arms, and occasionally the tidal debris during the early stages of the merger. Tidal interactions scatter stars into a stellar spheroid, while the gas cools down and reforms a disc. The morphological transformation after coalescence stabilises the gas and thus quenches star formation, without the need for feedback from an active galactic nucleus. This evolution shows similarities with a compaction scenario for compact quenched spheroids at high-redshift, yet without a long red nugget phase. Shells appear after coalescence, during the quenched phase, implying that they do not host the conditions necessary for in\u00a0situ star formation. The results suggest that shell-forming mergers might be part of the process of turning blue late-type galaxies into red and dead early-types.", + " Spectral data cubes of the interacting pair of galaxies NGC 2535 and NGC 2536 (the Arp 82 system) targeting bright emission lines in the visible band, obtained with the imaging Fourier transform spectrometer (iFTS) SITELLE attached to the Canada-France-Hawaii Telescope (CFHT), are presented. Analysis of H\u03b1 velocity maps reveals a bar in $\\rm NGC\\, 2536$. In $\\rm NGC\\, 2535$, we find strong non-circular motions outside the ocular ring, in the elliptical arc and tidal tails of $\\rm NGC\\, 2535$ and a misalignment between the kinematic and photometric position angles. We detect 155 HII region complexes in the interacting pair of galaxies and determine oxygen abundances for 66 of them using different calibrators. We find, regardless of the indicator used, that the oxygen abundance distribution in $\\rm NGC\\, 2536$ is shallow whereas, in $\\rm NGC\\, 2535$, it is best fitted by two slopes, the break occurring beyond the ocular ring. The inner slope is comparable to the one observed in isolated normal star-forming galaxies but the outer slope is shallow. We present a numerical simulation of the interaction that reproduces the observed tidal features, kinematics, and metallicity distribution, to investigate the effect of the interaction on the galaxies. The model indicates that the galaxies have undergone a close encounter, strongly prograde for the primary, and are half way in their course to a second close encounter.", + "We present a study of the stellar populations of globular clusters (GCs) in the Virgo Cluster core with a homogeneous spectroscopic catalog of 692 GCs within a major-axis distance R maj = 840 kpc from M87. We investigate radial and azimuthal variations in the mean age, total metallicity, [Fe/H], and \u03b1-element abundance of blue (metal-poor) and red (metal-rich) GCs using their co-added spectra. We find that the blue GCs have a steep radial gradient in [Z/H] within R maj = 165 kpc, with roughly equal contributions from [Fe/H] and [\u03b1/Fe], and flat gradients beyond. By contrast, the red GCs show a much shallower gradient in [Z/H], which is entirely driven by [Fe/H]. We use GC-tagged Illustris simulations to demonstrate an accretion scenario where more massive satellites (with more metal- and \u03b1-rich GCs) sink further into the central galaxy than less massive ones, and where the gradient flattening occurs because of the low GC occupation fraction of low-mass dwarfs disrupted at larger distances. The dense environment around M87 may also cause the steep [\u03b1/Fe] gradient of the blue GCs, mirroring what is seen in the dwarf galaxy population. The progenitors of red GCs have a narrower mass range than those of blue GCs, which makes their gradients shallower. We also explore spatial inhomogeneity in GC abundances, finding that the red GCs to the northwest of M87 are slightly more metal-rich. Future observations of GC stellar population gradients will be useful diagnostics of halo merger histories.", + "Context. Early-type galaxies (ETGs) are divided into slow and fast rotators (FRs and SRs) according to the degree of ordered rotation of their stellar populations. Cosmological hydrodynamical simulations indicate that galaxies are formed as FRs before their rotational support decreases, usually because of mergers. Aims. We aimed to investigate this process observationally for galaxies outside of clusters. Methods. We made use of the fact that di \ufb00 erent merger types leave di \ufb00 erent traces that have di \ufb00 erent lifetimes. We statistically analyzed multiple characteristics of galaxies that are expected to be in\ufb02uenced by mergers: tidal features, kinematically distinct cores, stellar age, etc. They were taken from the MATLAS and ATLAS 3D databases. We identi\ufb01ed through multilinear regression the quantities that, at a \ufb01xed mass and environmental density of the galaxy, signi\ufb01cantly correlate with a measure of the ordered rotation of the galaxy, \u03bb N R e . Results. We found a negative correlation of the rotational support with the occurrence of tidal disturbances and kinematic substructures and a positive correlation with metallicity and metallicity gradients. For massive galaxies, the rotational support correlates negatively with the abundance of alpha elements, and for the galaxies in low-density environments, it correlates negatively with the central photometric cuspiness. These and additional literature observational constraints are explained the easiest if the mergers that decreased the rotational support of ETGs were typically minor, wet and happening at z \u2248 2. They did not form the currently observed tidal features. The observed frequency of tidal features implies a merging rate of 0.07-0.2 per Gyr. This is insu \ufb03 cient for explaining the observed growth of radii of ETGs with redshift by", + " Using Herschel-SPIRE imaging and the Canada-France Imaging Survey (CFIS) Low Surface Brightness data products from the Ultraviolet Near-Infrared Optical Northern Survey (UNIONS), we present a cross-correlation between the cosmic far-infrared background and cosmic optical background fluctuations. The cross-spectrum is measured for two cases: all galaxies are kept in the images; or all individually-detected galaxies are masked to produce \u2018background\u2019 maps. We report the detection of the cross-correlation signal at \u2273 18\u2009\u03c3 (\u2273 14\u2009\u03c3 for the background map). The part of the optical brightness variations that are correlated with the submm emission translates to an rms brightness of \u2243 32.5\u2009mag\u2009arcsec\u22122 in the r band, a level normally unreachable for individual sources. A critical issue is determining what fraction of the cross-power spectrum might be caused by emission from Galactic cirrus. For one of the fields, the Galactic contamination is 10\u00a0times higher than the extragalactic signal; however, for the other fields, the contamination is around 20\u00a0per cent. An additional discriminant is that the cross-power spectrum is of the approximate form P(k)\u221d1/k, much shallower than that of Galactic cirrus. We interpret the results in a halo-model framework, which shows good agreement with independent measurements for the scalings of star-formation rates in galaxies. The approach presented in this study holds great promise for future surveys such as FYST/CCAT-prime combined with Euclid or the Vera Rubin Observatory (LSST), which will enable a detailed exploration of the evolution of star formation in galaxies.", + "The characterization of Low Surface Brightness (LSB) stellar structures around galaxies such as tidal debris of on-going or past collisions is essential to constrain models of galactic evolution. Our goal is to obtain quantitative measurements of LSB structures identified in deep images of samples consisting of hundreds of galaxies. We developed an online annotation tool that enables contributors to delineate the shapes of diffuse extended stellar structures, as well as artefacts or foreground structures. All parameters are automatically stored in a database which may be queried to retrieve quantitative measurements. We annotated LSB structures around 352 nearby massive galaxies with deep images obtained with the CFHT as part of two large programs: MATLAS and UNIONS/CFIS. Each LSB structure was delineated and labeled according to its likely nature: stellar shells, streams associated to a disrupted satellite, tails formed in major mergers, ghost reflections or cirrus. From our database containing 8441 annotations, the area, size, median surface brightness and distance to the host of 228 structures were computed. The results confirm the fact that tidal structures defined as streams are thinner than tails, as expected by numerical simulations. In addition, tidal tails appear to exhibit a higher surface brightness than streams (by about 1 mag), which may be related to different survival times for the two types of collisional debris. We did not detect any tidal feature fainter than 27.5 mag.arcsec$^{-2}$, while the nominal surface brightness limits of our surveys range between 28.3 and 29 mag.arcsec$^{-2}$, a difference that needs to be taken into account when estimating the sensitivity of future surveys to identify LSB structures. Our annotation database of observed LSB structures may be used for quantitative analysis and as a training set for machine learning algorithms (abbreviated).", + "Context: Early-type galaxies (ETGs) are divided into slow and fast rotators (FRs and SRs) according to the degree of ordered rotation of their stellar populations. Cosmological hydrodynamical simulations indicate that galaxies form as FRs before their rotational support decreases, usually because of mergers. Aims: We aimed to investigate this process observationally for galaxies outside of clusters. Methods: We made use of the fact that different merger types leave different traces that have different lifetimes. We statistically analyzed multiple characteristics of galaxies that are expected to be influenced by mergers, such as tidal features, kinematically distinct cores, and stellar ages. They were taken from the MATLAS and ATLAS$^\\mathrm{3D}$ databases. Through multilinear regression we identified the quantities that, at a fixed mass and environmental density of the galaxy, significantly correlate with a measure of the ordered rotation of the galaxy, $\\lambda_{R_e}^N$. Results: We found a negative correlation of the rotational support with the occurrence of tidal disturbances and kinematic substructures, and a positive correlation with metallicity and metallicity gradients. For massive galaxies, the rotational support correlates negatively with the abundance of alpha elements, and for the galaxies in low-density environments, it correlates negatively with the central photometric cuspiness. These and additional literature observational constraints are explained the easiest if the mergers that decreased the rotational support of ETGs were typically minor, wet, and happening at $z\\approx 2$. They did not form the currently observed tidal features. The observed frequency of tidal features implies a merging rate of 0.07-0.2 per Gyr. This is insufficient to explain the observed growth of the radii of ETGs with redshift by mergers.", + "Context. The lenticular galaxy NGC474 hosts a rich system of tidal shells and streams, some of which are exceptionally bright. Two teams recently presented spectroscopic observations of the brightest shells. These were the \ufb01rst shell spectra ever observed in integrated starlight. The authors studied the stellar populations of the shell, of the center of the galaxy, and of its globular clusters. The precise formation scenario for the tidal features of this prominent galaxy still remained unclear, however. Aims. Here, we add further clues on their formation from the radii of the shells, and we present a scenario for the formation of the tidal features that seems to be unique and can explain all available data. Methods. Shell radii were analyzed with the shell identi\ufb01cation method, and we ran self-consistent simulations of the formation of the tidal features. We considered Newtonian as well as MOND gravity. Results. Observations suggest that the tidal features originate from the accretion of a spiral galaxy. According to the shell identi\ufb01cation method, the merging galaxies \ufb01rst collided 1.3Gyr ago and then again 0.9Gyr ago, thereby forming the shells in two generations. This would also explain the young ages of stellar populations in the center of the galaxy and the young age of the globular clusters. The analytic models of shell propagation that underlie the shell identi\ufb01cation method are veri\ufb01ed by a simulation. The simulations reproduce the observed morphology of the tidal features well. The accreted spiral likely reached NGC474 on the plane of the sky nearly radially from the south, its rotation axis pointing toward us. It probably had a stellar mass of about one-sixth of NGC474, that is, 10 9 . 8 M (cid:12) . Apparently, all tidal features in the galaxy originate from one merger.", + "The presence of HI gas in galaxies is inextricably linked to their morphology and evolution. This paper aims to understand the HI content of the already identi\ufb01ed 2210 dwarfs located in the low-to-moderate density environments of the Mass Assembly of early-Type GaLAxies with their \ufb01ne Structures (MATLAS) deep imaging survey. We combined the HI observations from the ATLAS 3D survey, with the extragalactic HI sources from the Arecibo Legacy Fast ALFA survey, to extract the HI line width, velocity, and mass of the MATLAS dwarfs. From the 1773 dwarfs in our sample with available HI observations, 8% (145) have an HI line detection. The majority of the dwarfs show an irregular morphology, while 29% (42) are ellipticals, which is the largest sample of HI-bearing dwarf ellipticals (dEs) to date. Of the HI dwarf sample, 2% (three) are ultra-di \ufb00 use galaxies (UDGs), 12% have a transition-type morphology, 5% are tidal dwarf candidates, and 10% appear to be disrupted objects. In our optically selected sample, 9.5% of the dEs, 7% of the UDGs, and 10% of the classical dwarfs are HI-bearing. The HI-bearing dwarfs have, on average, bluer colors than the dwarfs without detected HI. We \ufb01nd relations between the stellar and HI masses, gas fraction, color, and absolute magnitude to be consistent with previous studies of dwarfs probing similar masses and environments. For 79% of the dwarfs identi\ufb01ed as satellites of massive early-type galaxies, we \ufb01nd that the HI mass increases with the projected distance to the host. Using the HI line width, we estimate dynamical masses and \ufb01nd that 5% (seven) of the dwarfs are dark matter de\ufb01cient.", + " We present a photometric study of the dwarf galaxy population in the low to moderate density environments of the MATLAS (Mass Assembly of early-Type gaLAxies with their fine Structures) deep imaging survey. The sample consists of 2210 dwarfs, including 508 nucleated. We define a nucleus as a compact source that is close to the galaxy photocentre (within 0.5 $R_\\mathrm{ e}$) which is also the brightest such source within the galaxy\u2019s effective radius. The morphological analysis is performed using a 2D surface brightness profile modelling on the g-band images of both the galaxies and nuclei. Our study reveals that, for similar luminosities, the MATLAS dwarfs show ranges in the distribution of structural properties comparable to cluster (Virgo and Fornax) dwarfs and a range of sizes comparable to the Local Group and Local Volume dwarfs. Colour measurements using the r- and i-band images indicate that the dwarfs in low and moderate density environments are as red as cluster dwarfs on average. The observed similarities between dwarf ellipticals in vastly different environments imply that dEs are not uniquely the product of morphological transformation due to ram-pressure stripping and galaxy harassment in high density environments. We measure that the dwarf nuclei are located predominantly in massive, bright and round dwarfs and observe fewer nuclei in dwarfs with a faint centre and a small size. The colour of the galaxy nucleus shows no clear relation to the colour of the dwarf, in agreement with the migration and wet migration nucleus formation scenarios. The catalogues of the MATLAS dwarfs photometric and structural properties are provided.", + "We present a search for disturbed, candidate ram pressure stripping galaxies across more than 50 spectroscopically selected SDSS groups and clusters. Forty-eight ram pressure candidates are visually identified in these systems using high quality UNIONS imaging from the Canada-France Hawaii Telescope, covering \u223c6200 deg2 and \u223c2800 deg2 in the uand r-bands respectively. Ram pressure candidates are found in groups and clusters spanning a wide range in halo mass and include \u223c30 ram pressure candidates in the group regime (Mh < 1014). The observed frequency of ram pressure candidates shows substantial scatter with group/cluster mass, but on average is larger in clusters (Mh \u2265 1014M ) than groups (Mh < 1014M ) by a factor of \u223c2. We find that ram pressure candidates are most commonly low-mass galaxies and have enhanced star formation rates relative to star-forming field galaxies. The enhancement in star formation is largely independent of galaxy mass and strongest for galaxies in clusters. As a result of the large survey footprint and excellent image quality from UNIONS, we are able to identify disturbed galaxies, potentially affected by ram pressure stripping, across a wide range of host environment." + ], + "domain": [ + "Astrophysics", + "Galaxy Formation", + "Intergalactic Medium", + "Dwarf Galaxies" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "601d7dd2-5e26-460f-a329-a9fd23ea1d42": { + "pk": "601d7dd2-5e26-460f-a329-a9fd23ea1d42", + "name": "Albert Gu", + "bio": "I am a researcher deeply engaged in the intersection of large-scale sequence modeling and biological applications, particularly in genomics. My recent work has led to the development of innovative architectures like Caduceus, which addresses the unique challenges of modeling genomic sequences, such as long-range interactions and reverse complementarity. Caduceus stands out as the first family of RC equivariant bi-directional long-range DNA language models, achieving remarkable performance on downstream tasks, even surpassing much larger models.\n\nIn addition to my work in genomics, I have explored the potential of state-space models (SSMs) as alternatives to traditional Transformer architectures. My research has revealed the close relationship between SSMs and Transformers, leading to the creation of Mamba-2, a refined architecture that is both faster and competitive in language modeling tasks. Through extensive comparisons, I have demonstrated that while SSMs excel in many areas, hybrid models combining SSMs and attention mechanisms can outperform Transformers, particularly in long-context reasoning tasks.\n\nI am passionate about advancing our understanding of sequence models through a unifying matrix mixer framework, which has allowed me to develop new sub-quadratic sequence models like Hydra. This model not only enhances performance on non-causal tasks but also serves as a drop-in replacement for attention layers, showcasing the potential of structured matrix approaches in sequence modeling. My work aims to push the boundaries of what is possible in both natural language processing and biological sequence analysis, contributing to the broader field of machine learning.", + "collaborators": [ + "Tri Dao", + "Yair Schiff", + "Chia-Hsiang Kao", + "Aaron Gokaslan", + "Volodymyr Kuleshov", + "Aviv Bick", + "Kevin Y. Li", + "Eric P. Xing", + "J. Z. Kolter", + "R. Waleffe", + "Wonmin Byeon", + "Duncan Riach", + "Brandon Norick", + "V. Korthikanti", + "Ali Hatamizadeh", + "Sudhakar Singh", + "Deepak Narayanan", + "Garvit Kulshreshtha", + "Vartika Singh", + "Jared Casper", + "Jan Kautz", + "Mohammad Shoeybi", + "Bryan Catanzaro", + "Sukjun Hwang", + "Aakash Lahoti" + ], + "pub_titles": [ + "Caduceus: Bi-Directional Equivariant Long-Range DNA Sequence Modeling", + "Transformers to SSMs: Distilling Quadratic Knowledge to Subquadratic Models", + "Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality", + "An Empirical Study of Mamba-based Language Models", + "Hydra: Bidirectional State Space Models Through Generalized Matrix Mixers" + ], + "pub_abstracts": [ + "Large-scale sequence modeling has sparked rapid advances that now extend into biology and genomics. However, modeling genomic sequences introduces challenges such as the need to model long-range token interactions, the effects of upstream and downstream regions of the genome, and the reverse complementarity (RC) of DNA. Here, we propose an architecture motivated by these challenges that builds off the long-range Mamba block, and extends it to a BiMamba component that supports bi-directionality, and to a MambaDNA block that additionally supports RC equivariance. We use MambaDNA as the basis of Caduceus, the first family of RC equivariant bi-directional long-range DNA language models, and we introduce pre-training and fine-tuning strategies that yield Caduceus DNA foundation models. Caduceus outperforms previous long-range models on downstream benchmarks; on a challenging long-range variant effect prediction task, Caduceus exceeds the performance of 10x larger models that do not leverage bi-directionality or equivariance.", + "Transformer architectures have become a dominant paradigm for domains like language modeling but suffer in many inference settings due to their quadratic-time self-attention. Recently proposed subquadratic architectures, such as Mamba, have shown promise, but have been pretrained with substantially less computational resources than the strongest Transformer models. In this work, we present a method that is able to distill a pretrained Transformer architecture into alternative architectures such as state space models (SSMs). The key idea to our approach is that we can view both Transformers and SSMs as applying different forms of mixing matrices over the token sequences. We can thus progressively distill the Transformer architecture by matching different degrees of granularity in the SSM: first matching the mixing matrices themselves, then the hidden units at each block, and finally the end-to-end predictions. Our method, called MOHAWK, is able to distill a Mamba-2 variant based on the Phi-1.5 architecture (Phi-Mamba) using only 3B tokens and a hybrid version (Hybrid Phi-Mamba) using 5B tokens. Despite using less than 1% of the training data typically used to train models from scratch, Phi-Mamba boasts substantially stronger performance compared to all past open-source non-Transformer models. MOHAWK allows models like SSMs to leverage computational resources invested in training Transformer-based architectures, highlighting a new avenue for building such models.", + "While Transformers have been the main architecture behind deep learning's success in language modeling, state-space models (SSMs) such as Mamba have recently been shown to match or outperform Transformers at small to medium scale. We show that these families of models are actually quite closely related, and develop a rich framework of theoretical connections between SSMs and variants of attention, connected through various decompositions of a well-studied class of structured semiseparable matrices. Our state space duality (SSD) framework allows us to design a new architecture (Mamba-2) whose core layer is an a refinement of Mamba's selective SSM that is 2-8X faster, while continuing to be competitive with Transformers on language modeling.", + "Selective state-space models (SSMs) like Mamba overcome some of the shortcomings of Transformers, such as quadratic computational complexity with sequence length and large inference-time memory requirements from the key-value cache. Moreover, recent studies have shown that SSMs can match or exceed the language modeling capabilities of Transformers, making them an attractive alternative. In a controlled setting (e.g., same data), however, studies so far have only presented small scale experiments comparing SSMs to Transformers. To understand the strengths and weaknesses of these architectures at larger scales, we present a direct comparison between 8B-parameter Mamba, Mamba-2, and Transformer models trained on the same datasets of up to 3.5T tokens. We also compare these models to a hybrid architecture consisting of 43% Mamba-2, 7% attention, and 50% MLP layers (Mamba-2-Hybrid). Using a diverse set of tasks, we answer the question of whether Mamba models can match Transformers at larger training budgets. Our results show that while pure SSMs match or exceed Transformers on many tasks, they lag behind Transformers on tasks which require strong copying or in-context learning abilities (e.g., 5-shot MMLU, Phonebook) or long-context reasoning. In contrast, we find that the 8B Mamba-2-Hybrid exceeds the 8B Transformer on all 12 standard tasks we evaluated (+2.65 points on average) and is predicted to be up to 8x faster when generating tokens at inference time. To validate long-context capabilities, we provide additional experiments evaluating variants of the Mamba-2-Hybrid and Transformer extended to support 16K, 32K, and 128K sequences. On an additional 23 long-context tasks, the hybrid model continues to closely match or exceed the Transformer on average. To enable further study, we release the checkpoints as well as the code used to train our models as part of NVIDIA's Megatron-LM project.", + "A wide array of sequence models are built on a framework modeled after Transformers, comprising alternating sequence mixer and channel mixer layers. This paper studies a unifying matrix mixer view of sequence mixers that can be conceptualized as a linear map on the input sequence. This framework encompasses a broad range of well-known sequence models, including the self-attention of Transformers as well as recent strong alternatives such as structured state space models (SSMs), and allows understanding downstream characteristics such as efficiency and expressivity through properties of their structured matrix class. We identify a key axis of matrix parameterizations termed sequence alignment, which increases the flexibility and performance of matrix mixers, providing insights into the strong performance of Transformers and recent SSMs such as Mamba. Furthermore, the matrix mixer framework offers a systematic approach to developing sequence mixers with desired properties, allowing us to develop several new sub-quadratic sequence models. In particular, we propose a natural bidirectional extension of the Mamba model (Hydra), parameterized as a quasiseparable matrix mixer, which demonstrates superior performance over other sequence models including Transformers on non-causal tasks. As a drop-in replacement for attention layers, Hydra outperforms BERT by 0.8 points on the GLUE benchmark and ViT by 2% Top-1 accuracy on ImageNet." + ], + "domain": [ + "Machine Learning", + "Sequence Modeling", + "State-Space Models", + "Genomics" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "9dfb4734-a216-4c45-bd80-5596c8e0eb97": { + "pk": "9dfb4734-a216-4c45-bd80-5596c8e0eb97", + "name": "Tri Dao", + "bio": "I am a researcher deeply engaged in the intersection of deep learning and genomics, with a particular focus on sequence modeling. My recent work has led to the development of Caduceus, a pioneering family of DNA language models that leverage bi-directionality and reverse complementarity (RC) to enhance long-range genomic sequence predictions. This innovative architecture not only outperforms larger models but also sets a new standard for DNA language modeling.\n\nI have also explored the theoretical underpinnings of state-space models (SSMs) and their relationship to Transformers, culminating in the creation of the Mamba-2 architecture. This model is designed to be significantly faster while maintaining competitive performance in language tasks. My research emphasizes the practical implications of these models, demonstrating their strengths and weaknesses across various scales and tasks, particularly in long-context reasoning.\n\nAdditionally, I have contributed to a unifying framework for sequence models, which has allowed me to develop new architectures like Hydra, a bidirectional extension of Mamba that excels in non-causal tasks. My work aims to push the boundaries of what is possible in sequence modeling, providing insights that not only advance theoretical understanding but also yield practical tools for real-world applications. I am committed to sharing my findings with the community, releasing model checkpoints and code to facilitate further research and innovation in this exciting field.", + "collaborators": [ + "Albert Gu", + "Yair Schiff", + "Chia-Hsiang Kao", + "Aaron Gokaslan", + "Volodymyr Kuleshov", + "R. Waleffe", + "Wonmin Byeon", + "Duncan Riach", + "Brandon Norick", + "V. Korthikanti", + "Ali Hatamizadeh", + "Sudhakar Singh", + "Deepak Narayanan", + "Garvit Kulshreshtha", + "Vartika Singh", + "Jared Casper", + "Jan Kautz", + "Mohammad Shoeybi", + "Bryan Catanzaro", + "Sukjun Hwang", + "Aakash Lahoti" + ], + "pub_titles": [ + "Caduceus: Bi-Directional Equivariant Long-Range DNA Sequence Modeling", + "Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality", + "An Empirical Study of Mamba-based Language Models", + "Hydra: Bidirectional State Space Models Through Generalized Matrix Mixers" + ], + "pub_abstracts": [ + "Large-scale sequence modeling has sparked rapid advances that now extend into biology and genomics. However, modeling genomic sequences introduces challenges such as the need to model long-range token interactions, the effects of upstream and downstream regions of the genome, and the reverse complementarity (RC) of DNA. Here, we propose an architecture motivated by these challenges that builds off the long-range Mamba block, and extends it to a BiMamba component that supports bi-directionality, and to a MambaDNA block that additionally supports RC equivariance. We use MambaDNA as the basis of Caduceus, the first family of RC equivariant bi-directional long-range DNA language models, and we introduce pre-training and fine-tuning strategies that yield Caduceus DNA foundation models. Caduceus outperforms previous long-range models on downstream benchmarks; on a challenging long-range variant effect prediction task, Caduceus exceeds the performance of 10x larger models that do not leverage bi-directionality or equivariance.", + "While Transformers have been the main architecture behind deep learning's success in language modeling, state-space models (SSMs) such as Mamba have recently been shown to match or outperform Transformers at small to medium scale. We show that these families of models are actually quite closely related, and develop a rich framework of theoretical connections between SSMs and variants of attention, connected through various decompositions of a well-studied class of structured semiseparable matrices. Our state space duality (SSD) framework allows us to design a new architecture (Mamba-2) whose core layer is an a refinement of Mamba's selective SSM that is 2-8X faster, while continuing to be competitive with Transformers on language modeling.", + "Selective state-space models (SSMs) like Mamba overcome some of the shortcomings of Transformers, such as quadratic computational complexity with sequence length and large inference-time memory requirements from the key-value cache. Moreover, recent studies have shown that SSMs can match or exceed the language modeling capabilities of Transformers, making them an attractive alternative. In a controlled setting (e.g., same data), however, studies so far have only presented small scale experiments comparing SSMs to Transformers. To understand the strengths and weaknesses of these architectures at larger scales, we present a direct comparison between 8B-parameter Mamba, Mamba-2, and Transformer models trained on the same datasets of up to 3.5T tokens. We also compare these models to a hybrid architecture consisting of 43% Mamba-2, 7% attention, and 50% MLP layers (Mamba-2-Hybrid). Using a diverse set of tasks, we answer the question of whether Mamba models can match Transformers at larger training budgets. Our results show that while pure SSMs match or exceed Transformers on many tasks, they lag behind Transformers on tasks which require strong copying or in-context learning abilities (e.g., 5-shot MMLU, Phonebook) or long-context reasoning. In contrast, we find that the 8B Mamba-2-Hybrid exceeds the 8B Transformer on all 12 standard tasks we evaluated (+2.65 points on average) and is predicted to be up to 8x faster when generating tokens at inference time. To validate long-context capabilities, we provide additional experiments evaluating variants of the Mamba-2-Hybrid and Transformer extended to support 16K, 32K, and 128K sequences. On an additional 23 long-context tasks, the hybrid model continues to closely match or exceed the Transformer on average. To enable further study, we release the checkpoints as well as the code used to train our models as part of NVIDIA's Megatron-LM project.", + "A wide array of sequence models are built on a framework modeled after Transformers, comprising alternating sequence mixer and channel mixer layers. This paper studies a unifying matrix mixer view of sequence mixers that can be conceptualized as a linear map on the input sequence. This framework encompasses a broad range of well-known sequence models, including the self-attention of Transformers as well as recent strong alternatives such as structured state space models (SSMs), and allows understanding downstream characteristics such as efficiency and expressivity through properties of their structured matrix class. We identify a key axis of matrix parameterizations termed sequence alignment, which increases the flexibility and performance of matrix mixers, providing insights into the strong performance of Transformers and recent SSMs such as Mamba. Furthermore, the matrix mixer framework offers a systematic approach to developing sequence mixers with desired properties, allowing us to develop several new sub-quadratic sequence models. In particular, we propose a natural bidirectional extension of the Mamba model (Hydra), parameterized as a quasiseparable matrix mixer, which demonstrates superior performance over other sequence models including Transformers on non-causal tasks. As a drop-in replacement for attention layers, Hydra outperforms BERT by 0.8 points on the GLUE benchmark and ViT by 2% Top-1 accuracy on ImageNet." + ], + "domain": [ + "Machine Learning", + "Sequence Modeling", + "State-Space Models", + "Genomics" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "add5f70c-b273-449d-846c-2405a5788b3f": { + "pk": "add5f70c-b273-449d-846c-2405a5788b3f", + "name": "Ian J. Goodfellow", + "bio": "I am a researcher deeply engaged in the field of machine learning, particularly focusing on neural networks and their optimization. My work has explored various aspects of training neural networks, including the challenges posed by local minima and the effectiveness of stochastic gradient descent in overcoming these obstacles. I have developed frameworks such as the generative adversarial network (GAN) model, which innovatively combines generative and discriminative processes to enhance data distribution learning.\n\nMy research also delves into the intricacies of adversarial examples, where I argue that the linear nature of neural networks contributes significantly to their vulnerability. I have investigated catastrophic forgetting in neural networks, revealing that dropout training consistently yields the best performance in adapting to new tasks while retaining knowledge of previous ones.\n\nAdditionally, I have contributed to the development of advanced models like the multi-prediction deep Boltzmann machine (MP-DBM) and the partially directed deep Boltzmann machine (PD-DBM), which improve classification tasks without the need for greedy layerwise pretraining. My work on dropout and maxout models has led to state-of-the-art results across various benchmark datasets, demonstrating the power of these techniques in enhancing neural network performance.\n\nThrough my research, I aim to bridge the gap between theoretical insights and practical applications, ultimately advancing the capabilities of machine learning systems in real-world scenarios.", + "collaborators": [ + "Yoshua Bengio", + "Aaron C. Courville", + "Mehdi Mirza", + "David Warde-Farley", + "J. Bergstra", + "Bing Xu", + "Christian Szegedy", + "Vincent Dumoulin", + "Pascal Lamblin", + "Razvan Pascanu", + "Fr\u00e9d\u00e9ric Bastien", + "D. Erhan", + "O. Vinyals", + "Jean Pouget-Abadie", + "Sherjil Ozair", + "Jonathon Shlens", + "Xia Da", + "Yaroslav Bulatov", + "Julian Ibarz", + "Sacha Arnoud", + "Vinay D. Shet", + "Wojciech Zaremba", + "I. Sutskever", + "Joan Bruna", + "R. Fergus", + "P. Carrier", + "Benjamin Hamner", + "William J. Cukierski", + "Yichuan Tang", + "David Thaler", + "Dong-Hyun Lee", + "Yingbo Zhou", + "Chetan Ramaiah", + "Fangxiang Feng", + "Ruifan Li", + "Xiaojie Wang", + "Dimitris Athanasakis", + "J. Shawe-Taylor", + "Maxim Milakov", + "John Park", + "Radu Tudor Ionescu", + "M. Popescu", + "C. Grozea", + "Jingjing Xie", + "Lukasz Romaszko", + "Chuang Zhang", + "Olivier Breuleux", + "Olivier Delalleau", + "Guillaume Desjardins", + "Arnaud Bergeron" + ], + "pub_titles": [ + "Qualitatively characterizing neural network optimization problems", + "Generative Adversarial Nets", + "On distinguishability criteria for estimating generative models", + "Explaining and Harnessing Adversarial Examples", + "An Empirical Investigation of Catastrophic Forgeting in Gradient-Based Neural Networks", + "On the Challenges of Physical Implementations of RBMs", + "Pylearn2: a machine learning research library", + "Scaling Up Spike-and-Slab Models for Unsupervised Feature Learning", + "Multi-Prediction Deep Boltzmann Machines", + "Piecewise Linear Multilayer Perceptrons and Dropout", + "An empirical analysis of dropout in piecewise linear networks", + "Joint Training Deep Boltzmann Machines for Classification", + "Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks", + "Maxout Networks", + "Intriguing properties of neural networks", + "Large-Scale Feature Learning With Spike-and-Slab Sparse Coding", + "Theano: Deep Learning on GPUs with Python", + "Joint Training of Partially-Directed Deep Boltzmann Machines" + ], + "pub_abstracts": [ + "Training neural networks involves solving large-scale non-convex optimization problems. This task has long been believed to be extremely difficult, with fear of local minima and other obstacles motivating a variety of schemes to improve optimization, such as unsupervised pretraining. However, modern neural networks are able to achieve negligible training error on complex tasks, using only direct training with stochastic gradient descent. We introduce a simple analysis technique to look for evidence that such networks are overcoming local optima. We find that, in fact, on a straight path from initialization to solution, a variety of state of the art neural networks never encounter any significant obstacles.", + "We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to \u00bd everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.", + "Two recently introduced criteria for estimation of generative models are both based on a reduction to binary classification. Noise-contrastive estimation (NCE) is an estimation procedure in which a generative model is trained to be able to distinguish data samples from noise samples. Generative adversarial networks (GANs) are pairs of generator and discriminator networks, with the generator network learning to generate samples by attempting to fool the discriminator network into believing its samples are real data. Both estimation procedures use the same function to drive learning, which naturally raises questions about how they are related to each other, as well as whether this function is related to maximum likelihood estimation (MLE). NCE corresponds to training an internal data model belonging to the {\\em discriminator} network but using a fixed generator network. We show that a variant of NCE, with a dynamic generator network, is equivalent to maximum likelihood estimation. Since pairing a learned discriminator with an appropriate dynamically selected generator recovers MLE, one might expect the reverse to hold for pairing a learned generator with a certain discriminator. However, we show that recovering MLE for a learned generator requires departing from the distinguishability game. Specifically: (i) The expected gradient of the NCE discriminator can be made to match the expected gradient of MLE, if one is allowed to use a non-stationary noise distribution for NCE, (ii) No choice of discriminator network can make the expected gradient for the GAN generator match that of MLE, and (iii) The existing theory does not guarantee that GANs will converge in the non-convex case. This suggests that the key next step in GAN research is to determine whether GANs converge, and if not, to modify their training algorithm to force convergence.", + "Several machine learning models, including neural networks, consistently misclassify adversarial examples---inputs formed by applying small but intentionally worst-case perturbations to examples from the dataset, such that the perturbed input results in the model outputting an incorrect answer with high confidence. Early attempts at explaining this phenomenon focused on nonlinearity and overfitting. We argue instead that the primary cause of neural networks' vulnerability to adversarial perturbation is their linear nature. This explanation is supported by new quantitative results while giving the first explanation of the most intriguing fact about them: their generalization across architectures and training sets. Moreover, this view yields a simple and fast method of generating adversarial examples. Using this approach to provide examples for adversarial training, we reduce the test set error of a maxout network on the MNIST dataset.", + "Catastrophic forgetting is a problem faced by many machine learning models and algorithms. When trained on one task, then trained on a second task, many machine learning models \"forget\" how to perform the first task. This is widely believed to be a serious problem for neural networks. Here, we investigate the extent to which the catastrophic forgetting problem occurs for modern neural networks, comparing both established and recent gradient-based training algorithms and activation functions. We also examine the effect of the relationship between the first task and the second task on catastrophic forgetting. We find that it is always best to train using the dropout algorithm--the dropout algorithm is consistently best at adapting to the new task, remembering the old task, and has the best tradeoff curve between these two extremes. We find that different tasks and relationships between tasks result in very different rankings of activation function performance. This suggests the choice of activation function should always be cross-validated.", + " Restricted Boltzmann machines (RBMs) are powerful machine learning models, but learning and some kinds of inference in the model require sampling-based approximations, which, in classical digital computers, are implemented using expensive MCMC. Physical computation offers the opportunity to reduce the costof sampling by building physical systems whose natural dynamics correspond to drawing samples from the desired RBM distribution. Such a system avoids the burn-in and mixing cost of a Markov chain. However, hardware implementations of this variety usually entail limitations such as low-precision and limited range of the parameters and restrictions on the size and topology of the RBM. We conduct software simulations to determine how harmful each of these restrictions is. Our simulations are based on the D-Wave Two computer, but the issues we investigate arise in most forms of physical computation.Our findings suggest that designers of new physical computing hardware and algorithms for physical computers should focus their efforts on overcoming the limitations imposed by the topology restrictions of currently existing physical computers. ", + "Pylearn2 is a machine learning research library. This does not just mean that it is a collection of machine learning algorithms that share a common API; it means that it has been designed for flexibility and extensibility in order to facilitate research projects that involve new or unusual use cases. In this paper we give a brief history of the library, an overview of its basic philosophy, a summary of the library's architecture, and a description of how the Pylearn2 community functions socially.", + "We describe the use of two spike-and-slab models for modeling real-valued data, with an emphasis on their applications to object recognition. The first model, which we call spike-and-slab sparse coding (S3C), is a preexisting model for which we introduce a faster approximate inference algorithm. We introduce a deep variant of S3C, which we call the partially directed deep Boltzmann machine (PD-DBM) and extend our S3C inference algorithm for use on this model. We describe learning procedures for each. We demonstrate that our inference procedure for S3C enables scaling the model to unprecedented large problem sizes, and demonstrate that using S3C as a feature extractor results in very good object recognition performance, particularly when the number of labeled examples is low. We show that the PD-DBM generates better samples than its shallow counterpart, and that unlike DBMs or DBNs, the PD-DBM may be trained successfully without greedy layerwise training.", + "We introduce the multi-prediction deep Boltzmann machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.1", + "We propose a new type of hidden layer for a multilayer perceptron, and demonstrate that it obtains the best reported performance for an MLP on the MNIST dataset.", + "The recently introduced dropout training criterion for neural networks has been the subject of much attention due to its simplicity and remarkable effectiveness as a regularizer, as well as its interpretation as a training procedure for an exponentially large ensemble of networks that share parameters. In this work we empirically investigate several questions related to the efficacy of dropout, specifically as it concerns networks employing the popular rectified linear activation function. We investigate the quality of the test time weight-scaling inference procedure by evaluating the geometric average exactly in small models, as well as compare the performance of the geometric mean to the arithmetic mean more commonly employed by ensemble techniques. We explore the effect of tied weights on the ensemble interpretation by training ensembles of masked networks without tied weights. Finally, we investigate an alternative criterion based on a biased estimator of the maximum likelihood ensemble gradient.", + "We introduce a new method for training deep Boltzmann machines jointly. Prior methods of training DBMs require an initial learning pass that trains the model greedily, one layer at a time, or do not perform well on classification tasks. In our approach, we train all layers of the DBM simultaneously, using a novel training procedure called multi-prediction training. The resulting model can either be interpreted as a single generative model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent networks that share parameters and may be approximately averaged together using a novel technique we call the multi-inference trick. We show that our approach performs competitively for classification and outperforms previous methods in terms of accuracy of approximate inference and classification with missing inputs.", + "Abstract: Recognizing arbitrary multi-character text in unconstrained natural photographs is a hard problem. In this paper, we address an equally hard sub-problem in this domain viz. recognizing arbitrary multi-digit numbers from Street View imagery. Traditional approaches to solve this problem typically separate out the localization, segmentation, and recognition steps. In this paper we propose a unified approach that integrates these three steps via the use of a deep convolutional neural network that operates directly on the image pixels. We employ the DistBelief implementation of deep neural networks in order to train large, distributed neural networks on high quality images. We find that the performance of this approach increases with the depth of the convolutional network, with the best performance occurring in the deepest architecture we trained, with eleven hidden layers. We evaluate this approach on the publicly available SVHN dataset and achieve over $96\\%$ accuracy in recognizing complete street numbers. We show that on a per-digit recognition task, we improve upon the state-of-the-art, achieving $97.84\\%$ accuracy. We also evaluate this approach on an even more challenging dataset generated from Street View imagery containing several tens of millions of street number annotations and achieve over $90\\%$ accuracy. To further explore the applicability of the proposed system to broader text recognition tasks, we apply it to synthetic distorted text from reCAPTCHA. reCAPTCHA is one of the most secure reverse turing tests that uses distorted text to distinguish humans from bots. We report a $99.8\\%$ accuracy on the hardest category of reCAPTCHA. Our evaluations on both tasks indicate that at specific operating thresholds, the performance of the proposed system is comparable to, and in some cases exceeds, that of human operators.", + "We consider the problem of designing models to leverage a recently introduced approximate model averaging technique called dropout. We define a simple new model called maxout (so named because its output is the max of a set of inputs, and because it is a natural companion to dropout) designed to both facilitate optimization by dropout and improve the accuracy of dropout's fast approximate model averaging technique. We empirically verify that the model successfully accomplishes both of these tasks. We use maxout and dropout to demonstrate state of the art classification performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN.", + "Deep neural networks are highly expressive models that have recently achieved state of the art performance on speech and visual recognition tasks. While their expressiveness is the reason they succeed, it also causes them to learn uninterpretable solutions that could have counter-intuitive properties. In this paper we report two such properties. First, we find that there is no distinction between individual high level units and random linear combinations of high level units, according to various methods of unit analysis. It suggests that it is the space, rather than the individual units, that contains of the semantic information in the high layers of neural networks. Second, we find that deep neural networks learn input-output mappings that are fairly discontinuous to a significant extend. We can cause the network to misclassify an image by applying a certain imperceptible perturbation, which is found by maximizing the network's prediction error. In addition, the specific nature of these perturbations is not a random artifact of learning: the same perturbation can cause a different network, that was trained on a different subset of the dataset, to misclassify the same input.", + "We consider the problem of object recognition with a large number of classes. In order to overcome the low amount of labeled examples available in this setting, we introduce a new feature learning and extraction procedure based on a factor model we call spike-and-slab sparse coding (S3C). Prior work on S3C has not prioritized the ability to exploit parallel architectures and scale S3C to the enormous problem sizes needed for object recognition. We present a novel inference procedure for appropriate for use with GPUs which allows us to dramatically increase both the training set size and the amount of latent factors that S3C may be trained with. We demonstrate that this approach improves upon the supervised learning capabilities of both sparse coding and the spike-and-slab Restricted Boltzmann Machine (ssRBM) on the CIFAR-10 dataset. We use the CIFAR-100 dataset to demonstrate that our method scales to large numbers of classes better than previous methods. Finally, we use our method to win the NIPS 2011 Workshop on Challenges In Learning Hierarchical Models' Transfer Learning Challenge.", + "In this paper, we present Theano 1 , a framework in the Python programming language for defining, optimizing and evaluating expressions involving high-level operations on tensors. Theano offers most of NumPy\u2019s functionality, but adds automatic symbolic differentiation, GPU support, and faster expression evaluation. Theano is a general mathematical tool, but it was developed with the goal of facilitating research in deep learning. The Deep Learning Tutorials 2 introduce recent advances in deep learning, and showcase how Theano", + "We introduce a deep probabilistic model which we call the partially directed deep Boltzmann machine (PD-DBM). The PD-DBM is a model of real-valued data based on the deep Boltzmann machine (DBM) and the spike-and-slab sparse coding (S3C) model. We offer a hypothesis for why DBMs may not be trained succesfully without greedy layerwise training, and motivate the PD-DBM as a modified DBM that can be trained jointly." + ], + "domain": [ + "Deep Learning", + "Generative Models", + "Adversarial Learning", + "Neural Networks" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "61cbcc25-a70d-42c4-b7dd-4431afaaccad": { + "pk": "61cbcc25-a70d-42c4-b7dd-4431afaaccad", + "name": "Jean Pouget-Abadie", + "bio": "I am a researcher deeply engaged in the field of natural language processing and generative models. My work primarily focuses on enhancing the capabilities of neural machine translation systems, particularly in addressing the challenges posed by long sentences. In my early research, I developed an innovative approach to automatically segment input sentences into manageable phrases, significantly improving translation quality. This work highlighted the limitations of existing neural translation models compared to traditional phrase-based systems.\n\nAdditionally, I have explored the realm of generative models through an adversarial framework, where I simultaneously train generative and discriminative models in a minimax game setup. This approach not only simplifies the training process by eliminating the need for complex Markov chains but also ensures that the generative model accurately captures the underlying data distribution. My experiments have demonstrated the effectiveness of this framework, showcasing its potential through both qualitative and quantitative evaluations of generated samples.\n\nOverall, my research aims to push the boundaries of what is possible in machine translation and generative modeling, striving for more robust and efficient systems that can handle the complexities of human language.", + "collaborators": [ + "Yoshua Bengio", + "M\u00f3nica Mendes Sousa", + "Dzmitry Bahdanau", + "B. V. Merrienboer", + "Kyunghyun Cho", + "I. Goodfellow", + "Mehdi Mirza", + "Bing Xu", + "David Warde-Farley", + "Sherjil Ozair", + "Aaron C. Courville", + "Claude Vital", + "Dominique Ostler", + "R. Fernandes", + "Dominique Carles", + "M. J. Saraiva" + ], + "pub_titles": [ + "Overcoming the Curse of Sentence Length for Neural Machine Translation using Automatic Segmentation", + "Generative Adversarial Nets" + ], + "pub_abstracts": [ + "The authors of (Cho et al., 2014a) have shown that the recently introduced neural network translation systems suffer from a significant drop in translation quality when translating long sentences, unlike existing phrase-based translation systems. In this paper, we propose a way to address this issue by automatically segmenting an input sentence into phrases that can be easily translated by the neural network translation model. Once each segment has been independently translated by the neural machine translation model, the translated clauses are concatenated to form a final translation. Empirical results show a significant improvement in translation quality for long sentences.", + "We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to \u00bd everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples." + ], + "domain": [ + "Natural Language Processing", + "Generative Models", + "Neural Networks" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "91dfef3e-0566-45f7-b111-302f034c4597": { + "pk": "91dfef3e-0566-45f7-b111-302f034c4597", + "name": "Mehdi Mirza", + "bio": "I am a researcher deeply engaged in the field of generative models and neural networks, with a particular focus on advancing techniques that enhance model performance and adaptability. My work began with the introduction of Conditional Generative Adversarial Nets (cGANs), where I explored how conditioning on specific data can lead to more meaningful and diverse outputs, such as generating MNIST digits based on class labels. This foundational work laid the groundwork for my ongoing exploration of adversarial processes in generative modeling.\n\nI have also investigated the challenges of catastrophic forgetting in neural networks, revealing how different training algorithms and activation functions impact a model's ability to retain knowledge across tasks. My findings emphasize the importance of dropout as a robust strategy for balancing the retention of old tasks while adapting to new ones.\n\nIn addition to theoretical advancements, I have contributed to practical applications, such as emotion recognition in video clips, where I combined multiple deep learning architectures to analyze various data modalities effectively. My work on the multi-prediction deep Boltzmann machine (MP-DBM) further demonstrates my commitment to improving classification tasks without the need for complex pretraining.\n\nThrough my research, I aim to push the boundaries of machine learning, developing flexible frameworks and innovative models that not only perform well but also adapt to the evolving landscape of data and tasks. I am passionate about fostering collaboration within the machine learning community, as exemplified by my contributions to the Pylearn2 library, which emphasizes extensibility and usability for researchers.", + "collaborators": [ + "Yoshua Bengio", + "Aaron C. Courville", + "I. Goodfellow", + "David Warde-Farley", + "Bing Xu", + "Pascal Lamblin", + "Razvan Pascanu", + "J. Bergstra", + "Pascal Vincent", + "P. Carrier", + "Simon Osindero", + "Jean Pouget-Abadie", + "Sherjil Ozair", + "Xia Da", + "Vincent Dumoulin", + "Fr\u00e9d\u00e9ric Bastien", + "Samira Ebrahimi Kahou", + "C. Pal", + "Xavier Bouthillier", + "Pierre Froumenty", + "\u00c7aglar G\u00fcl\u00e7ehre", + "R. Memisevic", + "Raul Chandias Ferrari", + "S\u00e9bastien Jean", + "Yann Dauphin", + "Nicolas Boulanger-Lewandowski", + "Abhishek Aggarwal", + "Jeremie Zumer", + "Jean-Philippe Raymond", + "Guillaume Desjardins", + "Atousa Torabi", + "Arjun Sharma", + "Emmanuel Bengio", + "K. Konda", + "Zhenzhou Wu", + "D. Erhan", + "Benjamin Hamner", + "William J. Cukierski", + "Yichuan Tang", + "David Thaler", + "Dong-Hyun Lee", + "Yingbo Zhou", + "Chetan Ramaiah", + "Fangxiang Feng", + "Ruifan Li", + "Xiaojie Wang", + "Dimitris Athanasakis", + "J. Shawe-Taylor", + "Maxim Milakov", + "John Park", + "Radu Tudor Ionescu", + "M. Popescu", + "C. Grozea", + "Jingjing Xie", + "Lukasz Romaszko", + "Chuang Zhang", + "Salah Rifai" + ], + "pub_titles": [ + "Conditional Generative Adversarial Nets", + "Generative Adversarial Nets", + "An Empirical Investigation of Catastrophic Forgeting in Gradient-Based Neural Networks", + "Pylearn2: a machine learning research library", + "Combining modality specific deep neural networks for emotion recognition in video", + "Multi-Prediction Deep Boltzmann Machines", + "Maxout Networks" + ], + "pub_abstracts": [ + "Generative Adversarial Nets [8] were recently introduced as a novel way to train generative models. In this work we introduce the conditional version of generative adversarial nets, which can be constructed by simply feeding the data, y, we wish to condition on to both the generator and discriminator. We show that this model can generate MNIST digits conditioned on class labels. We also illustrate how this model could be used to learn a multi-modal model, and provide preliminary examples of an application to image tagging in which we demonstrate how this approach can generate descriptive tags which are not part of training labels.", + "We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to \u00bd everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.", + "Catastrophic forgetting is a problem faced by many machine learning models and algorithms. When trained on one task, then trained on a second task, many machine learning models \"forget\" how to perform the first task. This is widely believed to be a serious problem for neural networks. Here, we investigate the extent to which the catastrophic forgetting problem occurs for modern neural networks, comparing both established and recent gradient-based training algorithms and activation functions. We also examine the effect of the relationship between the first task and the second task on catastrophic forgetting. We find that it is always best to train using the dropout algorithm--the dropout algorithm is consistently best at adapting to the new task, remembering the old task, and has the best tradeoff curve between these two extremes. We find that different tasks and relationships between tasks result in very different rankings of activation function performance. This suggests the choice of activation function should always be cross-validated.", + "Pylearn2 is a machine learning research library. This does not just mean that it is a collection of machine learning algorithms that share a common API; it means that it has been designed for flexibility and extensibility in order to facilitate research projects that involve new or unusual use cases. In this paper we give a brief history of the library, an overview of its basic philosophy, a summary of the library's architecture, and a description of how the Pylearn2 community functions socially.", + "In this paper we present the techniques used for the University of Montr\u00e9al's team submissions to the 2013 Emotion Recognition in the Wild Challenge. The challenge is to classify the emotions expressed by the primary human subject in short video clips extracted from feature length movies. This involves the analysis of video clips of acted scenes lasting approximately one-two seconds, including the audio track which may contain human voices as well as background music. Our approach combines multiple deep neural networks for different data modalities, including: (1) a deep convolutional neural network for the analysis of facial expressions within video frames; (2) a deep belief net to capture audio information; (3) a deep autoencoder to model the spatio-temporal information produced by the human actions depicted within the entire scene; and (4) a shallow network architecture focused on extracted features of the mouth of the primary human subject in the scene. We discuss each of these techniques, their performance characteristics and different strategies to aggregate their predictions. Our best single model was a convolutional neural network trained to predict emotions from static frames using two large data sets, the Toronto Face Database and our own set of faces images harvested from Google image search, followed by a per frame aggregation strategy that used the challenge training data. This yielded a test set accuracy of 35.58%. Using our best strategy for aggregating our top performing models into a single predictor we were able to produce an accuracy of 41.03% on the challenge test set. These compare favorably to the challenge baseline test set accuracy of 27.56%.", + "We introduce the multi-prediction deep Boltzmann machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.1", + "We consider the problem of designing models to leverage a recently introduced approximate model averaging technique called dropout. We define a simple new model called maxout (so named because its output is the max of a set of inputs, and because it is a natural companion to dropout) designed to both facilitate optimization by dropout and improve the accuracy of dropout's fast approximate model averaging technique. We empirically verify that the model successfully accomplishes both of these tasks. We use maxout and dropout to demonstrate state of the art classification performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN." + ], + "domain": [ + "Generative Models", + "Deep Learning", + "Neural Networks", + "Emotion Recognition" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "d38b14c2-4cc2-4c12-9861-f087536cad63": { + "pk": "d38b14c2-4cc2-4c12-9861-f087536cad63", + "name": "Bing Xu", + "bio": "I am a researcher dedicated to the exploration and development of generative models through innovative frameworks. My recent work introduces a novel adversarial process that simultaneously trains a generative model (G) and a discriminative model (D). This approach is grounded in a minimax two-player game, where G aims to capture the true data distribution while D assesses the authenticity of samples. \n\nWhat excites me about this framework is its elegance and efficiency; it allows for the training of both models using backpropagation without the complexities of Markov chains or unrolled approximate inference networks. Through rigorous qualitative and quantitative evaluations, I have demonstrated the framework's potential in generating high-quality samples. My research not only contributes to the theoretical understanding of generative models but also paves the way for practical applications in various domains. I am passionate about pushing the boundaries of machine learning and uncovering new possibilities in generative modeling.", + "collaborators": [ + "I. Goodfellow", + "Mehdi Mirza", + "Aaron C. Courville", + "Yoshua Bengio", + "Jean Pouget-Abadie", + "David Warde-Farley", + "Sherjil Ozair", + "D. Erhan", + "P. Carrier", + "Benjamin Hamner", + "William J. Cukierski", + "Yichuan Tang", + "David Thaler", + "Dong-Hyun Lee", + "Yingbo Zhou", + "Chetan Ramaiah", + "Fangxiang Feng", + "Ruifan Li", + "Xiaojie Wang", + "Dimitris Athanasakis", + "J. Shawe-Taylor", + "Maxim Milakov", + "John Park", + "Radu Tudor Ionescu", + "M. Popescu", + "C. Grozea", + "J. Bergstra", + "Jingjing Xie", + "Lukasz Romaszko", + "Chuang Zhang" + ], + "pub_titles": [ + "Generative Adversarial Nets" + ], + "pub_abstracts": [ + "We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to \u00bd everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples." + ], + "domain": [ + "Generative Models", + "Adversarial Learning", + "Deep Learning" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "d67b8a7e-d46c-47b8-a2c1-bfe526d90369": { + "pk": "d67b8a7e-d46c-47b8-a2c1-bfe526d90369", + "name": "Ian J. Goodfellow", + "bio": "I am a researcher deeply engaged in the exploration of machine learning, particularly in the realms of neural networks and probabilistic models. My work has primarily focused on addressing critical challenges such as catastrophic forgetting in neural networks, where models struggle to retain knowledge from previous tasks when learning new ones. Through my investigations, I have demonstrated that dropout training consistently outperforms other methods in balancing the retention of old task performance while adapting to new tasks.\n\nI have also contributed to the development of advanced models like the partially directed deep Boltzmann machine (PD-DBM) and the multi-prediction deep Boltzmann machine (MP-DBM), which enhance classification capabilities without the need for greedy layer-wise training. My research on spike-and-slab sparse coding (S3C) has led to significant improvements in object recognition tasks, particularly in scenarios with limited labeled data.\n\nIn addition to theoretical advancements, I have been involved in practical applications, such as recognizing multi-digit numbers from Street View imagery, achieving state-of-the-art accuracy through a unified deep learning approach. My work with Theano has also facilitated research in deep learning by providing a robust framework for tensor operations and automatic differentiation.\n\nOverall, my research aims to push the boundaries of machine learning by developing models that are not only effective but also interpretable, addressing the complexities of real-world data and tasks. I am passionate about leveraging these advancements to create more intelligent systems that can learn and adapt in dynamic environments.", + "collaborators": [ + "Yoshua Bengio", + "Aaron C. Courville", + "David Warde-Farley", + "Mehdi Mirza", + "J. Bergstra", + "Pascal Lamblin", + "Razvan Pascanu", + "Fr\u00e9d\u00e9ric Bastien", + "Vincent Dumoulin", + "D. Erhan", + "Guillaume Desjardins", + "Arnaud Bergeron", + "Xia Da", + "Yaroslav Bulatov", + "Julian Ibarz", + "Sacha Arnoud", + "Vinay D. Shet", + "Christian Szegedy", + "Wojciech Zaremba", + "I. Sutskever", + "Joan Bruna", + "R. Fergus", + "P. Carrier", + "Benjamin Hamner", + "William J. Cukierski", + "Yichuan Tang", + "David Thaler", + "Dong-Hyun Lee", + "Yingbo Zhou", + "Chetan Ramaiah", + "Fangxiang Feng", + "Ruifan Li", + "Xiaojie Wang", + "Dimitris Athanasakis", + "J. Shawe-Taylor", + "Maxim Milakov", + "John Park", + "Radu Tudor Ionescu", + "M. Popescu", + "C. Grozea", + "Jingjing Xie", + "Lukasz Romaszko", + "Bing Xu", + "Chuang Zhang", + "Olivier Breuleux", + "Olivier Delalleau", + "Nicolas Bouchard", + "Daftar Pustaka", + "Februari", + "Boshra Bahrami", + "Mirsaeid Hosseini", + "Januari", + "Gr\u00e9goire Mesnil", + "Yann Dauphin", + "Xavier Glorot", + "Salah Rifai", + "Y. Bengio", + "Erick Lavoie", + "X. Muller", + "Pascal Vincent" + ], + "pub_titles": [ + "An Empirical Investigation of Catastrophic Forgeting in Gradient-Based Neural Networks", + "On the Challenges of Physical Implementations of RBMs", + "Pylearn2: a machine learning research library", + "Scaling Up Spike-and-Slab Models for Unsupervised Feature Learning", + "Multi-Prediction Deep Boltzmann Machines", + "Piecewise Linear Multilayer Perceptrons and Dropout", + "An empirical analysis of dropout in piecewise linear networks", + "Joint Training Deep Boltzmann Machines for Classification", + "Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks", + "Maxout Networks", + "Intriguing properties of neural networks", + "Large-Scale Feature Learning With Spike-and-Slab Sparse Coding", + "Theano: Deep Learning on GPUs with Python", + "Joint Training of Partially-Directed Deep Boltzmann Machines", + "Theano: new features and speed improvements", + "Spike-and-Slab Sparse Coding for Unsupervised Feature Discovery", + "Joint Training of Deep Boltzmann Machines", + "Mining", + "Unsupervised and Transfer Learning Challenge: a Deep Learning Approach" + ], + "pub_abstracts": [ + "Catastrophic forgetting is a problem faced by many machine learning models and algorithms. When trained on one task, then trained on a second task, many machine learning models \"forget\" how to perform the first task. This is widely believed to be a serious problem for neural networks. Here, we investigate the extent to which the catastrophic forgetting problem occurs for modern neural networks, comparing both established and recent gradient-based training algorithms and activation functions. We also examine the effect of the relationship between the first task and the second task on catastrophic forgetting. We find that it is always best to train using the dropout algorithm--the dropout algorithm is consistently best at adapting to the new task, remembering the old task, and has the best tradeoff curve between these two extremes. We find that different tasks and relationships between tasks result in very different rankings of activation function performance. This suggests the choice of activation function should always be cross-validated.", + " Restricted Boltzmann machines (RBMs) are powerful machine learning models, but learning and some kinds of inference in the model require sampling-based approximations, which, in classical digital computers, are implemented using expensive MCMC. Physical computation offers the opportunity to reduce the costof sampling by building physical systems whose natural dynamics correspond to drawing samples from the desired RBM distribution. Such a system avoids the burn-in and mixing cost of a Markov chain. However, hardware implementations of this variety usually entail limitations such as low-precision and limited range of the parameters and restrictions on the size and topology of the RBM. We conduct software simulations to determine how harmful each of these restrictions is. Our simulations are based on the D-Wave Two computer, but the issues we investigate arise in most forms of physical computation.Our findings suggest that designers of new physical computing hardware and algorithms for physical computers should focus their efforts on overcoming the limitations imposed by the topology restrictions of currently existing physical computers. ", + "Pylearn2 is a machine learning research library. This does not just mean that it is a collection of machine learning algorithms that share a common API; it means that it has been designed for flexibility and extensibility in order to facilitate research projects that involve new or unusual use cases. In this paper we give a brief history of the library, an overview of its basic philosophy, a summary of the library's architecture, and a description of how the Pylearn2 community functions socially.", + "We describe the use of two spike-and-slab models for modeling real-valued data, with an emphasis on their applications to object recognition. The first model, which we call spike-and-slab sparse coding (S3C), is a preexisting model for which we introduce a faster approximate inference algorithm. We introduce a deep variant of S3C, which we call the partially directed deep Boltzmann machine (PD-DBM) and extend our S3C inference algorithm for use on this model. We describe learning procedures for each. We demonstrate that our inference procedure for S3C enables scaling the model to unprecedented large problem sizes, and demonstrate that using S3C as a feature extractor results in very good object recognition performance, particularly when the number of labeled examples is low. We show that the PD-DBM generates better samples than its shallow counterpart, and that unlike DBMs or DBNs, the PD-DBM may be trained successfully without greedy layerwise training.", + "We introduce the multi-prediction deep Boltzmann machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.1", + "We propose a new type of hidden layer for a multilayer perceptron, and demonstrate that it obtains the best reported performance for an MLP on the MNIST dataset.", + "The recently introduced dropout training criterion for neural networks has been the subject of much attention due to its simplicity and remarkable effectiveness as a regularizer, as well as its interpretation as a training procedure for an exponentially large ensemble of networks that share parameters. In this work we empirically investigate several questions related to the efficacy of dropout, specifically as it concerns networks employing the popular rectified linear activation function. We investigate the quality of the test time weight-scaling inference procedure by evaluating the geometric average exactly in small models, as well as compare the performance of the geometric mean to the arithmetic mean more commonly employed by ensemble techniques. We explore the effect of tied weights on the ensemble interpretation by training ensembles of masked networks without tied weights. Finally, we investigate an alternative criterion based on a biased estimator of the maximum likelihood ensemble gradient.", + "We introduce a new method for training deep Boltzmann machines jointly. Prior methods of training DBMs require an initial learning pass that trains the model greedily, one layer at a time, or do not perform well on classification tasks. In our approach, we train all layers of the DBM simultaneously, using a novel training procedure called multi-prediction training. The resulting model can either be interpreted as a single generative model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent networks that share parameters and may be approximately averaged together using a novel technique we call the multi-inference trick. We show that our approach performs competitively for classification and outperforms previous methods in terms of accuracy of approximate inference and classification with missing inputs.", + "Abstract: Recognizing arbitrary multi-character text in unconstrained natural photographs is a hard problem. In this paper, we address an equally hard sub-problem in this domain viz. recognizing arbitrary multi-digit numbers from Street View imagery. Traditional approaches to solve this problem typically separate out the localization, segmentation, and recognition steps. In this paper we propose a unified approach that integrates these three steps via the use of a deep convolutional neural network that operates directly on the image pixels. We employ the DistBelief implementation of deep neural networks in order to train large, distributed neural networks on high quality images. We find that the performance of this approach increases with the depth of the convolutional network, with the best performance occurring in the deepest architecture we trained, with eleven hidden layers. We evaluate this approach on the publicly available SVHN dataset and achieve over $96\\%$ accuracy in recognizing complete street numbers. We show that on a per-digit recognition task, we improve upon the state-of-the-art, achieving $97.84\\%$ accuracy. We also evaluate this approach on an even more challenging dataset generated from Street View imagery containing several tens of millions of street number annotations and achieve over $90\\%$ accuracy. To further explore the applicability of the proposed system to broader text recognition tasks, we apply it to synthetic distorted text from reCAPTCHA. reCAPTCHA is one of the most secure reverse turing tests that uses distorted text to distinguish humans from bots. We report a $99.8\\%$ accuracy on the hardest category of reCAPTCHA. Our evaluations on both tasks indicate that at specific operating thresholds, the performance of the proposed system is comparable to, and in some cases exceeds, that of human operators.", + "We consider the problem of designing models to leverage a recently introduced approximate model averaging technique called dropout. We define a simple new model called maxout (so named because its output is the max of a set of inputs, and because it is a natural companion to dropout) designed to both facilitate optimization by dropout and improve the accuracy of dropout's fast approximate model averaging technique. We empirically verify that the model successfully accomplishes both of these tasks. We use maxout and dropout to demonstrate state of the art classification performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN.", + "Deep neural networks are highly expressive models that have recently achieved state of the art performance on speech and visual recognition tasks. While their expressiveness is the reason they succeed, it also causes them to learn uninterpretable solutions that could have counter-intuitive properties. In this paper we report two such properties. First, we find that there is no distinction between individual high level units and random linear combinations of high level units, according to various methods of unit analysis. It suggests that it is the space, rather than the individual units, that contains of the semantic information in the high layers of neural networks. Second, we find that deep neural networks learn input-output mappings that are fairly discontinuous to a significant extend. We can cause the network to misclassify an image by applying a certain imperceptible perturbation, which is found by maximizing the network's prediction error. In addition, the specific nature of these perturbations is not a random artifact of learning: the same perturbation can cause a different network, that was trained on a different subset of the dataset, to misclassify the same input.", + "We consider the problem of object recognition with a large number of classes. In order to overcome the low amount of labeled examples available in this setting, we introduce a new feature learning and extraction procedure based on a factor model we call spike-and-slab sparse coding (S3C). Prior work on S3C has not prioritized the ability to exploit parallel architectures and scale S3C to the enormous problem sizes needed for object recognition. We present a novel inference procedure for appropriate for use with GPUs which allows us to dramatically increase both the training set size and the amount of latent factors that S3C may be trained with. We demonstrate that this approach improves upon the supervised learning capabilities of both sparse coding and the spike-and-slab Restricted Boltzmann Machine (ssRBM) on the CIFAR-10 dataset. We use the CIFAR-100 dataset to demonstrate that our method scales to large numbers of classes better than previous methods. Finally, we use our method to win the NIPS 2011 Workshop on Challenges In Learning Hierarchical Models' Transfer Learning Challenge.", + "In this paper, we present Theano 1 , a framework in the Python programming language for defining, optimizing and evaluating expressions involving high-level operations on tensors. Theano offers most of NumPy\u2019s functionality, but adds automatic symbolic differentiation, GPU support, and faster expression evaluation. Theano is a general mathematical tool, but it was developed with the goal of facilitating research in deep learning. The Deep Learning Tutorials 2 introduce recent advances in deep learning, and showcase how Theano", + "We introduce a deep probabilistic model which we call the partially directed deep Boltzmann machine (PD-DBM). The PD-DBM is a model of real-valued data based on the deep Boltzmann machine (DBM) and the spike-and-slab sparse coding (S3C) model. We offer a hypothesis for why DBMs may not be trained succesfully without greedy layerwise training, and motivate the PD-DBM as a modified DBM that can be trained jointly.", + "Theano is a linear algebra compiler that optimizes a user's symbolically-specified mathematical computations to produce efficient low-level implementations. In this paper, we present new features and efficiency improvements to Theano, and benchmarks demonstrating Theano's performance relative to Torch7, a recently introduced machine learning library, and to RNNLM, a C++ library targeted at recurrent neural networks.", + "We consider the problem of using a factor model we call {\\em spike-and-slab sparse coding} (S3C) to learn features for a classification task. The S3C model resembles both the spike-and-slab RBM and sparse coding. Since exact inference in this model is intractable, we derive a structured variational inference procedure and employ a variational EM training algorithm. Prior work on approximate inference for this model has not prioritized the ability to exploit parallel architectures and scale to enormous problem sizes. We present an inference procedure appropriate for use with GPUs which allows us to dramatically increase both the training set size and the amount of latent factors. We demonstrate that this approach improves upon the supervised learning capabilities of both sparse coding and the ssRBM on the CIFAR-10 dataset. We evaluate our approach's potential for semi-supervised learning on subsets of CIFAR-10. We demonstrate state-of-the art self-taught learning performance on the STL-10 dataset and use our method to win the NIPS 2011 Workshop on Challenges In Learning Hierarchical Models' Transfer Learning Challenge.", + "We introduce a new method for training deep Boltzmann machines jointly. Prior methods require an initial learning pass that trains the deep Boltzmann machine greedily, one layer at a time, or do not perform well on classifi- cation tasks.", + "Purpose. To establish the feasibility of refining deep open\u00adpit mines below the boundary of the use of combined motor\u00adcon\u00ad veyor transport with an increased slope angles of the pit walls using the developed transport unit for reloading rocks to overlying horizons during the reactivation of pillars under transport berms. Methodology. Preparation of a digital block model of the deposit, the elaboration of 3D geomechanical models for the dynam\u00ad ics of mining, 2D and 3D numerical simulation of the rock stress\u00adstrain state of the outcrops of opencast workings, mathematical modeling of stepwise ore reserves and mining schedule, patent research and feasibility study. Findings. It is advisable to carry out mining in terms of the marginal rock state with an increase in the slope of the pit sides below the limit of application of the cyclic and continuous method in ultra\u00addeep open pits. Such design of pit sides is achieved when benches are mined from top to bottom within the boundaries of steeply inclined layers with the use of inter\u00adbench loaders of the developed designed in the completion zone. Provisions for the selection and feasibility of using the loader in the deep zone are formulated based on demarcation of application zones of cyclic (road transport) and cyclic\u00adflow (combined road\u00adconveyor trans\u00ad port) technologies. Originality. Schematization of the mining operation was performed based on the calculated values of safety factor of sides, which allows increasing the slope angles of the pit walls of even ultra\u00addeep open pits in the completion zone. It was found that with deepening of mining, the zones of potential sliding move away from the loose overburden to lower ore benches closer to the final depth of the Kacharsky open pit (760 m), but the safety factor corresponds to the required value according to the design standards. Practical value. An increase in the slope of the pit walls in the completion zone can be achieved using the developed loading installation, the main difference of which is that it can be moved without dismantling under conditions of reactivation of transport pillars (with an increase in lifting height by 1.5\u20134.5 times compared to the known equipment).", + "Learning good representations from a large set of unlabeled data is a particularly challenging task. Recent work (see Bengio (2009) for a review) shows that training deep architectures is a good way to extract such representations, by extracting and disentangling gradually higher-level factors of variation characterizing the input distribution. In this paper, we describe different kinds of layers we trained for learning representations in the setting of the Unsupervised and Transfer Learning Challenge. The strategy of our team won the final phase of the challenge. It combined and stacked different one-layer unsupervised learning algorithms, adapted to each of the five datasets of the competition. This paper describes that strategy and the particular one-layer learning algorithms feeding a simple linear classifier with a tiny number of labeled training samples (1 to 64 per class)." + ], + "domain": [ + "Deep Learning", + "Neural Networks", + "Representation Learning", + "Object Recognition" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "84e178e3-b34c-4bde-b676-4b145a51feab": { + "pk": "84e178e3-b34c-4bde-b676-4b145a51feab", + "name": "Mehdi Mirza", + "bio": "I am a researcher deeply engaged in the exploration of neural networks and their capabilities, particularly in addressing challenges like catastrophic forgetting. My work has focused on understanding how different training algorithms and activation functions impact a model's ability to retain knowledge across tasks. Through my investigations, I have found that the dropout algorithm consistently outperforms others in balancing the retention of old tasks while adapting to new ones.\n\nIn addition to my work on catastrophic forgetting, I have contributed to the development of Pylearn2, a flexible machine learning research library designed to support innovative research projects. My involvement in the Emotion Recognition in the Wild Challenge showcased my ability to integrate multiple deep learning techniques, including convolutional neural networks and deep belief networks, to analyze complex data modalities like video and audio.\n\nI also introduced the multi-prediction deep Boltzmann machine (MP-DBM), which enhances classification tasks without the need for greedy layerwise pretraining, demonstrating improved performance over traditional models. My research on maxout, a model designed to work synergistically with dropout, has led to state-of-the-art results across several benchmark datasets.\n\nOverall, my work aims to push the boundaries of what neural networks can achieve, focusing on both theoretical advancements and practical applications in machine learning.", + "collaborators": [ + "Yoshua Bengio", + "Aaron C. Courville", + "I. Goodfellow", + "David Warde-Farley", + "Pascal Lamblin", + "Razvan Pascanu", + "J. Bergstra", + "Pascal Vincent", + "P. Carrier", + "Xia Da", + "Vincent Dumoulin", + "Fr\u00e9d\u00e9ric Bastien", + "Samira Ebrahimi Kahou", + "C. Pal", + "Xavier Bouthillier", + "Pierre Froumenty", + "\u00c7aglar G\u00fcl\u00e7ehre", + "R. Memisevic", + "Raul Chandias Ferrari", + "S\u00e9bastien Jean", + "Yann Dauphin", + "Nicolas Boulanger-Lewandowski", + "Abhishek Aggarwal", + "Jeremie Zumer", + "Jean-Philippe Raymond", + "Guillaume Desjardins", + "Atousa Torabi", + "Arjun Sharma", + "Emmanuel Bengio", + "K. Konda", + "Zhenzhou Wu", + "D. Erhan", + "Benjamin Hamner", + "William J. Cukierski", + "Yichuan Tang", + "David Thaler", + "Dong-Hyun Lee", + "Yingbo Zhou", + "Chetan Ramaiah", + "Fangxiang Feng", + "Ruifan Li", + "Xiaojie Wang", + "Dimitris Athanasakis", + "J. Shawe-Taylor", + "Maxim Milakov", + "John Park", + "Radu Tudor Ionescu", + "M. Popescu", + "C. Grozea", + "Jingjing Xie", + "Lukasz Romaszko", + "Bing Xu", + "Chuang Zhang", + "Salah Rifai" + ], + "pub_titles": [ + "An Empirical Investigation of Catastrophic Forgeting in Gradient-Based Neural Networks", + "Pylearn2: a machine learning research library", + "Combining modality specific deep neural networks for emotion recognition in video", + "Multi-Prediction Deep Boltzmann Machines", + "Maxout Networks" + ], + "pub_abstracts": [ + "Catastrophic forgetting is a problem faced by many machine learning models and algorithms. When trained on one task, then trained on a second task, many machine learning models \"forget\" how to perform the first task. This is widely believed to be a serious problem for neural networks. Here, we investigate the extent to which the catastrophic forgetting problem occurs for modern neural networks, comparing both established and recent gradient-based training algorithms and activation functions. We also examine the effect of the relationship between the first task and the second task on catastrophic forgetting. We find that it is always best to train using the dropout algorithm--the dropout algorithm is consistently best at adapting to the new task, remembering the old task, and has the best tradeoff curve between these two extremes. We find that different tasks and relationships between tasks result in very different rankings of activation function performance. This suggests the choice of activation function should always be cross-validated.", + "Pylearn2 is a machine learning research library. This does not just mean that it is a collection of machine learning algorithms that share a common API; it means that it has been designed for flexibility and extensibility in order to facilitate research projects that involve new or unusual use cases. In this paper we give a brief history of the library, an overview of its basic philosophy, a summary of the library's architecture, and a description of how the Pylearn2 community functions socially.", + "In this paper we present the techniques used for the University of Montr\u00e9al's team submissions to the 2013 Emotion Recognition in the Wild Challenge. The challenge is to classify the emotions expressed by the primary human subject in short video clips extracted from feature length movies. This involves the analysis of video clips of acted scenes lasting approximately one-two seconds, including the audio track which may contain human voices as well as background music. Our approach combines multiple deep neural networks for different data modalities, including: (1) a deep convolutional neural network for the analysis of facial expressions within video frames; (2) a deep belief net to capture audio information; (3) a deep autoencoder to model the spatio-temporal information produced by the human actions depicted within the entire scene; and (4) a shallow network architecture focused on extracted features of the mouth of the primary human subject in the scene. We discuss each of these techniques, their performance characteristics and different strategies to aggregate their predictions. Our best single model was a convolutional neural network trained to predict emotions from static frames using two large data sets, the Toronto Face Database and our own set of faces images harvested from Google image search, followed by a per frame aggregation strategy that used the challenge training data. This yielded a test set accuracy of 35.58%. Using our best strategy for aggregating our top performing models into a single predictor we were able to produce an accuracy of 41.03% on the challenge test set. These compare favorably to the challenge baseline test set accuracy of 27.56%.", + "We introduce the multi-prediction deep Boltzmann machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.1", + "We consider the problem of designing models to leverage a recently introduced approximate model averaging technique called dropout. We define a simple new model called maxout (so named because its output is the max of a set of inputs, and because it is a natural companion to dropout) designed to both facilitate optimization by dropout and improve the accuracy of dropout's fast approximate model averaging technique. We empirically verify that the model successfully accomplishes both of these tasks. We use maxout and dropout to demonstrate state of the art classification performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN." + ], + "domain": [ + "Deep Learning", + "Neural Networks", + "Emotion Recognition", + "Model Optimization" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "2b3f501a-7776-4f00-be4a-55aa19473b68": { + "pk": "2b3f501a-7776-4f00-be4a-55aa19473b68", + "name": "David Warde-Farley", + "bio": "I am a researcher deeply engaged in the intersection of machine learning and environmental science, particularly focusing on atmospheric data and reinforcement learning. My recent work has centered on developing innovative methods for compressing high-dimensional atmospheric states, enabling broader access to critical weather and climate data. By leveraging neural network architectures and advanced projection techniques, I have achieved impressive compression ratios while preserving essential features, such as extreme weather events.\n\nIn addition to atmospheric modeling, I have explored biases in machine learning models through the introduction of SkewSize, a novel metric that characterizes model mistakes across subgroups, enhancing our understanding of model performance. My research also extends to algorithm design, where I developed RbmSAT, an incomplete algorithm for Maximum Satisfiability tailored for neural network accelerators, demonstrating superior performance in competitive settings.\n\nMy interests also encompass unsupervised skill learning in reinforcement learning, where I have proposed methods like DISDAIN and RVIC to enhance exploration and skill diversity. These contributions aim to empower agents to learn effectively in complex environments without relying on external rewards.\n\nOverall, my work strives to bridge theoretical advancements with practical applications, ensuring that machine learning techniques can be effectively utilized in real-world scenarios, from climate science to autonomous agents. I am passionate about pushing the boundaries of what is possible in these fields and contributing to a deeper understanding of both machine learning and the earth system.", + "collaborators": [ + "S. Hansen", + "Volodymyr Mnih", + "Yoshua Bengio", + "Simon Osindero", + "Kate Baumli", + "T. Wiele", + "Dzmitry Bahdanau", + "J. Chorowski", + "Aaron C. Courville", + "Mehdi Mirza", + "C. Pal", + "Mihaela Rosca", + "Guillaume Desjardins", + "I. Goodfellow", + "Sherjil Ozair", + "Nicolas Boulanger-Lewandowski", + "Xavier Bouthillier", + "A. D. Br\u00e9bisson", + "Yann Dauphin", + "Laurent Dinh", + "Vincent Dumoulin", + "Samira Ebrahimi Kahou", + "Orhan Firat", + "\u00c7aglar G\u00fcl\u00e7ehre", + "S\u00e9bastien Jean", + "Pascal Lamblin", + "C\u00e9sar Laurent", + "R. Memisevic", + "B. V. Merrienboer", + "Vincent Michalski", + "M. Pezeshki", + "Dmitriy Serdyuk", + "Pascal Vincent", + "Ying Zhang", + "Piotr Mirowski", + "Matthew Koichi Grimes", + "Yana Hasson", + "Hyunjik Kim", + "M'elanie Rey", + "Suman V. Ravuri", + "Shakir Mohamed", + "Isabela Albuquerque", + "Jessica Schrouff", + "Ali Taylan Cemgil", + "Sven Gowal", + "Olivia Wiles", + "Vinod Nair", + "Yujia Li", + "Ivan Lobov", + "Felix Gimeno", + "N. Heess", + "D. Strouse", + "Vlad Mnih", + "A. Mnih", + "Will Dabney", + "Andr\u00e9 Barreto", + "Tejas D. Kulkarni", + "Catalin Ionescu", + "Jean Pouget-Abadie", + "Bing Xu", + "Balaji Lakshminarayanan", + "S. Mohamed", + "Rami Al-Rfou", + "Guillaume Alain", + "Amjad Almahairi", + "Christof Angerm\u00fcller", + "Nicolas Ballas", + "Fr\u00e9d\u00e9ric Bastien", + "Justin Bayer", + "A. Belikov", + "A. Belopolsky", + "Arnaud Bergeron", + "J. Bergstra", + "Valentin Bisson", + "Josh Bleecher Snyder", + "Nicolas Bouchard", + "Olivier Breuleux", + "P. Carrier", + "Kyunghyun Cho", + "P. Christiano", + "Tim Cooijmans", + "Marc-Alexandre C\u00f4t\u00e9", + "Myriam C\u00f4t\u00e9", + "Olivier Delalleau", + "Julien Demouth", + "S. Dieleman", + "M\u00e9lanie Ducoffe", + "D. Erhan", + "Ziye Fan", + "M. Germain", + "Xavier Glorot", + "M. Graham", + "P. Hamel", + "Iban Harlouchet", + "J. Heng", + "Bal\u00e1zs Hidasi", + "S. Honari", + "Arjun Jain", + "Kai Jia", + "Mikhail Korobov" + ], + "pub_titles": [ + "Neural Compression of Atmospheric States", + "Evaluating Model Bias Requires Characterizing its Mistakes", + "Solving MaxSAT with Matrix Multiplication", + "Entropic Desired Dynamics for Intrinsic Control", + "Learning more skills through optimistic exploration", + "Relative Variational Intrinsic Control", + "Q-Learning in enormous action spaces via amortized approximate maximization", + "Fast Task Inference with Variational Intrinsic Successor Features", + "Unsupervised Control Through Non-Parametric Discriminative Rewards", + "Generative Adversarial Networks for Image Steganography", + "Variational Approaches for Auto-Encoding Generative Adversarial Networks", + "Theano: A Python framework for fast computation of mathematical expressions", + "Improving Generative Adversarial Networks with Denoising Feature Matching", + "Blocks and Fuel: Frameworks for deep learning" + ], + "pub_abstracts": [ + "Atmospheric states derived from reanalysis comprise a substantial portion of weather and climate simulation outputs. Many stakeholders -- such as researchers, policy makers, and insurers -- use this data to better understand the earth system and guide policy decisions. Atmospheric states have also received increased interest as machine learning approaches to weather prediction have shown promising results. A key issue for all audiences is that dense time series of these high-dimensional states comprise an enormous amount of data, precluding all but the most well resourced groups from accessing and using historical data and future projections. To address this problem, we propose a method for compressing atmospheric states using methods from the neural network literature, adapting spherical data to processing by conventional neural architectures through the use of the area-preserving HEALPix projection. We investigate two model classes for building neural compressors: the hyperprior model from the neural image compression literature and recent vector-quantised models. We show that both families of models satisfy the desiderata of small average error, a small number of high-error reconstructed pixels, faithful reproduction of extreme events such as hurricanes and heatwaves, preservation of the spectral power distribution across spatial scales. We demonstrate compression ratios in excess of 1000x, with compression and decompression at a rate of approximately one second per global atmospheric state.", + "The ability to properly benchmark model performance in the face of spurious correlations is important to both build better predictors and increase confidence that models are operating as intended. We demonstrate that characterizing (as opposed to simply quantifying) model mistakes across subgroups is pivotal to properly reflect model biases, which are ignored by standard metrics such as worst-group accuracy or accuracy gap. Inspired by the hypothesis testing framework, we introduce SkewSize, a principled and flexible metric that captures bias from mistakes in a model's predictions. It can be used in multi-class settings or generalised to the open vocabulary setting of generative models. SkewSize is an aggregation of the effect size of the interaction between two categorical variables: the spurious variable representing the bias attribute and the model's prediction. We demonstrate the utility of SkewSize in multiple settings including: standard vision models trained on synthetic data, vision models trained on ImageNet, and large scale vision-and-language models from the BLIP-2 family. In each case, the proposed SkewSize is able to highlight biases not captured by other metrics, while also providing insights on the impact of recently proposed techniques, such as instruction tuning.", + "We propose an incomplete algorithm for Maximum Satisfiability (MaxSAT) specifically designed to run on neural network accelerators such as GPUs and TPUs. Given a MaxSAT problem instance in conjunctive normal form, our procedure constructs a Restricted Boltzmann Machine (RBM) with an equilibrium distribution wherein the probability of a Boolean assignment is exponential in the number of clauses it satisfies. Block Gibbs sampling is used to stochastically search the space of assignments with parallel Markov chains. Since matrix multiplication is the main computational primitive for block Gibbs sampling in an RBM, our approach leads to an elegantly simple algorithm (40 lines of JAX) well-suited for neural network accelerators. Theoretical results about RBMs guarantee that the required number of visible and hidden units of the RBM scale only linearly with the number of variables and constant-sized clauses in the MaxSAT instance, ensuring that the computational cost of a Gibbs step scales reasonably with the instance size. Search throughput can be increased by batching parallel chains within a single accelerator as well as by distributing them across multiple accelerators. As a further enhancement, a heuristic based on unit propagation running on CPU is periodically applied to the sampled assignments. Our approach, which we term RbmSAT, is a new design point in the algorithm-hardware co-design space for MaxSAT. We present timed results on a subset of problem instances from the annual MaxSAT Evaluation's Incomplete Unweighted Track for the years 2018 to 2021. When allotted the same running time and CPU compute budget (but no TPUs), RbmSAT outperforms other participating solvers on problems drawn from three out of the four years' competitions. Given the same running time on a TPU cluster for which RbmSAT is uniquely designed, it outperforms all solvers on problems drawn from all four years.", + "An agent might be said, informally, to have mastery of its environment when it has maximised the effective number of states it can reliably reach. In practice, this often means maximizing the number of latent codes that can be discriminated from future states under some short time horizon (e.g. [15]). By situating these latent codes in a globally consistent coordinate system, we show that agents can reliably reach more states in the long term while still optimizing a local objective. A simple instantiation of this idea, E ntropic D esired D ynamics for I ntrinsic C on T rol (EDDICT), assumes \ufb01xed additive latent dynamics, which results in tractable learning and an interpretable latent space. Compared to prior methods, EDDICT\u2019s globally consistent codes allow it to be far more exploratory, as demonstrated by improved state coverage and increased unsupervised performance on hard exploration games such as Montezuma\u2019s Revenge.", + "Unsupervised skill learning objectives (Gregor et al., 2016, Eysenbach et al., 2018) allow agents to learn rich repertoires of behavior in the absence of extrinsic rewards. They work by simultaneously training a policy to produce distinguishable latent-conditioned trajectories, and a discriminator to evaluate distinguishability by trying to infer latents from trajectories. The hope is for the agent to explore and master the environment by encouraging each skill (latent) to reliably reach different states. However, an inherent exploration problem lingers: when a novel state is actually encountered, the discriminator will necessarily not have seen enough training data to produce accurate and confident skill classifications, leading to low intrinsic reward for the agent and effective penalization of the sort of exploration needed to actually maximize the objective. To combat this inherent pessimism towards exploration, we derive an information gain auxiliary objective that involves training an ensemble of discriminators and rewarding the policy for their disagreement. Our objective directly estimates the epistemic uncertainty that comes from the discriminator not having seen enough training examples, thus providing an intrinsic reward more tailored to the true objective compared to pseudocount-based methods (Burda et al., 2019). We call this exploration bonus discriminator disagreement intrinsic reward, or DISDAIN. We demonstrate empirically that DISDAIN improves skill learning both in a tabular grid world (Four Rooms) and the 57 games of the Atari Suite (from pixels). Thus, we encourage researchers to treat pessimism with DISDAIN.", + "In the absence of external rewards, agents can still learn useful behaviors by identifying and mastering a set of diverse skills within their environment. Existing skill learning methods use mutual information objectives to incentivize each skill to be diverse and distinguishable from the rest. However, if care is not taken to constrain the ways in which the skills are diverse, trivially diverse skill sets can arise. To ensure useful skill diversity, we propose a novel skill learning objective, Relative Variational Intrinsic Control (RVIC), which incentivizes learning skills that are distinguishable in how they change the agent's relationship to its environment. The resulting set of skills tiles the space of affordances available to the agent. We qualitatively analyze skill behaviors on multiple environments and show how RVIC skills are more useful than skills discovered by existing methods in hierarchical reinforcement learning.", + "Applying Q-learning to high-dimensional or continuous action spaces can be difficult due to the required maximization over the set of possible actions. Motivated by techniques from amortized inference, we replace the expensive maximization over all actions with a maximization over a small subset of possible actions sampled from a learned proposal distribution. The resulting approach, which we dub Amortized Q-learning (AQL), is able to handle discrete, continuous, or hybrid action spaces while maintaining the benefits of Q-learning. Our experiments on continuous control tasks with up to 21 dimensional actions show that AQL outperforms D3PG (Barth-Maron et al, 2018) and QT-Opt (Kalashnikov et al, 2018). Experiments on structured discrete action spaces demonstrate that AQL can efficiently learn good policies in spaces with thousands of discrete actions.", + "It has been established that diverse behaviors spanning the controllable subspace of an Markov decision process can be trained by rewarding a policy for being distinguishable from other policies \\citep{gregor2016variational, eysenbach2018diversity, warde2018unsupervised}. However, one limitation of this formulation is generalizing behaviors beyond the finite set being explicitly learned, as is needed for use on subsequent tasks. Successor features \\citep{dayan93improving, barreto2017successor} provide an appealing solution to this generalization problem, but require defining the reward function as linear in some grounded feature space. In this paper, we show that these two techniques can be combined, and that each method solves the other's primary limitation. To do so we introduce Variational Intrinsic Successor FeatuRes (VISR), a novel algorithm which learns controllable features that can be leveraged to provide enhanced generalization and fast task inference through the successor feature framework. We empirically validate VISR on the full Atari suite, in a novel setup wherein the rewards are only exposed briefly after a long unsupervised phase. Achieving human-level performance on 14 games and beating all baselines, we believe VISR represents a step towards agents that rapidly learn from limited feedback.", + "Learning to control an environment without hand-crafted rewards or expert data remains challenging and is at the frontier of reinforcement learning research. We present an unsupervised learning algorithm to train agents to achieve perceptually-specified goals using only a stream of observations and actions. Our agent simultaneously learns a goal-conditioned policy and a goal achievement reward function that measures how similar a state is to the goal state. This dual optimization leads to a co-operative game, giving rise to a learned reward function that reflects similarity in controllable aspects of the environment instead of distance in the space of observations. We demonstrate the efficacy of our agent to learn, in an unsupervised manner, to reach a diverse set of goals on three domains -- Atari, the DeepMind Control Suite and DeepMind Lab.", + "Steganography is collection of methods to hide secret information (\"payload\") within non-secret information (\"container\"). Its counterpart, Steganalysis, is the practice of determining if a message contains a hidden payload, and recovering it if possible. Presence of hidden payloads is typically detected by a binary classifier. In the present study, we propose a new model for generating image-like containers based on Deep Convolutional Generative Adversarial Networks (DCGAN). This approach allows to generate more setganalysis-secure message embedding using standard steganography algorithms. Experiment results demonstrate that the new model successfully deceives the steganography analyzer, and for this reason, can be used in steganographic applications.", + "Auto-encoding generative adversarial networks (GANs) combine the standard GAN algorithm, which discriminates between real and model-generated data, with a reconstruction loss given by an auto-encoder. Such models aim to prevent mode collapse in the learned generative model by ensuring that it is grounded in all the available training data. In this paper, we develop a principle upon which auto-encoders can be combined with generative adversarial networks by exploiting the hierarchical structure of the generative model. The underlying principle shows that variational inference can be used a basic tool for learning, but with the in- tractable likelihood replaced by a synthetic likelihood, and the unknown posterior distribution replaced by an implicit distribution; both synthetic likelihoods and implicit posterior distributions can be learned using discriminators. This allows us to develop a natural fusion of variational auto-encoders and generative adversarial networks, combining the best of both these methods. We describe a unified objective for optimization, discuss the constraints needed to guide learning, connect to the wide range of existing work, and use a battery of tests to systematically and quantitatively assess the performance of our method.", + "Theano is a Python library that allows to define, optimize, and evaluate mathematical expressions involving multi-dimensional arrays efficiently. Since its introduction, it has been one of the most used CPU and GPU mathematical compilers - especially in the machine learning community - and has shown steady performance improvements. Theano is being actively and continuously developed since 2008, multiple frameworks have been built on top of it and it has been used to produce many state-of-the-art machine learning models. The present article is structured as follows. Section I provides an overview of the Theano software and its community. Section II presents the principal features of Theano and how to use them, and compares them with other similar projects. Section III focuses on recently-introduced functionalities and improvements. Section IV compares the performance of Theano against Torch7 and TensorFlow on several machine learning models. Section V discusses current limitations of Theano and potential ways of improving it.", + "We propose an augmented training procedure for generative adversarial networks designed to address shortcomings of the original by directing the generator towards probable configurations of abstract discriminator features. We estimate and track the distribution of these features, as computed from data, with a denoising auto-encoder, and use it to propose high-level targets for the generator. We combine this new loss with the original and evaluate the hybrid criterion on the task of unsupervised image synthesis from datasets comprising a diverse set of visual categories, noting a qualitative and quantitative improvement in the \u201cobjectness\u201d of the resulting samples.", + "We introduce two Python frameworks to train neural networks on large datasets: Blocks and Fuel. Blocks is based on Theano, a linear algebra compiler with CUDA-support (Bastien et al., 2012; Bergstra et al., 2010). It facilitates the training of complex neural network models by providing parametrized Theano operations, attaching metadata to Theano\u2019s symbolic computational graph, and providing an extensive set of utilities to assist training the networks, e.g. training algorithms, logging, monitoring, visualization, and serialization. Fuel provides a standard format for machine learning datasets. It allows the user to easily iterate over large datasets, performing many types of pre-processing on the fly." + ], + "domain": [ + "Machine Learning", + "Reinforcement Learning", + "Neural Networks", + "Generative Models" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "e72df3d5-fc04-440f-88f0-974acb1929a2": { + "pk": "e72df3d5-fc04-440f-88f0-974acb1929a2", + "name": "Sherjil Ozair", + "bio": "I am a researcher dedicated to advancing the fields of generative models, reinforcement learning, and representation learning. My recent work includes the development of Genie, an innovative generative interactive environment that leverages unlabelled Internet videos to create dynamic virtual worlds. This foundation model, with its 11 billion parameters, allows users to engage with generated environments in unprecedented ways.\n\nI have also focused on enhancing low-resource language models, particularly for Thai, by creating a synthetic data framework that emphasizes fluency, diversity, and cultural context. This work demonstrates that effective instruction-tuning can be achieved with minimal data, significantly improving performance compared to traditional methods.\n\nIn the realm of reinforcement learning, I have contributed to the establishment of benchmarks like AlphaStar Unplugged, which challenges offline RL algorithms in complex environments such as StarCraft II. My research extends to model-based approaches, where I introduced Stochastic MuZero, a method that incorporates stochastic models for improved planning in uncertain environments.\n\nAdditionally, I have explored the intricacies of mutual information in representation learning, proposing novel methods to enhance generalization and data efficiency. My work on DeepNash has pushed the boundaries of AI in imperfect information games, achieving human expert-level performance in Stratego.\n\nOverall, my research aims to bridge the gap between theoretical advancements and practical applications, fostering the development of intelligent agents capable of navigating complex, real-world scenarios.", + "collaborators": [ + "A\u00e4ron van den Oord", + "Yoshua Bengio", + "Ioannis Antonoglou", + "Julian Schrittwieser", + "David Silver", + "Yazhe Li", + "Alex Lamb", + "R. Devon Hjelm", + "Konrad Zolna", + "Nando de Freitas", + "Satinder Singh", + "Erica Moreira", + "L. Sifre", + "Petko Georgiev", + "O. Vinyals", + "Ali Razavi", + "Mina Khan", + "Ankesh Anand", + "Vikas Verma", + "David Ha", + "Ben Poole", + "Alexander A. Alemi", + "G. Tucker", + "Aaron C. Courville", + "Jake Bruce", + "Michael D. Dennis", + "Ashley Edwards", + "Jack Parker-Holder", + "Yuge Shi", + "Edward Hughes", + "Matthew Lai", + "Aditi Mavalankar", + "Richie Steigerwald", + "Chris Apps", + "Y. Aytar", + "Sarah Bechtle", + "Feryal M. P. Behbahani", + "Stephanie Chan", + "N. Heess", + "Lucy Gonzalez", + "Simon Osindero", + "Scott Reed", + "Jingwei Zhang", + "Jeff Clune", + "Tim Rocktaschel", + "Parinthapat Pengpun", + "Can Udomcharoenchaikit \u2020 Weerayut Buaphet", + "Peerat Limkonchotiwat", + "Xuan-Phi Nguyen", + "Wenxuan Zhang", + "Xin Li", + "Mahani Aljunied", + "Qingyu Tan", + "Liying Cheng", + "Guanzheng Chen", + "Yue Deng", + "Sen Yang", + "Chaoqun Liu", + "Haim-ing Bao", + "Mo Bavarian", + "J. Belgum", + "Ir-wan Bello", + "Jake Berdine", + "Gabriel Bernadett-Shapiro", + "Christopher Berner", + "Lenny Bogdonoff", + "Oleg Boiko", + "Made-laine Boyd", + "Anna-Luisa Brakman", + "Greg Brock-man", + "Tim Brooks", + "M. Brundage", + "Kevin Button", + "Trevor Cai", + "Rosie Campbell", + "Andrew Cann", + "Brittany Carey", + "Chelsea Carlson", + "Rory Carmichael", + "Brooke Chan", + "Che Chang", + "Fotis Chantzis", + "Derek Chen", + "Sully Chen", + "Ruby Chen", + "Jason Chen", + "Mark Chen", + "B. Chess", + "Chester Cho", + "Hyung Casey Chu", + "Won Chung", + "Dave Cummings", + "Jeremiah Currier", + "Yunxing Dai", + "Tarun Goel", + "Gabriel Gogineni", + "Rapha Goh", + "Jonathan Gontijo-Lopes", + "Morgan Gordon", + "Scott Grafstein" + ], + "pub_titles": [ + "Genie: Generative Interactive Environments", + "Seed-Free Synthetic Data Generation Framework for Instruction-Tuning LLMs: A Case Study in Thai", + "AlphaStar Unplugged: Large-Scale Offline Reinforcement Learning", + "[Re] Explaining in Style: Training a GAN to explain a classifier in StyleSpace", + "Model-based language-instructed reinforcement learning", + "Planning in Stochastic Environments with a Learned Model", + "Mastering the game of Stratego with model-free multiagent reinforcement learning", + "Pretrained Encoders are All You Need", + "Procedural Generalization by Planning with Self-Supervised World Models", + "Vector Quantized Models for Planning", + "SketchTransfer: A New Dataset for Exploring Detail-Invariance and the Abstractions Learned by Deep Networks", + "Unsupervised State Representation Learning in Atari", + "On Variational Bounds of Mutual Information", + "The Journey is the Reward: Unsupervised Learning of Influential Trajectories", + "SketchTransfer: A Challenging New Task for Exploring Detail-Invariance and the Abstractions Learned by Deep Networks", + "Wasserstein Dependency Measure for Representation Learning", + "Learning Generative Models with Locally Disentangled Latent Factors", + "Mutual Information Neural Estimation", + "On variational lower bounds of mutual information", + "Generative Adversarial Networks for Image Steganography" + ], + "pub_abstracts": [ + "We introduce Genie, the first generative interactive environment trained in an unsupervised manner from unlabelled Internet videos. The model can be prompted to generate an endless variety of action-controllable virtual worlds described through text, synthetic images, photographs, and even sketches. At 11B parameters, Genie can be considered a foundation world model. It is comprised of a spatiotemporal video tokenizer, an autoregressive dynamics model, and a simple and scalable latent action model. Genie enables users to act in the generated environments on a frame-by-frame basis despite training without any ground-truth action labels or other domain-specific requirements typically found in the world model literature. Further the resulting learned latent action space facilitates training agents to imitate behaviors from unseen videos, opening the path for training generalist agents of the future.", + "We present a synthetic data approach for instruction-tuning large language models (LLMs) for low-resource languages in a data-efficient manner, specifically focusing on Thai. We identify three key properties that contribute to the effectiveness of instruction-tuning datasets: fluency, diversity, and cultural context. We propose a seed-data-free framework for generating synthetic instruction-tuning data that incorporates these essential properties. Our framework employs an LLM to generate diverse topics, retrieve relevant contexts from Wikipedia, and create instructions for various tasks, such as question answering, summarization, and conversation. The experimental results show that our best-performing synthetic dataset, which incorporates all three key properties, achieves competitive performance using only 5,000 instructions when compared to state-of-the-art Thai LLMs trained on hundreds of thousands of instructions. Our code and dataset are publicly available at https://github.com/parinzee/seed-free-synthetic-instruct.", + "StarCraft II is one of the most challenging simulated reinforcement learning environments; it is partially observable, stochastic, multi-agent, and mastering StarCraft II requires strategic planning over long time horizons with real-time low-level execution. It also has an active professional competitive scene. StarCraft II is uniquely suited for advancing offline RL algorithms, both because of its challenging nature and because Blizzard has released a massive dataset of millions of StarCraft II games played by human players. This paper leverages that and establishes a benchmark, called AlphaStar Unplugged, introducing unprecedented challenges for offline reinforcement learning. We define a dataset (a subset of Blizzard's release), tools standardizing an API for machine learning methods, and an evaluation protocol. We also present baseline agents, including behavior cloning, offline variants of actor-critic and MuZero. We improve the state of the art of agents using only offline data, and we achieve 90% win rate against previously published AlphaStar behavior cloning agent.", + "StylEx is an approach for classifier-conditioned training of a StyleGAN2 [6], intending to capture classifier-specific 3 attributes in its disentangled StyleSpace [15]. Attributes can be adjusted to generate counterfactual explanations of 4 the classifier decisions. StylEx is domain and classifier-agnostic, while its explanations are claimed to be human5 interpretable, distinct, coherent and sufficient to produce flipped classifier decisions. We verify these claims by 6 reproducing a selection of the experiments in the paper. 7", + "We explore how we can build accurate world 001 models which are partially specified by lan-002 guage and how we can plan with them in the 003 face of novelty and uncertainty. We propose the 004 first Model-Based Reinforcement Learning ap-005 proach to tackle the environment Read To Fight 006 Monsters (Zhong et al., 2019), a grounded 007 policy learning problem. In RTFM an agent 008 has to reason over a set of rules and a goal, 009 both described in a language manual, and the 010 observations, while taking into account the 011 uncertainty arising from the stochasticity of 012 the environment, in order to generalize suc-013 cessfully its policy to test episodes. We pro-014 vide a sample-efficient proof-of-concept of the 015 model-based approach for the basic dynamic 016 task of RTFM. Furthermore, we show that the 017 main open challenge of RTFM is learning the 018 language-dependent reward function and sug-019 gest that future research should focus primarily 020 on that task. 021", + "Model-based reinforcement learning has proven highly successful. However, learning a model in isolation from its use during planning is problematic in complex environments. To date, the most effective techniques have instead combined value-equivalent model learning with powerful tree-search methods. This approach is exempli\ufb01ed by MuZero , which has achieved state-of-the-art performance in a wide range of domains, from board games to visually rich environments, with discrete and continuous action spaces, in online and of\ufb02ine settings. However, previous instantiations of this approach were limited to the use of deterministic models. This limits their performance in environments that are inherently stochastic, partially observed, or so large and complex that they appear stochastic to a \ufb01nite agent. In this paper we extend this approach to learn and plan with stochastic models. Specifically, we introduce a new algorithm, Stochastic MuZero , that learns a stochastic model incorporating afterstates, and uses this model to perform a stochastic tree search. Stochastic MuZero matched or exceeded the state of the art in a set of canonical single and multi-agent environments, including 2048 and backgammon, while maintaining the superhuman performance of standard MuZero in the game of Go.", + "We introduce DeepNash, an autonomous agent that plays the imperfect information game Stratego at a human expert level. Stratego is one of the few iconic board games that artificial intelligence (AI) has not yet mastered. It is a game characterized by a twin challenge: It requires long-term strategic thinking as in chess, but it also requires dealing with imperfect information as in poker. The technique underpinning DeepNash uses a game-theoretic, model-free deep reinforcement learning method, without search, that learns to master Stratego through self-play from scratch. DeepNash beat existing state-of-the-art AI methods in Stratego and achieved a year-to-date (2022) and all-time top-three ranking on the Gravon games platform, competing with human expert players. Description Machine learning to play Stratego Stratego is a popular two-player imperfect information board game. Because of its complexity stemming from its enormous game tree, decision-making under imperfect information, and a piece deployment phase at the start, Stratego poses a challenge for artificial intelligence (AI). Previous computer programs only performed at an amateur level at best. Perolat et al. introduce a model-free multiagent reinforcement learning methodology and show that it can achieve human expert\u2013level performance in Stratego. The present work not only adds to the growing list of games that AI systems can play as well or even better than humans but may also facilitate further applications of reinforcement learning methods in real-world, large-scale multiagent problems that are characterized by imperfect information and thus are currently unsolvable. \u2014YS Reinforcement learning achieves human expert\u2013level performance in the large-scale imperfect information board game Stratego.", + "Data-efficiency and generalization are key challenges in deep learning and deep reinforcement learning as many models are trained on large-scale, domain-specific, and expensive-to-label datasets. Self-supervised models trained on large-scale uncurated datasets have shown successful transfer to diverse settings. We investigate using pretrained image representations and spatio-temporal attention for state representation learning in Atari. We also explore fine-tuning pretrained representations with self-supervised techniques, i.e., contrastive predictive coding, spatio-temporal contrastive learning, and augmentations. Our results show that pretrained representations are at par with state-of-the-art self-supervised methods trained on domain-specific data. Pretrained representations, thus, yield data and compute-efficient state representations. https://github.com/PAL-ML/PEARL_v1", + "One of the key promises of model-based reinforcement learning is the ability to generalize using an internal model of the world to make predictions in novel environments and tasks. However, the generalization ability of model-based agents is not well understood because existing work has focused on model-free agents when benchmarking generalization. Here, we explicitly measure the generalization ability of model-based agents in comparison to their model-free counterparts. We focus our analysis on MuZero (Schrittwieser et al., 2020), a powerful model-based agent, and evaluate its performance on both procedural and task generalization. We identify three factors of procedural generalization -- planning, self-supervised representation learning, and procedural data diversity -- and show that by combining these techniques, we achieve state-of-the art generalization performance and data efficiency on Procgen (Cobbe et al., 2019). However, we find that these factors do not always provide the same benefits for the task generalization benchmarks in Meta-World (Yu et al., 2019), indicating that transfer remains a challenge and may require different approaches than procedural generalization. Overall, we suggest that building generalizable agents requires moving beyond the single-task, model-free paradigm and towards self-supervised model-based agents that are trained in rich, procedural, multi-task environments.", + "Recent developments in the field of model-based RL have proven successful in a range of environments, especially ones where planning is essential. However, such successes have been limited to deterministic fully-observed environments. We present a new approach that handles stochastic and partially-observable environments. Our key insight is to use discrete autoencoders to capture the multiple possible effects of an action in a stochastic environment. We use a stochastic variant of Monte Carlo tree search to plan over both the agent's actions and the discrete latent variables representing the environment's response. Our approach significantly outperforms an offline version of MuZero on a stochastic interpretation of chess where the opponent is considered part of the environment. We also show that our approach scales to DeepMind Lab, a first-person 3D environment with large visual observations and partial observability.", + "Deep networks have achieved excellent results in perceptual tasks, yet their ability to generalize to variations not seen during training has come under increasing scrutiny. In this work we focus on their ability to have invariance towards the presence or absence of details. For example, humans are able to watch cartoons, which are missing many visual details, without being explicitly trained to do so. As another example, 3D rendering software is a relatively recent development, yet people are able to understand such rendered scenes even though they are missing details (consider a film like Toy Story). This capability goes beyond visual data: humans are easily able to recognize isolated melodies from musical pieces when heard for the first time, even if the only piece they've listened to previously is from an orchestra. Thus the failure of machine learning algorithms to do this indicates a significant gap in generalization between human abilities and the abilities of deep networks. We propose a dataset that will make it easier to study the detail-invariance problem concretely. We produce a concrete task for this: SketchTransfer, and we show that state-of-the-art domain transfer algorithms still struggle with this task. The state-of-the-art technique which achieves over 95% on MNIST \\xrightarrow SVHN transfer only achieves 59% accuracy on the SketchTransfer task, which is much better than random (11% accuracy) but falls short of the 87% accuracy of a classifier trained directly on labeled sketches. This indicates that this task is approachable with today's best methods but has substantial room for improvement.", + "State representation learning, or the ability to capture latent generative factors of an environment, is crucial for building intelligent agents that can perform a wide variety of tasks. Learning such representations without supervision from rewards is a challenging open problem. We introduce a method that learns state representations by maximizing mutual information across spatially and temporally distinct features of a neural encoder of the observations. We also introduce a new benchmark based on Atari 2600 games where we evaluate representations based on how well they capture the ground truth state variables. We believe this new framework for evaluating representation learning models will be crucial for future representation learning research. Finally, we compare our technique with other state-of-the-art generative and contrastive representation learning methods. The code associated with this work is available at this https URL", + "Estimating and optimizing Mutual Information (MI) is core to many problems in machine learning; however, bounding MI in high dimensions is challenging. To establish tractable and scalable objectives, recent work has turned to variational bounds parameterized by neural networks, but the relationships and tradeoffs between these bounds remains unclear. In this work, we unify these recent developments in a single framework. We find that the existing variational lower bounds degrade when the MI is large, exhibiting either high bias or high variance. To address this problem, we introduce a continuum of lower bounds that encompasses previous bounds and flexibly trades off bias and variance. On high-dimensional, controlled problems, we empirically characterize the bias and variance of the bounds and their gradients and demonstrate the effectiveness of our new bounds for estimation and representation learning.", + "Unsupervised exploration and representation learning become increasingly important when learning in diverse and sparse environments. The information-theoretic principle of empowerment formalizes an unsupervised exploration objective through an agent trying to maximize its influence on the future states of its environment. Previous approaches carry certain limitations in that they either do not employ closed-loop feedback or do not have an internal state. As a consequence, a privileged final state is taken as an influence measure, rather than the full trajectory. We provide a model-free method which takes into account the whole trajectory while still offering the benefits of option-based approaches. We successfully apply our approach to settings with large action spaces, where discovery of meaningful action sequences is particularly difficult.", + "Deep networks have achieved excellent results in perceptual tasks, yet their ability to generalize to variations not seen during training has come under increasing scrutiny. In this work we focus on their ability to have invariance towards the presence or absence of details. For example, humans are able to watch cartoons, which are missing many visual details, without being explicitly trained to do so. As another example, 3D rendering software is a relatively recent development, yet people are able to understand such rendered scenes even though they are missing details (consider a film like Toy Story). The failure of ma- chine learning algorithms to do this indicates a significant gap in generalization between human abilities and the abilities of deep networks. We propose a dataset that will make it easier to study the detail-invariance problem concretely. We produce a concrete task for this: SketchTransfer, and we show that state-of-the-art domain transfer algorithms still struggle with this task. The state-of-the-art technique which achieves over 95% on MNIST \u2192 SVHN transfer only achieves 59% accuracy on the SketchTransfer task, which is much better than random (11% accuracy) but falls short of the 87% accuracy of a classifier trained directly on labeled sketches. This indicates that this task is approachable with today\u2019s best methods but has substantial room for improvement.", + "Mutual information maximization has emerged as a powerful learning objective for unsupervised representation learning obtaining state-of-the-art performance in applications such as object recognition, speech recognition, and reinforcement learning. However, such approaches are fundamentally limited since a tight lower bound of mutual information requires sample size exponential in the mutual information. This limits the applicability of these approaches for prediction tasks with high mutual information, such as in video understanding or reinforcement learning. In these settings, such techniques are prone to overfit, both in theory and in practice, and capture only a few of the relevant factors of variation. This leads to incomplete representations that are not optimal for downstream tasks. In this work, we empirically demonstrate that mutual information-based representation learning approaches do fail to learn complete representations on a number of designed and real-world tasks. To mitigate these problems we introduce the Wasserstein dependency measure, which learns more complete representations by using the Wasserstein distance instead of the KL divergence in the mutual information estimator. We show that a practical approximation to this theoretically motivated solution, constructed using Lipschitz constraint techniques from the GAN literature, achieves substantially improved results on tasks where incomplete representations are a major challenge.", + "One of the most successful techniques in generative models has been decomposing a complicated generation task into a series of simpler generation tasks. For example, generating an image at a low resolution and then learning to refine that into a high resolution image often improves results substantially. Here we explore a novel strategy for decomposing generation for complicated objects in which we first generate latent variables which describe a subset of the observed variables, and then map from these latent variables to the observed space. We show that this allows us to achieve decoupled training of complicated generative models and present both theoretical and experimental results supporting the benefit of such an approach.", + "We argue that the estimation of mutual information between high dimensional continuous random variables can be achieved by gradient descent over neural networks. We present a Mutual Information Neural Estimator (MINE) that is linearly scalable in dimensionality as well as in sample size, trainable through back-prop, and strongly consistent. We present a handful of applications on which MINE can be used to minimize or maximize mutual information. We apply MINE to improve adversarially trained generative models. We also use MINE to implement Information Bottleneck, applying it to supervised classification; our results demonstrate substantial improvement in flexibility and performance in these settings.", + "Estimating and maximizing mutual information (MI) is core to many objectives in machine learning, but tractably lower bounding MI in high dimensions is challenging. Recent work has introduced variational lower bounds with neural networks to attack this problem, but the tradeoffs and relationships between these techniques remains unclear. Here, we present several results that begin to demys-tify these techniques: we show that the bias-corrected gradient in MINE (Belghazi et al., 2018) can be derived as an unbiased gradient of a new lower bound on MI, present a stabler Jensen-Shannon-based training algorithm for the critic, provide a new interpretation of contrastive predictive coding (CPC, van den Oord et al. (2018)) and prove this variant is a lower bound on MI, and demonstrate the batch-size dependence of CPC. Empirically, we show that the effectiveness of these bounds depends on properties of the data being modeled and the structure of the critic, with no one bound uniformly dominating.", + "Steganography is collection of methods to hide secret information (\"payload\") within non-secret information (\"container\"). Its counterpart, Steganalysis, is the practice of determining if a message contains a hidden payload, and recovering it if possible. Presence of hidden payloads is typically detected by a binary classifier. In the present study, we propose a new model for generating image-like containers based on Deep Convolutional Generative Adversarial Networks (DCGAN). This approach allows to generate more setganalysis-secure message embedding using standard steganography algorithms. Experiment results demonstrate that the new model successfully deceives the steganography analyzer, and for this reason, can be used in steganographic applications." + ], + "domain": [ + "Generative Models", + "Reinforcement Learning", + "Representation Learning", + "Machine Learning" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "889412ee-3cc2-4643-8f4b-270de4eee311": { + "pk": "889412ee-3cc2-4643-8f4b-270de4eee311", + "name": "Yoshua Bengio", + "bio": "I am a researcher deeply engaged in the exploration of neural networks and their applications across various domains, particularly focusing on the challenges of catastrophic forgetting, generative models, and deep learning methodologies. My work has investigated the nuances of how neural networks retain knowledge when transitioning between tasks, revealing that dropout algorithms consistently outperform others in balancing the retention of old tasks while adapting to new ones.\n\nI have also contributed to advancements in scene classification by leveraging object detection features, demonstrating significant improvements in accuracy while reducing dimensionality. My research extends to the realm of recommendation systems, where I developed calibration techniques to mitigate biases in off-policy evaluations, particularly in video game matchmaking.\n\nIn the generative modeling space, I introduced multimodal transition distributions for Generative Stochastic Networks (GSNs), enhancing their ability to capture complex data distributions. My work on deep recurrent neural networks (RNNs) has led to novel architectures that improve performance in tasks like polyphonic music prediction and language modeling.\n\nAdditionally, I have explored the intersection of deep learning and affective computing, aiming to model emotions through advanced AI techniques. My commitment to making deep learning accessible is reflected in my tutorials, which demystify complex algorithms for natural language processing.\n\nOverall, my research is driven by a passion for understanding and improving the capabilities of machine learning models, with a focus on practical applications and theoretical advancements that push the boundaries of what is possible in AI.", + "collaborators": [ + "Aaron C. Courville", + "I. Goodfellow", + "Mehdi Mirza", + "\u00c7aglar G\u00fcl\u00e7ehre", + "Razvan Pascanu", + "Pascal Vincent", + "Eric Thibodeau-Laufer", + "Raul Chandias Ferrari", + "Kyunghyun Cho", + "Vincent Dumoulin", + "L. Yao", + "David Warde-Farley", + "Pascal Lamblin", + "Yann Dauphin", + "Xia Da", + "Gr\u00e9goire Mesnil", + "Salah Rifai", + "Antoine Bordes", + "Xavier Glorot", + "Li Yao", + "Olivier Delalleau", + "Sherjil Ozair", + "J. Bergstra", + "Fr\u00e9d\u00e9ric Bastien", + "H. P. Mart\u00ednez", + "Georgios N. Yannakakis", + "Samira Ebrahimi Kahou", + "C. Pal", + "Xavier Bouthillier", + "Pierre Froumenty", + "R. Memisevic", + "S\u00e9bastien Jean", + "P. Carrier", + "Nicolas Boulanger-Lewandowski", + "Abhishek Aggarwal", + "Jeremie Zumer", + "Jean-Philippe Raymond", + "Guillaume Desjardins", + "Atousa Torabi", + "Arjun Sharma", + "Emmanuel Bengio", + "K. Konda", + "Zhenzhou Wu", + "R. Socher", + "Christopher D. Manning", + "Guillaume Alain", + "J. Yosinski", + "Nicholas L\u00e9onard" + ], + "pub_titles": [ + "An Empirical Investigation of Catastrophic Forgeting in Gradient-Based Neural Networks", + "Unsupervised and Transfer Learning under Uncertainty - From Object Detections to Scene Categorization", + "Stacked calibration of off-policy policy evaluation for video game matchmaking", + "On the Challenges of Physical Implementations of RBMs", + "Multimodal Transitions for Generative Stochastic Networks", + "Pylearn2: a machine learning research library", + "Big Neural Networks Waste Capacity", + "Estimating or Propagating Gradients Through Stochastic Neurons", + "Learning deep physiological models of affect", + "Scaling Up Spike-and-Slab Models for Unsupervised Feature Learning", + "Combining modality specific deep neural networks for emotion recognition in video", + "Knowledge Matters: Importance of Prior Information for Optimization", + "How to Construct Deep Recurrent Neural Networks", + "Multi-Prediction Deep Boltzmann Machines", + "Deep Learning for NLP (without Magic)", + "Bounding the Test Log-Likelihood of Generative Models", + "Deep Generative Stochastic Networks Trainable by Backprop", + "Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation" + ], + "pub_abstracts": [ + "Catastrophic forgetting is a problem faced by many machine learning models and algorithms. When trained on one task, then trained on a second task, many machine learning models \"forget\" how to perform the first task. This is widely believed to be a serious problem for neural networks. Here, we investigate the extent to which the catastrophic forgetting problem occurs for modern neural networks, comparing both established and recent gradient-based training algorithms and activation functions. We also examine the effect of the relationship between the first task and the second task on catastrophic forgetting. We find that it is always best to train using the dropout algorithm--the dropout algorithm is consistently best at adapting to the new task, remembering the old task, and has the best tradeoff curve between these two extremes. We find that different tasks and relationships between tasks result in very different rankings of activation function performance. This suggests the choice of activation function should always be cross-validated.", + "Classifying scenes (e.g. into \u201cstreet\u201d, \u201chome\u201d or \u201cleisure\u201d) is an important but complicated task nowadays, because images come with variability, ambiguity, and a wide range of illumination or scale conditions. Standard approaches build an intermediate representation of the global image and learn classifiers on it. Recently, it has been proposed to depict an image as an aggregation of its contained objects:the representation on which classifiers are trained is composed of many heterogeneous feature vectors derived from various object detectors. In this paper, we propose to study different approaches to efficiently combine the data extracted by these detectors. We use the features provided by Object-Bank (Li-Jia Li and Fei-Fei, 2010a) (177 different object detectors producing 252 attributes each), and show on several benchmarks for scene categorization that careful combinations, taking into account the structure of the data, allows to greatly improve over original results (from +5% to +11%) while drastically reducing the dimensionality of the representation by 97% (from", + "We consider an industrial strength application of recommendation systems for video-game matchmaking in which off-policy policy evaluation is important but where standard approaches can hardly be applied. The objective of the policy is to sequentially form teams of players from those waiting to be matched, in such a way as to produce well-balanced matches. Unfortunately, the available training data comes from a policy that is not known perfectly and that is not stochastic, making it impossible to use methods based on importance weights. Furthermore, we observe that when the estimated reward function and the policy are obtained by training from the same off-policy dataset, the policy evaluation using the estimated reward function is biased. We present a simple calibration procedure that is similar to stacked regression and that removes most of the bias, in the experiments we performed. Data collected during beta tests of Ghost Recon Online, a first person shooter from Ubisoft, were used for the experiments.", + " Restricted Boltzmann machines (RBMs) are powerful machine learning models, but learning and some kinds of inference in the model require sampling-based approximations, which, in classical digital computers, are implemented using expensive MCMC. Physical computation offers the opportunity to reduce the costof sampling by building physical systems whose natural dynamics correspond to drawing samples from the desired RBM distribution. Such a system avoids the burn-in and mixing cost of a Markov chain. However, hardware implementations of this variety usually entail limitations such as low-precision and limited range of the parameters and restrictions on the size and topology of the RBM. We conduct software simulations to determine how harmful each of these restrictions is. Our simulations are based on the D-Wave Two computer, but the issues we investigate arise in most forms of physical computation.Our findings suggest that designers of new physical computing hardware and algorithms for physical computers should focus their efforts on overcoming the limitations imposed by the topology restrictions of currently existing physical computers. ", + "Generative Stochastic Networks (GSNs) have been recently introduced as an alternative to traditional probabilistic modeling: instead of parametrizing the data distribution directly, one parametrizes a transition operator for a Markov chain whose stationary distribution is an estimator of the data generating distribution. The result of training is therefore a machine that generates samples through this Markov chain. However, the previously introduced GSN consistency theorems suggest that in order to capture a wide class of distributions, the transition operator in general should be multimodal, something that has not been done before this paper. We introduce for the first time multimodal transition distributions for GSNs, in particular using models in the NADE family (Neural Autoregressive Density Estimator) as output distributions of the transition operator. A NADE model is related to an RBM (and can thus model multimodal distributions) but its likelihood (and likelihood gradient) can be computed easily. The parameters of the NADE are obtained as a learned function of the previous state of the learned Markov chain. Experiments clearly illustrate the advantage of such multimodal transition distributions over unimodal GSNs.", + "Pylearn2 is a machine learning research library. This does not just mean that it is a collection of machine learning algorithms that share a common API; it means that it has been designed for flexibility and extensibility in order to facilitate research projects that involve new or unusual use cases. In this paper we give a brief history of the library, an overview of its basic philosophy, a summary of the library's architecture, and a description of how the Pylearn2 community functions socially.", + "This article exposes the failure of some big neural networks to leverage added capacity to reduce underfitting. Past research suggest diminishing returns when increasing the size of neural networks. Our experiments on ImageNet LSVRC-2010 show that this may be due to the fact there are highly diminishing returns for capacity in terms of training error, leading to underfitting. This suggests that the optimization method - first order gradient descent - fails at this regime. Directly attacking this problem, either through the optimization method or the choices of parametrization, may allow to improve the generalization error on large datasets, for which a large capacity is required.", + "Stochastic neurons can be useful for a number of reasons in deep learning models, but in many cases they pose a challenging problem: how to estimate the gradient of a loss function with respect to the input of such s tochastic neurons, i.e., can we \u201cback-propagate\u201d through these stochastic neurons? We examine this question, existing approaches, and present two novel families of solutions, applicable in different settings. In particular, it is demonstrate d that a simple biologically plausible formula gives rise to an an unbiased (but noisy) estimator of the gradient with respect to a binary stochastic neuron firing proba bility. Unlike other estimators which view the noise as a small perturbation in order to estimate gradients by finite differences, this estimator is unbiased even w ithout assuming that the stochastic perturbation is small. This estimator is also in teresting because it can be applied in very general settings which do not allow gradient back-propagation, including the estimation of the gradient with respect to futur e rewards, as required in reinforcement learning setups. We also propose an approach to approximating this unbiased but high-variance estimator by learning to predict it using a biased estimator. The second approach we propose assumes that an estimator of the gradient can be back-propagated and it provides an unbiased estimator of the gradient, but can only work with non-linearities unlike the hard threshold, but like the rectifier, that are not flat for all of their range. This is similar to trad itional sigmoidal units but has the advantage that for many inputs, a hard decision (e.g., a 0 output) can be produced, which would be convenient for conditional computation and achieving sparse representations and sparse gradients.", + "More than 15 years after the early studies in Affective Computing (AC), [1] the problem of detecting and modeling emotions in the context of human-computer interaction (HCI) remains complex and largely unexplored. The detection and modeling of emotion is, primarily, the study and use of artificial intelligence (AI) techniques for the construction of computational models of emotion. The key challenges one faces when attempting to model emotion [2] are inherent in the vague definitions and fuzzy boundaries of emotion, and in the modeling methodology followed. In this context, open research questions are still present in all key components of the modeling process. These include, first, the appropriateness of the modeling tool employed to map emotional manifestations and responses to annotated affective states; second, the processing of signals that express these manifestations (i.e., model input); and third, the way affective annotation (i.e., model output) is handled. This paper touches upon all three key components of an affective model (i.e., input, model, output) and introduces the use of deep learning (DL) [3], [4], [5] methodologies for affective modeling from multiple physiological signals.", + "We describe the use of two spike-and-slab models for modeling real-valued data, with an emphasis on their applications to object recognition. The first model, which we call spike-and-slab sparse coding (S3C), is a preexisting model for which we introduce a faster approximate inference algorithm. We introduce a deep variant of S3C, which we call the partially directed deep Boltzmann machine (PD-DBM) and extend our S3C inference algorithm for use on this model. We describe learning procedures for each. We demonstrate that our inference procedure for S3C enables scaling the model to unprecedented large problem sizes, and demonstrate that using S3C as a feature extractor results in very good object recognition performance, particularly when the number of labeled examples is low. We show that the PD-DBM generates better samples than its shallow counterpart, and that unlike DBMs or DBNs, the PD-DBM may be trained successfully without greedy layerwise training.", + "In this paper we present the techniques used for the University of Montr\u00e9al's team submissions to the 2013 Emotion Recognition in the Wild Challenge. The challenge is to classify the emotions expressed by the primary human subject in short video clips extracted from feature length movies. This involves the analysis of video clips of acted scenes lasting approximately one-two seconds, including the audio track which may contain human voices as well as background music. Our approach combines multiple deep neural networks for different data modalities, including: (1) a deep convolutional neural network for the analysis of facial expressions within video frames; (2) a deep belief net to capture audio information; (3) a deep autoencoder to model the spatio-temporal information produced by the human actions depicted within the entire scene; and (4) a shallow network architecture focused on extracted features of the mouth of the primary human subject in the scene. We discuss each of these techniques, their performance characteristics and different strategies to aggregate their predictions. Our best single model was a convolutional neural network trained to predict emotions from static frames using two large data sets, the Toronto Face Database and our own set of faces images harvested from Google image search, followed by a per frame aggregation strategy that used the challenge training data. This yielded a test set accuracy of 35.58%. Using our best strategy for aggregating our top performing models into a single predictor we were able to produce an accuracy of 41.03% on the challenge test set. These compare favorably to the challenge baseline test set accuracy of 27.56%.", + "We explore the effect of introducing prior information into the intermediate level of neural networks for a learning task on which all the state-of-the-art machine learning algorithms tested failed to learn. We motivate our work from the hypothesis that humans learn such intermediate concepts from other individuals via a form of supervision or guidance using a curriculum. The experiments we have conducted provide positive evidence in favor of this hypothesis. In our experiments, a two-tiered MLP architecture is trained on a dataset with 64x64 binary inputs images, each image with three sprites. The final task is to decide whether all the sprites are the same or one of them is different. Sprites are pentomino tetris shapes and they are placed in an image with different locations using scaling and rotation transformations. The first part of the two-tiered MLP is pre-trained with intermediate-level targets being the presence of sprites at each location, while the second part takes the output of the first part as input and predicts the final task's target binary event. The two-tiered MLP architecture, with a few tens of thousand examples, was able to learn the task perfectly, whereas all other algorithms (include unsupervised pre-training, but also traditional algorithms like SVMs, decision trees and boosting) all perform no better than chance. We hypothesize that the optimization difficulty involved when the intermediate pre-training is not performed is due to the {\\em composition} of two highly non-linear tasks. Our findings are also consistent with hypotheses on cultural learning inspired by the observations of optimization problems with deep learning, presumably because of effective local minima.", + "In this paper, we explore different ways to extend a recurrent neural network (RNN) to a \\textit{deep} RNN. We start by arguing that the concept of depth in an RNN is not as clear as it is in feedforward neural networks. By carefully analyzing and understanding the architecture of an RNN, however, we find three points of an RNN which may be made deeper; (1) input-to-hidden function, (2) hidden-to-hidden transition and (3) hidden-to-output function. Based on this observation, we propose two novel architectures of a deep RNN which are orthogonal to an earlier attempt of stacking multiple recurrent layers to build a deep RNN (Schmidhuber, 1992; El Hihi and Bengio, 1996). We provide an alternative interpretation of these deep RNNs using a novel framework based on neural operators. The proposed deep RNNs are empirically evaluated on the tasks of polyphonic music prediction and language modeling. The experimental result supports our claim that the proposed deep RNNs benefit from the depth and outperform the conventional, shallow RNNs.", + "We introduce the multi-prediction deep Boltzmann machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.1", + "Machine learning is everywhere in today\u2019s NLP, but by and large machine learning amounts to numerical optimization of weights for human designed representations and features. The goal of deep learning is to explore how computers can take advantage of data to develop features and representations appropriate for complex interpretation tasks. This tutorial aims to cover the basic motivation, ideas, models and learning algorithms in deep learning for natural language processing. Recently, these methods have been shown to perform very well on various NLP tasks such as language modeling, POS tagging, named entity recognition, sentiment analysis and paraphrase detection, among others. The most attractive quality of these techniques is that they can perform well without any external hand-designed resources or time-intensive feature engineering. Despite these advantages, many researchers in NLP are not familiar with these methods. Our focus is on insight and understanding, using graphical illustrations and simple, intuitive derivations. The goal of the tutorial is to make the inner workings of these techniques transparent, intuitive and their results interpretable, rather than black boxes labeled \u201dmagic here\u201d. The first part of the tutorial presents the basics of neural networks, neural word vectors, several simple models based on local windows and the math and algorithms of training via backpropagation. In this section applications include language modeling and POS tagging. In the second section we present recursive neural networks which can learn structured tree outputs as well as vector representations for phrases and sentences. We cover both equations as well as applications. We show how training can be achieved by a modified version of the backpropagation algorithm introduced before. These modifications allow the algorithm to work on tree structures. Applications include sentiment analysis and paraphrase detection. We also draw connections to recent work in semantic compositionality in vector spaces. The principle goal, again, is to make these methods appear intuitive and interpretable", + "Several interesting generative learning algorithms involve a complex probability distribution over many random variables, involving intractable normalization constants or latent variable normalization. Some of them may even not have an analytic expression for the unnormalized probability function and no tractable approximation. This makes it difficult to estimate the quality of these models, once they have been trained, or to monitor their quality (e.g. for early stopping) while training. A previously proposed method is based on constructing a non-parametric density estimator of the model's probability function from samples generated by the model. We revisit this idea, propose a more efficient estimator, and prove that it provides a lower bound on the true test log-likelihood, and an unbiased estimator as the number of generated samples goes to infinity, although one that incorporates the effect of poor mixing. We further propose a biased variant of the estimator that can be used reliably with a finite number of samples for the purpose of model comparison.", + "We introduce a novel training principle for probabilistic models that is an alternative to maximum likelihood. The proposed Generative Stochastic Networks (GSN) framework is based on learning the transition operator of a Markov chain whose stationary distribution estimates the data distribution. The transition distribution of the Markov chain is conditional on the previous state, generally involving a small move, so this conditional distribution has fewer dominant modes, being unimodal in the limit of small moves. Thus, it is easier to learn because it is easier to approximate its partition function, more like learning to perform supervised function approximation, with gradients that can be obtained by backprop. We provide theorems that generalize recent work on the probabilistic interpretation of denoising autoencoders and obtain along the way an interesting justification for dependency networks and generalized pseudolikelihood, along with a definition of an appropriate joint distribution and sampling mechanism even when the conditionals are not consistent. GSNs can be used with missing inputs and can be used to sample subsets of variables given the rest. We validate these theoretical results with experiments on two image datasets using an architecture that mimics the Deep Boltzmann Machine Gibbs sampler but allows training to proceed with simple backprop, without the need for layerwise pretraining.", + "Stochastic neurons and hard non-linearities can be useful for a number of reasons in deep learning models, but in many cases they pose a challenging problem: how to estimate the gradient of a loss function with respect to the input of such stochastic or non-smooth neurons? I.e., can we \"back-propagate\" through these stochastic neurons? We examine this question, existing approaches, and compare four families of solutions, applicable in different settings. One of them is the minimum variance unbiased gradient estimator for stochatic binary neurons (a special case of the REINFORCE algorithm). A second approach, introduced here, decomposes the operation of a binary stochastic neuron into a stochastic binary part and a smooth differentiable part, which approximates the expected effect of the pure stochatic binary neuron to first order. A third approach involves the injection of additive or multiplicative noise in a computational graph that is otherwise differentiable. A fourth approach heuristically copies the gradient with respect to the stochastic output directly as an estimator of the gradient with respect to the sigmoid argument (we call this the straight-through estimator). To explore a context where these estimators are useful, we consider a small-scale version of {\\em conditional computation}, where sparse stochastic units form a distributed representation of gaters that can turn off in combinatorially many ways large chunks of the computation performed in the rest of the neural network. In this case, it is important that the gating units produce an actual 0 most of the time. The resulting sparsity can be potentially be exploited to greatly reduce the computational cost of large deep networks for which conditional computation would be useful." + ], + "domain": [ + "Deep Learning", + "Neural Networks", + "Affective Computing", + "Generative Models" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "994cf795-e9b3-43d1-a6ea-faa7ecb19355": { + "pk": "994cf795-e9b3-43d1-a6ea-faa7ecb19355", + "name": "Geoffrey Hinton", + "bio": "I am a researcher dedicated to advancing the fields of computer vision and machine learning through innovative methodologies and frameworks. My recent work focuses on unifying diverse computer vision tasks, such as object detection and image captioning, under a shared pixel-to-sequence interface. This approach allows for a single model architecture to handle multiple tasks without the need for task-specific customizations, demonstrating competitive performance against specialized models.\n\nI have also explored dynamic evaluation techniques for language models, introducing Fast Weight Layers (FWLs) that enhance performance while minimizing computational costs. My research extends to novel learning procedures, such as the Forward-Forward algorithm, which simplifies the training process by utilizing two forward passes instead of traditional backpropagation.\n\nIn addition, I have developed innovative solutions for panoptic segmentation and discrete data generation through diffusion models, achieving state-of-the-art results in various benchmarks. My work on representation learning in medical AI, particularly the REMEDIS framework, addresses the challenges of data-efficient generalization, significantly improving diagnostic accuracy with minimal retraining data.\n\nI am passionate about creating flexible and efficient learning frameworks that not only enhance model performance but also provide insights into the underlying processes of neural networks. My goal is to bridge the gap between complex tasks and effective learning strategies, ultimately contributing to the broader impact of AI in real-world applications.", + "collaborators": [ + "David J. Fleet", + "Ting Chen", + "Simon Kornblith", + "Mohammad Norouzi", + "Saurabh Saxena", + "Lala Li", + "Shekoofeh Azizi", + "J. Freyberg", + "Sebastien Baur", + "S. S. Mahdavi", + "Ellery Wulczyn", + "Boris Babenko", + "Aaron Loh", + "Po-Hsuan Cameron Chen", + "Yuan Liu", + "Pinal Bavishi", + "S. McKinney", + "Jim Winkens", + "Abhijit Guha Roy", + "Zach Beaver", + "Justin D. Krogue", + "M. Etemadi", + "Umesh Telang", + "Yun Liu", + "L. Peng", + "G. Corrado", + "D. Webster", + "N. Houlsby", + "A. Karthikesalingam", + "Vivek Natarajan", + "Mengye Ren", + "Renjie Liao", + "S. Sabour", + "Richard F. Rashid", + "Laura Culp", + "Basil Mustafa", + "Nenad Toma\u0161ev", + "Jovana Mitrovic", + "Patricia Strachan", + "Megan Walker", + "Fiona Ryan", + "R. Gartner", + "Jessica Bundy", + "Maria Jung", + "Tyler J King", + "Jane B. Sprott", + "Fernando \u00c1vila", + "J. Briggs", + "Daniel Konikof", + "Alex Luscombe", + "Audrey Macklin", + "H. Pelvin", + "Tsung-Yi Lin", + "Kevin Clark", + "Kelvin Guu", + "Ming-Wei Chang", + "Panupong Pasupat", + "Ruixiang Zhang", + "Laura J. Culp", + "L. Culp", + "B. Mustafa", + "Patricia MacWilliams", + "Megan Wilson", + "F. Ryan", + "Xiaodong He", + "Jianfeng Gao", + "L. Deng", + "S. Yih", + "Yu", + "J. Markoff", + "Dong Yu", + "Yoshua Bengio", + "Yann LeCun", + "A. Tagliasacchi", + "S. Yazdani", + "Geoffrey I. Webb", + "Johannes F\u00fcrnkranz", + "Claude Sammut", + "Joerg Sander", + "M. Vlachos", + "Yee Whye Teh", + "Ying Yang", + "D. Mladen\u00ed", + "J. Brank", + "M. Grobelnik", + "Ying Zhao", + "G. Karypis", + "Susan Craw", + "M. Puterman", + "J. Patrick", + "Aniruddh Raghu", + "M. Raghu", + "D. Duvenaud" + ], + "pub_titles": [ + "Volume 19, Number 5", + "A Unified Sequence Interface for Vision Tasks", + "Meta-Learning Fast Weight Language Models", + "The Forward-Forward Algorithm: Some Preliminary Investigations", + "Scaling Forward Gradient With Local Losses", + "A Generalist Framework for Panoptic Segmentation of Images and Videos", + "Analog Bits: Generating Discrete Data using Diffusion Models with Self-Conditioning", + "Gaussian-Bernoulli RBMs Without Tears", + "Testing GLOM's ability to infer wholes from ambiguous parts", + "Robust and Efficient Medical Imaging with Self-Supervision", + "Pix2seq: A Language Modeling Framework for Object Detection", + "Deep Learning for Natural Language Processing", + "How to Represent Part-Whole Hierarchies in a Neural Network", + "Deep learning for AI", + "Unsupervised part representation by Flow Capsules", + "The Next Generation of Neural Networks", + "Teaching with Commentaries" + ], + "pub_abstracts": [ + "September/October 2023", + "While language tasks are naturally expressed in a single, unified, modeling framework, i.e., generating sequences of tokens, this has not been the case in computer vision. As a result, there is a proliferation of distinct architectures and loss functions for different vision tasks. In this work we show that a diverse set of\"core\"computer vision tasks can also be unified if formulated in terms of a shared pixel-to-sequence interface. We focus on four tasks, namely, object detection, instance segmentation, keypoint detection, and image captioning, all with diverse types of outputs, e.g., bounding boxes or dense masks. Despite that, by formulating the output of each task as a sequence of discrete tokens with a unified interface, we show that one can train a neural network with a single model architecture and loss function on all these tasks, with no task-specific customization. To solve a specific task, we use a short prompt as task description, and the sequence output adapts to the prompt so it can produce task-specific output. We show that such a model can achieve competitive performance compared to well-established task-specific models.", + "Dynamic evaluation of language models (LMs) adapts model parameters at test time using gradient information from previous tokens and substantially improves LM performance. However, it requires over 3x more compute than standard inference. We present Fast Weight Layers (FWLs), a neural component that provides the benefits of dynamic evaluation much more efficiently by expressing gradient updates as linear attention. A key improvement over dynamic evaluation is that FWLs can also be applied at training time, so the model learns to make good use of gradient updates. FWLs can easily be added on top of existing transformer models, require relatively little extra compute or memory to run, and significantly improve language modeling perplexity.", + "The aim of this paper is to introduce a new learning procedure for neural networks and to demonstrate that it works well enough on a few small problems to be worth further investigation. The Forward-Forward algorithm replaces the forward and backward passes of backpropagation by two forward passes, one with positive (i.e. real) data and the other with negative data which could be generated by the network itself. Each layer has its own objective function which is simply to have high goodness for positive data and low goodness for negative data. The sum of the squared activities in a layer can be used as the goodness but there are many other possibilities, including minus the sum of the squared activities. If the positive and negative passes could be separated in time, the negative passes could be done offline, which would make the learning much simpler in the positive pass and allow video to be pipelined through the network without ever storing activities or stopping to propagate derivatives.", + "Forward gradient learning computes a noisy directional gradient and is a biologically plausible alternative to backprop for learning deep neural networks. However, the standard forward gradient algorithm, when applied naively, suffers from high variance when the number of parameters to be learned is large. In this paper, we propose a series of architectural and algorithmic modifications that together make forward gradient learning practical for standard deep learning benchmark tasks. We show that it is possible to substantially reduce the variance of the forward gradient estimator by applying perturbations to activations rather than weights. We further improve the scalability of forward gradient by introducing a large number of local greedy loss functions, each of which involves only a small number of learnable parameters, and a new MLPMixer-inspired architecture, LocalMixer, that is more suitable for local learning. Our approach matches backprop on MNIST and CIFAR-10 and significantly outperforms previously proposed backprop-free algorithms on ImageNet.", + "Panoptic segmentation assigns semantic and instance ID labels to every pixel of an image. As permutations of instance IDs are also valid solutions, the task requires learning of high-dimensional one-to-many mapping. As a result, state-of-the-art approaches use customized architectures and task-specific loss functions. We formulate panoptic segmentation as a discrete data generation problem, without relying on inductive bias of the task. A diffusion model is proposed to model panoptic masks, with a simple architecture and generic loss function. By simply adding past predictions as a conditioning signal, our method is capable of modeling video (in a streaming setting) and thereby learns to track object instances automatically. With extensive experiments, we demonstrate that our simple approach can perform competitively to state-of-the-art specialist methods in similar settings. 1", + "We present Bit Diffusion: a simple and generic approach for generating discrete data with continuous state and continuous time diffusion models. The main idea behind our approach is to first represent the discrete data as binary bits, and then train a continuous diffusion model to model these bits as real numbers which we call analog bits. To generate samples, the model first generates the analog bits, which are then thresholded to obtain the bits that represent the discrete variables. We further propose two simple techniques, namely Self-Conditioning and Asymmetric Time Intervals, which lead to a significant improvement in sample quality. Despite its simplicity, the proposed approach can achieve strong performance in both discrete image generation and image captioning tasks. For discrete image generation, we significantly improve previous state-of-the-art on both CIFAR-10 (which has 3K discrete 8-bit tokens) and ImageNet-64x64 (which has 12K discrete 8-bit tokens), outperforming the best autoregressive model in both sample quality (measured by FID) and efficiency. For image captioning on MS-COCO dataset, our approach achieves competitive results compared to autoregressive models.", + "We revisit the challenging problem of training Gaussian-Bernoulli restricted Boltzmann machines (GRBMs), introducing two innovations. We propose a novel Gibbs-Langevin sampling algorithm that outperforms existing methods like Gibbs sampling. We propose a modified contrastive divergence (CD) algorithm so that one can generate images with GRBMs starting from noise. This enables direct comparison of GRBMs with deep generative models, improving evaluation protocols in the RBM literature. Moreover, we show that modified CD and gradient clipping are enough to robustly train GRBMs with large learning rates, thus removing the necessity of various tricks in the literature. Experiments on Gaussian Mixtures, MNIST, FashionMNIST, and CelebA show GRBMs can generate good samples, despite their single-hidden-layer architecture. Our code is released at: \\url{https://github.com/lrjconan/GRBM}.", + "The GLOM architecture proposed by Hinton [2021] is a recurrent neural network for parsing an image into a hierarchy of wholes and parts. When a part is ambiguous, GLOM assumes that the ambiguity can be resolved by allowing the part to make multi-modal predictions for the pose and identity of the whole to which it belongs and then using attention to similar predictions coming from other possibly ambiguous parts to settle on a common mode that is predicted by several different parts. In this study, we describe a highly simplified version of GLOM that allows us to assess the effectiveness of this way of dealing with ambiguity. Our results show that, with supervised training, GLOM is able to successfully form islands of very similar embedding vectors for all of the locations occupied by the same object and it is also robust to strong noise injections in the input and to out-of-distribution input transformations.", + "Recent progress in Medical Artificial Intelligence (AI) has delivered systems that can reach clinical expert level performance. However, such systems tend to demonstrate sub-optimal\"out-of-distribution\"performance when evaluated in clinical settings different from the training environment. A common mitigation strategy is to develop separate systems for each clinical setting using site-specific data [1]. However, this quickly becomes impractical as medical data is time-consuming to acquire and expensive to annotate [2]. Thus, the problem of\"data-efficient generalization\"presents an ongoing difficulty for Medical AI development. Although progress in representation learning shows promise, their benefits have not been rigorously studied, specifically for out-of-distribution settings. To meet these challenges, we present REMEDIS, a unified representation learning strategy to improve robustness and data-efficiency of medical imaging AI. REMEDIS uses a generic combination of large-scale supervised transfer learning with self-supervised learning and requires little task-specific customization. We study a diverse range of medical imaging tasks and simulate three realistic application scenarios using retrospective data. REMEDIS exhibits significantly improved in-distribution performance with up to 11.5% relative improvement in diagnostic accuracy over a strong supervised baseline. More importantly, our strategy leads to strong data-efficient generalization of medical imaging AI, matching strong supervised baselines using between 1% to 33% of retraining data across tasks. These results suggest that REMEDIS can significantly accelerate the life-cycle of medical imaging AI development thereby presenting an important step forward for medical imaging AI to deliver broad impact.", + "We present Pix2Seq, a simple and generic framework for object detection. Unlike existing approaches that explicitly integrate prior knowledge about the task, we cast object detection as a language modeling task conditioned on the observed pixel inputs. Object descriptions (e.g., bounding boxes and class labels) are expressed as sequences of discrete tokens, and we train a neural network to perceive the image and generate the desired sequence. Our approach is based mainly on the intuition that if a neural network knows about where and what the objects are, we just need to teach it how to read them out. Beyond the use of task-specific data augmentations, our approach makes minimal assumptions about the task, yet it achieves competitive results on the challenging COCO dataset, compared to highly specialized and well optimized detection algorithms.", + ",", + "Abstract This article does not describe a working system. Instead, it presents a single idea about representation that allows advances made by several different groups to be combined into an imaginary system called GLOM.1 The advances include transformers, neural fields, contrastive representation learning, distillation, and capsules. GLOM answers the question: How can a neural network with a fixed architecture parse an image into a part-whole hierarchy that has a different structure for each image? The idea is simply to use islands of identical vectors to represent the nodes in the parse tree. If GLOM can be made to work, it should significantly improve the interpretability of the representations produced by transformer-like systems when applied to vision or language.", + "How can neural networks learn the rich internal representations required for difficult tasks such as recognizing objects or understanding language?", + "Capsule networks are designed to parse an image into a hierarchy of objects, parts and relations. While promising, they remain limited by an inability to learn effective low level part descriptions. To address this issue we propose a novel self-supervised method for learning part descriptors of an image. During training, we exploit motion as a powerful perceptual cue for part definition, using an expressive decoder for part generation and layered image formation with occlusion. Experiments demonstrate robust part discovery in the presence of multiple objects, cluttered backgrounds, and significant occlusion. The resulting part descriptors, a.k.a. part capsules, are decoded into shape masks, filling in occluded pixels, along with relative depth on single images. We also report unsupervised object classification using our capsule parts in a stacked capsule autoencoder.", + "The most important unsolved problem with artificial neural networks is how to do unsupervised learning as effectively as the brain. There are currently two main approaches to unsupervised learning. In the first approach, exemplified by BERT and Variational Autoencoders, a deep neural network is used to reconstruct its input. This is problematic for images because the deepest layers of the network need to encode the fine details of the image. An alternative approach, introduced by Becker and Hinton in 1992, is to train two copies of a deep neural network to produce output vectors that have high mutual information when given two different crops of the same image as their inputs. This approach was designed to allow the representations to be untethered from irrelevant details of the input. The method of optimizing mutual information used by Becker and Hinton was flawed (for a subtle reason that I will explain) so Pacannaro and Hinton (2001) replaced it by a discriminative objective in which one vector representation must select a corresponding vector representation from among many alternatives. With faster hardware, contrastive learning of representations has recently become very popular and is proving to be very effective, but it suffers from a major flaw: To learn pairs of representation vectors that have N bits of mutual information we need to contrast the correct corresponding vector with about 2N incorrect alternatives. I will describe a novel and effective way of dealing with this limitation. I will also show that this leads to a simple way of implementing perceptual learning in cortex.", + "Effective training of deep neural networks can be challenging, and there remain many open questions on how to best learn these models. Recently developed methods to improve neural network training examine teaching: providing learned information during the training process to improve downstream model performance. In this paper, we take steps towards extending the scope of teaching. We propose a flexible teaching framework using commentaries, meta-learned information helpful for training on a particular task or dataset. We present an efficient and scalable gradient-based method to learn commentaries, leveraging recent work on implicit differentiation. We explore diverse applications of commentaries, from learning weights for individual training examples, to parameterizing label-dependent data augmentation policies, to representing attention masks that highlight salient image regions. In these settings, we find that commentaries can improve training speed and/or performance and also provide fundamental insights about the dataset and training process." + ], + "domain": [ + "Computer Vision", + "Natural Language Processing", + "Neural Networks", + "Representation Learning" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "be76cb2d-9483-49ba-93e3-29c96509ab30": { + "pk": "be76cb2d-9483-49ba-93e3-29c96509ab30", + "name": "Oriol Vinyals", + "bio": "I am a researcher deeply engaged in the intersection of multimodal machine learning and generative models, with a focus on enhancing the understanding and performance of complex tasks such as event argument role labeling (EARL) and long-context reasoning. My recent work, including the development of GenEARL, showcases my commitment to creating innovative frameworks that leverage generative models to improve performance in challenging tasks without the need for extensive training data. \n\nI have also contributed to the Gemini family of models, which excel in multimodal reasoning and have set new benchmarks in medical applications, demonstrating the potential of AI in critical domains. My exploration of lightweight models, such as Gemma, reflects my dedication to making advanced AI accessible and efficient. \n\nIn addition to my work on specific models, I am interested in understanding the dynamics of large language models and their multitasking capabilities. My research has revealed insights into task competition and the emergence of abilities in these models, paving the way for more effective training strategies. \n\nI am passionate about applying my findings to real-world challenges, such as improving weather forecasting with GraphCast and optimizing video compression using reinforcement learning. My goal is to push the boundaries of what is possible in AI, ensuring that our models not only perform well but also adapt seamlessly to new tasks and domains.", + "collaborators": [ + "Sebastian Borgeaud", + "Katie Millican", + "Jean-Baptiste Alayrac", + "Elena Buchatskaya", + "Julian Schrittwieser", + "Roman Ring", + "Eliza Rutherford", + "Zhitao Gong", + "Tom Hennigan", + "Eric Noland", + "D. Hassabis", + "Karsten Roth", + "Zeynep Akata", + "Antoine Miech", + "Iain Barr", + "Karel Lenc", + "A. Mensch", + "Malcolm Reynolds", + "Sina Samangooei", + "Andy Brock", + "Andrew Zisserman", + "Lisa Anne Hendricks", + "Machel Reid", + "Eli Collins", + "Siamak Shakeri", + "Aakanksha Chowdhery", + "Petko Georgiev", + "Jean-Baptiste Lespiau", + "Charline Le Lan", + "Paul Michel", + "Evan Senter", + "Mateo Wirth", + "Amol Mandhane", + "Minh Giang", + "R. Comanescu", + "Alek Andreev", + "K. Kavukcuoglu", + "Joelle Barral", + "D. Mankowitz", + "A. Zhernov", + "T. Hubert", + "Jeff Donahue", + "Pauline Luc", + "Yana Hasson", + "Serkan Cabi", + "Tengda Han", + "Marianne Monteiro", + "Jacob Menick", + "Aida Nematzadeh", + "Sahand Sharifzadeh", + "Ricardo Barreira", + "Tom Brown", + "Benjamin Mann", + "Jared Kaplan", + "Pranav Shyam", + "Jordan Hoffmann", + "Trevor Cai", + "Diego de", + "Las Casas", + "Johannes Welbl", + "Aidan Clark", + "Rohan Anil", + "P. Barham", + "Ross McIlroy", + "Melvin Johnson", + "Erica Moreira", + "H. Michalewski", + "James Keeling", + "Oscar Chang", + "George Tucker", + "Tom\u00e1s Kocisk\u00fd", + "Evgenii Eltyshev", + "Ambrose Slone", + "Ben Caine", + "J Christopher Love", + "N. Houlsby", + "Luheng He", + "Yong Cheng", + "Yujia Li", + "Albert Webson", + "Rahma Chaabouni", + "T. Paine", + "Behnam Neyshabur", + "Jack W. Rae", + "Boxi Wu", + "Basil Mustafa", + "Emilio Parisotto", + "Chenjie Gu", + "A. Pritzel", + "J. Mao-Jones", + "Hannah Sheahan", + "James Svensson", + "Bogdan Damoc", + "George van den Driessche", + "Justin Chiu", + "Adri\u00e0 Recasens", + "S'ebastien M. R. Arnold", + "Lisa Lee", + "Kartikeya Badola", + "Joshua Newlan" + ], + "pub_titles": [ + "GenEARL: A Training-Free Generative Framework for Multimodal Event Argument Role Labeling", + "Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context", + "Capabilities of Gemini Models in Medicine", + "Understanding Inverse Scaling and Emergence in Multitask Representation Learning", + "Gemma: Open Models Based on Gemini Research and Technology", + "A Practitioner's Guide to Continual Multimodal Pretraining", + "Gemma 2: Improving Open Language Models at a Practical Size", + "AlphaStar Unplugged: Large-Scale Offline Reinforcement Learning", + "Waffling around for Performance: Visual Classification with Random Words and Broad Concepts", + "Learning skillful medium-range global weather forecasting", + "Fantastic Gains and Where to Find Them: On the Existence and Prospect of General Knowledge Transfer between Any Pretrained Model", + "Optimizing Memory Mapping Using Deep Reinforcement Learning", + "General-purpose, long-context autoregressive modeling with Perceiver AR", + "Flamingo: a Visual Language Model for Few-Shot Learning", + "Hierarchical Perceiver", + "Non-isotropy Regularization for Proxy-based Deep Metric Learning", + "A Generalist Agent", + "MuZero with Self-competition for Rate Control in VP9 Video Compression" + ], + "pub_abstracts": [ + "Multimodal event argument role labeling (EARL), a task that assigns a role for each event participant (object) in an image is a complex challenge. It requires reasoning over the entire image, the depicted event, and the interactions between various objects participating in the event. Existing models heavily rely on high-quality event-annotated training data to understand the event semantics and structures, and they fail to generalize to new event types and domains. In this paper, we propose GenEARL, a training-free generative framework that harness the power of the modern generative models to understand event task descriptions given image contexts to perform the EARL task. Specifically, GenEARL comprises two stages of generative prompting with a frozen vision-language model (VLM) and a frozen large language model (LLM). First, a generative VLM learns the semantics of the event argument roles and generates event-centric object descriptions based on the image. Subsequently, a LLM is prompted with the generated object descriptions with a predefined template for EARL (i.e., assign an object with an event argument role). We show that GenEARL outperforms the contrastive pretraining (CLIP) baseline by 9.4% and 14.2% accuracy for zero-shot EARL on the M2E2 and SwiG datasets, respectively. In addition, we outperform CLIP-Event by 22% precision on M2E2 dataset. The framework also allows flexible adaptation and generalization to unseen domains.", + "In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly compute-efficient multimodal models capable of recalling and reasoning over fine-grained information from millions of tokens of context, including multiple long documents and hours of video and audio. The family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February version on the great majority of capabilities and benchmarks; (2) Gemini 1.5 Flash, a more lightweight variant designed for efficiency with minimal regression in quality. Gemini 1.5 models achieve near-perfect recall on long-context retrieval tasks across modalities, improve the state-of-the-art in long-document QA, long-video QA and long-context ASR, and match or surpass Gemini 1.0 Ultra's state-of-the-art performance across a broad set of benchmarks. Studying the limits of Gemini 1.5's long-context ability, we find continued improvement in next-token prediction and near-perfect retrieval (>99%) up to at least 10M tokens, a generational leap over existing models such as Claude 3.0 (200k) and GPT-4 Turbo (128k). Finally, we highlight real-world use cases, such as Gemini 1.5 collaborating with professionals on completing their tasks achieving 26 to 75% time savings across 10 different job categories, as well as surprising new capabilities of large language models at the frontier; when given a grammar manual for Kalamang, a language with fewer than 200 speakers worldwide, the model learns to translate English to Kalamang at a similar level to a person who learned from the same content.", + "Excellence in a wide variety of medical applications poses considerable challenges for AI, requiring advanced reasoning, access to up-to-date medical knowledge and understanding of complex multimodal data. Gemini models, with strong general capabilities in multimodal and long-context reasoning, offer exciting possibilities in medicine. Building on these core strengths of Gemini, we introduce Med-Gemini, a family of highly capable multimodal models that are specialized in medicine with the ability to seamlessly use web search, and that can be efficiently tailored to novel modalities using custom encoders. We evaluate Med-Gemini on 14 medical benchmarks, establishing new state-of-the-art (SoTA) performance on 10 of them, and surpass the GPT-4 model family on every benchmark where a direct comparison is viable, often by a wide margin. On the popular MedQA (USMLE) benchmark, our best-performing Med-Gemini model achieves SoTA performance of 91.1% accuracy, using a novel uncertainty-guided search strategy. On 7 multimodal benchmarks including NEJM Image Challenges and MMMU (health&medicine), Med-Gemini improves over GPT-4V by an average relative margin of 44.5%. We demonstrate the effectiveness of Med-Gemini's long-context capabilities through SoTA performance on a needle-in-a-haystack retrieval task from long de-identified health records and medical video question answering, surpassing prior bespoke methods using only in-context learning. Finally, Med-Gemini's performance suggests real-world utility by surpassing human experts on tasks such as medical text summarization, alongside demonstrations of promising potential for multimodal medical dialogue, medical research and education. Taken together, our results offer compelling evidence for Med-Gemini's potential, although further rigorous evaluation will be crucial before real-world deployment in this safety-critical domain.", + "Large language models exhibit strong multi-tasking capabilities, however, their learning dynamics as a function of task characteristics, sample size, and model complexity remain mysterious. For instance, it is known that, as the model size grows, large language models exhibit emerging abilities where certain tasks can abruptly jump from poor to respectable performance. Such phenomena motivate a deeper understanding of how individual tasks evolve during multitasking. To this aim, we study a multitask representation learning setup where tasks can have distinct distributions , quantified by their covariance priors. Through random matrix theory, we precisely characterize the optimal linear representation for few-shot learning that minimizes the average test risk in terms of task covariances. When tasks have equal sample sizes, we prove a reduction to an equivalent problem with a single effective covariance from which the individual task risks of the original problem can be deduced. Importantly, we introduce \u201c task competition \u201d to explain how tasks with dominant covariance eigen-spectrum emerge faster than others. We show that task competition can potentially explain the inverse scaling of certain tasks i.e. reduced test accuracy as the model grows. Overall, this work sheds light on the risk and emergence of individual tasks and uncovers new high-dimensional phenomena (including multiple-descent risk curves) that arise in multitask representation learning.", + "This work introduces Gemma, a family of lightweight, state-of-the art open models built from the research and technology used to create Gemini models. Gemma models demonstrate strong performance across academic benchmarks for language understanding, reasoning, and safety. We release two sizes of models (2 billion and 7 billion parameters), and provide both pretrained and fine-tuned checkpoints. Gemma outperforms similarly sized open models on 11 out of 18 text-based tasks, and we present comprehensive evaluations of safety and responsibility aspects of the models, alongside a detailed description of model development. We believe the responsible release of LLMs is critical for improving the safety of frontier models, and for enabling the next wave of LLM innovations.", + "Multimodal foundation models serve numerous applications at the intersection of vision and language. Still, despite being pretrained on extensive data, they become outdated over time. To keep models updated, research into continual pretraining mainly explores scenarios with either (1) infrequent, indiscriminate updates on large-scale new data, or (2) frequent, sample-level updates. However, practical model deployment often operates in the gap between these two limit cases, as real-world applications often demand adaptation to specific subdomains, tasks or concepts -- spread over the entire, varying life cycle of a model. In this work, we complement current perspectives on continual pretraining through a research test bed as well as provide comprehensive guidance for effective continual model updates in such scenarios. We first introduce FoMo-in-Flux, a continual multimodal pretraining benchmark with realistic compute constraints and practical deployment requirements, constructed over 63 datasets with diverse visual and semantic coverage. Using FoMo-in-Flux, we explore the complex landscape of practical continual pretraining through multiple perspectives: (1) A data-centric investigation of data mixtures and stream orderings that emulate real-world deployment situations, (2) a method-centric investigation ranging from simple fine-tuning and traditional continual learning strategies to parameter-efficient updates and model merging, (3) meta learning rate schedules and mechanistic design choices, and (4) the influence of model and compute scaling. Together, our insights provide a practitioner's guide to continual multimodal pretraining for real-world deployment. Our benchmark and code is here: https://github.com/ExplainableML/fomo_in_flux.", + "In this work, we introduce Gemma 2, a new addition to the Gemma family of lightweight, state-of-the-art open models, ranging in scale from 2 billion to 27 billion parameters. In this new version, we apply several known technical modifications to the Transformer architecture, such as interleaving local-global attentions (Beltagy et al., 2020a) and group-query attention (Ainslie et al., 2023). We also train the 2B and 9B models with knowledge distillation (Hinton et al., 2015) instead of next token prediction. The resulting models deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3 times bigger. We release all our models to the community.", + "StarCraft II is one of the most challenging simulated reinforcement learning environments; it is partially observable, stochastic, multi-agent, and mastering StarCraft II requires strategic planning over long time horizons with real-time low-level execution. It also has an active professional competitive scene. StarCraft II is uniquely suited for advancing offline RL algorithms, both because of its challenging nature and because Blizzard has released a massive dataset of millions of StarCraft II games played by human players. This paper leverages that and establishes a benchmark, called AlphaStar Unplugged, introducing unprecedented challenges for offline reinforcement learning. We define a dataset (a subset of Blizzard's release), tools standardizing an API for machine learning methods, and an evaluation protocol. We also present baseline agents, including behavior cloning, offline variants of actor-critic and MuZero. We improve the state of the art of agents using only offline data, and we achieve 90% win rate against previously published AlphaStar behavior cloning agent.", + "The visual classification performance of vision-language models such as CLIP has been shown to benefit from additional semantic knowledge from large language models (LLMs) such as GPT-3. In particular, averaging over LLM-generated class descriptors, e.g. \"waffle, which has a round shape\", can notably improve generalization performance. In this work, we critically study this behavior and propose WaffleCLIP, a framework for zero-shot visual classification which simply replaces LLM-generated descriptors with random character and word descriptors. Without querying external models, we achieve comparable performance gains on a large number of visual classification tasks. This allows WaffleCLIP to both serve as a low-cost alternative, as well as a sanity check for any future LLM-based vision-language model extensions. We conduct an extensive experimental study on the impact and shortcomings of additional semantics introduced with LLM-generated descriptors, and showcase how - if available - semantic context is better leveraged by querying LLMs for high-level concepts, which we show can be done to jointly resolve potential class name ambiguities. Code is available here: https://github.com/ExplainableML/WaffleCLIP.", + "Global medium-range weather forecasting is critical to decision-making across many social and economic domains. Traditional numerical weather prediction uses increased compute resources to improve forecast accuracy but does not directly use historical weather data to improve the underlying model. Here, we introduce GraphCast, a machine learning\u2013based method trained directly from reanalysis data. It predicts hundreds of weather variables for the next 10 days at 0.25\u00b0 resolution globally in under 1 minute. GraphCast significantly outperforms the most accurate operational deterministic systems on 90% of 1380 verification targets, and its forecasts support better severe event prediction, including tropical cyclone tracking, atmospheric rivers, and extreme temperatures. GraphCast is a key advance in accurate and efficient weather forecasting and helps realize the promise of machine learning for modeling complex dynamical systems. Editor\u2019s summary The numerical models used to predict weather are large, complex, and computationally demanding and do not learn from past weather patterns. Lam et al. introduced a machine learning\u2013based method that has been trained directly from reanalysis data of past atmospheric conditions. In this way, the authors were able to quickly predict hundreds of weather variables globally up to 10 days in advance and at high resolution. Their predictions were more accurate than those of traditional weather models in 90% of tested cases and displayed better severe event prediction for tropical cyclones, atmospheric rivers, and extreme temperatures. \u2014H. Jesse Smith Machine learning leads to better, faster, and cheaper weather forecasting.", + "Training deep networks requires various design decisions regarding for instance their architecture, data augmentation, or optimization. In this work, we find these training variations to result in networks learning unique feature sets from the data. Using public model libraries comprising thousands of models trained on canonical datasets like ImageNet, we observe that for arbitrary pairings of pretrained models, one model extracts significant data context unavailable in the other -- independent of overall performance. Given any arbitrary pairing of pretrained models and no external rankings (such as separate test sets, e.g. due to data privacy), we investigate if it is possible to transfer such\"complementary\"knowledge from one model to another without performance degradation -- a task made particularly difficult as additional knowledge can be contained in stronger, equiperformant or weaker models. Yet facilitating robust transfer in scenarios agnostic to pretrained model pairings would unlock auxiliary gains and knowledge fusion from any model repository without restrictions on model and problem specifics - including from weaker, lower-performance models. This work therefore provides an initial, in-depth exploration on the viability of such general-purpose knowledge transfer. Across large-scale experiments, we first reveal the shortcomings of standard knowledge distillation techniques, and then propose a much more general extension through data partitioning for successful transfer between nearly all pretrained models, which we show can also be done unsupervised. Finally, we assess both the scalability and impact of fundamental model properties on successful model-agnostic knowledge transfer.", + "Resource scheduling and allocation is a critical component of many high impact systems ranging from congestion control to cloud computing. Finding more optimal solutions to these problems often has significant impact on resource and time savings, reducing device wear-and-tear, and even potentially improving carbon emissions. In this paper, we focus on a specific instance of a scheduling problem, namely the memory mapping problem that occurs during compilation of machine learning programs: That is, mapping tensors to different memory layers to optimize execution time. We introduce an approach for solving the memory mapping problem using Reinforcement Learning. RL is a solution paradigm well-suited for sequential decision making problems that are amenable to planning, and combinatorial search spaces with high-dimensional data inputs. We formulate the problem as a single-player game, which we call the mallocGame, such that high-reward trajectories of the game correspond to efficient memory mappings on the target hardware. We also introduce a Reinforcement Learning agent, mallocMuZero, and show that it is capable of playing this game to discover new and improved memory mapping solutions that lead to faster execution times on real ML workloads on ML accelerators. We compare the performance of mallocMuZero to the default solver used by the Accelerated Linear Algebra (XLA) compiler on a benchmark of realistic ML workloads. In addition, we show that mallocMuZero is capable of improving the execution time of the recently published AlphaTensor matrix multiplication model.", + "Real-world data is high-dimensional: a book, image, or musical performance can easily contain hundreds of thousands of elements even after compression. However, the most commonly used autoregressive models, Transformers, are prohibitively expensive to scale to the number of inputs and layers needed to capture this long-range structure. We develop Perceiver AR, an autoregressive, modality-agnostic architecture which uses cross-attention to map long-range inputs to a small number of latents while also maintaining end-to-end causal masking. Perceiver AR can directly attend to over a hundred thousand tokens, enabling practical long-context density estimation without the need for hand-crafted sparsity patterns or memory mechanisms. When trained on images or music, Perceiver AR generates outputs with clear long-term coherence and structure. Our architecture also obtains state-of-the-art likelihood on long-sequence benchmarks, including 64 x 64 ImageNet images and PG-19 books.", + "Building models that can be rapidly adapted to novel tasks using only a handful of annotated examples is an open challenge for multimodal machine learning research. We introduce Flamingo, a family of Visual Language Models (VLM) with this ability. We propose key architectural innovations to: (i) bridge powerful pretrained vision-only and language-only models, (ii) handle sequences of arbitrarily interleaved visual and textual data, and (iii) seamlessly ingest images or videos as inputs. Thanks to their flexibility, Flamingo models can be trained on large-scale multimodal web corpora containing arbitrarily interleaved text and images, which is key to endow them with in-context few-shot learning capabilities. We perform a thorough evaluation of our models, exploring and measuring their ability to rapidly adapt to a variety of image and video tasks. These include open-ended tasks such as visual question-answering, where the model is prompted with a question which it has to answer; captioning tasks, which evaluate the ability to describe a scene or an event; and close-ended tasks such as multiple-choice visual question-answering. For tasks lying anywhere on this spectrum, a single Flamingo model can achieve a new state of the art with few-shot learning, simply by prompting the model with task-specific examples. On numerous benchmarks, Flamingo outperforms models fine-tuned on thousands of times more task-specific data.", + "General perception systems such as Perceivers can process arbitrary modalities in any combination and are able to handle up to a few hundred thousand inputs. They achieve this generality by using exclusively global attention operations. This however hinders them from scaling up to the inputs sizes required to process raw high-resolution images or video. In this paper, we show that some degree of locality can be introduced back into these models, greatly improving their efficiency while preserving their generality. To scale them further, we introduce a self-supervised approach that enables learning dense low-dimensional positional embeddings for very large signals. We call the resulting model a Hierarchical Perceiver (HiP). In sum our contributions are: 1) scaling Perceiver-type models to raw high-resolution images and audio+video, 2) showing the feasibility of learning 1M+ positional embeddings from scratch using masked auto-encoding, 3) demonstrating competitive performance on raw data from ImageNet, AudioSet, PASCAL VOC, ModelNet40 and Kinetics datasets with the same exact, unchanged model and without specialized preprocessing or any tokenization.", + "Deep Metric Learning (DML) aims to learn representation spaces on which semantic relations can simply be expressed through predefined distance metrics. Best performing approaches commonly leverage class proxies as sample stand-ins for better convergence and generalization. However, these proxy-methods solely optimize for sample-proxy distances. Given the inherent non-bijectiveness of used distance functions, this can induce locally isotropic sample distributions, leading to crucial semantic context being missed due to difficulties resolving local structures and intraclass relations between samples. To alleviate this problem, we propose non-isotropy regularization $(\\mathbb{NIR})$ for proxy-based Deep Metric Learning. By leveraging Normalizing Flows, we enforce unique translatability of samples from their respective class proxies. This allows us to explicitly induce a non-isotropic distribution of samples around a proxy to optimize for. In doing so, we equip proxy-based objectives to better learn local structures. Extensive experiments highlight consistent generalization benefits of NIR while achieving competitive and state-of-the-art performance on the standard benchmarks CUB200-2011, Cars196 and Stanford Online Products. In addition, we find the superior convergence properties of proxy-based methods to still be retained or even improved, making NIR very attractive for practical usage. Code available at github.com/ExplainableML/NonIsotropicProxyDML.", + "Inspired by progress in large-scale language modeling, we apply a similar approach towards building a single generalist agent beyond the realm of text outputs. The agent, which we refer to as Gato, works as a multi-modal, multi-task, multi-embodiment generalist policy. The same network with the same weights can play Atari, caption images, chat, stack blocks with a real robot arm and much more, deciding based on its context whether to output text, joint torques, button presses, or other tokens. In this report we describe the model and the data, and document the current capabilities of Gato.", + "Video streaming usage has seen a significant rise as entertainment, education, and business increasingly rely on online video. Optimizing video compression has the potential to increase access and quality of content to users, and reduce energy use and costs overall. In this paper, we present an application of the MuZero algorithm to the challenge of video compression. Specifically, we target the problem of learning a rate control policy to select the quantization parameters (QP) in the encoding process of libvpx, an open source VP9 video compression library widely used by popular video-on-demand (VOD) services. We treat this as a sequential decision making problem to maximize the video quality with an episodic constraint imposed by the target bitrate. Notably, we introduce a novel self-competition based reward mechanism to solve constrained RL with variable constraint satisfaction difficulty, which is challenging for existing constrained RL methods. We demonstrate that the MuZero-based rate control achieves an average 6.28% reduction in size of the compressed videos for the same delivered video quality level (measured as PSNR BD-rate) compared to libvpx's two-pass VBR rate control policy, while having better constraint satisfaction behavior." + ], + "domain": [ + "Multimodal Learning", + "Reinforcement Learning", + "Deep Learning", + "Natural Language Processing" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "0144157b-9040-4a58-a17e-b0488cce7896": { + "pk": "0144157b-9040-4a58-a17e-b0488cce7896", + "name": "Jeff Dean", + "bio": "I am a researcher deeply engaged in the intersection of machine learning and systems design, with a particular focus on optimizing large-scale computational frameworks. My recent work includes the development of Pathways, a sophisticated orchestration layer for accelerators that enables efficient parallel computations across thousands of devices while maintaining high performance for current models. This system exemplifies my commitment to pushing the boundaries of machine learning infrastructure, allowing for the exploration of innovative research ideas.\n\nThroughout my career, I have contributed to significant advancements in neural networks and their applications, particularly in natural language processing and computer vision. My work on Google's Neural Machine Translation (GNMT) system highlights my dedication to improving translation accuracy and efficiency, addressing challenges such as rare word handling and computational costs. Additionally, I have explored the potential of learned indexes to replace traditional data structures, demonstrating how deep learning can revolutionize data management systems.\n\nI am passionate about making machine learning accessible and effective for a wide range of applications, as evidenced by my contributions to TensorFlow, which has become a cornerstone for researchers and developers alike. My research not only focuses on theoretical advancements but also emphasizes practical implementations that can be leveraged in real-world scenarios, ultimately aiming to enhance the capabilities of AI across various domains.", + "collaborators": [ + "G. Corrado", + "Sanjay Ghemawat", + "M. Isard", + "M. Devin", + "Jonathon Shlens", + "P. Barham", + "Sherry Moore", + "G. Irving", + "Z. Chen", + "R. Monga", + "Vincent Vanhoucke", + "Quoc V. Le", + "Samy Bengio", + "Marc'Aurelio Ranzato", + "Tomas Mikolov", + "Yonghui Wu", + "I. Goodfellow", + "A. Harp", + "Yangqing Jia", + "R. J\u00f3zefowicz", + "Mart\u00edn Abadi", + "Andy Davis", + "M. Kudlur", + "J. Levenberg", + "D. Murray", + "Benoit Steiner", + "P. Tucker", + "Vijay Vasudevan", + "Pete Warden", + "M. Wicke", + "Yuan Yu", + "Lukasz Kaiser", + "M. Schuster", + "I. Sutskever", + "O. Vinyals", + "Mohammad Norouzi", + "Andrea Frome", + "Y. Singer", + "Patrick Nguyen", + "A. Senior", + "Aakanksha Chowdhery", + "S. Hand", + "D. Hurt", + "Hyeontaek Lim", + "Ruoming Pang", + "Sudip Roy", + "Brennan Saeta", + "Parker Schuh", + "Ryan Sepassi", + "Laurent El Shafey", + "C. A. Thekkath", + "E. Real", + "Thomas Breuel", + "Tim Kraska", + "Alex Beutel", + "Ed H. Chi", + "N. Polyzotis", + "A. Jaffey", + "Jianmin Chen", + "Xiaoqiang Zhang", + "Ashish Agarwal", + "E. Brevdo", + "C. Citro", + "Dandelion Man\u00e9", + "C. Olah", + "Kunal Talwar", + "F. Vi\u00e9gas", + "M. Wattenberg", + "Xiaoqiang Zheng", + "Wolfgang Macherey", + "M. Krikun", + "Yuan Cao", + "Qin Gao", + "Klaus Macherey", + "J. Klingner", + "Apurva Shah", + "Melvin Johnson", + "Xiaobing Liu", + "Stephan Gouws", + "Yoshikiyo Kato", + "Taku Kudo", + "H. Kazawa", + "K. Stevens", + "George Kurian", + "Nishant Patil", + "Wei Wang", + "C. Young", + "Jason R. Smith", + "Jason Riesa", + "Alex Rudnick", + "Macduff Hughes", + "D. Erhan", + "Eugene Ie", + "Andrew Rabinovich", + "Matthew D. Zeiler", + "Mark Z. Mao", + "K. Yang", + "Geoffrey E. Hinton", + "Kai Chen", + "G. Heigold" + ], + "pub_titles": [ + "Pathways: Asynchronous Distributed Dataflow for ML", + "A Golden Decade of Deep Learning: Computing Systems & Applications", + "The Deep Learning Revolution and Its Implications for Computer Architecture and Chip Design", + "The Case for Learned Index Structures", + "DLVM: A MODERN COMPILER FRAMEWORK FOR NEURAL NETWORK DSLS", + "Large-Scale Deep Learning For Building Intelligent Computer Systems", + "TensorFlow: A system for large-scale machine learning", + "TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems", + "Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation", + "The rise of cloud computing systems", + "DeViSE: A Deep Visual-Semantic Embedding Model", + "Using Web Co-occurrence Statistics for Improving Image Categorization", + "On rectified linear units for speech processing", + "Distributed Representations of Words and Phrases and their Compositionality", + "Multilingual acoustic models using distributed deep neural networks", + "The tail at scale", + "Zero-Shot Learning by Convex Combination of Semantic Embeddings" + ], + "pub_abstracts": [ + "We present the design of a new large scale orchestration layer for accelerators. Our system, Pathways, is explicitly designed to enable exploration of new systems and ML research ideas, while retaining state of the art performance for current models. Pathways uses a sharded dataflow graph of asynchronous operators that consume and produce futures, and efficiently gang-schedules heterogeneous parallel computations on thousands of accelerators while coordinating data transfers over their dedicated interconnects. Pathways makes use of a novel asynchronous distributed dataflow design that lets the control plane execute in parallel despite dependencies in the data plane. This design, with careful engineering, allows Pathways to adopt a single-controller model that makes it easier to express complex new parallelism patterns. We demonstrate that Pathways can achieve performance parity (~100% accelerator utilization) with state-of-the-art systems when running SPMD computations over 2048 TPUs, while also delivering throughput comparable to the SPMD case for Transformer models that are pipelined across 16 stages, or sharded across two islands of accelerators connected over a data center network.", + "Abstract The past decade has seen tremendous progress in the field of artificial intelligence thanks to the resurgence of neural networks through deep learning. This has helped improve the ability for computers to see, hear, and understand the world around them, leading to dramatic advances in the application of AI to many fields of science and other areas of human endeavor. In this essay, I examine the reasons for this progress, including the confluence of progress in computing hardware designed to accelerate machine learning and the emergence of open-source software frameworks to dramatically expand the set of people who can use machine learning effectively. I also present a broad overview of some of the areas in which machine learning has been applied over the past decade. Finally, I sketch out some likely directions from which further progress in artificial intelligence will come.", + "The past decade has seen a remarkable series of advances in machine learning, and in particular deep learning approaches based on artificial neural networks, to improve our abilities to build more accurate systems across a broad range of areas, including computer vision, speech recognition, language translation, and natural language understanding tasks. This paper is a companion paper to a keynote talk at the 2020 International Solid-State Circuits Conference (ISSCC) discussing some of the advances in machine learning, and their implications on the kinds of computational devices we need to build, especially in the post-Moore's Law-era. It also discusses some of the ways that machine learning may also be able to help with some aspects of the circuit design process. Finally, it provides a sketch of at least one interesting direction towards much larger-scale multi-task models that are sparsely activated and employ much more dynamic, example- and task-based routing than the machine learning models of today.", + "Indexes are models: a \\btree-Index can be seen as a model to map a key to the position of a record within a sorted array, a Hash-Index as a model to map a key to a position of a record within an unsorted array, and a BitMap-Index as a model to indicate if a data record exists or not. In this exploratory research paper, we start from this premise and posit that all existing index structures can be replaced with other types of models, including deep-learning models, which we term \\em learned indexes. We theoretically analyze under which conditions learned indexes outperform traditional index structures and describe the main challenges in designing learned index structures. Our initial results show that our learned indexes can have significant advantages over traditional indexes. More importantly, we believe that the idea of replacing core components of a data management system through learned models has far reaching implications for future systems designs and that this work provides just a glimpse of what might be possible.", + "Many current approaches to deep learning make use of high-level toolkits such as TensorFlow, Torch, or Caffe. Toolkits such as Caffe have a layer-based programming framework with hard-coded gradients specified for each layer type, making research using novel layer types problematic. Toolkits such as Torch and TensorFlow define a computation graph in a host language such as Python, where each node represents a linear algebra operation parallelized as a compute kernel on GPU and stores the result of evaluation; some of these toolkits subsequently perform runtime interpretation over that graph, storing the results of forward calculations and reverse-accumulated gradients at each node. This approach is more flexible, but these toolkits take a very limited and ad-hoc approach to performing optimization. Also problematic are the facts that most toolkits lack type safety, and target only a single (usually GPU) architecture, limiting users\u2019 abilities to make use of heterogeneous and emerging hardware architectures. We introduce a novel framework for high-level programming that addresses all of the above shortcomings.", + "For the past five years, the Google Brain team has focused on conducting research in difficult problems in artificial intelligence, on building large-scale computer systems for machine learning research, and, in collaboration with many teams at Google, on applying our research and systems to dozens of Google products. Our group has recently open-sourced the TensorFlow system (tensorflow.org), a system designed to easily express machine ideas, and to quickly train, evaluate and deploy machine learning systems. In this talk, I'll highlight some of the design decisions we made in building TensorFlow, discuss research results produced within our group, and describe ways in which these ideas have been applied to a variety of problems in Google's products, usually in close collaboration with other teams. This talk describes joint work with many people at Google.", + "TensorFlow is a machine learning system that operates at large scale and in heterogeneous environments. Tensor-Flow uses dataflow graphs to represent computation, shared state, and the operations that mutate that state. It maps the nodes of a dataflow graph across many machines in a cluster, and within a machine across multiple computational devices, including multicore CPUs, general-purpose GPUs, and custom-designed ASICs known as Tensor Processing Units (TPUs). This architecture gives flexibility to the application developer: whereas in previous \"parameter server\" designs the management of shared state is built into the system, TensorFlow enables developers to experiment with novel optimizations and training algorithms. TensorFlow supports a variety of applications, with a focus on training and inference on deep neural networks. Several Google services use TensorFlow in production, we have released it as an open-source project, and it has become widely used for machine learning research. In this paper, we describe the TensorFlow dataflow model and demonstrate the compelling performance that TensorFlow achieves for several real-world applications.", + "TensorFlow is an interface for expressing machine learning algorithms, and an implementation for executing such algorithms. A computation expressed using TensorFlow can be executed with little or no change on a wide variety of heterogeneous systems, ranging from mobile devices such as phones and tablets up to large-scale distributed systems of hundreds of machines and thousands of computational devices such as GPU cards. The system is flexible and can be used to express a wide variety of algorithms, including training and inference algorithms for deep neural network models, and it has been used for conducting research and for deploying machine learning systems into production across more than a dozen areas of computer science and other fields, including speech recognition, computer vision, robotics, information retrieval, natural language processing, geographic information extraction, and computational drug discovery. This paper describes the TensorFlow interface and an implementation of that interface that we have built at Google. The TensorFlow API and a reference implementation were released as an open-source package under the Apache 2.0 license in November, 2015 and are available at www.tensorflow.org.", + "Neural Machine Translation (NMT) is an end-to-end learning approach for automated translation, with the potential to overcome many of the weaknesses of conventional phrase-based translation systems. Unfortunately, NMT systems are known to be computationally expensive both in training and in translation inference. Also, most NMT systems have difficulty with rare words. These issues have hindered NMT's use in practical deployments and services, where both accuracy and speed are essential. In this work, we present GNMT, Google's Neural Machine Translation system, which attempts to address many of these issues. Our model consists of a deep LSTM network with 8 encoder and 8 decoder layers using attention and residual connections. To improve parallelism and therefore decrease training time, our attention mechanism connects the bottom layer of the decoder to the top layer of the encoder. To accelerate the final translation speed, we employ low-precision arithmetic during inference computations. To improve handling of rare words, we divide words into a limited set of common sub-word units (\"wordpieces\") for both input and output. This method provides a good balance between the flexibility of \"character\"-delimited models and the efficiency of \"word\"-delimited models, naturally handles translation of rare words, and ultimately improves the overall accuracy of the system. Our beam search technique employs a length-normalization procedure and uses a coverage penalty, which encourages generation of an output sentence that is most likely to cover all the words in the source sentence. On the WMT'14 English-to-French and English-to-German benchmarks, GNMT achieves competitive results to state-of-the-art. Using a human side-by-side evaluation on a set of isolated simple sentences, it reduces translation errors by an average of 60% compared to Google's phrase-based production system.", + "In this talk I will describe the development of systems that underlie modern cloud computing systems. This development shares much of its motivation with the related fields of transaction processing systems and high performance computing, but because of scale, these systems tend to have more emphasis on fault tolerance using software techniques. Important developments in the development of modern cloud systems include very high performance distributed file system, such as the Google File System (Ghemawat et al., SOSP 2003), reliable computational frameworks such as MapReduce (Dean & Ghemawat, OSDI 2004) and Dryad (Isard et al., 2007), and large scale structured storage systems such as BigTable (Chang et al. 2006), Dynamo (DeCandia et al., 2007), and Spanner (Corbett et al., 2012). Scheduling computations can either be done using virtual machines (exemplified by VMWare's products), or as individual processes or containers. The development of public cloud platforms such as AWS, Microsoft Azure, and Google Cloud Platform, allow external developers to utilize these large-scale services to build new and interesting services and products, benefiting from the economies of scale of large datacenters and the ability to grow and shrink computing resources on demand across millions of customers.", + "Modern visual recognition systems are often limited in their ability to scale to large numbers of object categories. This limitation is in part due to the increasing difficulty of acquiring sufficient training data in the form of labeled images as the number of object categories grows. One remedy is to leverage data from other sources - such as text data - both to train visual models and to constrain their predictions. In this paper we present a new deep visual-semantic embedding model trained to identify visual objects using both labeled image data as well as semantic information gleaned from unannotated text. We demonstrate that this model matches state-of-the-art performance on the 1000-class ImageNet object recognition challenge while making more semantically reasonable errors, and also show that the semantic information can be exploited to make predictions about tens of thousands of image labels not observed during training. Semantic knowledge improves such zero-shot predictions achieving hit rates of up to 18% across thousands of novel labels never seen by the visual model.", + "Object recognition and localization are important tasks in computer vision. The focus of this work is the incorporation of contextual information in order to improve object recognition and localization. For instance, it is natural to expect not to see an elephant to appear in the middle of an ocean. We consider a simple approach to encapsulate such common sense knowledge using co-occurrence statistics from web documents. By merely counting the number of times nouns (such as elephants, sharks, oceans, etc.) co-occur in web documents, we obtain a good estimate of expected co-occurrences in visual data. We then cast the problem of combining textual co-occurrence statistics with the predictions of image-based classifiers as an optimization problem. The resulting optimization problem serves as a surrogate for our inference procedure. Albeit the simplicity of the resulting optimization problem, it is effective in improving both recognition and localization accuracy. Concretely, we observe significant improvements in recognition and localization rates for both ImageNet Detection 2012 and Sun 2012 datasets.", + "Deep neural networks have recently become the gold standard for acoustic modeling in speech recognition systems. The key computational unit of a deep network is a linear projection followed by a point-wise non-linearity, which is typically a logistic function. In this work, we show that we can improve generalization and make training of deep networks faster and simpler by substituting the logistic units with rectified linear units. These units are linear when their input is positive and zero otherwise. In a supervised setting, we can successfully train very deep nets from random initialization on a large vocabulary speech recognition task achieving lower word error rates than using a logistic network with the same topology. Similarly in an unsupervised setting, we show how we can learn sparse features that can be useful for discriminative tasks. All our experiments are executed in a distributed environment using several hundred machines and several hundred hours of speech data.", + "The recently introduced continuous Skip-gram model is an efficient method for learning high-quality distributed vector representations that capture a large number of precise syntactic and semantic word relationships. In this paper we present several extensions that improve both the quality of the vectors and the training speed. By subsampling of the frequent words we obtain significant speedup and also learn more regular word representations. We also describe a simple alternative to the hierarchical softmax called negative sampling. An inherent limitation of word representations is their indifference to word order and their inability to represent idiomatic phrases. For example, the meanings of \"Canada\" and \"Air\" cannot be easily combined to obtain \"Air Canada\". Motivated by this example, we present a simple method for finding phrases in text, and show that learning good vector representations for millions of phrases is possible.", + "Today's speech recognition technology is mature enough to be useful for many practical applications. In this context, it is of paramount importance to train accurate acoustic models for many languages within given resource constraints such as data, processing power, and time. Multilingual training has the potential to solve the data issue and close the performance gap between resource-rich and resource-scarce languages. Neural networks lend themselves naturally to parameter sharing across languages, and distributed implementations have made it feasible to train large networks. In this paper, we present experimental results for cross- and multi-lingual network training of eleven Romance languages on 10k hours of data in total. The average relative gains over the monolingual baselines are 4%/2% (data-scarce/data-rich languages) for cross- and 7%/2% for multi-lingual training. However, the additional gain from jointly training the languages on all data comes at an increased training time of roughly four weeks, compared to two weeks (monolingual) and one week (crosslingual).", + "Software techniques that tolerate latency variability are vital to building responsive large-scale Web services.", + "Abstract: Several recent publications have proposed methods for mapping images into continuous semantic embedding spaces. In some cases the embedding space is trained jointly with the image transformation. In other cases the semantic embedding space is established by an independent natural language processing task, and then the image transformation into that space is learned in a second stage. Proponents of these image embedding systems have stressed their advantages over the traditional \\nway{} classification framing of image understanding, particularly in terms of the promise for zero-shot learning -- the ability to correctly annotate images of previously unseen object categories. In this paper, we propose a simple method for constructing an image embedding system from any existing \\nway{} image classifier and a semantic word embedding model, which contains the $\\n$ class labels in its vocabulary. Our method maps images into the semantic embedding space via convex combination of the class label embedding vectors, and requires no additional training. We show that this simple and direct method confers many of the advantages associated with more complex image embedding schemes, and indeed outperforms state of the art methods on the ImageNet zero-shot learning task." + ], + "domain": [ + "Machine Learning", + "Deep Learning", + "Neural Networks", + "Computer Vision" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "14ee7bb9-eb0e-4350-8b68-3728df1e3f59": { + "pk": "14ee7bb9-eb0e-4350-8b68-3728df1e3f59", + "name": "Cynthia Rudin", + "bio": "I am a researcher dedicated to advancing methods for observational causal inference and machine learning, with a strong emphasis on interpretability, scalability, and practical application. My work revolves around developing frameworks like Model-to-Match, which combines outcome modeling with matching techniques to enhance treatment effect estimation while ensuring auditability and flexibility. I have also contributed to the field of nonlinear dynamical systems through my algorithm, OKRidge, which efficiently solves sparse ridge regression problems.\n\nA significant aspect of my research is the exploration of the Rashomon set, which allows for the identification of diverse, near-optimal models. This work culminated in the development of Timbertrek, an interactive visualization tool that empowers users to navigate and select models based on their specific needs and values. My commitment to interpretability extends to various domains, including healthcare, where I have designed interpretable deep learning models for clinical decision-making.\n\nI am particularly passionate about bridging the gap between complex machine learning models and domain expertise, ensuring that our methodologies not only perform well but are also understandable and actionable. My research aims to create tools that facilitate better decision-making in high-stakes environments, ultimately contributing to more responsible and effective applications of machine learning in society.", + "collaborators": [ + "M. Seltzer", + "Chudi Zhong", + "Jiachang Liu", + "Zhi Chen", + "A. Volfovsky", + "Rui Xin", + "Quinn Lanners", + "Harsh Parikh", + "David Page", + "Takuya Takagi", + "Sam Rosen", + "Stephen Hahn", + "Rico Zhu", + "Simon Mak", + "Yue Jiang", + "Marco Morucci", + "Jacob Peloquin", + "A. Kirillova", + "L. Brinson", + "K. Gall", + "Fernanda Bravo", + "Yaron Shaposhnik", + "Yuting Yuan", + "Chunxiao Li", + "T. McCormick", + "Boxuan Li", + "A. Barnett", + "Vaibhav Sharma", + "Neel Gajjar", + "Jerry Fang", + "F. Schwartz", + "J. Lo", + "Y. Mansour", + "Michal Moshkovitz", + "Haiyang Huang", + "Vaishali Jain", + "Ted Enamorado", + "Zijie J. Wang", + "Duen Horng Chau", + "Ali Behrouz", + "Mathias L\u00e9cuyer" + ], + "pub_titles": [ + "Variable importance matching for causal inference", + "OKRidge: Scalable Optimal k-Sparse Ridge Regression for Learning Dynamical Systems", + "From Feature Importance to Distance Metric: An Almost Exact Matching Approach for Causal Inference", + "Exploring and Interacting with the Set of Good Sparse Generalized Additive Models", + "An Interpretable, Flexible, and Interactive Probabilistic Framework for Melody Generation", + "Matched Machine Learning: A Generalized Framework for Treatment Effect Inference With Learned Metrics", + "Interpretable Prediction Rules for Congestion Risk in Intensive Care Units", + "Fast Sparse Classification for Generalized Linear and Additive Models", + "Rethinking Nonlinear Instrumental Variable Models through Prediction Validity", + "FasterRisk: Fast and Accurate Interpretable Risk Scores", + "Optimal Sparse Regression Trees", + "Interpretable deep learning models for better clinician-AI communication in clinical mammography", + "There is no Accuracy-Interpretability Tradeoff in Reinforcement Learning for Mazes", + "SegDiscover: Visual Concept Discovery via Unsupervised Semantic Segmentation", + "Exploring the Whole Rashomon Set of Sparse Decision Trees", + "TimberTrek: Exploring and Curating Sparse Decision Trees with Interactive Visualization", + "Fast optimization of weighted sparse decision trees for use in optimal treatment regimes and optimal policy design" + ], + "pub_abstracts": [ + "Our goal is to produce methods for observational causal inference that are auditable, easy to troubleshoot, accurate for treatment effect estimation, and scalable to high-dimensional data. We describe a general framework called Model-to-Match that achieves these goals by (i) learning a distance metric via outcome modeling, (ii) creating matched groups using the distance metric, and (iii) using the matched groups to estimate treatment effects. Model-to-Match uses variable importance measurements to construct a distance metric, making it a flexible framework that can be adapted to various applications. Concentrating on the scalability of the problem in the number of potential confounders, we operationalize the Model-to-Match framework with LASSO. We derive performance guarantees for settings where LASSO outcome modeling consistently identifies all confounders (importantly without requiring the linear model to be correctly specified). We also provide experimental results demonstrating the method's auditability, accuracy, and scalability as well as extensions to more general nonparametric outcome modeling.", + "We consider an important problem in scientific discovery, namely identifying sparse governing equations for nonlinear dynamical systems. This involves solving sparse ridge regression problems to provable optimality in order to determine which terms drive the underlying dynamics. We propose a fast algorithm, OKRidge, for sparse ridge regression, using a novel lower bound calculation involving, first, a saddle point formulation, and from there, either solving (i) a linear system or (ii) using an ADMM-based approach, where the proximal operators can be efficiently evaluated by solving another linear system and an isotonic regression problem. We also propose a method to warm-start our solver, which leverages a beam search. Experimentally, our methods attain provable optimality with run times that are orders of magnitude faster than those of the existing MIP formulations solved by the commercial solver Gurobi.", + ",", + "In real applications, interaction between machine learning models and domain experts is critical; however, the classical machine learning paradigm that usually produces only a single model does not facilitate such interaction. Approximating and exploring the Rashomon set, i.e., the set of all near-optimal models, addresses this practical challenge by providing the user with a searchable space containing a diverse set of models from which domain experts can choose. We present algorithms to efficiently and accurately approximate the Rashomon set of sparse, generalized additive models with ellipsoids for fixed support sets and use these ellipsoids to approximate Rashomon sets for many different support sets. The approximated Rashomon set serves as a cornerstone to solve practical challenges such as (1) studying the variable importance for the model class; (2) finding models under user-specified constraints (monotonicity, direct editing); and (3) investigating sudden changes in the shape functions. Experiments demonstrate the fidelity of the approximated Rashomon set and its effectiveness in solving practical challenges.", + "The fast-growing demand for algorithmic music generation is found throughout entertainment, art, education, etc. Unfortunately, most recent models are practically impossible to interpret or musically fine-tune, as they use deep neural networks with thousands of parameters. We introduce an interpretable, flexible, and interactive model, SchenkComposer, for melody generation that empowers users to be creative in all aspects of the music generation pipeline and allows them to learn from the process. We divide the task of melody generation into steps based on the process that a human composer using music-theoretical domain knowledge might use. First, the model determines phrase structure based on form analysis and identifies an appropriate number of measures. Using concepts from Schenkerian analysis, the model then finds a fitting harmonic rhythm, middleground harmonic progression, foreground rhythm, and melody in a hierarchical, scaffolded approach using a probabilistic context-free grammar based on musical contours. By incorporating theories of musical form and harmonic structure, our model produces music with long-term structural coherence. In extensive human experiments, we find that music generated with our approach successfully passes a Turing test in human experiments while current state-of-the-art approaches fail, and we further demonstrate superior performance and preference for our melodies compared to existing melody generation methods. Additionally, we developed and deployed a public website for SchenkComposer, and conducted preliminary user surveys. Through analysis, we show the strong viability and enjoyability of SchenkComposer.", + "We introduce Matched Machine Learning, a framework that combines the flexibility of machine learning black boxes with the interpretability of matching, a longstanding tool in observational causal inference. Interpretability is paramount in many high-stakes application of causal inference. Current tools for nonparametric estimation of both average and individualized treatment effects are black-boxes that do not allow for human auditing of estimates. Our framework uses machine learning to learn an optimal metric for matching units and estimating outcomes, thus achieving the performance of machine learning black-boxes, while being interpretable. Our general framework encompasses several published works as special cases. We provide asymptotic inference theory for our proposed framework, enabling users to construct approximate confidence intervals around estimates of both individualized and average treatment effects. We show empirically that instances of Matched Machine Learning perform on par with black-box machine learning methods and better than existing matching methods for similar problems. Finally, in our application we show how Matched Machine Learning can be used to perform causal inference even when covariate data are highly complex: we study an image dataset, and produce high quality matches and estimates of treatment effects.", + "We study the problem of predicting congestion risk in intensive care units (ICUs). Congestion is associated with poor service experience, high costs, and poor health outcomes. By predicting future congestion, decision makers can initiate preventive measures, such as rescheduling activities or increasing short-term capacity, to mitigate the effects of congestion. To this end, we consider well-established queueing models of ICUs and define \u201chigh-risk states\u201d as system states that are likely to lead to congestion in the near future. We strive to formulate rules for determining whether a given system state is high risk. We design the rules to be interpretable (informally, easy to understand) for their practical appeal to stakeholders. We show that for simple Markovian queueing systems, such as the [Formula: see text] queue with multiple patient classes, our rules take the form of linear and quadratic functions on the state space. For more general queueing systems, we employ methods from queueing theory, simulation, and machine learning (ML) to devise interpretable prediction rules, and we demonstrate their effectiveness through an extensive computational study, which includes a large-scale ICU model validated using data. Our study shows that congestion risk can be effectively and transparently predicted using linear ML models and interpretable features engineered from the queueing model representation of the system. History: This paper has been accepted for the Service Science/Stochastic Systems Joint Special Issue. Supplemental Material: The online appendix is available at https://doi.org/10.1287/stsy.2022.0018 .", + "We present fast classification techniques for sparse generalized linear and additive models. These techniques can handle thousands of features and thousands of observations in minutes, even in the presence of many highly correlated features. For fast sparse logistic regression, our computational speed-up over other best-subset search techniques owes to linear and quadratic surrogate cuts for the logistic loss that allow us to efficiently screen features for elimination, as well as use of a priority queue that favors a more uniform exploration of features. As an alternative to the logistic loss, we propose the exponential loss, which permits an analytical solution to the line search at each iteration. Our algorithms are generally 2 to 5 times faster than previous approaches. They produce interpretable models that have accuracy comparable to black box models on challenging datasets.", + "Instrumental variables (IV) are widely used in the social and health sciences in situations where a researcher would like to measure a causal effect but cannot perform an experiment. For valid causal inference in an IV model, there must be external (exogenous) variation that (i) has a sufficiently large impact on the variable of interest (called the relevance assumption) and where (ii) the only pathway through which the external variation impacts the outcome is via the variable of interest (called the exclusion restriction). For statistical inference, researchers must also make assumptions about the functional form of the relationship between the three variables. Current practice assumes (i) and (ii) are met, then postulates a functional form with limited input from the data. In this paper, we describe a framework that leverages machine learning to validate these typically unchecked but consequential assumptions in the IV framework, providing the researcher empirical evidence about the quality of the instrument given the data at hand. Central to the proposed approach is the idea of prediction validity. Prediction validity checks that error terms \u2013 which should be independent from the instrument \u2013 cannot be modeled with machine learning any better than a model that is identically zero. We use prediction validity to develop both one-stage and two-stage approaches for IV, and demonstrate their performance on an example relevant to climate change policy.", + "Over the last century, risk scores have been the most popular form of predictive model used in healthcare and criminal justice. Risk scores are sparse linear models with integer coefficients; often these models can be memorized or placed on an index card. Typically, risk scores have been created either without data or by rounding logistic regression coefficients, but these methods do not reliably produce high-quality risk scores. Recent work used mathematical programming, which is computationally slow. We introduce an approach for efficiently producing a collection of high-quality risk scores learned from data. Specifically, our approach produces a pool of almost-optimal sparse continuous solutions, each with a different support set, using a beam-search algorithm. Each of these continuous solutions is transformed into a separate risk score through a\"star ray\"search, where a range of multipliers are considered before rounding the coefficients sequentially to maintain low logistic loss. Our algorithm returns all of these high-quality risk scores for the user to consider. This method completes within minutes and can be valuable in a broad variety of applications.", + "Regression trees are one of the oldest forms of AI models, and their predictions can be made without a calculator, which makes them broadly useful, particularly for high-stakes applications. Within the large literature on regression trees, there has been little effort towards full provable optimization, mainly due to the computational hardness of the problem. This work proposes a dynamic-programming-with-bounds approach to the construction of provably-optimal sparse regression trees. We leverage a novel lower bound based on an optimal solution to the k-Means clustering algorithm on one dimensional data. We are often able to find optimal sparse trees in seconds, even for challenging datasets that involve large numbers of samples and highly-correlated features.", + "There is increasing interest in using deep learning and computer vision to help guide clinical decisions, such as whether to order a biopsy based on a mammogram. Existing networks are typically black box, unable to explain how they make their predictions. We present an interpretable deep-learning network which explains its predictions in terms of BI-RADS features mass shape and mass margin. Our model predicts mass margin and mass shape, then uses the logits from those interpretable models to predict malignancy, also using an interpretable model. The interpretable mass margin model explains its predictions using a prototypical parts model. The interpretable mass shape model predicts segmentations, fits an ellipse, then determines shape based on the goodness of fit and eccentricity of the fitted ellipse. While including mass shape logits in the malignancy prediction model did not improve performance, we present this technique as part of a framework for better clinician-AI communication.", + "Interpretability is an essential building block for trustworthiness in reinforcement learning systems. However, interpretability might come at the cost of deteriorated performance, leading many researchers to build complex models. Our goal is to analyze the cost of interpretability. We show that in certain cases, one can achieve policy interpretability while maintaining its optimality. We focus on a classical problem from reinforcement learning: mazes with $k$ obstacles in $\\mathbb{R}^d$. We prove the existence of a small decision tree with a linear function at each inner node and depth $O(\\log k + 2^d)$ that represents an optimal policy. Note that for the interesting case of a constant $d$, we have $O(\\log k)$ depth. Thus, in this setting, there is no accuracy-interpretability tradeoff. To prove this result, we use a new\"compressing\"technique that might be useful in additional settings.", + "Visual concept discovery has long been deemed important to improve interpretability of neural networks, because a bank of semantically meaningful concepts would provide us with a starting point for building machine learning models that exhibit intelligible reasoning process. Previous methods have disadvantages: either they rely on labelled support sets that incorporate human biases for objects that are\"useful,\"or they fail to identify multiple concepts that occur within a single image. We reframe the concept discovery task as an unsupervised semantic segmentation problem, and present SegDiscover, a novel framework that discovers semantically meaningful visual concepts from imagery datasets with complex scenes without supervision. Our method contains three important pieces: generating concept primitives from raw images, discovering concepts by clustering in the latent space of a self-supervised pretrained encoder, and concept refinement via neural network smoothing. Experimental results provide evidence that our method can discover multiple concepts within a single image and outperforms state-of-the-art unsupervised methods on complex datasets such as Cityscapes and COCO-Stuff. Our method can be further used as a neural network explanation tool by comparing results obtained by different encoders.", + "In any given machine learning problem, there might be many models that explain the data almost equally well. However, most learning algorithms return only one of these models, leaving practitioners with no practical way to explore alternative models that might have desirable properties beyond what could be expressed by a loss function. The Rashomon set is the set of these all almost-optimal models. Rashomon sets can be large in size and complicated in structure, particularly for highly nonlinear function classes that allow complex interaction terms, such as decision trees. We provide the first technique for completely enumerating the Rashomon set for sparse decision trees; in fact, our work provides the first complete enumeration of any Rashomon set for a non-trivial problem with a highly nonlinear discrete function class. This allows the user an unprecedented level of control over model choice among all models that are approximately equally good. We represent the Rashomon set in a specialized data structure that supports efficient querying and sampling. We show three applications of the Rashomon set: 1) it can be used to study variable importance for the set of almost-optimal trees (as opposed to a single tree), 2) the Rashomon set for accuracy enables enumeration of the Rashomon sets for balanced accuracy and F1-score, and 3) the Rashomon set for a full dataset can be used to produce Rashomon sets constructed with only subsets of the data set. Thus, we are able to examine Rashomon sets across problems with a new lens, enabling users to choose models rather than be at the mercy of an algorithm that produces only a single model.", + "Given thousands of equally accurate machine learning (ML) models, how can users choose among them? A recent ML technique enables domain experts and data scientists to generate a complete Rashomon set for sparse decision trees-a huge set of almost-optimal inter-pretable ML models. To help ML practitioners identify models with desirable properties from this Rashomon set, we develop Tim-bertrek, the first interactive visualization system that summarizes thousands of sparse decision trees at scale. Two usage scenarios high-light how Timbertrek can empower users to easily explore, compare, and curate models that align with their domain knowledge and values. Our open-source tool runs directly in users' computational notebooks and web browsers, lowering the barrier to creating more responsible ML models. Timbertrek is available at the following public demo link: https: //poloclub. github. io/timbertrek.", + "Sparse decision trees are one of the most common forms of interpretable models. While recent advances have produced algorithms that fully optimize sparse decision trees for prediction, that work does not address policy design, because the algorithms cannot handle weighted data samples. Specifically, they rely on the discreteness of the loss function, which means that real-valued weights cannot be directly used. For example, none of the existing techniques produce policies that incorporate inverse propensity weighting on individual data points. We present three algorithms for efficient sparse weighted decision tree optimization. The first approach directly optimizes the weighted loss function; however, it tends to be computationally inefficient for large datasets. Our second approach, which scales more efficiently, transforms weights to integer values and uses data duplication to transform the weighted decision tree optimization problem into an unweighted (but larger) counterpart. Our third algorithm, which scales to much larger datasets, uses a randomized procedure that samples each data point with a probability proportional to its weight. We present theoretical bounds on the error of the two fast methods and show experimentally that these methods can be two orders of magnitude faster than the direct optimization of the weighted loss, without losing significant accuracy." + ], + "domain": [ + "Causal Inference", + "Machine Learning", + "Interpretability", + "Decision Trees" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "13289ce2-2acd-4eb9-a43a-971d09ee4ab6": { + "pk": "13289ce2-2acd-4eb9-a43a-971d09ee4ab6", + "name": "Sebastian Raschka", + "bio": "I am a researcher with a diverse background in machine learning, bioinformatics, and computational biology. My work primarily focuses on developing innovative algorithms and models that enhance privacy, improve predictive analytics, and deepen our understanding of biological interactions. \n\nOne of my notable contributions is the design of a convolutional autoencoder that protects gender privacy in face images while maintaining recognition accuracy. This project involved a novel semi-adversarial training scheme, showcasing my ability to blend machine learning techniques with practical applications in privacy protection.\n\nIn the realm of bioinformatics, I have explored protein-ligand interactions through my SiteInterlock approach, which leverages rigidity theory to predict native binding modes. This work not only enhances the accuracy of binding predictions but also provides insights into the cooperative nature of molecular interactions.\n\nAdditionally, I have developed tools for sentiment analysis in music, utilizing naive Bayes classifiers to predict emotional responses based on lyrics. My passion for teaching and sharing knowledge is reflected in my contributions to literature on Python machine learning, where I guide readers through the complexities of data science and predictive analytics.\n\nOverall, my research is driven by a commitment to bridging theoretical concepts with real-world applications, whether it\u2019s through enhancing privacy in facial recognition or advancing our understanding of molecular biology.", + "collaborators": [ + "R. Olson", + "Pronojit Saha", + "Nathan", + "Randy J. Carnevale", + "Ted", + "kadarakos", + "ktkirk", + "Daniel", + "derekjanni", + "screwed", + "Vahid Mirjalili", + "F. O'Donovan", + "Grishma Jena", + "A. Namboodiri", + "A. Ross", + "S. Turner", + "Daniel S. Standage", + "Cui Jie", + "Phelim Bradley", + "Daniel E Cook", + "deepstop", + "\u00c9. Normandeau", + "HLiang", + "Joseph Bemister-Buffington", + "L. Kuhn", + "Akshay Varik", + "weixuanfu", + "Randal S. Olson" + ], + "pub_titles": [ + "Semi-adversarial Networks: Convolutional Autoencoders for Imparting Privacy to Face Images", + "BioPandas: Working with molecular structures in pandas DataFrames", + "MusicMood: Predicting the mood of music from song lyrics using machine learning", + "Detecting the native ligand orientation by interfacial rigidity: SiteInterlock", + "Python machine learning : unlock deeper insights into machine learning with this vital guide to cutting-edge predictive analytics", + "Naive Bayes and Text Classification I - Introduction and Theory", + "An Overview of General Performance Metrics of Binary Classifier Systems", + "Statistical Identification of Potential CLAVATA2 Interactors by Fluorescence Resonance Energy Transfer Analysis" + ], + "pub_abstracts": [ + "In this paper, we design and evaluate a convolutional autoencoder that perturbs an input face image to impart privacy to a subject. Specifically, the proposed autoencoder transforms an input face image such that the transformed image can be successfully used for face recognition but not for gender classification. In order to train this autoencoder, we propose a novel training scheme, referred to as semi-adversarial training in this work. The training is facilitated by attaching a semi-adversarial module consisting of an auxiliary gender classifier and an auxiliary face matcher to the autoencoder. The objective function utilized for training this network has three terms: one to ensure that the perturbed image is a realistic face image; another to ensure that the gender attributes of the face are confounded; and a third to ensure that biometric recognition performance due to the perturbed image is not impacted. Extensive experiments confirm the efficacy of the proposed architecture in extending gender privacy to face images.", + "Furthermore, useful small-molecule related functions are provided for reading and parsing millions of small molecule structures (from multi-MOL2 files (Tripos 2007)) fast and efficiently in virtual screening applications. Inbuilt functions for filtering molecules by the presence of functional groups and their pair-wise distances to each other make BioPandas a particularly attractive utility library for virtual screening and protein-ligand docking applications.", + "Sentiment prediction of contemporary music can have a wide-range of applications in modern society, for instance, selecting music for public institutions such as hospitals or restaurants to potentially improve the emotional well-being of personnel, patients, and customers, respectively. In this project, music recommendation system built upon on a naive Bayes classifier, trained to predict the sentiment of songs based on song lyrics alone. The experimental results show that music corresponding to a happy mood can be detected with high precision based on text features obtained from song lyrics.", + "Understanding the physical attributes of protein\u2010ligand interfaces, the source of most biological activity, is a fundamental problem in biophysics. Knowing the characteristic features of interfaces also enables the design of molecules with potent and selective interactions. Prediction of native protein\u2010ligand interactions has traditionally focused on the development of physics\u2010based potential energy functions, empirical scoring functions that are fit to binding data, and knowledge\u2010based potentials that assess the likelihood of pairwise interactions. Here we explore a new approach, testing the hypothesis that protein\u2010ligand binding results in computationally detectable rigidification of the protein\u2010ligand interface. Our SiteInterlock approach uses rigidity theory to efficiently measure the relative interfacial rigidity of a series of small\u2010molecule ligand orientations and conformations for a number of protein complexes. In the majority of cases, SiteInterlock detects a near\u2010native binding mode as being the most rigid, with particularly robust performance relative to other methods when the ligand\u2010free conformation of the protein is provided. The interfacial rigidification of both the protein and ligand prove to be important characteristics of the native binding mode. This measure of rigidity is also sensitive to the spatial coupling of interactions and bond\u2010rotational degrees of freedom in the interface. While the predictive performance of SiteInterlock is competitive with the best of the five other scoring functions tested, its measure of rigidity encompasses cooperative rather than just additive binding interactions, providing novel information for detecting native\u2010like complexes. SiteInterlock shows special strength in enhancing the prediction of native complexes by ruling out inaccurate poses. Proteins 2016; 84:1888\u20131901. \u00a9 2016 Wiley Periodicals, Inc.", + "Unlock deeper insights into Machine Leaning with this vital guide to cutting-edge predictive analytics About This Book * Leverage Python's most powerful open-source libraries for deep learning, data wrangling, and data visualization * Learn effective strategies and best practices to improve and optimize machine learning systems and algorithms * Ask and answer tough questions of your data with robust statistical models, built for a range of datasets Who This Book Is For If you want to find out how to use Python to start answering critical questions of your data, pick up Python Machine Learning whether you want to get started from scratch or want to extend your data science knowledge, this is an essential and unmissable resource. What You Will Learn * Explore how to use different machine learning models to ask different questions of your data * Learn how to build neural networks using Keras and Theano * Find out how to write clean and elegant Python code that will optimize the strength of your algorithms * Discover how to embed your machine learning model in a web application for increased accessibility * Predict continuous target outcomes using regression analysis * Uncover hidden patterns and structures in data with clustering * Organize data using effective pre-processing techniques * Get to grips with sentiment analysis to delve deeper into textual and social media data In Detail Machine learning and predictive analytics are transforming the way businesses and other organizations operate. Being able to understand trends and patterns in complex data is critical to success, becoming one of the key strategies for unlocking growth in a challenging contemporary marketplace. Python can help you deliver key insights into your data its unique capabilities as a language let you build sophisticated algorithms and statistical models that can reveal new perspectives and answer key questions that are vital for success. Python Machine Learning gives you access to the world of predictive analytics and demonstrates why Python is one of the world's leading data science languages. If you want to ask better questions of data, or need to improve and extend the capabilities of your machine learning systems, this practical data science book is invaluable. Covering a wide range of powerful Python libraries, including scikit-learn, Theano, and Keras, and featuring guidance and tips on everything from sentiment analysis to neural networks, you'll soon be able to answer some of the most important questions facing you and your organization. Style and approach Python Machine Learning connects the fundamental theoretical principles behind machine learning to their practical application in a way that focuses you on asking and answering the right questions. It walks you through the key elements of Python and its powerful machine learning libraries, while demonstrating how to get to grips with a range of statistical models.", + "Naive Bayes classifiers, a family of classifiers that are based on the popular Bayes' probability theorem, are known for creating simple yet well performing models, especially in the fields of document classification and disease prediction. In this article, we will look at the main concepts of naive Bayes classification in the context of document categorization.", + "This document provides a brief overview of different metrics and terminology that is used to measure the performance of binary classification systems.", + "The overall goal of this study was to identify potential interactors of the CLAVATA2 (CLV2) membrane receptor, which is participating in the stem cell signaling pathway of the model plant Arabidopsis thaliana. In order to investigate the physical interaction between those proteins, a fluorescence resonance energy transfer (FRET) analysis was conducted. Data have been collected all by myself during my undergraduate laboratory experiences in August, 2011, at the Department of Developmental Genetics at Heinrich-Heine University D\u00fcsseldorf." + ], + "domain": [ + "Machine Learning", + "Computer Vision", + "Bioinformatics", + "Sentiment Analysis" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "660ac6f5-6dea-40ba-bdd9-e56de39a7aaf": { + "pk": "660ac6f5-6dea-40ba-bdd9-e56de39a7aaf", + "name": "Rupesh Kumar Srivastava", + "bio": "I am a researcher dedicated to advancing the fields of machine learning and optimization, with a particular focus on the intersection of neural networks and design optimization under uncertainty. My work has explored innovative approaches to automatically generate image descriptions using multimodal models, achieving state-of-the-art results on benchmarks like Microsoft COCO. I have delved into the dynamics of neural network activation functions, revealing how local competition among neurons can enhance performance and mitigate issues like catastrophic forgetting.\n\nMy research also emphasizes the application of evolutionary algorithms (EAs) in design optimization, particularly in scenarios where uncertainty is a significant factor. I have developed bi-objective evolutionary approaches that leverage evidence theory to optimize design objectives while minimizing the plausibility of constraint violations. This work has demonstrated substantial computational efficiencies, particularly through GPU-based parallelization, allowing for rapid evaluations in complex design problems.\n\nAdditionally, I am intrigued by the concept of curiosity-driven problem-solving, as exemplified by the POWERPLAY framework, which fosters the development of artificial explorers capable of generating and solving novel problems. My recent efforts in Generalized Compressed Network Search (GCNS) aim to enhance neural network efficiency by identifying critical frequencies for better compression and generalization.\n\nOverall, my research is characterized by a commitment to pushing the boundaries of machine learning and optimization, with a focus on practical applications and innovative methodologies that address real-world challenges.", + "collaborators": [ + "J. Schmidhuber", + "K. Deb", + "Faustino J. Gomez", + "Jonathan Masci", + "Bas R. Steunebrink", + "Hao Fang", + "Saurabh Gupta", + "F. Iandola", + "L. Deng", + "Piotr Doll\u00e1r", + "Jianfeng Gao", + "Xiaodong He", + "Margaret Mitchell", + "John C. Platt", + "C. L. Zitnick", + "G. Zweig", + "Rupesh Tulshyan", + "Sohrob Kazerounian", + "Marijn F. Stollenga" + ], + "pub_titles": [ + "From captions to visual concepts and back", + "Understanding Locally Competitive Networks", + "An evolutionary algorithm based approach to design optimization using evidence theory", + "Compete to Compute", + "An evolutionary based Bayesian design optimization approach under incomplete information", + "Continually adding self-invented problems to the repertoire: First experiments with POWERPLAY", + "Generalized compressed network search", + "An EA-based approach to design optimization using evidence theory" + ], + "pub_abstracts": [ + "This paper presents a novel approach for automatically generating image descriptions: visual detectors, language models, and multimodal similarity models learnt directly from a dataset of image captions. We use multiple instance learning to train visual detectors for words that commonly occur in captions, including many different parts of speech such as nouns, verbs, and adjectives. The word detector outputs serve as conditional inputs to a maximum-entropy language model. The language model learns from a set of over 400,000 image descriptions to capture the statistics of word usage. We capture global semantics by re-ranking caption candidates using sentence-level features and a deep multimodal similarity model. Our system is state-of-the-art on the official Microsoft COCO benchmark, producing a BLEU-4 score of 29.1%. When human judges compare the system captions to ones written by other people on our held-out test set, the system captions have equal or better quality 34% of the time.", + "Recently proposed neural network activation functions such as rectified linear, maxout, and local winner-take-all have allowed for faster and more effective training of deep neural architectures on large and complex datasets. The common trait among these functions is that they implement local competition between small groups of computational units within a layer, so that only part of the network is activated for any given input pattern. In this paper, we attempt to visualize and understand this self-modularization, and suggest a unified explanation for the beneficial properties of such networks. We also show how our insights can be directly useful for efficiently performing retrieval over large datasets using neural networks.", + "For problems involving uncertainties in design variables and parameters, a bi-objective evolutionary algorithm (EA) based approach to design optimization using evidence theory is proposed and implemented in this paper. In addition to a functional objective, a plausibility measure of failure of constraint satisfaction is minimized. Despite some interests in classical optimization literature, this is the first attempt to use evidence theory with an EA. Due to EA\u2019s flexibility in its operators, non-requirement of any gradient, its ability to handle multiple conflicting objectives, and ease of parallelization, evidence-based design optimization using an EA is promising. Results on a test problem and a couple of engineering design problems show that the modified evolutionary multi-objective optimization (EMO) algorithm is capable of finding a widely distributed trade-off frontier showing different optimal solutions corresponding to different levels of plausibility failure limits. Furthermore, a single-objective evidence based EA is found to produce better optimal solutions than a previously reported classical optimization procedure. The use of a GPU based parallel computing platform demonstrates EA\u2019s performance enhancement around 160 to 700 times in implementing plausibility computations. Handling uncertainties of different types are getting increasingly popular in applied optimization studies and this EA based study should motivate further studies in handling uncertainties.", + "Local competition among neighboring neurons is common in biological neural networks (NNs). In this paper, we apply the concept to gradient-based, backprop-trained artificial multilayer NNs. NNs with competing linear units tend to outperform those with non-competing nonlinear units, and avoid catastrophic forgetting when training sets change over time.", + "Design optimization in the absence of complete information about uncertain quantities has been recently gaining consideration, as expensive repetitive computation tasks are becoming tractable due to the invention of faster and parallel computers. This work uses Bayesian inference to quantify design reliability when only sample measurements of the uncertain quantities are available. A generalized Bayesian reliability based design optimization algorithm has been proposed and implemented for numerical as well as engineering design problems. The approach uses an evolutionary algorithm (EA) to obtain a trade-off front between design objectives and reliability. The Bayesian approach provides a well-defined link between the amount of available information and the reliability through a confidence measure, and the EA acts as an efficient optimizer for a discrete and multi-dimensional objective space. Additionally, a GPU-based parallelization study shows computational speed-up of close to 100 times in a simulated scenario wherein the constraint qualification checks may be time consuming and could render a sequential implementation that can be impractical for large sample sets. These results show promise for the use of a parallel implementation of EAs in handling design optimization problems under uncertainties.", + "Pure scientists do not only invent new methods to solve given problems. They also invent new problems. The recent POWERPLAY framework formalizes this type of curiosity and creativity in a new, general, yet practical way. To acquire problem solving prowess through playing, POWERPLAY-based artificial explorers by design continually come up with the fastest to find, initially novel, but eventually solvable problems. They also continually simplify or speed up solutions to previous problems. We report on results of first experiments with POWERPLAY. A self-delimiting recurrent neural network (SLIM RNN) is used as a general computational architecture to implement the system's solver. Its weights can encode arbitrary, self-delimiting, halting or non-halting programs affecting both environment (through effectors) and internal states encoding abstractions of event sequences. In open-ended fashion, our POWERPLAY-driven RNNs learn to become increasingly general problem solvers, continually adding new problem solving procedures to the growing repertoire, exhibiting interesting developmental stages.", + "This paper presents initial results of Generalized Compressed Network Search (GCNS), a method for automatically identifying the important frequencies for neural networks encoded as a set of Fourier-type coefficients (i.e. \"compressed\" networks). GCNS achieves better compression than our previous approach, and promises better generalization capabilities. Results for a high-dimensional Octopus arm control problem show that a high fitness 3680-weight network can be encoded using less than 10 coefficients, using the frequencies identified by GCNS.", + "For problems involving uncertainties in design variables and parameters, a bi-objective evolutionary algorithm (EA) based approach to design optimization using evidence theory is proposed and implemented in this paper. In addition to a functional objective, a plausibility measure of failure of constraint satisfaction is minimized. Despite some interests in classical optimization literature, such a consideration in EA is rare. Due to EA's flexibility in its operators, non-requirement of any gradient, its ability to handle multiple conflicting objectives, and ease of parallelization, evidence-based design optimization using an EA is promising. Results on a test problem and a couple of engineering design problems show that the modified evolutionary multi-objective optimization (EMO) algorithm is capable of finding a widely distributed trade-off frontier showing different optimal solutions corresponding to different levels of plausibility failure limits. Furthermore, a single-objective evidence based EA is found to produce better optimal solutions than a previously reported classical optimization procedure. Handling uncertainties of different types are getting increasingly popular in applied optimization studies and more such studies using EAs will make EAs more useful and pragmatic in practical optimization problem-solving tasks." + ], + "domain": [ + "Computer Vision", + "Neural Networks", + "Evolutionary Algorithms", + "Design Optimization" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "86ae7766-7ca5-4702-aa08-3f1cc1223d1a": { + "pk": "86ae7766-7ca5-4702-aa08-3f1cc1223d1a", + "name": "Jan Koutn\u00edk", + "bio": "I am a researcher specializing in the intersection of neuroevolution and reinforcement learning (RL), with a focus on developing innovative neural network architectures that can effectively handle high-dimensional input spaces. My work has led to significant advancements in evolving compact recurrent neural networks (RNNs) that can control complex tasks, such as driving in the TORCS racing simulator, using only visual input. By employing deep learning techniques, I have successfully transformed high-dimensional visual data into compact feature vectors, enabling the evolution of efficient neural controllers.\n\nOne of my notable contributions is the introduction of the Clockwork RNN (CW-RNN), which partitions the hidden layer into modules that process inputs at varying temporal granularities. This architecture not only simplifies the model but also enhances performance across various tasks, including audio signal generation and handwriting recognition. Additionally, I have explored indirect encoding methods for neural networks, utilizing Fourier coefficients to represent weight matrices, which significantly reduces the search space dimensionality and accelerates convergence in complex RL tasks.\n\nMy research also includes the development of benchmarks, such as the Super Mario Bros. RL benchmark, which allows for the investigation of learning strategies in high-dimensional environments. Through my work, I aim to push the boundaries of neuroevolution and RL, making these powerful techniques more accessible and effective for a wide range of applications.", + "collaborators": [ + "J. Schmidhuber", + "Faustino J. Gomez", + "M. Snorek", + "Jan Drchal", + "Giuseppe Cuccu", + "V. K\u016frkov\u00e1", + "Roman Neruda", + "Klaus Greff", + "Bas R. Steunebrink", + "K. Th\u00f3risson", + "Eric Nivel", + "T. Glasmachers", + "V. Graziano", + "P. Kord\u00edk", + "O. Kov\u00e1r\u00edk", + "Miroslav epek", + "J. Togelius", + "S. Karakovskiy", + "Ondrej Kapral", + "Zden\u011bk Buk" + ], + "pub_titles": [ + "Evolving deep unsupervised convolutional networks for vision-based reinforcement learning", + "A Clockwork RNN", + "Evolving large-scale neural networks for vision-based TORCS", + "Evolving large-scale neural networks for vision-based reinforcement learning", + "Complexity search for compressed neural networks", + "A Frequency-Domain Encoding for Neuroevolution", + "Searching for Minimal Neural Networks in Fourier Space", + "Evolving neural networks in compressed weight space", + "Super mario evolution", + "HyperNEAT controlled robots learn how to drive on roads in simulated environment" + ], + "pub_abstracts": [ + "Dealing with high-dimensional input spaces, like visual input, is a challenging task for reinforcement learning (RL). Neuroevolution (NE), used for continuous RL problems, has to either reduce the problem dimensionality by (1) compressing the representation of the neural network controllers or (2) employing a pre-processor (compressor) that transforms the high-dimensional raw inputs into low-dimensional features. In this paper, we are able to evolve extremely small recurrent neural network (RNN) controllers for a task that previously required networks with over a million weights. The high-dimensional visual input, which the controller would normally receive, is first transformed into a compact feature vector through a deep, max-pooling convolutional neural network (MPCNN). Both the MPCNN preprocessor and the RNN controller are evolved successfully to control a car in the TORCS racing simulator using only visual input. This is the first use of deep learning in the context evolutionary RL.", + "Sequence prediction and classification are ubiquitous and challenging problems in machine learning that can require identifying complex dependencies between temporally distant inputs. Recurrent Neural Networks (RNNs) have the ability, in theory, to cope with these temporal dependencies by virtue of the short-term memory implemented by their recurrent (feedback) connections. However, in practice they are difficult to train successfully when long-term memory is required. This paper introduces a simple, yet powerful modification to the simple RNN (SRN) architecture, the Clockwork RNN (CW-RNN), in which the hidden layer is partitioned into separate modules, each processing inputs at its own temporal granularity, making computations only at its prescribed clock rate. Rather than making the standard RNN models more complex, CW-RNN reduces the number of SRN parameters, improves the performance significantly in the tasks tested, and speeds up the network evaluation. The network is demonstrated in preliminary experiments involving three tasks: audio signal generation, TIMIT spoken word classification, where it outperforms both SRN and LSTM networks, and online handwriting recognition, where it outperforms SRNs.", + "The TORCS racing simulator has become a standard testbed used in many recent reinforcement learning competitions, where an agent must learn to drive a car around a track using a small set of task-specific features. In this paper, large, recurrent neural networks (with over 1 million weights) are evolved to solve a much more challenging version of the task that instead uses only a stream of images from the driver\u2019s perspective as input. Evolving such large nets is made possible by representing them in the frequency domain as a set of coefficients that are transformed into weight matrices via an inverse Fourier-type transform. To our knowledge this is the first attempt to tackle TORCS using vision, and successfully evolve a neural network controllers of this size.", + "The idea of using evolutionary computation to train artificial neural networks, or neuroevolution (NE), for reinforcement learning (RL) tasks has now been around for over 20 years. However, as RL tasks become more challenging, the networks required become larger, as do their genomes. But, scaling NE to large nets (i.e. tens of thousands of weights) is infeasible using direct encodings that map genes one-to-one to network components. In this paper, we scale-up our compressed network encoding where network weight matrices are represented indirectly as a set of Fourier-type coefficients, to tasks that require very-large networks due to the high-dimensionality of their input space. The approach is demonstrated successfully on two reinforcement learning tasks in which the control networks receive visual input: (1) a vision-based version of the octopus control task requiring networks with over 3 thousand weights, and (2) a version of the TORCS driving game where networks with over 1 million weights are evolved to drive a car around a track using video images from the driver's perspective.", + "In this paper, we introduce a method, called Compressed Network Complexity Search (CNCS), for automatically determining the complexity of compressed networks (neural networks encoded indirectly by Fourier-type coefficients) that favors parsimonious solutions. CNCS maintains a probability distribution over complexity classes that it uses to select which class to optimize. Class probabilities are adapted based on their expected fitness, starting with a prior biased toward the simplest networks. Experiments on a challenging non-linear version of the helicopter hovering task, show that the method consistently finds simple solutions.", + "Neuroevolution has yet to scale up to complex reinforcement learning tasks that require large networks. Networks with many inputs (e.g. raw video) imply a very high dimensional search space if encoded directly. Indirect methods use a more compact genotype representation that is transformed into networks of potentially arbitrary size. In this paper, we present an indirect method where networks are encoded by a set of Fourier coefficients which are transformed into network weight matrices via an inverse Fourier-type transform. Because there often exist network solutions whose weight matrices contain regularity (i.e. adjacent weights are correlated), the number of coefficients required to represent these networks in the frequency domain is much smaller than the number of weights (in the same way that natural images can be compressed by ignore high-frequency components). This \"compressed\" encoding is compared to the direct approach where search is conducted in the weight space on the high-dimensional octopus arm task. The results show that representing networks in the frequency domain can reduce the search-space dimensionality by as much as two orders of magnitude, both accelerating convergence and yielding more general solutions.", + "The principle of minimum description length suggests looking for the simplest network that works well on the training examples, where simplicity is measured by network description size based on a reasonable programming language for encoding networks. Previous work used an assembler-like universal network encoding language (NEL) and Speed Priorbased search (related to Levin\u2019s Universal Search) to quickly find low-complexity nets with excellent generalization performance. Here we define a more natural and often more practical NEL whose instructions are frequency domain coefficients. Frequency coefficients may get encoded by few bits, hence huge weight matrices may just be low-complexity superpositions of patterns computed by programs with few elementary instructions. On various benchmarks this weight matrix encoding greatly accelerates the search. The scheme was tested on pole-balancing, long-term dependency T-maze, and ball throwing. Some of the solutions turn out to be unexpectedly simple as they are computable by fairly short bit", + "We propose a new indirect encoding scheme for neural networks in which the weight matrices are represented in the frequency domain by sets Fourier coefficients. This scheme exploits spatial regularities in the matrix to reduce the dimensionality of the representation by ignoring high-frequency coefficients, as is done in lossy image compression. We compare the efficiency of searching in this \"compressed\" network space to searching in the space of directly encoded networks, using the CoSyNE neuroevolution algorithm on three benchmark problems: pole-balancing, ball throwing and octopus arm control. The results show that this encoding can dramatically reduce the search space dimensionality such that solutions can be found in significantly fewer evaluations", + "We introduce a new reinforcement learning benchmark based on the classic platform game Super Mario Bros. The benchmark has a high-dimensional input space, and achieving a good score requires sophisticated and varied strategies. However, it has tunable difficulty, and at the lowest difficulty setting decent score can be achieved using rudimentary strategies and a small fraction of the input space. To investigate the properties of the benchmark, we evolve neural network-based controllers using different network architectures and input spaces. We show that it is relatively easy to learn basic strategies capable of clearing individual levels of low difficulty, but that these controllers have problems with generalization to unseen levels and with taking larger parts of the input space into account. A number of directions worth exploring for learning betterperforming strategies are discussed.", + "In this paper we describe simulation of autonomous robots controlled by recurrent neural networks, which are evolved through indirect encoding using HyperNEAT algorithm. The robots utilize 180 degree wide sensor array. Thanks to the scalability of the neural network generated by HyperNEAT, the sensor array can have various resolution. This would allow to use camera as an input for neural network controller used in real robot. The robots were simulated using software simulation environment. In the experiments the robots were trained to drive with imaximum average speed. Such fitness forces them to learn how to drive on roads and avoid collisions. Evolved neural networks show excellent scalability. Scaling of the sensory input breaks performance of the robots, which should be gained back with re-training of the robot with a different sensory input resolution." + ], + "domain": [ + "Reinforcement Learning", + "Neuroevolution", + "Neural Networks", + "Deep Learning" + ], + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "8bb82077-222d-4e19-b754-7bb824bb421b": { + "pk": "8bb82077-222d-4e19-b754-7bb824bb421b", + "name": "Bas R. Steunebrink", + "bio": "I am a researcher dedicated to exploring the intersection of artificial intelligence and human communication, with a particular focus on how artificial agents can learn language and social skills through observation. My work has led to the development of the auto-catalytic, endogenous, reflective architecture (AERA), which enables agents to learn complex multimodal language by observing human interactions in real-time scenarios, such as mock television interviews. Through my experiments, I have demonstrated that agents can acquire a rich understanding of language, including pragmatics, semantics, and syntax, without any pre-defined grammar, relying solely on high-level goals and a small ontology.\n\nI am also deeply interested in the broader implications of artificial general intelligence (AGI) and the importance of self-reflective systems. My research emphasizes the need for AGI systems to reason about their own code and improve autonomously, which I believe is crucial for achieving true intelligence. Additionally, I have explored the role of emotions in decision-making, proposing that understanding human emotions can enhance the design of artificial agents, allowing them to interact more effectively with humans.\n\nMy work is driven by a desire to bridge the gap between cognitive science and AI, advocating for a more holistic approach to understanding intelligence. I aim to create systems that not only perform tasks but also learn and adapt in dynamic environments, ultimately contributing to the development of intelligent agents that can thrive in the complexities of human society.", + "collaborators": [ + "J. Schmidhuber", + "M. Dastani", + "K. Th\u00f3risson", + "J. Meyer", + "Eric Nivel", + "H. Dindo", + "G. Pezzulo", + "D. Ognibene", + "Helgi P\u00e1ll Helgason", + "A. Chella", + "R. Sanz", + "Manuel Rodr\u00edguez", + "C. Hern\u00e1ndez", + "G. Jonsson", + "C. H. Corbato", + "R. Srivastava", + "Nivel Nivel", + "R. S. Bravo", + "Manuel Hernandez", + "Pei Wang", + "Kristinn R. \u00de\u00f3risson", + "J. Koutn\u00edk", + "M. Rodriguez", + "Marijn F. Stollenga" + ], + "pub_titles": [ + "AUTONOMOUS ACQUISITION OF NATURAL LANGUAGE", + "Autonomous Acquisition of Natural Situated Communication", + "What Should AGI Learn From AI & CogSci ?", + "Bounded Recursive Self-Improvement", + "Continually adding self-invented problems to the repertoire: First experiments with POWERPLAY", + "Towards an Actual G\u00f6del Machine Implementation: a Lesson in Self-Reflective Systems", + "Emotions to control agent deliberation", + "The logical structure of emotions", + "Modularity in BDI-Based Multi-agent Programming Languages", + "The OCC Model Revisited", + "A Formal Model of Emotions: Integrating Qualitative and Quantitative Aspects" + ], + "pub_abstracts": [ + "An important part of human intelligence is the ability to use language. Humans learn how to use language in a society of language users, which is probably the most effective way to learn a language from the ground up. Principles that might allow an artificial agents to learn language this way are not known at present. Here we present a framework which begins to address this challenge. Our auto-catalytic, endogenous, reflective architecture (AERA) supports the creation of agents that can learn natural language by observation. We present results from two experiments where our S1 agent learns human communication by observing two humans interacting in a realtime mock television interview, using gesture and situated language. Results show that S1 can learn multimodal complex language and multimodal communicative acts, using a vocabulary of 100 words with numerous sentence formats, by observing unscripted interaction between the humans, with no grammar being provided to it a priori, and only high-level information about the format of the human interaction in the form of high-level goals of the interviewer and interviewee and a small ontology. The agent learns both the pragmatics, semantics, and syntax of complex sentences spoken by the human subjects on the topic of recycling of objects such as aluminum cans, glass bottles, plastic, and wood, as well as use of manual deictic reference and anaphora.", + "An important part of human intelligence, both historically and operationally, is our ability to communicate. We learn how to communicate, and maintain our communicative skills, in a society of communicators \u2013 a highly effective way to reach and maintain proficiency in this complex skill. Principles that might allow artificial agents to learn language this way are in completely known at present \u2013 the multi-dimensional nature of socio-communicative skills are beyond every machine learning framework so far proposed. Our work begins to address the challenge of proposing a way for observation-based machine learning of natural language and communication. Our framework can learn complex communicative skills with minimal up-front knowledge. The system learns by incrementally producing predictive models of causal relationships in observed data, guided by goal-inference and reasoning using forward-inverse models. We present results from two experiments where our S1 agent learns human communication by observing two humans interacting in a realtime TV-style interview, using multimodal communicative gesture and situated language to talk about recycling of various materials and objects. S1 can learn multimodal complex language and multimodal communicative acts, a vocabulary of 100 words forming natural sentences with relatively complex sentence structure, including manual deictic reference and anaphora. S1 is seeded only with high-level information about goals of the interviewer and interviewee, and a small ontology; no grammar or other information is provided to S1 a priori. The agent learns the pragmatics, semantics, and syntax of complex utterances spoken and gestures from scratch, by observing the humans compare and contrast the cost and pollution related to recycling aluminum cans, glass bottles, newspaper, plastic, and wood. After 20 hours of observation S1 can perform an unscripted TV interview with a human, in the same style, without making mistakes.", + "While the fields of artificial intelligence (AI) and cognitive science (CogSci) both originated from a deep interest in the same phenomenon \u2013 intelligence \u2013 and both setting themselves high aims in their early days, each has since greatly narrowed its focus, and all but abandoned their core subject for a more limited version of the phenomenon. The many non-obvious causes for this change over the decades are perhaps understandable, but they have significantly reduced the potential of both fields to impact our understanding of the fundamentals of intelligence \u2013 in the wild and in the laboratory. This position paper argues that researchers in the field of artificial general intelligence (AGI) should carefully posit their research objectives and methodology to avoid repeating the same mistakes. 1 The Big Picture of Intelligence and Cognition Roughly speaking, artificial intelligence (AI) and cognitive science (CogSci) come from the same observation and imagination, namely that in a certain sense, the human mind and the electronic computer are \u2013 or can become \u2013 similar to each other. The similarities (and differences) have been suggested by many people, including Wiener [26], Turing [16], von Neumann [9], McCulloch and Pitt [7], though each from a different perspective. Initiated in this atmosphere, AI and CogSci can be seen as two sides of the same coin: while the former attempts to build a mind-like machine [11], the latter tries to study the mind as a machine [1]. Their relation is like that between engineering and science in general, that is, there is a strong mutual dependence. It is obvious that, to build an intelligent system, one has to have a clear idea about how intelligence works, and most of our knowledge on that topic comes from the study of the human mind. On the other hand, to evaluate the correctness of a theory of cognition, a straightforward way is to model it in an artifact to see if it produces the expected results. Given this relation, it is natural for AI to get inspiration from CogSci, as well as for CogSci to use AI models. Various theories have been proposed both to explain the phenomena observed in human cognition and to guide the design of machine intelligence (cf. [8, 10]). However, as the difficulties in this research became more and more clear, the mainstream in both fields gradually departed from the original objective to", + "We have designed a machine that becomes increasingly better at behaving in underspecified circumstances, in a goal-directed way, on the job, by modeling itself and its environment as experience accumulates. Based on principles of autocatalysis, endogeny, and reflectivity, the work provides an architectural blueprint for constructing systems with high levels of operational autonomy in underspecified circumstances, starting from a small seed. Through value-driven dynamic priority scheduling controlling the parallel execution of a vast number of reasoning threads, the system achieves recursive self-improvement after it leaves the lab, within the boundaries imposed by its designers. A prototype system has been implemented and demonstrated to learn a complex real-world task, real-time multimodal dialogue with humans, by on-line observation. Our work presents solutions to several challenges that must be solved for achieving artificial general intelligence.", + "Pure scientists do not only invent new methods to solve given problems. They also invent new problems. The recent POWERPLAY framework formalizes this type of curiosity and creativity in a new, general, yet practical way. To acquire problem solving prowess through playing, POWERPLAY-based artificial explorers by design continually come up with the fastest to find, initially novel, but eventually solvable problems. They also continually simplify or speed up solutions to previous problems. We report on results of first experiments with POWERPLAY. A self-delimiting recurrent neural network (SLIM RNN) is used as a general computational architecture to implement the system's solver. Its weights can encode arbitrary, self-delimiting, halting or non-halting programs affecting both environment (through effectors) and internal states encoding abstractions of event sequences. In open-ended fashion, our POWERPLAY-driven RNNs learn to become increasingly general problem solvers, continually adding new problem solving procedures to the growing repertoire, exhibiting interesting developmental stages.", + "Recently, interest has been revived in self-reflective systems in the context of Artificial General Intelligence (AGI). An AGI system should be intelligent enough to be able to reason about its own program code, and make modifications where it sees fit, improving on the initial code written by human programmers. A pertinent example is the GA\u00b6del Machine, which employs a proof searcher\u00e2\u20ac\u201din parallel to its regular problem solves duties\u00e2\u20ac\u201dto find a self-rewrite of which it can prove that it will be beneficial.", + "The execution of an artificial agent is usually implemented with a sense--reason--act cycle. This cycle includes tasks such as event processing, generating and revising plans, and selecting actions to execute. However, there are typically many choices in the design of such a cycle, which are often hard-coded in the cycle in an ad hoc way. The question of this paper is how one decides, in a principled way, how often and which reasoning rules to apply, how to interleave the execution of plans, or when to start replanning. This paper proposes and formalizes the eliciting conditions of hope, fear, joy, and distress according to a well-known psychological model of human emotion. These conditions are then used to reduce the choices an agent can make in each state. They formalize the idea that emotions focus an agent's attention on what is important in each state.", + "Even though emotions sometimes lead us astray, there is mounting evidence from psychology and neurology that emotions have---on the whole---a positive effect on intelligent decision making and acting. Emotions help both overtly and covertly by focusing a person's attention to what is important and pruning unpromising directions of reasoning. Like humans, artificial agents---such as robots and virtual characters---have to act intelligently under resource constraints. A deep understanding of how human emotions function as innate and learned heuristics can help us in designing more effective artificial agents. Even if one does not want artificial agents to behave emotionally, it will still be useful to make these agents have knowledge of human emotions, so that they can take these into account when interacting or cooperating with humans. In order to incorporate emotions in artificial agents, a bridge must be built from psychological models of human emotions to computer science. This is done in this dissertation by capturing an emotion theory in a formal agent specification language. This formalization both serves as a foundation for the implementation of emotions in artificial agents, and enables us to formally analyze properties of the psychological model, leading to a more precise understanding of the workings of human emotions", + "This paper proposes a module-based vision for designing BDI-based multi-agent programming languages. The introduced concept of modules enables common programming techniques such as encapsulation and information hiding for BDI-based programs, and facilitates the implementation of agent roles and profiles. This vision is applied to a BDI-based agent programming language to which specific programming constructs are added to allow the implementation of modules. The syntax and intuitive semantics of module based programming constructs are explained. An example is presented to illustrate how modules can be used to implement BDI-based multi-agent systems." \ No newline at end of file diff --git a/research_bench/Profile.pkl b/research_bench/Profile.pkl new file mode 100644 index 00000000..f399fc6a Binary files /dev/null and b/research_bench/Profile.pkl differ diff --git a/research_bench/build_agent_number_ablation_paper_subset.py b/research_bench/build_agent_number_ablation_paper_subset.py new file mode 100644 index 00000000..1284136e --- /dev/null +++ b/research_bench/build_agent_number_ablation_paper_subset.py @@ -0,0 +1,44 @@ +import json + +with open('./paper_bench/paper_bench_hard_500_filtered_1205.json', 'r') as f: + dataset_hard = json.load(f) + +with open('./paper_bench/paper_bench_mid_500_filtered_1205.json', 'r') as f: + dataset_mid = json.load(f) + +with open('./paper_bench/paper_bench_easy_500_filtered_1205.json', 'r') as f: + dataset_easy = json.load(f) + +dataset = {**dataset_hard} + + +agent_number_ablation_dataset = {} +for key, value in dataset.items(): + author_num = len(value['paper_data']['authors']) + if author_num >= 5: + agent_number_ablation_dataset[key] = value + +print(len(agent_number_ablation_dataset)) +with open('./paper_bench/agent_number_ablation_paper_bench.json', 'w') as f: + json.dump(agent_number_ablation_dataset, f, indent=4) + +''' +paper_number_ablation_dataset = {} +for key, value in dataset.items(): + reference = value['paper_data']['references'][0] + if 'reference_section' in reference: + references = value['paper_data']['references'] + section_names = [] + + for ref in references: + if 'reference_section' in ref.keys(): + if ref['reference_section'] is not None: + for section_name in ref['reference_section']: + section_names.append(section_name.lower()) + + if 'related work' in section_names and 'introduction' in section_names: + paper_number_ablation_dataset[key] = value + +with open('./paper_bench/paper_number_ablation_paper_bench.json', 'w') as f: + json.dump(paper_number_ablation_dataset, f, indent=4) +''' \ No newline at end of file diff --git a/research_bench/check_arxiv_paper_topic.py b/research_bench/check_arxiv_paper_topic.py new file mode 100644 index 00000000..a12ddeef --- /dev/null +++ b/research_bench/check_arxiv_paper_topic.py @@ -0,0 +1,37 @@ +import arxiv +import json +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor + +# Load arXiv IDs +with open('./paper_bench/paper_bench_full.json', 'r') as f: + paper_bench_full = json.load(f) + arxiv_ids = list(paper_bench_full.keys()) + +# Function to process a batch of arXiv IDs +def process_batch(batch): + updated_papers = {} + search = arxiv.Search(id_list=batch) + for result in search.results(): + categories = result.categories + updated_papers[result.entry_id.split('/')[-1].split('v')[0]] = {"categories": categories} + return updated_papers + +# Batch size +batch_size = 10 + +# Collect results +updated_results = {} +with ThreadPoolExecutor() as executor: + for i in tqdm(range(0, len(arxiv_ids), batch_size)): + batch = arxiv_ids[i:i+batch_size] + results = executor.submit(process_batch, batch).result() + updated_results.update(results) + +# Update original dictionary +for arxiv_id, data in updated_results.items(): + paper_bench_full[arxiv_id]['paper_data']['categories'] = data['categories'] + +# Write results to file +with open('./paper_bench/paper_bench_full_with_categories.json', 'w') as f: + json.dump(paper_bench_full, f, indent=4) diff --git a/research_bench/clean_cross_domain_paper.py b/research_bench/clean_cross_domain_paper.py new file mode 100644 index 00000000..ff37eabc --- /dev/null +++ b/research_bench/clean_cross_domain_paper.py @@ -0,0 +1,41 @@ +import json + +with open('./oodbench/oodbench_1203.json', 'r') as f: + dataset = json.load(f) + +with open('./oodbench/oodbench_ml_1203.json', 'r') as f: + ml_dataset = json.load(f) + +dataset = {**dataset, **ml_dataset} + +#with open('./oodbench/oodbench_paper_titles.txt') as f: +# paper_titles = f.read().splitlines() + +#filtered_dataset = {} +#for key, value in dataset.items(): +# filtered_dataset[key] = value + + +new_dataset = {} +for key, data in dataset.items(): + authors = data['paper_data']['authors'] + title = data['paper_data']['title'] + author_info_dict = data['author_data'] + valid_references = [ref for ref in data['paper_data']['references'] if ref['abstract'] is not None] + if len(authors) != len(author_info_dict): + print(len(authors), len(author_info_dict)) + continue + if len(valid_references) < 5: + continue + if data['paper_data']['abstract'] is None or len(data['paper_data']['abstract']) < 5: + continue + if data['paper_data']['introduction'] is None or len(data['paper_data']['introduction']) < 5: + continue + new_dataset[key] = data + +# select 100 papers in new_dataset +new_dataset = dict(list(new_dataset.items())[:100]) +print(len(new_dataset)) + +with open('./oodbench/oodbench_1203_filtered.json', 'w') as f: + json.dump(new_dataset, f, indent=4) \ No newline at end of file diff --git a/research_bench/compare_mlbench_and_mlbench_full.py b/research_bench/compare_mlbench_and_mlbench_full.py new file mode 100644 index 00000000..9c583279 --- /dev/null +++ b/research_bench/compare_mlbench_and_mlbench_full.py @@ -0,0 +1,17 @@ +import json + +with open('./mlbench/mlbench_full.json', 'r') as f: + mlbench_full = json.load(f) + +with open('./mlbench/mlbench.json', 'r') as f: + mlbench = json.load(f) + +import pdb; pdb.set_trace() +assert list(mlbench_full.keys()) == list(mlbench.keys()) + +for key in mlbench_full.keys(): + full_data = mlbench_full[key] + data = mlbench[key] + assert full_data['paper_data'] == data['paper_data'] + #assert full_data['author_data'] == data['author_data'] + assert full_data['reference_proposal'] == data['reference_proposal'] diff --git a/research_bench/create_bench_from_paper_links.py b/research_bench/create_bench_from_paper_links.py index 9564b647..892d3c94 100644 --- a/research_bench/create_bench_from_paper_links.py +++ b/research_bench/create_bench_from_paper_links.py @@ -28,14 +28,14 @@ def get_arxiv_ids(input_file: str) -> List[str]: return arxiv_ids -def process_single_arxiv_id(arxiv_id: str, config: Config) -> Tuple[str, Any]: +def process_single_arxiv_id(arxiv_id: str, config: Config, with_year_limit: bool) -> Tuple[str, Any]: """Processes a single arXiv ID, handling any errors gracefully.""" try: paper_data = get_paper_data(arxiv_id) return arxiv_id, { 'paper_data': paper_data, 'author_data': get_author_data( - arxiv_id, paper_data['authors'], paper_data['title'], config + arxiv_id, paper_data['authors'], paper_data['title'], config, with_year_limit=with_year_limit, ), 'reference_proposal': get_proposal_from_paper( arxiv_id, paper_data['introduction'], config @@ -56,7 +56,7 @@ def save_benchmark_data(data: Dict[str, Any], output: str) -> None: def process_arxiv_ids( - arxiv_ids: List[str], output: str, config: Config, num_processes: int + arxiv_ids: List[str], output: str, config: Config, num_processes: int, with_year_limit: bool ) -> None: """Processes arXiv IDs using multiprocessing, saving results after each batch.""" arxiv_ids_chunks = [ @@ -69,14 +69,14 @@ def process_arxiv_ids( if num_processes == 1: # Single-process mode results = [ - process_single_arxiv_id(arxiv_id, config) for arxiv_id in chunk + process_single_arxiv_id(arxiv_id, config, with_year_limit) for arxiv_id in chunk ] else: # Multiprocessing mode with Pool(processes=num_processes) as pool: results = pool.starmap( process_single_arxiv_id, - [(arxiv_id, config) for arxiv_id in chunk], + [(arxiv_id, config, with_year_limit) for arxiv_id in chunk], ) # Filter out None results and save data @@ -101,6 +101,11 @@ def parse_args() -> argparse.Namespace: default=1, help='Number of processes to use. Set to 1 for single-process mode. Default is based on available CPU cores.', ) + parser.add_argument( + '--with_year_limit', + action='store_true', + help='Limit the number of papers to those published within the same year as the input paper.', + ) return parser.parse_args() @@ -108,7 +113,7 @@ def main() -> None: args = parse_args() arxiv_ids = get_arxiv_ids(args.input) config = Config('../configs') - process_arxiv_ids(arxiv_ids, args.output, config, args.num_processes) + process_arxiv_ids(arxiv_ids, args.output, config, args.num_processes, args.with_year_limit) if __name__ == '__main__': diff --git a/research_bench/create_crossbench.sh b/research_bench/create_crossbench.sh index b9237072..99b8184f 100644 --- a/research_bench/create_crossbench.sh +++ b/research_bench/create_crossbench.sh @@ -1 +1 @@ -python create_bench_from_paper_links.py --input ./crossbench/crossbench_paper_links.txt --output ./crossbench/crossbench.json +python create_bench_from_paper_links.py --input ./crossbench/crossbench_paper_links_cross_arxiv_category.txt --output ./crossbench/crossbench_1201.json diff --git a/research_bench/create_oodbench.sh b/research_bench/create_oodbench.sh new file mode 100644 index 00000000..ffa697db --- /dev/null +++ b/research_bench/create_oodbench.sh @@ -0,0 +1 @@ +python create_bench_from_paper_links.py --input ./oodbench/oodbench_ml_arxiv_links.txt --output ./oodbench/oodbench_ml_1203.json --with_year_limit diff --git a/research_bench/crossbench/crossbench_paper_links.txt b/research_bench/crossbench/crossbench_paper_links.txt index 83cdd012..412c3317 100644 --- a/research_bench/crossbench/crossbench_paper_links.txt +++ b/research_bench/crossbench/crossbench_paper_links.txt @@ -18,3 +18,7 @@ https://arxiv.org/abs/2401.14656 https://arxiv.org/abs/2401.11052 https://arxiv.org/abs/2311.12410 https://arxiv.org/abs/2311.10776 +https://arxiv.org/abs/2407.20248 +https://arxiv.org/abs/2405.06684 +https://arxiv.org/abs/2405.18732 +https://arxiv.org/abs/2406.09471 diff --git a/research_bench/crossbench/crossbench_paper_links_cross_arxiv_category.txt b/research_bench/crossbench/crossbench_paper_links_cross_arxiv_category.txt new file mode 100644 index 00000000..018d1c54 --- /dev/null +++ b/research_bench/crossbench/crossbench_paper_links_cross_arxiv_category.txt @@ -0,0 +1,1086 @@ +http://arxiv.org/abs/cs/0308038v1 +http://arxiv.org/abs/2411.18440v1 +http://arxiv.org/abs/2410.16347v1 +http://arxiv.org/abs/2409.01825v1 +http://arxiv.org/abs/2407.07494v1 +http://arxiv.org/abs/2404.01780v1 +http://arxiv.org/abs/2403.19912v2 +http://arxiv.org/abs/2403.14235v1 +http://arxiv.org/abs/2403.01692v1 +http://arxiv.org/abs/2403.00897v1 +http://arxiv.org/abs/2312.04948v1 +http://arxiv.org/abs/2312.02910v1 +http://arxiv.org/abs/2312.02908v1 +http://arxiv.org/abs/2311.00186v1 +http://arxiv.org/abs/2308.05166v1 +http://arxiv.org/abs/2307.01090v2 +http://arxiv.org/abs/2305.09121v2 +http://arxiv.org/abs/2305.01720v1 +http://arxiv.org/abs/2302.02005v2 +http://arxiv.org/abs/2211.14543v2 +http://arxiv.org/abs/2211.12809v1 +http://arxiv.org/abs/2410.16347v1 +http://arxiv.org/abs/2408.10871v1 +http://arxiv.org/abs/2403.14235v1 +http://arxiv.org/abs/2402.19455v2 +http://arxiv.org/abs/2308.05166v1 +http://arxiv.org/abs/2307.01090v2 +http://arxiv.org/abs/2211.14543v2 +http://arxiv.org/abs/2210.04143v1 +http://arxiv.org/abs/2206.14820v2 +http://arxiv.org/abs/2205.07368v1 +http://arxiv.org/abs/2205.00701v4 +http://arxiv.org/abs/2204.13713v2 +http://arxiv.org/abs/2201.03131v1 +http://arxiv.org/abs/2110.00023v2 +http://arxiv.org/abs/2109.10915v1 +http://arxiv.org/abs/2109.10360v1 +http://arxiv.org/abs/2109.09747v1 +http://arxiv.org/abs/2109.09715v2 +http://arxiv.org/abs/2102.13123v2 +http://arxiv.org/abs/2102.00277v3 +http://arxiv.org/abs/2409.17178v1 +http://arxiv.org/abs/2409.11383v1 +http://arxiv.org/abs/2404.14661v1 +http://arxiv.org/abs/2310.17158v2 +http://arxiv.org/abs/2310.11888v1 +http://arxiv.org/abs/2308.14650v1 +http://arxiv.org/abs/2306.11159v1 +http://arxiv.org/abs/2211.09806v1 +http://arxiv.org/abs/2211.02239v2 +http://arxiv.org/abs/2209.07414v1 +http://arxiv.org/abs/2208.02053v1 +http://arxiv.org/abs/2206.08277v1 +http://arxiv.org/abs/2112.15068v2 +http://arxiv.org/abs/2106.07113v2 +http://arxiv.org/abs/2106.06523v1 +http://arxiv.org/abs/2101.04812v1 +http://arxiv.org/abs/2012.04460v2 +http://arxiv.org/abs/2009.13852v1 +http://arxiv.org/abs/1909.05917v1 +http://arxiv.org/abs/1906.08826v3 +http://arxiv.org/abs/2312.04640v1 +http://arxiv.org/abs/2311.08080v2 +http://arxiv.org/abs/2212.05497v1 +http://arxiv.org/abs/2210.08382v1 +http://arxiv.org/abs/2207.00591v1 +http://arxiv.org/abs/2109.01381v1 +http://arxiv.org/abs/2106.06718v1 +http://arxiv.org/abs/1904.09609v1 +http://arxiv.org/abs/2411.14078v2 +http://arxiv.org/abs/2410.16116v1 +http://arxiv.org/abs/2410.16347v1 +http://arxiv.org/abs/2410.10841v1 +http://arxiv.org/abs/2409.17178v1 +http://arxiv.org/abs/2409.14587v1 +http://arxiv.org/abs/2409.01825v1 +http://arxiv.org/abs/2408.11768v1 +http://arxiv.org/abs/2408.10871v1 +http://arxiv.org/abs/2406.17323v2 +http://arxiv.org/abs/2406.11054v1 +http://arxiv.org/abs/2405.09864v1 +http://arxiv.org/abs/2405.07842v2 +http://arxiv.org/abs/2405.03408v1 +http://arxiv.org/abs/2404.15552v1 +http://arxiv.org/abs/2404.01780v1 +http://arxiv.org/abs/2403.19912v2 +http://arxiv.org/abs/2403.14235v1 +http://arxiv.org/abs/2403.08851v1 +http://arxiv.org/abs/2403.05452v3 +http://arxiv.org/abs/2411.18070v1 +http://arxiv.org/abs/2410.17816v1 +http://arxiv.org/abs/2410.16116v1 +http://arxiv.org/abs/2410.02530v1 +http://arxiv.org/abs/2410.10841v1 +http://arxiv.org/abs/2408.11768v1 +http://arxiv.org/abs/2406.11054v1 +http://arxiv.org/abs/2405.03408v1 +http://arxiv.org/abs/2403.18347v1 +http://arxiv.org/abs/2311.08080v2 +http://arxiv.org/abs/2311.00186v1 +http://arxiv.org/abs/2309.14483v1 +http://arxiv.org/abs/2309.10784v1 +http://arxiv.org/abs/2306.15308v1 +http://arxiv.org/abs/2305.09327v1 +http://arxiv.org/abs/2212.06717v1 +http://arxiv.org/abs/2210.06478v2 +http://arxiv.org/abs/2210.02635v2 +http://arxiv.org/abs/2209.15036v1 +http://arxiv.org/abs/2208.09512v1 +http://arxiv.org/abs/2410.10713v1 +http://arxiv.org/abs/2409.01496v1 +http://arxiv.org/abs/2406.12220v1 +http://arxiv.org/abs/2402.16991v2 +http://arxiv.org/abs/2402.06784v2 +http://arxiv.org/abs/2310.08430v1 +http://arxiv.org/abs/2307.06542v3 +http://arxiv.org/abs/2304.13061v2 +http://arxiv.org/abs/2302.07253v2 +http://arxiv.org/abs/2203.10204v2 +http://arxiv.org/abs/2202.06201v1 +http://arxiv.org/abs/2201.01778v3 +http://arxiv.org/abs/2112.09741v2 +http://arxiv.org/abs/2106.09035v2 +http://arxiv.org/abs/2010.00029v5 +http://arxiv.org/abs/2006.05467v3 +http://arxiv.org/abs/2001.00030v1 +http://arxiv.org/abs/1803.00500v1 +http://arxiv.org/abs/1710.07393v2 +http://arxiv.org/abs/1705.10589v1 +http://arxiv.org/abs/2411.12516v1 +http://arxiv.org/abs/2410.13594v1 +http://arxiv.org/abs/2405.19076v3 +http://arxiv.org/abs/2402.13699v4 +http://arxiv.org/abs/2312.11206v1 +http://arxiv.org/abs/2312.03110v1 +http://arxiv.org/abs/2301.08654v2 +http://arxiv.org/abs/2209.03837v2 +http://arxiv.org/abs/2206.04272v1 +http://arxiv.org/abs/2201.01778v3 +http://arxiv.org/abs/2112.09362v3 +http://arxiv.org/abs/2010.00500v2 +http://arxiv.org/abs/2410.20558v1 +http://arxiv.org/abs/2410.20402v1 +http://arxiv.org/abs/2409.11438v2 +http://arxiv.org/abs/2409.02648v1 +http://arxiv.org/abs/2408.12732v1 +http://arxiv.org/abs/2408.01558v1 +http://arxiv.org/abs/2405.19076v3 +http://arxiv.org/abs/2402.18286v2 +http://arxiv.org/abs/2402.15815v1 +http://arxiv.org/abs/2402.13353v1 +http://arxiv.org/abs/2401.00065v1 +http://arxiv.org/abs/2312.17251v1 +http://arxiv.org/abs/2311.08585v1 +http://arxiv.org/abs/2309.00305v2 +http://arxiv.org/abs/2308.13917v1 +http://arxiv.org/abs/2307.07912v1 +http://arxiv.org/abs/2307.06322v1 +http://arxiv.org/abs/2306.15319v3 +http://arxiv.org/abs/2305.19302v3 +http://arxiv.org/abs/2305.16467v1 +http://arxiv.org/abs/2305.16526v2 +http://arxiv.org/abs/2205.09114v2 +http://arxiv.org/abs/2111.04881v2 +http://arxiv.org/abs/2101.05404v2 +http://arxiv.org/abs/2409.11438v2 +http://arxiv.org/abs/2309.00058v1 +http://arxiv.org/abs/2305.16467v1 +http://arxiv.org/abs/2203.13875v2 +http://arxiv.org/abs/2202.10983v1 +http://arxiv.org/abs/2411.02604v1 +http://arxiv.org/abs/2409.05800v1 +http://arxiv.org/abs/2211.01779v2 +http://arxiv.org/abs/2202.08177v1 +http://arxiv.org/abs/2112.09741v2 +http://arxiv.org/abs/2109.13925v2 +http://arxiv.org/abs/2012.12854v2 +http://arxiv.org/abs/2006.09179v2 +http://arxiv.org/abs/2002.01599v1 +http://arxiv.org/abs/1809.08406v1 +http://arxiv.org/abs/1710.09875v1 +http://arxiv.org/abs/1511.06036v1 +http://arxiv.org/abs/1510.07740v2 +http://arxiv.org/abs/1501.00834v1 +http://arxiv.org/abs/1404.3012v5 +http://arxiv.org/abs/cs/0604011v2 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/2211.01779v2 +http://arxiv.org/abs/2012.11841v2 +http://arxiv.org/abs/2009.09932v1 +http://arxiv.org/abs/2001.00030v1 +http://arxiv.org/abs/1906.06329v1 +http://arxiv.org/abs/1710.05520v1 +http://arxiv.org/abs/2404.15552v1 +http://arxiv.org/abs/2303.13917v1 +http://arxiv.org/abs/2211.01369v1 +http://arxiv.org/abs/2210.08382v1 +http://arxiv.org/abs/2207.00591v1 +http://arxiv.org/abs/2205.00701v4 +http://arxiv.org/abs/1911.01915v1 +http://arxiv.org/abs/1810.11027v2 +http://arxiv.org/abs/1803.09933v1 +http://arxiv.org/abs/1706.07446v1 +http://arxiv.org/abs/2408.10599v1 +http://arxiv.org/abs/2406.03233v1 +http://arxiv.org/abs/2403.11934v2 +http://arxiv.org/abs/2310.19695v3 +http://arxiv.org/abs/2308.09025v2 +http://arxiv.org/abs/2308.02559v2 +http://arxiv.org/abs/2306.13606v1 +http://arxiv.org/abs/2210.13869v5 +http://arxiv.org/abs/2204.01681v3 +http://arxiv.org/abs/2203.06210v1 +http://arxiv.org/abs/2107.07714v2 +http://arxiv.org/abs/2104.14659v3 +http://arxiv.org/abs/2103.06509v1 +http://arxiv.org/abs/2102.01033v1 +http://arxiv.org/abs/2101.06189v1 +http://arxiv.org/abs/2101.05108v2 +http://arxiv.org/abs/2012.09719v2 +http://arxiv.org/abs/2012.08526v2 +http://arxiv.org/abs/2012.06181v2 +http://arxiv.org/abs/2006.01993v1 +http://arxiv.org/abs/1912.12410v1 +http://arxiv.org/abs/1909.06238v2 +http://arxiv.org/abs/2403.11934v2 +http://arxiv.org/abs/2303.00693v1 +http://arxiv.org/abs/2210.13869v5 +http://arxiv.org/abs/2203.06210v1 +http://arxiv.org/abs/2106.06718v1 +http://arxiv.org/abs/2103.06115v2 +http://arxiv.org/abs/2012.09719v2 +http://arxiv.org/abs/2012.08526v2 +http://arxiv.org/abs/2010.08201v1 +http://arxiv.org/abs/1909.06238v2 +http://arxiv.org/abs/1902.08570v3 +http://arxiv.org/abs/1807.02701v1 +http://arxiv.org/abs/2105.13926v1 +http://arxiv.org/abs/1703.07915v1 +http://arxiv.org/abs/2310.20095v1 +http://arxiv.org/abs/2012.13346v1 +http://arxiv.org/abs/1707.07310v2 +http://arxiv.org/abs/1704.01297v1 +http://arxiv.org/abs/1110.0107v2 +http://arxiv.org/abs/0812.5064v2 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/cs/0005027v1 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/2405.20717v2 +http://arxiv.org/abs/2404.17865v1 +http://arxiv.org/abs/2301.07541v2 +http://arxiv.org/abs/2210.11921v2 +http://arxiv.org/abs/2105.14412v2 +http://arxiv.org/abs/2006.09179v2 +http://arxiv.org/abs/1704.01297v1 +http://arxiv.org/abs/2308.11846v1 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/2403.00689v1 +http://arxiv.org/abs/2204.04955v1 +http://arxiv.org/abs/2008.02757v3 +http://arxiv.org/abs/1810.10350v3 +http://arxiv.org/abs/2403.13858v1 +http://arxiv.org/abs/2411.16098v1 +http://arxiv.org/abs/2411.05420v1 +http://arxiv.org/abs/2410.19814v1 +http://arxiv.org/abs/2410.04135v1 +http://arxiv.org/abs/2410.08218v1 +http://arxiv.org/abs/2409.16320v2 +http://arxiv.org/abs/2409.07961v3 +http://arxiv.org/abs/2409.17354v1 +http://arxiv.org/abs/2408.15993v1 +http://arxiv.org/abs/2408.15122v1 +http://arxiv.org/abs/2408.11438v2 +http://arxiv.org/abs/2408.11032v1 +http://arxiv.org/abs/2407.11666v2 +http://arxiv.org/abs/2406.14399v2 +http://arxiv.org/abs/2406.11217v2 +http://arxiv.org/abs/2405.17455v1 +http://arxiv.org/abs/2404.19605v1 +http://arxiv.org/abs/2404.09415v1 +http://arxiv.org/abs/2404.07395v1 +http://arxiv.org/abs/2404.05758v1 +http://arxiv.org/abs/2410.15229v1 +http://arxiv.org/abs/2410.08826v1 +http://arxiv.org/abs/2409.18326v2 +http://arxiv.org/abs/2409.16069v1 +http://arxiv.org/abs/2404.06657v1 +http://arxiv.org/abs/2403.14324v1 +http://arxiv.org/abs/2403.11035v1 +http://arxiv.org/abs/2401.17695v2 +http://arxiv.org/abs/2401.16779v1 +http://arxiv.org/abs/2401.08923v1 +http://arxiv.org/abs/2401.07929v1 +http://arxiv.org/abs/2401.07856v1 +http://arxiv.org/abs/2311.16652v1 +http://arxiv.org/abs/2311.10278v2 +http://arxiv.org/abs/2311.04473v1 +http://arxiv.org/abs/2309.09215v1 +http://arxiv.org/abs/2308.15019v2 +http://arxiv.org/abs/2308.02952v1 +http://arxiv.org/abs/2307.14243v1 +http://arxiv.org/abs/2303.02753v1 +http://arxiv.org/abs/2411.03129v1 +http://arxiv.org/abs/2408.07786v1 +http://arxiv.org/abs/2404.08549v2 +http://arxiv.org/abs/2403.17293v1 +http://arxiv.org/abs/2403.12970v1 +http://arxiv.org/abs/2312.14830v2 +http://arxiv.org/abs/2310.09441v1 +http://arxiv.org/abs/2305.17193v2 +http://arxiv.org/abs/2305.07822v1 +http://arxiv.org/abs/2111.12854v1 +http://arxiv.org/abs/2111.05315v1 +http://arxiv.org/abs/2109.10472v2 +http://arxiv.org/abs/2106.13064v1 +http://arxiv.org/abs/2010.10781v1 +http://arxiv.org/abs/2009.14303v1 +http://arxiv.org/abs/2009.02264v3 +http://arxiv.org/abs/1908.08631v1 +http://arxiv.org/abs/1610.04579v1 +http://arxiv.org/abs/1310.4249v2 +http://arxiv.org/abs/1304.1209v1 +http://arxiv.org/abs/2310.18367v1 +http://arxiv.org/abs/2305.19302v3 +http://arxiv.org/abs/2304.12972v1 +http://arxiv.org/abs/2209.07414v1 +http://arxiv.org/abs/2105.03688v1 +http://arxiv.org/abs/1908.05841v1 +http://arxiv.org/abs/1502.02077v3 +http://arxiv.org/abs/2211.01369v1 +http://arxiv.org/abs/2411.11693v1 +http://arxiv.org/abs/2411.02604v1 +http://arxiv.org/abs/2410.04123v1 +http://arxiv.org/abs/2410.03920v1 +http://arxiv.org/abs/2410.01376v1 +http://arxiv.org/abs/2407.14983v1 +http://arxiv.org/abs/2406.03919v2 +http://arxiv.org/abs/2404.06517v1 +http://arxiv.org/abs/2402.17745v2 +http://arxiv.org/abs/2312.15136v1 +http://arxiv.org/abs/2312.01464v2 +http://arxiv.org/abs/2311.16652v1 +http://arxiv.org/abs/2309.13457v3 +http://arxiv.org/abs/2306.14070v1 +http://arxiv.org/abs/2305.18944v3 +http://arxiv.org/abs/2304.02104v2 +http://arxiv.org/abs/2212.07564v3 +http://arxiv.org/abs/2205.11521v4 +http://arxiv.org/abs/2205.02568v1 +http://arxiv.org/abs/2204.05699v1 +http://arxiv.org/abs/2411.12897v1 +http://arxiv.org/abs/2409.17178v1 +http://arxiv.org/abs/2408.10356v2 +http://arxiv.org/abs/2406.03233v1 +http://arxiv.org/abs/2405.20559v2 +http://arxiv.org/abs/2404.05758v1 +http://arxiv.org/abs/2401.07825v2 +http://arxiv.org/abs/2312.07586v5 +http://arxiv.org/abs/2311.10904v1 +http://arxiv.org/abs/2310.11888v1 +http://arxiv.org/abs/2310.08430v1 +http://arxiv.org/abs/2309.07087v2 +http://arxiv.org/abs/2306.17210v2 +http://arxiv.org/abs/2305.05542v1 +http://arxiv.org/abs/2301.07541v2 +http://arxiv.org/abs/2211.00002v1 +http://arxiv.org/abs/2208.02693v2 +http://arxiv.org/abs/2207.10625v1 +http://arxiv.org/abs/2207.08148v1 +http://arxiv.org/abs/2206.02564v1 +http://arxiv.org/abs/2411.17059v1 +http://arxiv.org/abs/2411.16417v1 +http://arxiv.org/abs/2408.12662v1 +http://arxiv.org/abs/2406.03325v1 +http://arxiv.org/abs/2405.17260v2 +http://arxiv.org/abs/2404.01352v1 +http://arxiv.org/abs/2403.12226v1 +http://arxiv.org/abs/2401.15913v1 +http://arxiv.org/abs/2312.14635v1 +http://arxiv.org/abs/2311.06557v1 +http://arxiv.org/abs/2309.13457v3 +http://arxiv.org/abs/2309.10172v2 +http://arxiv.org/abs/2306.12915v1 +http://arxiv.org/abs/2305.16656v3 +http://arxiv.org/abs/2305.11884v2 +http://arxiv.org/abs/2305.05542v1 +http://arxiv.org/abs/2305.02116v1 +http://arxiv.org/abs/2304.02104v2 +http://arxiv.org/abs/2303.14109v1 +http://arxiv.org/abs/2302.14470v1 +http://arxiv.org/abs/2411.16098v1 +http://arxiv.org/abs/2411.02627v1 +http://arxiv.org/abs/2411.00911v1 +http://arxiv.org/abs/2410.21462v1 +http://arxiv.org/abs/2410.15907v1 +http://arxiv.org/abs/2410.08231v1 +http://arxiv.org/abs/2409.10259v1 +http://arxiv.org/abs/2409.03878v1 +http://arxiv.org/abs/2407.18100v3 +http://arxiv.org/abs/2406.14815v4 +http://arxiv.org/abs/2406.07482v1 +http://arxiv.org/abs/2404.03754v3 +http://arxiv.org/abs/2403.02774v1 +http://arxiv.org/abs/2401.03131v1 +http://arxiv.org/abs/2312.10568v1 +http://arxiv.org/abs/2312.08194v1 +http://arxiv.org/abs/2310.08430v1 +http://arxiv.org/abs/2310.04430v1 +http://arxiv.org/abs/2309.06062v2 +http://arxiv.org/abs/2309.01066v1 +http://arxiv.org/abs/2410.20558v1 +http://arxiv.org/abs/2409.05159v1 +http://arxiv.org/abs/2403.00689v1 +http://arxiv.org/abs/2312.17265v1 +http://arxiv.org/abs/2311.10904v1 +http://arxiv.org/abs/2308.09025v2 +http://arxiv.org/abs/2304.14503v1 +http://arxiv.org/abs/2303.15319v1 +http://arxiv.org/abs/2301.02757v1 +http://arxiv.org/abs/2211.15218v1 +http://arxiv.org/abs/2211.01505v1 +http://arxiv.org/abs/2211.01159v1 +http://arxiv.org/abs/2208.07196v1 +http://arxiv.org/abs/2207.10530v1 +http://arxiv.org/abs/2207.00089v2 +http://arxiv.org/abs/2205.08736v1 +http://arxiv.org/abs/2205.07690v1 +http://arxiv.org/abs/2205.03549v1 +http://arxiv.org/abs/2204.14228v1 +http://arxiv.org/abs/2204.11592v1 +http://arxiv.org/abs/2411.18249v1 +http://arxiv.org/abs/2411.15322v1 +http://arxiv.org/abs/2411.13120v1 +http://arxiv.org/abs/2411.10308v1 +http://arxiv.org/abs/2411.07918v1 +http://arxiv.org/abs/2411.07503v1 +http://arxiv.org/abs/2411.05302v1 +http://arxiv.org/abs/2411.01291v1 +http://arxiv.org/abs/2411.00594v1 +http://arxiv.org/abs/2410.23386v1 +http://arxiv.org/abs/2410.23329v1 +http://arxiv.org/abs/2410.20073v1 +http://arxiv.org/abs/2410.18461v1 +http://arxiv.org/abs/2410.17557v1 +http://arxiv.org/abs/2410.15229v1 +http://arxiv.org/abs/2410.12940v1 +http://arxiv.org/abs/2410.11491v1 +http://arxiv.org/abs/2409.20409v3 +http://arxiv.org/abs/2410.10826v1 +http://arxiv.org/abs/2409.13477v1 +http://arxiv.org/abs/2411.13120v1 +http://arxiv.org/abs/2411.08995v1 +http://arxiv.org/abs/2410.20073v1 +http://arxiv.org/abs/2410.20055v1 +http://arxiv.org/abs/2410.13295v1 +http://arxiv.org/abs/2410.05413v2 +http://arxiv.org/abs/2410.04123v1 +http://arxiv.org/abs/2409.20013v1 +http://arxiv.org/abs/2409.18614v1 +http://arxiv.org/abs/2409.05159v1 +http://arxiv.org/abs/2408.13782v1 +http://arxiv.org/abs/2409.00028v1 +http://arxiv.org/abs/2408.06681v1 +http://arxiv.org/abs/2407.18456v3 +http://arxiv.org/abs/2407.10897v2 +http://arxiv.org/abs/2405.20559v2 +http://arxiv.org/abs/2404.19201v1 +http://arxiv.org/abs/2404.17503v2 +http://arxiv.org/abs/2404.06657v1 +http://arxiv.org/abs/2405.01558v2 +http://arxiv.org/abs/2408.00781v1 +http://arxiv.org/abs/2109.00926v1 +http://arxiv.org/abs/2408.10356v2 +http://arxiv.org/abs/2408.00781v1 +http://arxiv.org/abs/2407.11013v1 +http://arxiv.org/abs/2312.03447v1 +http://arxiv.org/abs/2304.09914v4 +http://arxiv.org/abs/2210.10247v1 +http://arxiv.org/abs/2203.11078v1 +http://arxiv.org/abs/2104.04545v1 +http://arxiv.org/abs/2101.12010v2 +http://arxiv.org/abs/2011.14326v2 +http://arxiv.org/abs/2010.09648v1 +http://arxiv.org/abs/2005.04567v1 +http://arxiv.org/abs/2004.04907v1 +http://arxiv.org/abs/1711.00536v1 +http://arxiv.org/abs/1706.02850v1 +http://arxiv.org/abs/1611.01843v3 +http://arxiv.org/abs/1207.3809v1 +http://arxiv.org/abs/0812.5032v1 +http://arxiv.org/abs/2305.18944v3 +http://arxiv.org/abs/2109.01381v1 +http://arxiv.org/abs/2407.14473v1 +http://arxiv.org/abs/2308.07481v1 +http://arxiv.org/abs/2308.00408v1 +http://arxiv.org/abs/2204.00660v1 +http://arxiv.org/abs/2004.12270v1 +http://arxiv.org/abs/2409.16333v1 +http://arxiv.org/abs/2408.05526v1 +http://arxiv.org/abs/2407.18338v1 +http://arxiv.org/abs/2404.10178v1 +http://arxiv.org/abs/2402.17960v1 +http://arxiv.org/abs/2401.06169v1 +http://arxiv.org/abs/2312.14432v1 +http://arxiv.org/abs/2306.07274v2 +http://arxiv.org/abs/2302.06091v2 +http://arxiv.org/abs/2210.07387v1 +http://arxiv.org/abs/2205.02169v2 +http://arxiv.org/abs/2203.08138v4 +http://arxiv.org/abs/2102.03881v1 +http://arxiv.org/abs/2011.11020v1 +http://arxiv.org/abs/2009.05277v3 +http://arxiv.org/abs/2008.04757v4 +http://arxiv.org/abs/2008.03646v1 +http://arxiv.org/abs/2006.08532v1 +http://arxiv.org/abs/2006.09275v2 +http://arxiv.org/abs/1912.12476v1 +http://arxiv.org/abs/2312.03509v1 +http://arxiv.org/abs/2309.08745v2 +http://arxiv.org/abs/2207.06215v1 +http://arxiv.org/abs/2202.00813v1 +http://arxiv.org/abs/2111.06425v2 +http://arxiv.org/abs/2110.04921v1 +http://arxiv.org/abs/2106.12548v1 +http://arxiv.org/abs/2010.12011v2 +http://arxiv.org/abs/2007.09469v1 +http://arxiv.org/abs/1910.12326v1 +http://arxiv.org/abs/1710.08149v3 +http://arxiv.org/abs/1403.3780v1 +http://arxiv.org/abs/1207.3127v1 +http://arxiv.org/abs/2411.02796v1 +http://arxiv.org/abs/2411.00749v1 +http://arxiv.org/abs/2410.00945v1 +http://arxiv.org/abs/2403.11375v1 +http://arxiv.org/abs/2403.01927v1 +http://arxiv.org/abs/2312.15320v2 +http://arxiv.org/abs/2309.07332v1 +http://arxiv.org/abs/2308.01839v2 +http://arxiv.org/abs/2306.09391v3 +http://arxiv.org/abs/2304.06819v2 +http://arxiv.org/abs/2303.09987v1 +http://arxiv.org/abs/2211.06764v1 +http://arxiv.org/abs/2206.01897v1 +http://arxiv.org/abs/2206.00455v1 +http://arxiv.org/abs/2204.13705v2 +http://arxiv.org/abs/2203.02794v3 +http://arxiv.org/abs/2108.02278v1 +http://arxiv.org/abs/2107.00648v1 +http://arxiv.org/abs/2101.00304v2 +http://arxiv.org/abs/2010.03420v1 +http://arxiv.org/abs/2205.11676v1 +http://arxiv.org/abs/2411.10596v1 +http://arxiv.org/abs/2411.05712v1 +http://arxiv.org/abs/2411.05188v1 +http://arxiv.org/abs/2411.05825v1 +http://arxiv.org/abs/2411.00238v1 +http://arxiv.org/abs/2411.00888v1 +http://arxiv.org/abs/2410.16945v1 +http://arxiv.org/abs/2410.16524v1 +http://arxiv.org/abs/2410.15614v1 +http://arxiv.org/abs/2410.15433v1 +http://arxiv.org/abs/2410.19810v1 +http://arxiv.org/abs/2410.09614v3 +http://arxiv.org/abs/2410.05342v1 +http://arxiv.org/abs/2410.03952v2 +http://arxiv.org/abs/2410.02430v1 +http://arxiv.org/abs/2410.02094v2 +http://arxiv.org/abs/2409.19174v1 +http://arxiv.org/abs/2409.17510v3 +http://arxiv.org/abs/2409.18967v1 +http://arxiv.org/abs/2409.02390v1 +http://arxiv.org/abs/2409.16488v1 +http://arxiv.org/abs/2406.05170v1 +http://arxiv.org/abs/2211.15472v1 +http://arxiv.org/abs/2209.05097v1 +http://arxiv.org/abs/2204.11716v2 +http://arxiv.org/abs/2006.00067v2 +http://arxiv.org/abs/2411.10074v2 +http://arxiv.org/abs/2408.00160v1 +http://arxiv.org/abs/2405.15976v1 +http://arxiv.org/abs/2403.18028v2 +http://arxiv.org/abs/2311.00936v1 +http://arxiv.org/abs/2305.18207v1 +http://arxiv.org/abs/2302.08062v3 +http://arxiv.org/abs/2211.02537v1 +http://arxiv.org/abs/2209.07802v1 +http://arxiv.org/abs/2201.10526v1 +http://arxiv.org/abs/2108.05258v1 +http://arxiv.org/abs/2004.02498v1 +http://arxiv.org/abs/1912.03538v3 +http://arxiv.org/abs/1807.04975v2 +http://arxiv.org/abs/1605.00775v1 +http://arxiv.org/abs/2411.15076v1 +http://arxiv.org/abs/2411.14975v1 +http://arxiv.org/abs/2411.14833v1 +http://arxiv.org/abs/2411.14743v1 +http://arxiv.org/abs/2411.12707v1 +http://arxiv.org/abs/2411.14467v1 +http://arxiv.org/abs/2411.09766v2 +http://arxiv.org/abs/2411.06583v1 +http://arxiv.org/abs/2411.06315v1 +http://arxiv.org/abs/2411.05028v1 +http://arxiv.org/abs/2411.02354v1 +http://arxiv.org/abs/2411.01508v1 +http://arxiv.org/abs/2411.01034v1 +http://arxiv.org/abs/2411.00922v2 +http://arxiv.org/abs/2410.23327v1 +http://arxiv.org/abs/2410.21560v1 +http://arxiv.org/abs/2410.11281v1 +http://arxiv.org/abs/2410.02988v1 +http://arxiv.org/abs/2410.00152v1 +http://arxiv.org/abs/2409.20013v1 +http://arxiv.org/abs/2305.17193v2 +http://arxiv.org/abs/2304.05065v1 +http://arxiv.org/abs/1511.05286v1 +http://arxiv.org/abs/2411.07503v1 +http://arxiv.org/abs/2411.00749v1 +http://arxiv.org/abs/2410.21560v1 +http://arxiv.org/abs/2409.20407v2 +http://arxiv.org/abs/2407.19821v2 +http://arxiv.org/abs/2407.19305v2 +http://arxiv.org/abs/2407.17157v2 +http://arxiv.org/abs/2407.09540v1 +http://arxiv.org/abs/2406.12808v3 +http://arxiv.org/abs/2406.10893v1 +http://arxiv.org/abs/2404.04983v1 +http://arxiv.org/abs/2404.15318v2 +http://arxiv.org/abs/2403.18233v1 +http://arxiv.org/abs/2403.16695v1 +http://arxiv.org/abs/2403.16678v1 +http://arxiv.org/abs/2405.10950v1 +http://arxiv.org/abs/2403.05949v3 +http://arxiv.org/abs/2403.05220v1 +http://arxiv.org/abs/2403.04142v1 +http://arxiv.org/abs/2403.01927v1 +http://arxiv.org/abs/2409.16333v1 +http://arxiv.org/abs/2305.04884v1 +http://arxiv.org/abs/1907.10046v3 +http://arxiv.org/abs/1907.09567v2 +http://arxiv.org/abs/2407.15339v3 +http://arxiv.org/abs/2403.19915v1 +http://arxiv.org/abs/2403.10916v2 +http://arxiv.org/abs/2310.10050v1 +http://arxiv.org/abs/2308.12477v1 +http://arxiv.org/abs/2307.13221v1 +http://arxiv.org/abs/2307.10549v1 +http://arxiv.org/abs/2305.14672v1 +http://arxiv.org/abs/2304.03464v3 +http://arxiv.org/abs/2304.02737v2 +http://arxiv.org/abs/2303.04204v2 +http://arxiv.org/abs/2303.02230v2 +http://arxiv.org/abs/2209.10148v1 +http://arxiv.org/abs/2204.00052v2 +http://arxiv.org/abs/2112.13850v2 +http://arxiv.org/abs/2104.04545v1 +http://arxiv.org/abs/2010.05970v2 +http://arxiv.org/abs/2009.05455v1 +http://arxiv.org/abs/1910.04879v1 +http://arxiv.org/abs/2012.06573v2 +http://arxiv.org/abs/2306.02848v1 +http://arxiv.org/abs/2305.16364v1 +http://arxiv.org/abs/2411.13615v2 +http://arxiv.org/abs/2307.08650v2 +http://arxiv.org/abs/2305.04884v1 +http://arxiv.org/abs/2303.09323v1 +http://arxiv.org/abs/2112.10139v1 +http://arxiv.org/abs/2107.01273v2 +http://arxiv.org/abs/2008.09471v1 +http://arxiv.org/abs/2107.01273v2 +http://arxiv.org/abs/1907.10046v3 +http://arxiv.org/abs/1907.09567v2 +http://arxiv.org/abs/2308.08276v1 +http://arxiv.org/abs/2302.00117v1 +http://arxiv.org/abs/2102.12061v2 +http://arxiv.org/abs/2102.03239v1 +http://arxiv.org/abs/2101.10862v2 +http://arxiv.org/abs/2011.09052v3 +http://arxiv.org/abs/1908.02166v1 +http://arxiv.org/abs/1907.07786v2 +http://arxiv.org/abs/1807.07155v2 +http://arxiv.org/abs/2407.15339v3 +http://arxiv.org/abs/2403.19915v1 +http://arxiv.org/abs/2403.10916v2 +http://arxiv.org/abs/2310.10050v1 +http://arxiv.org/abs/2308.12477v1 +http://arxiv.org/abs/2307.13221v1 +http://arxiv.org/abs/2307.10549v1 +http://arxiv.org/abs/2305.14672v1 +http://arxiv.org/abs/2304.03464v3 +http://arxiv.org/abs/2304.02737v2 +http://arxiv.org/abs/2303.04204v2 +http://arxiv.org/abs/2303.02230v2 +http://arxiv.org/abs/2209.10148v1 +http://arxiv.org/abs/2204.00052v2 +http://arxiv.org/abs/2112.13850v2 +http://arxiv.org/abs/2104.04545v1 +http://arxiv.org/abs/2010.05970v2 +http://arxiv.org/abs/2009.05455v1 +http://arxiv.org/abs/1910.04879v1 +http://arxiv.org/abs/2401.01916v2 +http://arxiv.org/abs/2309.06126v1 +http://arxiv.org/abs/2211.08513v2 +http://arxiv.org/abs/2401.01916v2 +http://arxiv.org/abs/2309.06126v1 +http://arxiv.org/abs/2309.06126v1 +http://arxiv.org/abs/2409.19750v1 +http://arxiv.org/abs/2406.17324v2 +http://arxiv.org/abs/2404.10757v1 +http://arxiv.org/abs/2403.08851v1 +http://arxiv.org/abs/2401.01916v2 +http://arxiv.org/abs/2312.08579v2 +http://arxiv.org/abs/2309.06126v1 +http://arxiv.org/abs/2212.00744v1 +http://arxiv.org/abs/2210.14760v2 +http://arxiv.org/abs/2112.00590v1 +http://arxiv.org/abs/1902.00027v2 +http://arxiv.org/abs/2404.10757v1 +http://arxiv.org/abs/2401.01916v2 +http://arxiv.org/abs/2411.05421v1 +http://arxiv.org/abs/2410.12375v1 +http://arxiv.org/abs/2409.05556v1 +http://arxiv.org/abs/2408.15138v1 +http://arxiv.org/abs/2408.02288v2 +http://arxiv.org/abs/2406.00048v3 +http://arxiv.org/abs/2402.07148v2 +http://arxiv.org/abs/2311.08166v1 +http://arxiv.org/abs/2310.19998v1 +http://arxiv.org/abs/2309.14913v2 +http://arxiv.org/abs/2305.04934v2 +http://arxiv.org/abs/2304.07235v4 +http://arxiv.org/abs/2112.08628v2 +http://arxiv.org/abs/2008.06996v3 +http://arxiv.org/abs/2003.01787v1 +http://arxiv.org/abs/1711.01416v1 +http://arxiv.org/abs/1710.10248v2 +http://arxiv.org/abs/1609.03207v2 +http://arxiv.org/abs/1606.06737v3 +http://arxiv.org/abs/2411.05421v1 +http://arxiv.org/abs/2410.12375v1 +http://arxiv.org/abs/2405.19076v3 +http://arxiv.org/abs/2403.11996v3 +http://arxiv.org/abs/2310.19998v1 +http://arxiv.org/abs/2310.10605v3 +http://arxiv.org/abs/2410.21317v1 +http://arxiv.org/abs/2410.12375v1 +http://arxiv.org/abs/2409.14572v1 +http://arxiv.org/abs/2409.13732v1 +http://arxiv.org/abs/2409.05556v1 +http://arxiv.org/abs/2408.04661v1 +http://arxiv.org/abs/2407.15459v1 +http://arxiv.org/abs/2406.13163v1 +http://arxiv.org/abs/2405.19076v3 +http://arxiv.org/abs/2405.02128v1 +http://arxiv.org/abs/2403.11996v3 +http://arxiv.org/abs/2402.19462v2 +http://arxiv.org/abs/2402.06964v1 +http://arxiv.org/abs/2402.05200v2 +http://arxiv.org/abs/2401.17244v3 +http://arxiv.org/abs/2311.08166v1 +http://arxiv.org/abs/2311.06303v1 +http://arxiv.org/abs/2310.19998v1 +http://arxiv.org/abs/2310.14029v1 +http://arxiv.org/abs/2310.10605v3 +http://arxiv.org/abs/2411.05421v1 +http://arxiv.org/abs/2403.11996v3 +http://arxiv.org/abs/2402.07148v2 +http://arxiv.org/abs/2402.04268v1 +http://arxiv.org/abs/2305.04934v2 +http://arxiv.org/abs/2408.15138v1 +http://arxiv.org/abs/2408.02288v2 +http://arxiv.org/abs/2405.17088v1 +http://arxiv.org/abs/2310.06960v1 +http://arxiv.org/abs/2306.11232v2 +http://arxiv.org/abs/2304.07235v4 +http://arxiv.org/abs/2112.08628v2 +http://arxiv.org/abs/1811.07208v1 +http://arxiv.org/abs/1802.10411v1 +http://arxiv.org/abs/1302.5526v2 +http://arxiv.org/abs/0710.0009v2 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/cond-mat/0203436v1 +http://arxiv.org/abs/cond-mat/0202383v1 +http://arxiv.org/abs/2309.10923v2 +http://arxiv.org/abs/2210.15600v2 +http://arxiv.org/abs/1812.01995v4 +http://arxiv.org/abs/2404.08001v1 +http://arxiv.org/abs/2312.04576v1 +http://arxiv.org/abs/2405.04620v2 +http://arxiv.org/abs/2404.08001v1 +http://arxiv.org/abs/2304.10327v2 +http://arxiv.org/abs/2304.02034v1 +http://arxiv.org/abs/1807.00735v1 +http://arxiv.org/abs/2405.20818v3 +http://arxiv.org/abs/1808.05439v2 +http://arxiv.org/abs/1705.09731v3 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/2201.07341v3 +http://arxiv.org/abs/1907.04211v1 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/2106.03181v2 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/2405.08888v1 +http://arxiv.org/abs/2405.01359v1 +http://arxiv.org/abs/2409.19058v1 +http://arxiv.org/abs/2406.11217v2 +http://arxiv.org/abs/2402.19462v2 +http://arxiv.org/abs/2310.19998v1 +http://arxiv.org/abs/2408.07154v1 +http://arxiv.org/abs/2407.14556v2 +http://arxiv.org/abs/2409.13989v1 +http://arxiv.org/abs/2408.06150v2 +http://arxiv.org/abs/2407.01603v3 +http://arxiv.org/abs/2406.13193v1 +http://arxiv.org/abs/2405.00972v1 +http://arxiv.org/abs/2405.00949v1 +http://arxiv.org/abs/2311.06303v1 +http://arxiv.org/abs/2309.16235v1 +http://arxiv.org/abs/2308.01423v2 +http://arxiv.org/abs/2307.15759v1 +http://arxiv.org/abs/2306.11976v1 +http://arxiv.org/abs/2306.11296v2 +http://arxiv.org/abs/2304.05332v1 +http://arxiv.org/abs/2303.11593v4 +http://arxiv.org/abs/2206.07048v1 +http://arxiv.org/abs/2205.09175v1 +http://arxiv.org/abs/2109.08830v3 +http://arxiv.org/abs/2012.06051v1 +http://arxiv.org/abs/2010.09885v2 +http://arxiv.org/abs/1907.01356v2 +http://arxiv.org/abs/2404.08001v1 +http://arxiv.org/abs/2401.07777v1 +http://arxiv.org/abs/1812.01995v4 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/2411.00062v1 +http://arxiv.org/abs/2205.09674v1 +http://arxiv.org/abs/2205.05390v1 +http://arxiv.org/abs/2205.00148v1 +http://arxiv.org/abs/1802.10411v1 +http://arxiv.org/abs/1708.01677v2 +http://arxiv.org/abs/1705.00545v3 +http://arxiv.org/abs/1412.2486v2 +http://arxiv.org/abs/1310.5884v3 +http://arxiv.org/abs/1302.4471v1 +http://arxiv.org/abs/1302.4465v1 +http://arxiv.org/abs/1112.6045v1 +http://arxiv.org/abs/1109.6018v1 +http://arxiv.org/abs/1006.3271v1 +http://arxiv.org/abs/cs/0504089v2 +http://arxiv.org/abs/cond-mat/0203436v1 +http://arxiv.org/abs/2407.10991v1 +http://arxiv.org/abs/2304.08341v1 +http://arxiv.org/abs/1810.03480v1 +http://arxiv.org/abs/2304.10327v2 +http://arxiv.org/abs/2201.10222v1 +http://arxiv.org/abs/2309.10066v2 +http://arxiv.org/abs/2306.08000v1 +http://arxiv.org/abs/2303.09038v3 +http://arxiv.org/abs/2407.13787v2 +http://arxiv.org/abs/2308.02962v2 +http://arxiv.org/abs/2407.09364v1 +http://arxiv.org/abs/2406.14522v2 +http://arxiv.org/abs/2405.07764v2 +http://arxiv.org/abs/2403.04009v1 +http://arxiv.org/abs/2402.11895v3 +http://arxiv.org/abs/2312.15523v1 +http://arxiv.org/abs/2310.16181v2 +http://arxiv.org/abs/2308.05502v2 +http://arxiv.org/abs/2306.15551v2 +http://arxiv.org/abs/2212.00237v1 +http://arxiv.org/abs/2203.07135v2 +http://arxiv.org/abs/2201.00083v1 +http://arxiv.org/abs/2112.03033v1 +http://arxiv.org/abs/2111.12061v2 +http://arxiv.org/abs/2110.13710v1 +http://arxiv.org/abs/2109.09010v2 +http://arxiv.org/abs/2105.13025v1 +http://arxiv.org/abs/2105.11519v3 +http://arxiv.org/abs/2105.02570v4 +http://arxiv.org/abs/2104.10210v1 +http://arxiv.org/abs/2411.11875v1 +http://arxiv.org/abs/2410.15592v2 +http://arxiv.org/abs/2410.15165v1 +http://arxiv.org/abs/2410.03553v2 +http://arxiv.org/abs/2410.02647v1 +http://arxiv.org/abs/2409.13989v1 +http://arxiv.org/abs/2408.11866v1 +http://arxiv.org/abs/2408.06150v2 +http://arxiv.org/abs/2407.13780v1 +http://arxiv.org/abs/2406.05797v1 +http://arxiv.org/abs/2405.10625v1 +http://arxiv.org/abs/2405.06690v1 +http://arxiv.org/abs/2405.00949v1 +http://arxiv.org/abs/2405.06662v1 +http://arxiv.org/abs/2404.14850v1 +http://arxiv.org/abs/2403.07179v2 +http://arxiv.org/abs/2403.05602v1 +http://arxiv.org/abs/2403.04395v1 +http://arxiv.org/abs/2403.13830v1 +http://arxiv.org/abs/2403.01528v2 +http://arxiv.org/abs/2410.01795v1 +http://arxiv.org/abs/2406.13133v1 +http://arxiv.org/abs/2402.08777v3 +http://arxiv.org/abs/2307.10634v1 +http://arxiv.org/abs/2305.18410v1 +http://arxiv.org/abs/2304.09667v3 +http://arxiv.org/abs/2205.03853v1 +http://arxiv.org/abs/2008.08727v3 +http://arxiv.org/abs/1811.08162v1 +http://arxiv.org/abs/cs/0410017v1 +http://arxiv.org/abs/2405.18605v1 +http://arxiv.org/abs/2411.11061v1 +http://arxiv.org/abs/2410.20053v1 +http://arxiv.org/abs/2410.12866v1 +http://arxiv.org/abs/2410.04383v1 +http://arxiv.org/abs/2410.00257v1 +http://arxiv.org/abs/2409.19710v1 +http://arxiv.org/abs/2409.16322v1 +http://arxiv.org/abs/2409.10715v2 +http://arxiv.org/abs/2407.09450v2 +http://arxiv.org/abs/2407.04680v1 +http://arxiv.org/abs/2405.18639v1 +http://arxiv.org/abs/2405.16714v1 +http://arxiv.org/abs/2405.11459v3 +http://arxiv.org/abs/2405.06725v3 +http://arxiv.org/abs/2404.14024v2 +http://arxiv.org/abs/2403.17299v1 +http://arxiv.org/abs/2403.00854v1 +http://arxiv.org/abs/2402.14701v2 +http://arxiv.org/abs/2402.02243v1 +http://arxiv.org/abs/2401.08851v1 +http://arxiv.org/abs/2407.14556v2 +http://arxiv.org/abs/2112.02097v1 +http://arxiv.org/abs/2101.11710v1 +http://arxiv.org/abs/1908.00785v1 +http://arxiv.org/abs/cs/0308032v1 +http://arxiv.org/abs/2405.20818v3 +http://arxiv.org/abs/2404.04671v3 +http://arxiv.org/abs/2206.05603v1 +http://arxiv.org/abs/2104.10210v1 +http://arxiv.org/abs/1406.2963v2 +http://arxiv.org/abs/0710.0009v2 +http://arxiv.org/abs/2411.17669v1 +http://arxiv.org/abs/2411.14721v1 +http://arxiv.org/abs/2410.24022v1 +http://arxiv.org/abs/2410.21127v1 +http://arxiv.org/abs/2410.15592v2 +http://arxiv.org/abs/2409.19710v1 +http://arxiv.org/abs/2409.13057v2 +http://arxiv.org/abs/2409.00544v1 +http://arxiv.org/abs/2408.13378v3 +http://arxiv.org/abs/2408.03732v1 +http://arxiv.org/abs/2408.01869v1 +http://arxiv.org/abs/2406.15534v1 +http://arxiv.org/abs/2406.14021v1 +http://arxiv.org/abs/2406.12950v2 +http://arxiv.org/abs/2406.09454v1 +http://arxiv.org/abs/2406.05540v2 +http://arxiv.org/abs/2405.14225v1 +http://arxiv.org/abs/2405.00972v1 +http://arxiv.org/abs/2404.16880v1 +http://arxiv.org/abs/2404.03969v1 +http://arxiv.org/abs/2411.08726v1 +http://arxiv.org/abs/2410.17266v1 +http://arxiv.org/abs/2410.00031v1 +http://arxiv.org/abs/2408.13214v1 +http://arxiv.org/abs/2408.09420v3 +http://arxiv.org/abs/2408.06634v2 +http://arxiv.org/abs/2407.18324v1 +http://arxiv.org/abs/2406.02969v1 +http://arxiv.org/abs/2402.11728v2 +http://arxiv.org/abs/2402.10481v2 +http://arxiv.org/abs/2402.01734v2 +http://arxiv.org/abs/2401.03737v2 +http://arxiv.org/abs/2311.08533v1 +http://arxiv.org/abs/2309.11979v1 +http://arxiv.org/abs/2308.14634v1 +http://arxiv.org/abs/2306.05803v1 +http://arxiv.org/abs/2306.03763v4 +http://arxiv.org/abs/2305.17457v1 +http://arxiv.org/abs/2301.09279v2 +http://arxiv.org/abs/2210.14304v1 +http://arxiv.org/abs/2410.21359v1 +http://arxiv.org/abs/2410.10665v1 +http://arxiv.org/abs/2409.19325v1 +http://arxiv.org/abs/2409.18417v1 +http://arxiv.org/abs/2407.15339v3 +http://arxiv.org/abs/2406.15593v1 +http://arxiv.org/abs/2406.09490v1 +http://arxiv.org/abs/2407.09480v1 +http://arxiv.org/abs/2402.01766v3 +http://arxiv.org/abs/2312.03194v1 +http://arxiv.org/abs/2312.02181v1 +http://arxiv.org/abs/2310.10050v1 +http://arxiv.org/abs/2308.12477v1 +http://arxiv.org/abs/2307.10549v1 +http://arxiv.org/abs/2305.14672v1 +http://arxiv.org/abs/2305.08524v2 +http://arxiv.org/abs/2304.03464v3 +http://arxiv.org/abs/2212.04277v1 +http://arxiv.org/abs/2205.01317v1 +http://arxiv.org/abs/2203.09128v1 +http://arxiv.org/abs/2407.17624v1 +http://arxiv.org/abs/2404.07221v2 +http://arxiv.org/abs/2311.10723v2 +http://arxiv.org/abs/2309.13064v1 +http://arxiv.org/abs/2307.10485v2 +http://arxiv.org/abs/2303.17564v3 +http://arxiv.org/abs/2301.03136v2 +http://arxiv.org/abs/2209.15293v1 +http://arxiv.org/abs/2207.01402v1 +http://arxiv.org/abs/2406.02969v1 +http://arxiv.org/abs/2312.14203v1 +http://arxiv.org/abs/2111.00526v2 +http://arxiv.org/abs/2107.08721v1 +http://arxiv.org/abs/2112.03868v3 +http://arxiv.org/abs/2411.02558v1 +http://arxiv.org/abs/2410.17266v1 +http://arxiv.org/abs/2409.17909v1 +http://arxiv.org/abs/2407.17624v1 +http://arxiv.org/abs/2406.09765v2 +http://arxiv.org/abs/2406.03614v1 +http://arxiv.org/abs/2404.18470v2 +http://arxiv.org/abs/2401.16458v2 +http://arxiv.org/abs/2308.11138v3 +http://arxiv.org/abs/2308.00065v1 +http://arxiv.org/abs/1812.10479v1 +http://arxiv.org/abs/1811.05270v1 +http://arxiv.org/abs/2411.04788v1 +http://arxiv.org/abs/2411.02558v1 +http://arxiv.org/abs/2410.07225v1 +http://arxiv.org/abs/2409.11408v1 +http://arxiv.org/abs/2408.06634v2 +http://arxiv.org/abs/2408.04948v1 +http://arxiv.org/abs/2407.18324v1 +http://arxiv.org/abs/2405.14767v2 +http://arxiv.org/abs/2405.10584v1 +http://arxiv.org/abs/2405.12990v1 +http://arxiv.org/abs/2404.02053v2 +http://arxiv.org/abs/2404.01338v1 +http://arxiv.org/abs/2404.01337v1 +http://arxiv.org/abs/2403.12285v1 +http://arxiv.org/abs/2403.00782v1 +http://arxiv.org/abs/2402.10481v2 +http://arxiv.org/abs/2402.03659v3 +http://arxiv.org/abs/2311.15548v1 +http://arxiv.org/abs/2311.07598v1 +http://arxiv.org/abs/2310.05627v1 +http://arxiv.org/abs/2411.04788v1 +http://arxiv.org/abs/2405.14767v2 +http://arxiv.org/abs/2404.18470v2 +http://arxiv.org/abs/2404.08665v1 +http://arxiv.org/abs/2403.12285v1 +http://arxiv.org/abs/2306.12659v1 +http://arxiv.org/abs/2306.06031v1 +http://arxiv.org/abs/2206.00648v2 +http://arxiv.org/abs/2112.13593v5 +http://arxiv.org/abs/2112.02095v1 +http://arxiv.org/abs/2407.00890v1 +http://arxiv.org/abs/2406.17972v1 +http://arxiv.org/abs/2112.07985v1 +http://arxiv.org/abs/2105.00817v2 +http://arxiv.org/abs/2103.01126v4 +http://arxiv.org/abs/2006.00707v1 +http://arxiv.org/abs/1909.00154v1 +http://arxiv.org/abs/2410.21359v1 +http://arxiv.org/abs/2410.10665v1 +http://arxiv.org/abs/2409.19325v1 +http://arxiv.org/abs/2409.18417v1 +http://arxiv.org/abs/2407.15339v3 +http://arxiv.org/abs/2406.15593v1 +http://arxiv.org/abs/2406.09490v1 +http://arxiv.org/abs/2407.09480v1 +http://arxiv.org/abs/2402.01766v3 +http://arxiv.org/abs/2312.03194v1 +http://arxiv.org/abs/2312.02181v1 +http://arxiv.org/abs/2310.10050v1 +http://arxiv.org/abs/2308.12477v1 +http://arxiv.org/abs/2307.10549v1 +http://arxiv.org/abs/2305.14672v1 +http://arxiv.org/abs/2305.08524v2 +http://arxiv.org/abs/2304.03464v3 +http://arxiv.org/abs/2212.04277v1 +http://arxiv.org/abs/2205.01317v1 +http://arxiv.org/abs/2203.09128v1 +http://arxiv.org/abs/2308.06907v1 +http://arxiv.org/abs/2203.00070v2 diff --git a/research_bench/discussion1_add_profile_relatedness_to_agent_ablation.py b/research_bench/discussion1_add_profile_relatedness_to_agent_ablation.py new file mode 100644 index 00000000..124c2148 --- /dev/null +++ b/research_bench/discussion1_add_profile_relatedness_to_agent_ablation.py @@ -0,0 +1,32 @@ +import voyageai +import json +import numpy as np +from tqdm import tqdm + +# embed + +with open('./paper_bench/agent_number_ablation_paper_bench.json', 'r') as f: + dataset = json.load(f) + +for key, data in tqdm(dataset.items()): + author_data = data['author_data'] + bios = [] + pks = [] + for author in author_data.values(): + bios.append(author['bio']) + + for author_pk in author_data.keys(): + pks.append(author_pk) + + abstract = data['paper_data']['abstract'] + + abstract_embed = voyageai.get_embedding(abstract, model='voyage-3', input_type='document') + + for pk, bio in zip(pks, bios): + bio_embed = voyageai.get_embedding(bio, model='voyage-3', input_type='document') + similarity = np.dot(abstract_embed, bio_embed) / (np.linalg.norm(abstract_embed) * np.linalg.norm(bio_embed)) + print(similarity) + dataset[key]['author_data'][pk]['bio_relatedness_with_abstract'] = similarity + +with open('./paper_bench/agent_number_ablation_paper_bench_with_relatedness.json', 'w') as f: + json.dump(dataset, f, indent=4) \ No newline at end of file diff --git a/research_bench/discussion1_check_correlation_between_bio_relatedness_and_similarity.py b/research_bench/discussion1_check_correlation_between_bio_relatedness_and_similarity.py new file mode 100644 index 00000000..e5fe1b8b --- /dev/null +++ b/research_bench/discussion1_check_correlation_between_bio_relatedness_and_similarity.py @@ -0,0 +1,125 @@ +import json +import jsonlines +import numpy as np + + +def load_json(filepath): + """ + Load a JSON file. + """ + with open(filepath, 'r') as f: + return json.load(f) + + +def load_jsonlines(filepath): + """ + Load a JSONL file and return a list of records. + """ + with jsonlines.open(filepath, 'r') as f: + return [line for line in f] + + +def calculate_mean_scores(results, prefix, num_questions=5): + """ + Calculate mean scores for each paper ID based on given results. + """ + sim_dict = {} + for result in results: + sim_dict[result['paper_id']] = np.mean( + [result[f'{prefix}_sim_q{i}'] for i in range(1, num_questions + 1)] + ) + return sim_dict + +if __name__ == '__main__': + # Load datasets + dataset = load_json('./paper_bench/agent_number_ablation_paper_bench_with_relatedness.json') + + first_author_results = load_jsonlines( + './results/agent_number_ablation_record_each_agent_output_paper_bench_first_author_result_4o_mini_fake_research_town.jsonl' + ) + second_author_results = load_jsonlines( + './results/agent_number_ablation_record_each_agent_output_paper_bench_second_author_result_4o_mini_fake_research_town.jsonl' + ) + third_author_results = load_jsonlines( + './results/agent_number_ablation_record_each_agent_output_paper_bench_third_author_result_4o_mini_fake_research_town.jsonl' + ) + fourth_author_results = load_jsonlines( + './results/agent_number_ablation_record_each_agent_output_paper_bench_fourth_author_result_4o_mini_fake_research_town.jsonl' + ) + fifth_author_results = load_jsonlines( + './results/agent_number_ablation_record_each_agent_output_paper_bench_fifth_author_result_4o_mini_fake_research_town.jsonl' + ) + + # Calculate similarity scores for first author + first_author_openai_sim_dict = calculate_mean_scores(first_author_results, 'openai') + first_author_voyageai_sim_dict = calculate_mean_scores(first_author_results, 'voyageai') + + # Example: Process other authors' results if needed + second_author_openai_sim_dict = calculate_mean_scores(second_author_results, 'openai') + second_author_voyageai_sim_dict = calculate_mean_scores(second_author_results, 'voyageai') + + third_author_openai_sim_dict = calculate_mean_scores(third_author_results, 'openai') + third_author_voyageai_sim_dict = calculate_mean_scores(third_author_results, 'voyageai') + + fourth_author_openai_sim_dict = calculate_mean_scores(fourth_author_results, 'openai') + fourth_author_voyageai_sim_dict = calculate_mean_scores(fourth_author_results, 'voyageai') + + fifth_author_openai_sim_dict = calculate_mean_scores(fifth_author_results, 'openai') + fifth_author_voyageai_sim_dict = calculate_mean_scores(fifth_author_results, 'voyageai') + + import numpy as np + from scipy.stats import pearsonr + + all_bio = [] + all_openai = [] + all_voyageai = [] + + for paper_id, data in dataset.items(): + author_data = data['author_data'] + # Extract bio-relatedness from first five authors + bio_relatedness = [author_info['bio_relatedness_with_abstract'] for author_info in author_data.values()][:5] + + # Extract similarity scores for the same five authors + openai_sims = [ + first_author_openai_sim_dict[paper_id], + second_author_openai_sim_dict[paper_id], + third_author_openai_sim_dict[paper_id], + fourth_author_openai_sim_dict[paper_id], + fifth_author_openai_sim_dict[paper_id] + ] + + voyageai_sims = [ + first_author_voyageai_sim_dict[paper_id], + second_author_voyageai_sim_dict[paper_id], + third_author_voyageai_sim_dict[paper_id], + fourth_author_voyageai_sim_dict[paper_id], + fifth_author_voyageai_sim_dict[paper_id] + ] + + bio_array = np.array(bio_relatedness) + openai_array = np.array(openai_sims) + voyageai_array = np.array(voyageai_sims) + + # if bio array has one much higher than the rest, calculate + # correlation between the two lower ones + sorted_bio_array = np.sort(bio_array) + if sorted_bio_array[-1] - sorted_bio_array[-2] < 0.1: + continue + + + # Compute Pearson correlation coefficients + bio_openai_corr, bio_openai_p = pearsonr(bio_array, openai_array) + bio_voyageai_corr, bio_voyageai_p = pearsonr(bio_array, voyageai_array) + openai_voyageai_corr, openai_voyageai_p = pearsonr(openai_array, voyageai_array) + + print(f"Paper ID: {paper_id}") + print(f"Bio vs OpenAI: Correlation={bio_openai_corr:.3f}, p-value={bio_openai_p:.3f}") + print(f"Bio vs VoyageAI: Correlation={bio_voyageai_corr:.3f}, p-value={bio_voyageai_p:.3f}") + print(f"OpenAI vs VoyageAI: Correlation={openai_voyageai_corr:.3f}, p-value={openai_voyageai_p:.3f}\n") + + print(bio_array) + print(openai_array) + print(voyageai_array) + print('\n\n') + +# average bio_relatedness and similarity scores for each paper diff --git a/research_bench/discussion2_per_agent_vs_sampling.py b/research_bench/discussion2_per_agent_vs_sampling.py new file mode 100644 index 00000000..d56ef850 --- /dev/null +++ b/research_bench/discussion2_per_agent_vs_sampling.py @@ -0,0 +1,80 @@ +import json +import numpy as np +import matplotlib.pyplot as plt + +num = 3 + +# Load and process without_bio data +with open(f'./results/agent_number_ablation_record_each_agent_output_paper_bench_with_bio_sample_5_result_4o_mini_fake_research_town.jsonl', 'r') as f: + without_bio_results = [json.loads(line) for line in f] + for res in without_bio_results: + q1_agents = res['openai_sim_q1_per_agent'] + q2_agents = res['openai_sim_q2_per_agent'] + q3_agents = res['openai_sim_q3_per_agent'] + q4_agents = res['openai_sim_q4_per_agent'] + q5_agents = res['openai_sim_q5_per_agent'] + + agents_scores = zip(q1_agents, q2_agents, q3_agents, q4_agents, q5_agents) + + avg_openai_sim = [] + for agent_scores in agents_scores: + if 0 in agent_scores: + continue + avg_openai_sim.append(sum(agent_scores) / 5.0) + + res['avg_openai_sim'] = avg_openai_sim + +all_averages_without_bio = [val for res in without_bio_results for val in res['avg_openai_sim']] + +# Load and process with_bio data +with open(f'./results/agent_number_ablation_record_each_agent_output_paper_bench_with_bio_sample_1_result_4o_mini_fake_research_town.jsonl', 'r') as f: + with_bio_results = [json.loads(line) for line in f] + for res in with_bio_results: + q1_agents = res['openai_sim_q1_per_agent'] + q2_agents = res['openai_sim_q2_per_agent'] + q3_agents = res['openai_sim_q3_per_agent'] + q4_agents = res['openai_sim_q4_per_agent'] + q5_agents = res['openai_sim_q5_per_agent'] + + agents_scores = zip(q1_agents, q2_agents, q3_agents, q4_agents, q5_agents) + + avg_openai_sim = [] + for agent_scores in agents_scores: + if 0 in agent_scores: + continue + avg_openai_sim.append(sum(agent_scores) / 5.0) + + res['avg_openai_sim'] = avg_openai_sim + +all_averages_with_bio = [val for res in with_bio_results for val in res['avg_openai_sim']] + +print('with bio') +print(len(all_averages_with_bio)) +print('without bio') +print(len(all_averages_without_bio)) + +# Compute statistics +mean_without_bio = np.mean(all_averages_without_bio) if all_averages_without_bio else 0 +std_without_bio = np.std(all_averages_without_bio) if all_averages_without_bio else 0 + +mean_with_bio = np.mean(all_averages_with_bio) if all_averages_with_bio else 0 +std_with_bio = np.std(all_averages_with_bio) if all_averages_with_bio else 0 + +print("Overall statistics (without_bio):") +print(" Mean:", mean_without_bio) +print(" Std:", std_without_bio) + +print("Overall statistics (with_bio):") +print(" Mean:", mean_with_bio) +print(" Std:", std_with_bio) + +# Create violin plot +data = [all_averages_without_bio, all_averages_with_bio] +plt.violinplot(data, showmeans=True, showextrema=True, showmedians=False) + +plt.xticks([1, 2], ['Without Bio', 'With Bio']) +plt.title('Distribution of Per-Agent Average Similarities') +plt.xlabel('Condition') +plt.ylabel('Average OpenAI Similarity') + +plt.show() diff --git a/research_bench/eval.py b/research_bench/eval.py index 9d347b9b..95949dac 100644 --- a/research_bench/eval.py +++ b/research_bench/eval.py @@ -8,7 +8,7 @@ from litellm import embedding from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu from rouge_score import rouge_scorer - +import torch from research_town.utils.model_prompting import model_prompting # Initialize NLTK resources @@ -42,9 +42,10 @@ def compute_rouge_l(reference: str, hypothesis: str) -> float: def compute_bertscore(reference: str, hypothesis: str) -> float: try: + device = 'cuda' if torch.cuda.is_available() else 'cpu' # Compute BERTScore P, R, F1 = score( - [hypothesis], [reference], lang='en', rescale_with_baseline=True + [hypothesis], [reference], lang='en', rescale_with_baseline=True, device=device ) return float(F1.mean().item()) except Exception as e: @@ -214,14 +215,39 @@ def compute_voyageai_embedding_similarity_per_question( print(f'Error computing embedding similarity per question: {e}') return [0.0] * len(questions) +def compute_bertscore_per_question(reference: str, hypothesis: str) -> List[float]: + try: + questions = [ + 'What is the problem?', + 'Why is it interesting and important?', + 'Why is it hard?', + "Why hasn't it been solved before?", + 'What are the key components of my approach and results?', + ] + + ref_questions = extract_and_clean_question_content(reference, questions) + hyp_questions = extract_and_clean_question_content(hypothesis, questions) + + similarities = [] + + for ref_text, hyp_text in zip(ref_questions, hyp_questions): + if not ref_text or not hyp_text: + print(f'Empty question: {ref_text} vs {hyp_text}') + similarities.append(0.0) + continue + + cosine_sim = compute_bertscore(ref_text, hyp_text) + similarities.append(float(cosine_sim)) + + return similarities + + except Exception as e: + print(f'Error computing BERTScore per question: {e}') + return [0.0] * len(questions) def compute_proposal_metrics(reference: str, generation: str) -> Dict[str, float]: bleu = compute_bleu(reference, generation) rouge_l = compute_rouge_l(reference, generation) - bert_score = compute_bertscore(reference, generation) - gpt_metric = compute_proposal_gpt_metric(reference, generation) - openai_sim = compute_openai_embedding_similarity(reference, generation) - voyageai_sim = compute_voyageai_embedding_similarity(reference, generation) openai_sim_per_question = compute_openai_embedding_similarity_per_question( reference, generation ) @@ -232,10 +258,6 @@ def compute_proposal_metrics(reference: str, generation: str) -> Dict[str, float return { 'bleu': bleu, 'rouge_l': rouge_l, - 'gpt_metric_score': gpt_metric, - 'bert_score': bert_score, - 'openai_sim': openai_sim, - 'voyageai_sim': voyageai_sim, 'openai_sim_q1': openai_sim_per_question[0], 'openai_sim_q2': openai_sim_per_question[1], 'openai_sim_q3': openai_sim_per_question[2], diff --git a/research_bench/eval_only.py b/research_bench/eval_only.py new file mode 100644 index 00000000..5d055041 --- /dev/null +++ b/research_bench/eval_only.py @@ -0,0 +1,256 @@ +import json +import numpy as np +from scipy.stats import ttest_rel +import matplotlib.pyplot as plt + +def convert_aligned_to_metrics(aligned_metrics): + metrics = { + 'bleu': [], + 'rouge_l': [], + 'openai_sim_q1': [], + 'openai_sim_q2': [], + 'openai_sim_q3': [], + 'openai_sim_q4': [], + 'openai_sim_q5': [], + 'voyageai_sim_q1': [], + 'voyageai_sim_q2': [], + 'voyageai_sim_q3': [], + 'voyageai_sim_q4': [], + 'voyageai_sim_q5': [], + 'nvscore_q1': [], + 'nvscore_q2': [], + 'nvscore_q3': [], + 'nvscore_q4': [], + 'nvscore_q5': [], + 'openai_sim_q1_per_agent_avg': [], + 'openai_sim_q2_per_agent_avg': [], + 'openai_sim_q3_per_agent_avg': [], + 'openai_sim_q4_per_agent_avg': [], + 'openai_sim_q5_per_agent_avg': [], + 'voyageai_sim_q1_per_agent_avg': [], + 'voyageai_sim_q2_per_agent_avg': [], + 'voyageai_sim_q3_per_agent_avg': [], + 'voyageai_sim_q4_per_agent_avg': [], + 'voyageai_sim_q5_per_agent_avg': [], + } + for metric in aligned_metrics: + for key in metrics: + if 'per_agent' not in key: + metrics[key].append(metric[key]) + else: + #ms = [] + #for m in metric[key.replace('_avg', '')]: + # if m is not None and m != 0: + # ms.append(m) + #metrics[key].append(np.mean(ms)) + #metrics[key].append(min(metric[key.replace('_avg', '')])) + metrics[key].append(0) + return metrics + +def get_shared_ids(file1_path, file2_path): + with open(file1_path, 'r') as f1, open(file2_path, 'r') as f2: + ids_file1 = {json.loads(line)['paper_id'] for line in f1} + ids_file2 = {json.loads(line)['paper_id'] for line in f2} + return ids_file1.intersection(ids_file2) + +def load_metrics_shared_as_dict(file_path, shared_ids): + metrics_dict = {} + with open(file_path, 'r') as f: + for line in f: + obj = json.loads(line) + if obj['paper_id'] in shared_ids: + metrics_dict[obj['paper_id']] = obj + return metrics_dict + +def compute_weighted_metric(metrics): + weights = [0.2] * 5 + openai_metric = np.dot(weights, [np.mean(metrics[f'openai_sim_q{i}']) for i in range(1, 6)]) + voyageai_metric = np.dot(weights, [np.mean(metrics[f'voyageai_sim_q{i}']) for i in range(1, 6)]) + nvscore_metric = np.dot(weights, [np.mean(metrics[f'nvscore_q{i}']) for i in range(1, 6)]) + bleu = np.dot(weights, [np.mean(metrics[f'bleu']) for i in range(1, 6)]) + rouge_l = np.dot(weights, [np.mean(metrics[f'rouge_l']) for i in range(1, 6)]) + return openai_metric, voyageai_metric, nvscore_metric, bleu, rouge_l + +def plot_sorted_metrics(metric1, metric2): + sorted_indices = np.argsort(metric2) + plt.plot(np.array(metric1)[sorted_indices], label='Metric 1 (sorted by Metric 2)', marker='o') + plt.plot(np.array(metric2)[sorted_indices], label='Metric 2 (sorted)', marker='x') + plt.legend() + plt.grid(True) + plt.show() + +if __name__ == "__main__": + # Uncomment the desired file paths + # file1_path = './results/mlbench_result_4o_mini_fake_research_town_full_author.jsonl' + # file2_path = './results/mlbench_result_4o_mini_citation_only.jsonl' + + file1_path = './results/paper_bench_mid_500_result_4o_mini_author_only_with_nv_filtered.jsonl' + file2_path = './results/paper_bench_mid_500_result_4o_mini_fake_research_town_with_nv_filtered.jsonl' + + #file1_path = './results/paper_bench_easy_500_result_4o_mini_citation_only_with_nv_filtered.jsonl' + #file2_path = './results/paper_bench_easy_500_result_4o_mini_fake_research_town_with_nv_filtered.jsonl' + + #file1_path = './results/paper_bench_mid_500_result_4o_mini_fake_research_town.jsonl' + #file2_path = './results/paper_bench_mid_500_result_4o_mini_citation_only.jsonl' + + #file1_path = './results/paper_bench_easy_500_result_4o_mini_fake_research_town.jsonl' + #file2_path = './results/paper_bench_easy_500_result_4o_mini_citation_only.jsonl' + + # file1_path = './results/mlbench_use_all_citations_result_4o_mini_fake_research_town.jsonl' + # file2_path = './results/mlbench_use_all_citations_result_4o_mini_citation_only.jsonl' + + # file1_path = './results/mlbench_use_only_related_work_result_4o_mini_fake_research_town.jsonl' + # file2_path = './results/mlbench_use_only_related_work_result_4o_mini_citation_only.jsonl' + + # file1_path = './results/crossbench_result_4o_mini_fake_research_town_first_and_last_author.jsonl' + # file2_path = './results/crossbench_result_4o_mini_citation_only.jsonl' + + # file1_path = './results/mlbench_result_4o_mini_citation_only.jsonl' + # file2_path = './results/mlbench_use_only_related_work_result_4o_mini_citation_only.jsonl' + + # file1_path = './results/mlbench_full_filtered_use_all_citations_result_4o_mini_fake_research_town.jsonl' + # file2_path = './results/mlbench_full_filtered_use_all_citations_result_4o_mini_citation_only.jsonl' + + # file1_path = './results/paper_bench_result_4o_mini_fake_research_town.jsonl' + # file2_path = './results/paper_bench_result_4o_mini_citation_only_part1.jsonl' + + #file1_path = './results/cross_bench_1202_result_4o_mini_fake_research_town.jsonl' + #file2_path = './results/cross_bench_1202_result_4o_mini_fake_research_town_twice.jsonl' + + #file1_path = './results/paper_bench_mid_500_result_4o_mini_fake_research_town.jsonl' + #file2_path = './results/paper_bench_mid_500_result_4o_mini_fake_research_town_twice.jsonl' + + #file1_path = './results/paper_bench_hard_500_result_4o_mini_swarm_with_related_work.jsonl' + #file2_path = './results/paper_bench_hard_500_result_4o_mini_swarm.jsonl' + + #file1_path = './results/oodbench_1203_filtered_result_4o_mini_fake_research_town.jsonl' + #file2_path = './results/oodbench_1203_filtered_result_4o_mini_citation_only.jsonl' + + file1_path = './results/crossbench_1205_filtered_result_4o_mini_fake_research_town.jsonl' + file2_path = './results/crossbench_1205_filtered_result_4o_mini_fake_research_town.jsonl' + + #file1_path ='./results/oodbench_1203_filtered_result_4o_mini_fake_research_town.jsonl' + #file2_path = './results/oodbench_1203_filtered_result_4o_mini_fake_research_town.jsonl' + + #file1_path = './results/agent_number_ablation_paper_bench_author_1_result_4o_mini_fake_research_town.jsonl' + #file2_path = './results/agent_number_ablation_paper_bench_author_5_result_4o_mini_fake_research_town.jsonl' + + #file1_path = './results/paper_number_ablation_paper_bench_all_references_result_4o_mini_fake_research_town.jsonl' + #file2_path = './results/paper_number_ablation_paper_bench_introduction_only_result_4o_mini_fake_research_town.jsonl' + + file1_path = './results/paper_bench_hard_500_result_4o_mini_citation_only_with_nv_filtered.jsonl' + file2_path = './results/agent_number_ablation_record_each_agent_output_paper_bench_last_author_result_4o_mini_fake_research_town.jsonl' + + file1_path = './results/agent_number_ablation_record_each_agent_output_paper_bench_without_bio_sample_3_result_4o_mini_fake_research_town.jsonl' + file2_path = './results/agent_number_ablation_record_each_agent_output_paper_bench_without_bio_sample_1_result_4o_mini_fake_research_town.jsonl' + + file1_path = './results/agent_number_ablation_record_each_agent_output_paper_bench_author_5_result_4o_mini_fake_research_town_filtered.jsonl' + file2_path = './results/agent_number_ablation_record_each_agent_output_paper_bench_author_1_result_4o_mini_fake_research_town_filtered.jsonl' + + file1_path = './results/agent_number_ablation_record_each_agent_output_paper_bench_with_bio_sample_1_result_4o_mini_fake_research_town.jsonl' + file2_path = './results/agent_number_ablation_record_each_agent_output_paper_bench_with_bio_sample_5_result_4o_mini_fake_research_town.jsonl' + + file1_path = './results/paper_number_ablation_all_papers_result_4o_mini_citation_only.jsonl' + file2_path = './results/paper_number_ablation_other_only_result_4o_mini_citation_only.jsonl' + + #file1_path = './results/agent_number_ablation_two_author_result_4o_mini_author_only.jsonl' + #file2_path = './results/agent_number_ablation_five_author_result_4o_mini_author_only.jsonl' + + file1_path = './results/paper_bench_hard_500_result_4o_mini_zero_shot_resplit.jsonl' + file2_path = './results/paper_bench_hard_500_result_4o_mini_author_only_resplit.jsonl' + + print("Finding shared paper_ids...") + shared_ids = get_shared_ids(file1_path, file2_path) + print(f"Number of shared paper_ids: {len(shared_ids)}") + + print("Loading metrics...") + metrics_file1_dict = load_metrics_shared_as_dict(file1_path, shared_ids) + metrics_file2_dict = load_metrics_shared_as_dict(file2_path, shared_ids) + + aligned_metrics_file1 = [metrics_file1_dict[pid] for pid in shared_ids] + aligned_metrics_file2 = [metrics_file2_dict[pid] for pid in shared_ids] + + metrics_file1 = convert_aligned_to_metrics(aligned_metrics_file1) + metrics_file2 = convert_aligned_to_metrics(aligned_metrics_file2) + + print("Computing weighted metrics...") + metric1_openai, metric1_voyageai, metric1_nvscore, metric1_bleu, metric1_rougel = compute_weighted_metric(metrics_file1) + metric2_openai, metric2_voyageai, metric2_nvscore, metric2_bleu, metric2_rougel = compute_weighted_metric(metrics_file2) + + print(f"File 1 - OpenAI metric: {metric1_openai}, VoyageAI metric: {metric1_voyageai}, NVScore metric: {metric1_nvscore}") + print(f"File 2 - OpenAI metric: {metric2_openai}, VoyageAI metric: {metric2_voyageai}, NVScore metric: {metric2_nvscore}") + print(f"File 1 - BLEU metric: {metric1_bleu}, ROUGE-L metric: {metric1_rougel}") + print(f"File 2 - BLEU metric: {metric2_bleu}, ROUGE-L metric: {metric2_rougel}") + + print("Performing paired t-tests on openai...") + t_stat, p_value = ttest_rel( + [np.dot([0.2] * 5, [metrics_file1[f'openai_sim_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))], + [np.dot([0.2] * 5, [metrics_file2[f'openai_sim_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))] + ) + + # print average of q1 to q5 separately + for i in range(1, 6): + t_stat, p_value = ttest_rel( + [metrics_file1[f'openai_sim_q{i}'][j] for j in range(len(shared_ids))], + [metrics_file2[f'openai_sim_q{i}'][j] for j in range(len(shared_ids))] + ) + print(f"average score for q{i} in file1: {np.mean([metrics_file1[f'openai_sim_q{i}'][j] for j in range(len(shared_ids))])}") + print(f"average score for q{i} in file2: {np.mean([metrics_file2[f'openai_sim_q{i}'][j] for j in range(len(shared_ids))])}") + print(f"Paired t-test for q{i}: t-statistic = {t_stat}, p-value = {p_value}") + + # average score for average of q1 to q5 + print('====') + print(f"average score for average of q1 to q5 in file1: {np.mean([np.dot([0.2] * 5, [metrics_file1[f'openai_sim_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))])}") + print(f"std score for average of q1 to q5 in file1: {np.std([np.dot([0.2] * 5, [metrics_file1[f'openai_sim_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))])}") + print('====') + print(f"average score for average of q1 to q5 in file2: {np.mean([np.dot([0.2] * 5, [metrics_file2[f'openai_sim_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))])}") + print(f"std score for average of q1 to q5 in file2: {np.std([np.dot([0.2] * 5, [metrics_file2[f'openai_sim_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))])}") + print('====') + + for i in range(1, 6): + print(f"average per agent score for q{i} in file1: {np.mean([metrics_file1[f'openai_sim_q{i}_per_agent_avg'][j] for j in range(len(shared_ids))])}") + print(f"std per agent score for q{i} in file1: {np.std([metrics_file1[f'openai_sim_q{i}_per_agent_avg'][j] for j in range(len(shared_ids))])}") + print(f"average per agent score for q{i} in file2: {np.mean([metrics_file2[f'openai_sim_q{i}_per_agent_avg'][j] for j in range(len(shared_ids))])}") + print(f"std per agent score for q{i} in file2: {np.std([metrics_file2[f'openai_sim_q{i}_per_agent_avg'][j] for j in range(len(shared_ids))])}") + + + print("Performing paired t-tests on voyageai...") + t_stat, p_value = ttest_rel( + [np.dot([0.2] * 5, [metrics_file1[f'voyageai_sim_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))], + [np.dot([0.2] * 5, [metrics_file2[f'voyageai_sim_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))] + ) + + + + for i in range(1, 6): + t_stat, p_value = ttest_rel( + [metrics_file1[f'voyageai_sim_q{i}'][j] for j in range(len(shared_ids))], + [metrics_file2[f'voyageai_sim_q{i}'][j] for j in range(len(shared_ids))] + ) + print(f"average score for q{i} in file1: {np.mean([metrics_file1[f'voyageai_sim_q{i}'][j] for j in range(len(shared_ids))])}") + print(f"average score for q{i} in file2: {np.mean([metrics_file2[f'voyageai_sim_q{i}'][j] for j in range(len(shared_ids))])}") + print(f"Paired t-test for q{i}: t-statistic = {t_stat}, p-value = {p_value}") + + print("Performing paired t-tests on nvscore...") + t_stat, p_value = ttest_rel( + [np.dot([0.2] * 5, [metrics_file1[f'nvscore_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))], + [np.dot([0.2] * 5, [metrics_file2[f'nvscore_q{i}'][j] for i in range(1, 6)]) for j in range(len(shared_ids))] + ) + + for i in range(1, 6): + t_stat, p_value = ttest_rel( + [metrics_file1[f'nvscore_q{i}'][j] for j in range(len(shared_ids))], + [metrics_file2[f'nvscore_q{i}'][j] for j in range(len(shared_ids))] + ) + print(f"average score for q{i} in file1: {np.mean([metrics_file1[f'nvscore_q{i}'][j] for j in range(len(shared_ids))])}") + print(f"average score for q{i} in file2: {np.mean([metrics_file2[f'nvscore_q{i}'][j] for j in range(len(shared_ids))])}") + print(f"Paired t-test for q{i}: t-statistic = {t_stat}, p-value = {p_value}") + + for i in range(1, 6): + print(f"average per agent score for q{i} in file1: {np.mean([metrics_file1[f'voyageai_sim_q{i}_per_agent_avg'][j] for j in range(len(shared_ids))])}") + print(f"std per agent score for q{i} in file1: {np.std([metrics_file1[f'voyageai_sim_q{i}_per_agent_avg'][j] for j in range(len(shared_ids))])}") + print(f"average per agent score for q{i} in file2: {np.mean([metrics_file2[f'voyageai_sim_q{i}_per_agent_avg'][j] for j in range(len(shared_ids))])}") + print(f"std per agent score for q{i} in file2: {np.std([metrics_file2[f'voyageai_sim_q{i}_per_agent_avg'][j] for j in range(len(shared_ids))])}") + + + plot_sorted_metrics(metrics_file1['openai_sim_q5'], metrics_file2['openai_sim_q5']) diff --git a/research_bench/fill_in_all_metrics.py b/research_bench/fill_in_all_metrics.py new file mode 100644 index 00000000..bfd92d93 --- /dev/null +++ b/research_bench/fill_in_all_metrics.py @@ -0,0 +1,45 @@ +import jsonlines +from research_bench.eval import compute_bertscore_per_question +from tqdm import tqdm +from multiprocessing import Pool + +def process_file(file_path): + try: + # Open the file and load its data + with jsonlines.open(file_path, 'r') as f: + dataset = [line for line in f] + + # Process each data entry in the file + for idx, data in enumerate(tqdm(dataset, desc=f"Processing {file_path}")): + ref_proposal = data['ref_proposal'] + gen_proposal = data['gen_proposal'] + if 'bertscore_q1' not in data or 'bertscore_q2' not in data or 'bertscore_q3' not in data or 'bertscore_q4' not in data or 'bertscore_q5' not in data: + bert_score_per_question = compute_bertscore_per_question(ref_proposal, gen_proposal) + dataset[idx]['bertscore_q1'] = bert_score_per_question[0] + dataset[idx]['bertscore_q2'] = bert_score_per_question[1] + dataset[idx]['bertscore_q3'] = bert_score_per_question[2] + dataset[idx]['bertscore_q4'] = bert_score_per_question[3] + dataset[idx]['bertscore_q5'] = bert_score_per_question[4] + + # Write the updated data back to the file + with jsonlines.open(file_path, 'w') as f: + for data in dataset: + f.write(data) + print(f"Finished processing {file_path}") + except Exception as e: + print(f"Error processing {file_path}: {e}") + +if __name__ == "__main__": + # List of file paths to process + file_paths = [ + './results/paper_bench_hard_500_result_4o_mini_fake_research_town.jsonl', + './results/paper_bench_hard_500_result_4o_mini_citation_only.jsonl', + './results/paper_bench_mid_500_result_4o_mini_fake_research_town.jsonl', + './results/paper_bench_mid_500_result_4o_mini_citation_only.jsonl', + './results/paper_bench_easy_500_result_4o_mini_fake_research_town.jsonl', + './results/paper_bench_easy_500_result_4o_mini_citation_only.jsonl', + ] + + # Create a pool of workers and process the files in parallel + with Pool(processes=len(file_paths)) as pool: + pool.map(process_file, file_paths) diff --git a/research_bench/filter_out_agent_number_ablation_study.py b/research_bench/filter_out_agent_number_ablation_study.py new file mode 100644 index 00000000..27571fb9 --- /dev/null +++ b/research_bench/filter_out_agent_number_ablation_study.py @@ -0,0 +1,16 @@ +import json + +with open('./paper_bench/agent_number_ablation_paper_bench.json', 'r') as f: + dataset = json.load(f) + +for i in range(1, 6): + + with open(f'./results/agent_number_ablation_record_each_agent_output_paper_bench_author_{i}_result_4o_mini_fake_research_town.jsonl', 'r') as f: + results = [json.loads(line) for line in f] + + filtered_results = [data for data in results if data['paper_id'] in dataset] + + with open(f'./results/agent_number_ablation_record_each_agent_output_paper_bench_author_{i}_result_4o_mini_fake_research_town_filtered.jsonl', 'w') as f: + for result in filtered_results: + json.dump(result, f) + f.write('\n') \ No newline at end of file diff --git a/research_bench/filter_results.py b/research_bench/filter_results.py new file mode 100644 index 00000000..d14d0139 --- /dev/null +++ b/research_bench/filter_results.py @@ -0,0 +1,41 @@ +import json + +with open('./mlbench/mlbench_full.json', 'r') as f: + mlbench_full = json.load(f) + +with open('./iclrbench/iclrbench.json', 'r') as f: + iclrbench_full = json.load(f) + +filtered_data = {} +for key, value in mlbench_full.items(): + count = 0 + references = value['paper_data']['references'] + for ref in references: + abstract = ref['abstract'] + if not abstract: + continue + count += 1 + if count >= 20: + filtered_data[key] = value + +for key, value in iclrbench_full.items(): + count = 0 + references = value['paper_data']['references'] + for ref in references: + abstract = ref['abstract'] + if not abstract: + continue + count += 1 + if count >= 20: + filtered_data[key] = value + + +print(len(mlbench_full)) +print(len(iclrbench_full)) +print(len(filtered_data)) + +import pdb; pdb.set_trace() + + +with open('./paper_bench/paper_bench_full.json', 'w') as f: + json.dump(filtered_data, f) \ No newline at end of file diff --git a/research_bench/get_cross_domain_paper.py b/research_bench/get_cross_domain_paper.py new file mode 100644 index 00000000..7a8d9d55 --- /dev/null +++ b/research_bench/get_cross_domain_paper.py @@ -0,0 +1,87 @@ +import feedparser +from urllib.parse import urlencode, quote +from tqdm import tqdm + +def search_arxiv(categories, query=None, max_results=50): + """ + Search arXiv for papers in specified categories and an optional query. + + Args: + categories (list): List of categories to filter papers (e.g., ['cs', 'econ']). + query (str): Optional search query for additional filtering. + max_results (int): Maximum number of results to fetch. + + Returns: + list: List of papers with title, authors, summary, and link. + """ + base_url = "http://export.arxiv.org/api/query?" + + # Construct the category filter + category_filter = " AND ".join([f"cat:{cat}" for cat in categories]) + search_query = f"({category_filter})" + if query: + search_query += f" AND ({query})" + + # URL encode the query parameters + encoded_query = urlencode({"search_query": search_query, "start": 0, "max_results": max_results, + "sortBy": "submittedDate", "sortOrder": "descending"}) + + # Construct the full URL + url = f"{base_url}{encoded_query}" + + # Parse the response + feed = feedparser.parse(url) + papers = [] + for entry in feed.entries: + paper = { + "title": entry.title, + "authors": [author.name for author in entry.authors], + "summary": entry.summary, + "link": entry.link, + "published": entry.published, + } + papers.append(paper) + return papers + +arxiv_categories = [ + # Physics + "astro-ph", "astro-ph.GA", "astro-ph.CO", "astro-ph.EP", "astro-ph.HE", "astro-ph.IM", "astro-ph.SR", + "cond-mat", "cond-mat.dis-nn", "cond-mat.mes-hall", "cond-mat.mtrl-sci", "cond-mat.other", + "cond-mat.quant-gas", "cond-mat.soft", "cond-mat.stat-mech", "cond-mat.str-el", "cond-mat.supr-con", + "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th", "math-ph", "nlin.AO", "nlin.CG", "nlin.CD", + "nlin.SI", "nlin.PS", "nucl-ex", "nucl-th", "physics.acc-ph", "physics.ao-ph", "physics.app-ph", + "physics.atm-clus", "physics.atom-ph", "physics.bio-ph", "physics.chem-ph", "physics.class-ph", + "physics.comp-ph", "physics.data-an", "physics.flu-dyn", "physics.gen-ph", "physics.geo-ph", + "physics.hist-ph", "physics.ins-det", "physics.med-ph", "physics.optics", "physics.ed-ph", + "physics.soc-ph", "physics.plasm-ph", "physics.pop-ph", "physics.space-ph", + + # Quantitative Biology + "q-bio.BM", "q-bio.CB", "q-bio.GN", "q-bio.MN", "q-bio.NC", "q-bio.OT", "q-bio.PE", "q-bio.QM", + "q-bio.SC", "q-bio.TO", + + # Quantitative Finance + "q-fin.CP", "q-fin.EC", "q-fin.GN", "q-fin.MF", "q-fin.PM", "q-fin.PR", "q-fin.RM", "q-fin.ST", + "q-fin.TR", + + # Economics + "econ.EM", "econ.GN", "econ.TH" +] + +links = [] +for category in tqdm(arxiv_categories): + # Example usage + categories = ["cs.CL", category] + query = "machine learning" # Optional; use None if you don't have a specific query + max_results = 20 + + papers = search_arxiv(categories, query, max_results) + + # Display the papers + for idx, paper in enumerate(papers): + print(f"Paper {idx + 1}:") + print(f"Title: {paper['title']}") + links.append(paper['link']) + +with open('cross_domain_arxiv_links.txt', 'a') as f: + for link in links: + f.write(f"{link}\n") \ No newline at end of file diff --git a/research_bench/get_url.py b/research_bench/get_url.py new file mode 100644 index 00000000..d5295ee3 --- /dev/null +++ b/research_bench/get_url.py @@ -0,0 +1,14 @@ +import json + +with open('./mlbench/mlbench_full.json', 'r') as f: + mlbench_full = json.load(f) + +import pdb; pdb.set_trace() + +links = [] +for key, value in mlbench_full.items(): + links.append(value['paper_data']['url']) + +with open('./mlbench/links.txt', 'w') as f: + for link in links: + f.write(f'{link}\n') \ No newline at end of file diff --git a/research_bench/mlbench/links.txt b/research_bench/mlbench/links.txt new file mode 100644 index 00000000..5f3a8cc7 --- /dev/null +++ b/research_bench/mlbench/links.txt @@ -0,0 +1,1285 @@ +http://arxiv.org/abs/2406.05346v2 +http://arxiv.org/abs/2406.06419v2 +http://arxiv.org/abs/2401.11374v3 +http://arxiv.org/abs/2406.11741v4 +http://arxiv.org/abs/2402.16811v1 +http://arxiv.org/abs/2406.00819v1 +http://arxiv.org/abs/2406.09397v1 +http://arxiv.org/abs/2406.00488v1 +http://arxiv.org/abs/2406.17863v2 +http://arxiv.org/abs/2402.05421v2 +http://arxiv.org/abs/2405.13226v1 +http://arxiv.org/abs/2409.18433v1 +http://arxiv.org/abs/2410.05499v1 +http://arxiv.org/abs/2403.13117v2 +http://arxiv.org/abs/2405.13985v1 +http://arxiv.org/abs/2405.20053v1 +http://arxiv.org/abs/2312.02027v5 +http://arxiv.org/abs/2405.17374v2 +http://arxiv.org/abs/2401.08140v2 +http://arxiv.org/abs/2407.17492v1 +http://arxiv.org/abs/2404.11568v4 +http://arxiv.org/abs/2405.15285v1 +http://arxiv.org/abs/2405.03917v1 +http://arxiv.org/abs/2405.19562v1 +http://arxiv.org/abs/2410.14574v1 +http://arxiv.org/abs/2403.00867v2 +http://arxiv.org/abs/2403.04317v1 +http://arxiv.org/abs/2405.14014v3 +http://arxiv.org/abs/2405.14578v4 +http://arxiv.org/abs/2406.09639v2 +http://arxiv.org/abs/2410.02629v1 +http://arxiv.org/abs/2405.16493v1 +http://arxiv.org/abs/2210.07893v4 +http://arxiv.org/abs/2405.13721v1 +http://arxiv.org/abs/2406.00147v3 +http://arxiv.org/abs/2409.15393v1 +http://arxiv.org/abs/2311.10483v2 +http://arxiv.org/abs/2406.14477v1 +http://arxiv.org/abs/2409.17978v1 +http://arxiv.org/abs/2404.01318v4 +http://arxiv.org/abs/2406.03417v1 +http://arxiv.org/abs/2410.03919v1 +http://arxiv.org/abs/2405.17382v1 +http://arxiv.org/abs/2406.06040v1 +http://arxiv.org/abs/2409.18055v1 +http://arxiv.org/abs/2402.03883v1 +http://arxiv.org/abs/2405.13987v1 +http://arxiv.org/abs/2405.16806v2 +http://arxiv.org/abs/2407.05484v1 +http://arxiv.org/abs/2402.14744v2 +http://arxiv.org/abs/2406.05183v1 +http://arxiv.org/abs/2407.12043v1 +http://arxiv.org/abs/2405.14066v1 +http://arxiv.org/abs/2409.19433v2 +http://arxiv.org/abs/2405.12221v1 +http://arxiv.org/abs/2404.15146v2 +http://arxiv.org/abs/2405.09831v5 +http://arxiv.org/abs/2404.16811v2 +http://arxiv.org/abs/2406.03679v4 +http://arxiv.org/abs/2405.20348v1 +http://arxiv.org/abs/2404.16666v3 +http://arxiv.org/abs/2211.14960v5 +http://arxiv.org/abs/2205.13608v1 +http://arxiv.org/abs/2402.14904v1 +http://arxiv.org/abs/2407.05622v1 +http://arxiv.org/abs/2407.00316v1 +http://arxiv.org/abs/2401.05821v3 +http://arxiv.org/abs/2405.20724v1 +http://arxiv.org/abs/2405.17187v2 +http://arxiv.org/abs/2408.15205v1 +http://arxiv.org/abs/2408.11370v1 +http://arxiv.org/abs/2402.01382v1 +http://arxiv.org/abs/2407.19198v1 +http://arxiv.org/abs/2312.09841v1 +http://arxiv.org/abs/2402.17805v1 +http://arxiv.org/abs/2406.03003v1 +http://arxiv.org/abs/2310.06836v3 +http://arxiv.org/abs/2410.08091v2 +http://arxiv.org/abs/2402.15898v4 +http://arxiv.org/abs/2311.09308v2 +http://arxiv.org/abs/2410.16415v1 +http://arxiv.org/abs/2312.07000v1 +http://arxiv.org/abs/2409.19345v1 +http://arxiv.org/abs/2406.02742v1 +http://arxiv.org/abs/2405.11780v2 +http://arxiv.org/abs/2406.19258v1 +http://arxiv.org/abs/2410.02164v1 +http://arxiv.org/abs/2410.11251v1 +http://arxiv.org/abs/2405.17705v2 +http://arxiv.org/abs/2402.01607v2 +http://arxiv.org/abs/2403.08312v1 +http://arxiv.org/abs/2409.01369v1 +http://arxiv.org/abs/2402.02552v1 +http://arxiv.org/abs/2406.06769v2 +http://arxiv.org/abs/2406.01006v1 +http://arxiv.org/abs/2405.19946v1 +http://arxiv.org/abs/2406.09373v1 +http://arxiv.org/abs/2405.13587v1 +http://arxiv.org/abs/2405.14440v1 +http://arxiv.org/abs/2407.02315v2 +http://arxiv.org/abs/2410.04492v3 +http://arxiv.org/abs/2410.14970v1 +http://arxiv.org/abs/2405.19585v1 +http://arxiv.org/abs/2405.17673v1 +http://arxiv.org/abs/2406.08332v1 +http://arxiv.org/abs/2311.17295v1 +http://arxiv.org/abs/2405.13766v5 +http://arxiv.org/abs/2402.11894v3 +http://arxiv.org/abs/2405.16718v1 +http://arxiv.org/abs/2402.07067v2 +http://arxiv.org/abs/2406.17736v1 +http://arxiv.org/abs/2410.12490v1 +http://arxiv.org/abs/2406.09949v1 +http://arxiv.org/abs/2404.11833v2 +http://arxiv.org/abs/2311.17245v5 +http://arxiv.org/abs/2410.05550v1 +http://arxiv.org/abs/2404.04286v2 +http://arxiv.org/abs/2406.08465v1 +http://arxiv.org/abs/2406.03537v1 +http://arxiv.org/abs/2409.01977v1 +http://arxiv.org/abs/2408.15784v1 +http://arxiv.org/abs/2406.05869v1 +http://arxiv.org/abs/2406.16540v2 +http://arxiv.org/abs/2402.02425v3 +http://arxiv.org/abs/2408.13242v1 +http://arxiv.org/abs/2310.03253v2 +http://arxiv.org/abs/2408.15241v1 +http://arxiv.org/abs/2406.01257v1 +http://arxiv.org/abs/2405.16436v1 +http://arxiv.org/abs/2405.14913v1 +http://arxiv.org/abs/2409.19734v2 +http://arxiv.org/abs/2405.02140v2 +http://arxiv.org/abs/2406.09353v1 +http://arxiv.org/abs/2407.08713v1 +http://arxiv.org/abs/2405.15306v2 +http://arxiv.org/abs/2409.06762v1 +http://arxiv.org/abs/2405.19534v3 +http://arxiv.org/abs/2410.14030v1 +http://arxiv.org/abs/2312.10523v1 +http://arxiv.org/abs/2406.02507v1 +http://arxiv.org/abs/2410.04386v1 +http://arxiv.org/abs/2407.02632v1 +http://arxiv.org/abs/2402.04838v4 +http://arxiv.org/abs/2406.00519v1 +http://arxiv.org/abs/2402.09723v3 +http://arxiv.org/abs/2406.05532v2 +http://arxiv.org/abs/2402.07844v4 +http://arxiv.org/abs/2406.03437v2 +http://arxiv.org/abs/2406.15283v2 +http://arxiv.org/abs/2405.13912v1 +http://arxiv.org/abs/2406.11235v1 +http://arxiv.org/abs/2402.17106v3 +http://arxiv.org/abs/2405.19463v1 +http://arxiv.org/abs/2406.09028v1 +http://arxiv.org/abs/2410.07685v1 +http://arxiv.org/abs/2402.10176v1 +http://arxiv.org/abs/2405.16441v1 +http://arxiv.org/abs/2409.18591v1 +http://arxiv.org/abs/2405.15743v1 +http://arxiv.org/abs/2405.21048v1 +http://arxiv.org/abs/2405.20165v1 +http://arxiv.org/abs/2406.10248v2 +http://arxiv.org/abs/2406.12214v3 +http://arxiv.org/abs/2403.11808v2 +http://arxiv.org/abs/2410.11559v2 +http://arxiv.org/abs/2405.03987v2 +http://arxiv.org/abs/2405.16732v1 +http://arxiv.org/abs/2303.04209v2 +http://arxiv.org/abs/2402.03941v1 +http://arxiv.org/abs/2404.09591v2 +http://arxiv.org/abs/2404.00986v1 +http://arxiv.org/abs/2406.18777v1 +http://arxiv.org/abs/2409.19808v1 +http://arxiv.org/abs/2406.03689v2 +http://arxiv.org/abs/2404.01595v1 +http://arxiv.org/abs/2407.19448v1 +http://arxiv.org/abs/2409.04500v1 +http://arxiv.org/abs/2410.15059v1 +http://arxiv.org/abs/2406.02543v2 +http://arxiv.org/abs/2406.11944v1 +http://arxiv.org/abs/2406.05027v2 +http://arxiv.org/abs/2410.04372v1 +http://arxiv.org/abs/2407.19474v1 +http://arxiv.org/abs/2406.09175v1 +http://arxiv.org/abs/2406.17557v1 +http://arxiv.org/abs/2402.08097v2 +http://arxiv.org/abs/2406.02234v1 +http://arxiv.org/abs/2410.09600v2 +http://arxiv.org/abs/2410.10098v1 +http://arxiv.org/abs/2402.05785v4 +http://arxiv.org/abs/2402.01489v2 +http://arxiv.org/abs/2409.00119v1 +http://arxiv.org/abs/2405.11533v1 +http://arxiv.org/abs/2405.14366v2 +http://arxiv.org/abs/2407.04945v1 +http://arxiv.org/abs/2407.01800v1 +http://arxiv.org/abs/2402.19469v1 +http://arxiv.org/abs/2402.04010v1 +http://arxiv.org/abs/2406.05882v1 +http://arxiv.org/abs/2406.07407v1 +http://arxiv.org/abs/2409.17808v1 +http://arxiv.org/abs/2406.01461v1 +http://arxiv.org/abs/2404.14329v2 +http://arxiv.org/abs/2402.10946v1 +http://arxiv.org/abs/2405.13456v1 +http://arxiv.org/abs/2406.01781v1 +http://arxiv.org/abs/2407.12528v1 +http://arxiv.org/abs/2410.15629v2 +http://arxiv.org/abs/2404.14469v2 +http://arxiv.org/abs/2407.19996v1 +http://arxiv.org/abs/2402.08090v3 +http://arxiv.org/abs/2410.09355v2 +http://arxiv.org/abs/2406.10738v1 +http://arxiv.org/abs/2407.17686v1 +http://arxiv.org/abs/2405.14544v2 +http://arxiv.org/abs/2410.07112v1 +http://arxiv.org/abs/2407.05921v2 +http://arxiv.org/abs/2408.15792v1 +http://arxiv.org/abs/2202.00769v5 +http://arxiv.org/abs/2406.06407v1 +http://arxiv.org/abs/2405.14780v1 +http://arxiv.org/abs/2310.01636v3 +http://arxiv.org/abs/2311.14127v2 +http://arxiv.org/abs/2405.20782v1 +http://arxiv.org/abs/2402.07712v2 +http://arxiv.org/abs/2410.10587v1 +http://arxiv.org/abs/2405.19088v1 +http://arxiv.org/abs/2406.13215v2 +http://arxiv.org/abs/2403.20287v2 +http://arxiv.org/abs/2409.08302v1 +http://arxiv.org/abs/2405.14394v2 +http://arxiv.org/abs/2410.06171v1 +http://arxiv.org/abs/2408.17052v1 +http://arxiv.org/abs/2405.15821v1 +http://arxiv.org/abs/2407.03878v2 +http://arxiv.org/abs/2410.00447v1 +http://arxiv.org/abs/2401.10225v4 +http://arxiv.org/abs/2406.07230v2 +http://arxiv.org/abs/2405.14591v1 +http://arxiv.org/abs/2402.02017v2 +http://arxiv.org/abs/2410.05429v1 +http://arxiv.org/abs/2405.16075v1 +http://arxiv.org/abs/2406.15955v1 +http://arxiv.org/abs/2406.01249v1 +http://arxiv.org/abs/2405.15593v1 +http://arxiv.org/abs/2403.04081v1 +http://arxiv.org/abs/2406.00120v2 +http://arxiv.org/abs/2403.09603v2 +http://arxiv.org/abs/2405.15769v2 +http://arxiv.org/abs/2406.04103v1 +http://arxiv.org/abs/2406.14596v2 +http://arxiv.org/abs/2406.11840v1 +http://arxiv.org/abs/2402.03559v2 +http://arxiv.org/abs/2406.07217v1 +http://arxiv.org/abs/2404.13872v2 +http://arxiv.org/abs/2406.12538v2 +http://arxiv.org/abs/2404.12038v3 +http://arxiv.org/abs/2402.10998v2 +http://arxiv.org/abs/2402.06861v2 +http://arxiv.org/abs/2406.04485v3 +http://arxiv.org/abs/2405.14540v2 +http://arxiv.org/abs/2405.17398v4 +http://arxiv.org/abs/2406.03314v2 +http://arxiv.org/abs/2207.05209v2 +http://arxiv.org/abs/2403.05812v1 +http://arxiv.org/abs/2404.14743v2 +http://arxiv.org/abs/2406.15480v2 +http://arxiv.org/abs/2406.03072v2 +http://arxiv.org/abs/2406.04320v1 +http://arxiv.org/abs/2306.13041v1 +http://arxiv.org/abs/2407.03418v1 +http://arxiv.org/abs/2410.10817v1 +http://arxiv.org/abs/2405.15424v1 +http://arxiv.org/abs/2402.04114v2 +http://arxiv.org/abs/2406.16192v1 +http://arxiv.org/abs/2405.15106v1 +http://arxiv.org/abs/2406.18451v2 +http://arxiv.org/abs/2405.19276v1 +http://arxiv.org/abs/2408.06257v2 +http://arxiv.org/abs/2405.14365v1 +http://arxiv.org/abs/2408.14393v1 +http://arxiv.org/abs/2405.17815v1 +http://arxiv.org/abs/2406.03694v1 +http://arxiv.org/abs/2407.18158v1 +http://arxiv.org/abs/2410.14837v1 +http://arxiv.org/abs/2410.15618v1 +http://arxiv.org/abs/2406.05184v2 +http://arxiv.org/abs/2310.17712v3 +http://arxiv.org/abs/2403.13749v1 +http://arxiv.org/abs/2406.19861v1 +http://arxiv.org/abs/2402.17176v1 +http://arxiv.org/abs/2409.18330v1 +http://arxiv.org/abs/2410.13032v1 +http://arxiv.org/abs/2402.17902v1 +http://arxiv.org/abs/2409.20012v1 +http://arxiv.org/abs/2405.19985v1 +http://arxiv.org/abs/2406.11717v2 +http://arxiv.org/abs/2405.15632v2 +http://arxiv.org/abs/2403.08757v4 +http://arxiv.org/abs/2404.00438v1 +http://arxiv.org/abs/2406.11118v1 +http://arxiv.org/abs/2407.09388v1 +http://arxiv.org/abs/2403.11267v1 +http://arxiv.org/abs/2405.17580v1 +http://arxiv.org/abs/2409.19245v2 +http://arxiv.org/abs/2407.16154v1 +http://arxiv.org/abs/2402.07437v1 +http://arxiv.org/abs/2405.13712v3 +http://arxiv.org/abs/2403.12553v2 +http://arxiv.org/abs/2402.10403v3 +http://arxiv.org/abs/2403.15796v2 +http://arxiv.org/abs/2407.04970v1 +http://arxiv.org/abs/2405.15234v3 +http://arxiv.org/abs/2406.08527v1 +http://arxiv.org/abs/2406.06158v2 +http://arxiv.org/abs/2406.00392v1 +http://arxiv.org/abs/2408.03330v1 +http://arxiv.org/abs/2405.18942v2 +http://arxiv.org/abs/2407.09941v1 +http://arxiv.org/abs/2402.01355v2 +http://arxiv.org/abs/2407.19985v2 +http://arxiv.org/abs/2310.04415v1 +http://arxiv.org/abs/2407.12709v1 +http://arxiv.org/abs/2406.17830v1 +http://arxiv.org/abs/2406.20081v1 +http://arxiv.org/abs/2308.11129v4 +http://arxiv.org/abs/2310.12963v4 +http://arxiv.org/abs/2204.10888v2 +http://arxiv.org/abs/2312.00170v2 +http://arxiv.org/abs/2407.00382v2 +http://arxiv.org/abs/2405.17767v1 +http://arxiv.org/abs/2409.18859v1 +http://arxiv.org/abs/2405.16731v1 +http://arxiv.org/abs/2402.14254v1 +http://arxiv.org/abs/2408.00113v1 +http://arxiv.org/abs/2408.03361v7 +http://arxiv.org/abs/2401.10371v5 +http://arxiv.org/abs/2410.16152v2 +http://arxiv.org/abs/2405.16714v1 +http://arxiv.org/abs/2408.16862v1 +http://arxiv.org/abs/2404.02827v2 +http://arxiv.org/abs/2405.15182v1 +http://arxiv.org/abs/2406.06527v1 +http://arxiv.org/abs/2407.13281v1 +http://arxiv.org/abs/2404.12404v2 +http://arxiv.org/abs/2405.20236v1 +http://arxiv.org/abs/2404.04269v1 +http://arxiv.org/abs/2406.03052v1 +http://arxiv.org/abs/2404.13686v2 +http://arxiv.org/abs/2410.03276v2 +http://arxiv.org/abs/2311.05436v3 +http://arxiv.org/abs/2405.16785v2 +http://arxiv.org/abs/2405.15926v1 +http://arxiv.org/abs/2401.16198v1 +http://arxiv.org/abs/2305.03712v2 +http://arxiv.org/abs/2410.15020v1 +http://arxiv.org/abs/2405.17871v1 +http://arxiv.org/abs/2404.08335v1 +http://arxiv.org/abs/2407.08946v1 +http://arxiv.org/abs/2410.14069v1 +http://arxiv.org/abs/2402.06160v2 +http://arxiv.org/abs/2405.16034v1 +http://arxiv.org/abs/2404.06831v3 +http://arxiv.org/abs/2402.16302v1 +http://arxiv.org/abs/2405.14633v1 +http://arxiv.org/abs/2407.01567v1 +http://arxiv.org/abs/2404.03080v3 +http://arxiv.org/abs/2410.00051v1 +http://arxiv.org/abs/2407.10725v1 +http://arxiv.org/abs/2406.13770v1 +http://arxiv.org/abs/2405.14669v1 +http://arxiv.org/abs/2403.17105v2 +http://arxiv.org/abs/2401.18079v4 +http://arxiv.org/abs/2403.09621v1 +http://arxiv.org/abs/2405.13888v1 +http://arxiv.org/abs/2409.03142v1 +http://arxiv.org/abs/2111.06530v2 +http://arxiv.org/abs/2406.14408v2 +http://arxiv.org/abs/2404.13344v1 +http://arxiv.org/abs/2405.14096v1 +http://arxiv.org/abs/2310.01144v3 +http://arxiv.org/abs/2405.18378v2 +http://arxiv.org/abs/2405.20853v2 +http://arxiv.org/abs/2404.14951v2 +http://arxiv.org/abs/2401.13544v2 +http://arxiv.org/abs/2408.07307v1 +http://arxiv.org/abs/2407.13979v1 +http://arxiv.org/abs/2410.04376v2 +http://arxiv.org/abs/2401.10166v3 +http://arxiv.org/abs/2310.10683v2 +http://arxiv.org/abs/2410.08710v2 +http://arxiv.org/abs/2310.08164v5 +http://arxiv.org/abs/2406.11813v1 +http://arxiv.org/abs/2405.13879v1 +http://arxiv.org/abs/2402.15393v2 +http://arxiv.org/abs/2402.09152v2 +http://arxiv.org/abs/2405.17978v1 +http://arxiv.org/abs/2404.13968v3 +http://arxiv.org/abs/2405.19687v2 +http://arxiv.org/abs/2409.16965v2 +http://arxiv.org/abs/2409.00138v2 +http://arxiv.org/abs/2311.14601v1 +http://arxiv.org/abs/2402.02500v3 +http://arxiv.org/abs/2409.17475v1 +http://arxiv.org/abs/2407.06120v1 +http://arxiv.org/abs/2405.18400v5 +http://arxiv.org/abs/2410.00993v1 +http://arxiv.org/abs/2406.03618v3 +http://arxiv.org/abs/2401.10809v2 +http://arxiv.org/abs/2408.08753v1 +http://arxiv.org/abs/2410.02249v1 +http://arxiv.org/abs/2311.06423v3 +http://arxiv.org/abs/2312.00486v1 +http://arxiv.org/abs/2404.03272v1 +http://arxiv.org/abs/2409.10559v1 +http://arxiv.org/abs/2406.07520v1 +http://arxiv.org/abs/2406.03704v1 +http://arxiv.org/abs/2406.13892v2 +http://arxiv.org/abs/2405.12856v4 +http://arxiv.org/abs/2404.10740v3 +http://arxiv.org/abs/2406.17271v1 +http://arxiv.org/abs/2410.14754v1 +http://arxiv.org/abs/2405.14226v2 +http://arxiv.org/abs/2408.13256v2 +http://arxiv.org/abs/2404.01340v1 +http://arxiv.org/abs/2408.15094v1 +http://arxiv.org/abs/2406.14165v1 +http://arxiv.org/abs/2406.12272v5 +http://arxiv.org/abs/2410.11224v2 +http://arxiv.org/abs/2410.14488v1 +http://arxiv.org/abs/2403.11574v1 +http://arxiv.org/abs/2404.13445v2 +http://arxiv.org/abs/2404.02948v3 +http://arxiv.org/abs/2406.01799v2 +http://arxiv.org/abs/2406.05658v3 +http://arxiv.org/abs/2404.04465v2 +http://arxiv.org/abs/2405.20435v1 +http://arxiv.org/abs/2306.01953v2 +http://arxiv.org/abs/2311.02373v2 +http://arxiv.org/abs/2406.06838v1 +http://arxiv.org/abs/2406.08747v1 +http://arxiv.org/abs/2403.14067v2 +http://arxiv.org/abs/2406.00551v2 +http://arxiv.org/abs/2406.09413v2 +http://arxiv.org/abs/2311.13385v4 +http://arxiv.org/abs/2409.19069v1 +http://arxiv.org/abs/2406.18521v1 +http://arxiv.org/abs/2402.06529v3 +http://arxiv.org/abs/2409.07142v2 +http://arxiv.org/abs/2406.16778v1 +http://arxiv.org/abs/2402.06126v3 +http://arxiv.org/abs/2405.15682v3 +http://arxiv.org/abs/2406.03620v1 +http://arxiv.org/abs/2405.15539v1 +http://arxiv.org/abs/2405.13845v2 +http://arxiv.org/abs/2403.19655v3 +http://arxiv.org/abs/2410.01649v1 +http://arxiv.org/abs/2402.14123v1 +http://arxiv.org/abs/2407.03263v1 +http://arxiv.org/abs/2406.19272v2 +http://arxiv.org/abs/2409.18962v1 +http://arxiv.org/abs/2405.16247v2 +http://arxiv.org/abs/2312.11846v1 +http://arxiv.org/abs/2406.09891v1 +http://arxiv.org/abs/2311.08803v3 +http://arxiv.org/abs/2401.12794v2 +http://arxiv.org/abs/2406.16341v1 +http://arxiv.org/abs/2410.06558v4 +http://arxiv.org/abs/2402.10739v4 +http://arxiv.org/abs/2406.12406v1 +http://arxiv.org/abs/2405.18407v1 +http://arxiv.org/abs/2407.08751v1 +http://arxiv.org/abs/2404.18928v1 +http://arxiv.org/abs/2410.11181v1 +http://arxiv.org/abs/2406.08666v1 +http://arxiv.org/abs/2404.13076v1 +http://arxiv.org/abs/2406.16623v1 +http://arxiv.org/abs/2410.15010v1 +http://arxiv.org/abs/2405.14064v1 +http://arxiv.org/abs/2406.07472v1 +http://arxiv.org/abs/2405.20390v1 +http://arxiv.org/abs/2409.17687v1 +http://arxiv.org/abs/2406.12382v1 +http://arxiv.org/abs/2406.19073v1 +http://arxiv.org/abs/2404.03139v1 +http://arxiv.org/abs/2406.16964v1 +http://arxiv.org/abs/2406.07302v1 +http://arxiv.org/abs/2408.10189v1 +http://arxiv.org/abs/2403.01327v1 +http://arxiv.org/abs/2406.10580v1 +http://arxiv.org/abs/2407.08447v1 +http://arxiv.org/abs/2409.04792v1 +http://arxiv.org/abs/2405.17694v1 +http://arxiv.org/abs/2406.12849v1 +http://arxiv.org/abs/2406.04299v2 +http://arxiv.org/abs/2405.18822v1 +http://arxiv.org/abs/2311.00094v2 +http://arxiv.org/abs/2407.02518v1 +http://arxiv.org/abs/2404.13591v2 +http://arxiv.org/abs/2405.13865v1 +http://arxiv.org/abs/2405.14569v2 +http://arxiv.org/abs/2402.07193v2 +http://arxiv.org/abs/2402.08547v2 +http://arxiv.org/abs/2406.02532v2 +http://arxiv.org/abs/2405.20671v1 +http://arxiv.org/abs/2410.02527v1 +http://arxiv.org/abs/2406.07449v1 +http://arxiv.org/abs/2406.04801v1 +http://arxiv.org/abs/2410.06535v2 +http://arxiv.org/abs/2407.02680v3 +http://arxiv.org/abs/2405.13800v1 +http://arxiv.org/abs/2405.14974v1 +http://arxiv.org/abs/2405.17992v1 +http://arxiv.org/abs/2405.07976v3 +http://arxiv.org/abs/2405.17149v1 +http://arxiv.org/abs/2402.07426v2 +http://arxiv.org/abs/2404.13895v2 +http://arxiv.org/abs/2407.08906v1 +http://arxiv.org/abs/2310.18955v2 +http://arxiv.org/abs/2402.07510v2 +http://arxiv.org/abs/2409.19212v1 +http://arxiv.org/abs/2405.09220v2 +http://arxiv.org/abs/2410.04847v1 +http://arxiv.org/abs/2405.14681v1 +http://arxiv.org/abs/2410.05626v1 +http://arxiv.org/abs/2406.00048v2 +http://arxiv.org/abs/2406.08300v1 +http://arxiv.org/abs/2406.06425v1 +http://arxiv.org/abs/2406.00535v2 +http://arxiv.org/abs/2402.10429v2 +http://arxiv.org/abs/2405.09719v2 +http://arxiv.org/abs/2405.13805v2 +http://arxiv.org/abs/2307.03288v3 +http://arxiv.org/abs/2407.06076v1 +http://arxiv.org/abs/2409.01081v1 +http://arxiv.org/abs/2405.14530v1 +http://arxiv.org/abs/2402.00957v3 +http://arxiv.org/abs/2409.18479v2 +http://arxiv.org/abs/2305.12715v3 +http://arxiv.org/abs/2405.14677v2 +http://arxiv.org/abs/2407.15595v1 +http://arxiv.org/abs/2405.03553v3 +http://arxiv.org/abs/2405.13919v1 +http://arxiv.org/abs/2403.04690v2 +http://arxiv.org/abs/2403.06328v2 +http://arxiv.org/abs/2405.14082v1 +http://arxiv.org/abs/2402.07963v2 +http://arxiv.org/abs/2405.08807v1 +http://arxiv.org/abs/2406.00773v2 +http://arxiv.org/abs/2407.19234v1 +http://arxiv.org/abs/2406.08298v4 +http://arxiv.org/abs/2406.07815v2 +http://arxiv.org/abs/2407.05600v1 +http://arxiv.org/abs/2406.02395v1 +http://arxiv.org/abs/2407.10897v1 +http://arxiv.org/abs/2402.10095v2 +http://arxiv.org/abs/2404.09248v1 +http://arxiv.org/abs/2409.08311v1 +http://arxiv.org/abs/2106.16239v9 +http://arxiv.org/abs/2406.09371v1 +http://arxiv.org/abs/2410.13761v1 +http://arxiv.org/abs/2407.09413v1 +http://arxiv.org/abs/2405.00662v2 +http://arxiv.org/abs/2406.09414v2 +http://arxiv.org/abs/2404.10881v1 +http://arxiv.org/abs/2409.17996v2 +http://arxiv.org/abs/2407.12861v1 +http://arxiv.org/abs/2402.16349v1 +http://arxiv.org/abs/2406.06420v1 +http://arxiv.org/abs/2406.18664v4 +http://arxiv.org/abs/2406.13123v2 +http://arxiv.org/abs/2409.18892v2 +http://arxiv.org/abs/2006.09268v3 +http://arxiv.org/abs/2404.08801v2 +http://arxiv.org/abs/2309.01973v1 +http://arxiv.org/abs/2405.19681v1 +http://arxiv.org/abs/2307.01649v2 +http://arxiv.org/abs/2405.14473v1 +http://arxiv.org/abs/2407.07082v1 +http://arxiv.org/abs/2406.12763v2 +http://arxiv.org/abs/2406.19253v1 +http://arxiv.org/abs/2406.07592v1 +http://arxiv.org/abs/2409.19681v1 +http://arxiv.org/abs/2308.03648v2 +http://arxiv.org/abs/2405.14574v1 +http://arxiv.org/abs/2402.12366v1 +http://arxiv.org/abs/2401.06687v2 +http://arxiv.org/abs/2307.16405v1 +http://arxiv.org/abs/2310.17638v2 +http://arxiv.org/abs/2406.09513v1 +http://arxiv.org/abs/2403.00158v2 +http://arxiv.org/abs/2403.06903v3 +http://arxiv.org/abs/2406.17720v1 +http://arxiv.org/abs/2406.11775v1 +http://arxiv.org/abs/2402.07876v6 +http://arxiv.org/abs/2402.06353v2 +http://arxiv.org/abs/2406.18814v2 +http://arxiv.org/abs/2406.02764v1 +http://arxiv.org/abs/2410.02117v2 +http://arxiv.org/abs/2408.07841v4 +http://arxiv.org/abs/2410.14195v1 +http://arxiv.org/abs/2405.20630v3 +http://arxiv.org/abs/2406.02797v1 +http://arxiv.org/abs/2404.06757v1 +http://arxiv.org/abs/2410.07971v1 +http://arxiv.org/abs/2405.19279v1 +http://arxiv.org/abs/2402.19460v1 +http://arxiv.org/abs/2405.20612v1 +http://arxiv.org/abs/2407.11502v2 +http://arxiv.org/abs/2406.09400v1 +http://arxiv.org/abs/2402.15978v1 +http://arxiv.org/abs/2406.01793v1 +http://arxiv.org/abs/2306.11895v2 +http://arxiv.org/abs/2410.17020v1 +http://arxiv.org/abs/2405.13992v1 +http://arxiv.org/abs/2410.06645v2 +http://arxiv.org/abs/2405.13675v4 +http://arxiv.org/abs/2403.13893v2 +http://arxiv.org/abs/2406.10650v1 +http://arxiv.org/abs/2410.05437v1 +http://arxiv.org/abs/2402.08126v2 +http://arxiv.org/abs/2305.15260v3 +http://arxiv.org/abs/2403.04919v2 +http://arxiv.org/abs/2402.09014v2 +http://arxiv.org/abs/2405.20282v1 +http://arxiv.org/abs/2409.18461v1 +http://arxiv.org/abs/2405.20510v2 +http://arxiv.org/abs/2406.11316v1 +http://arxiv.org/abs/2410.11187v1 +http://arxiv.org/abs/2410.06170v1 +http://arxiv.org/abs/2402.00522v5 +http://arxiv.org/abs/2409.18269v1 +http://arxiv.org/abs/2406.08850v1 +http://arxiv.org/abs/2312.14556v1 +http://arxiv.org/abs/2402.04785v1 +http://arxiv.org/abs/2408.05798v1 +http://arxiv.org/abs/2402.07314v2 +http://arxiv.org/abs/2405.17394v2 +http://arxiv.org/abs/2410.06007v1 +http://arxiv.org/abs/2305.12519v3 +http://arxiv.org/abs/2402.10723v1 +http://arxiv.org/abs/2407.14679v1 +http://arxiv.org/abs/2410.02924v1 +http://arxiv.org/abs/2410.07638v1 +http://arxiv.org/abs/2405.17661v1 +http://arxiv.org/abs/2202.05404v7 +http://arxiv.org/abs/2410.02195v1 +http://arxiv.org/abs/2110.00744v5 +http://arxiv.org/abs/2406.08475v1 +http://arxiv.org/abs/2405.18137v1 +http://arxiv.org/abs/2409.17504v1 +http://arxiv.org/abs/2402.19072v1 +http://arxiv.org/abs/2406.01959v2 +http://arxiv.org/abs/2407.12831v2 +http://arxiv.org/abs/2406.03802v1 +http://arxiv.org/abs/2405.15509v1 +http://arxiv.org/abs/2410.10892v1 +http://arxiv.org/abs/2405.19690v2 +http://arxiv.org/abs/2410.15859v2 +http://arxiv.org/abs/2405.20272v1 +http://arxiv.org/abs/2406.17711v1 +http://arxiv.org/abs/2407.04694v1 +http://arxiv.org/abs/2405.18836v1 +http://arxiv.org/abs/2405.15124v3 +http://arxiv.org/abs/2410.14067v1 +http://arxiv.org/abs/2410.10924v1 +http://arxiv.org/abs/2406.06484v2 +http://arxiv.org/abs/2406.08993v1 +http://arxiv.org/abs/2405.03991v1 +http://arxiv.org/abs/2409.17331v1 +http://arxiv.org/abs/2406.04329v1 +http://arxiv.org/abs/2407.00401v1 +http://arxiv.org/abs/2410.03581v2 +http://arxiv.org/abs/2406.02176v3 +http://arxiv.org/abs/2312.13236v2 +http://arxiv.org/abs/2405.15125v3 +http://arxiv.org/abs/2410.04368v1 +http://arxiv.org/abs/2406.08164v1 +http://arxiv.org/abs/2106.04255v4 +http://arxiv.org/abs/2407.18035v1 +http://arxiv.org/abs/2404.02837v1 +http://arxiv.org/abs/2406.10127v1 +http://arxiv.org/abs/2403.20233v3 +http://arxiv.org/abs/2410.05578v1 +http://arxiv.org/abs/2405.18877v1 +http://arxiv.org/abs/2406.07020v1 +http://arxiv.org/abs/2406.12303v1 +http://arxiv.org/abs/2309.16965v3 +http://arxiv.org/abs/2410.07157v1 +http://arxiv.org/abs/2407.12979v1 +http://arxiv.org/abs/2405.16387v1 +http://arxiv.org/abs/2406.03852v1 +http://arxiv.org/abs/2410.10674v1 +http://arxiv.org/abs/2408.15099v2 +http://arxiv.org/abs/2409.00328v1 +http://arxiv.org/abs/2405.20008v1 +http://arxiv.org/abs/2405.16802v3 +http://arxiv.org/abs/2401.17789v2 +http://arxiv.org/abs/2407.04693v1 +http://arxiv.org/abs/2406.05405v2 +http://arxiv.org/abs/2406.01175v2 +http://arxiv.org/abs/2409.00729v2 +http://arxiv.org/abs/2406.03175v1 +http://arxiv.org/abs/2310.00526v7 +http://arxiv.org/abs/2310.05469v3 +http://arxiv.org/abs/2406.04331v1 +http://arxiv.org/abs/2409.17652v1 +http://arxiv.org/abs/2402.10360v2 +http://arxiv.org/abs/2405.18296v1 +http://arxiv.org/abs/2401.00308v1 +http://arxiv.org/abs/2403.03880v2 +http://arxiv.org/abs/2407.13957v1 +http://arxiv.org/abs/2305.18475v2 +http://arxiv.org/abs/2408.15065v1 +http://arxiv.org/abs/2402.04647v2 +http://arxiv.org/abs/2405.16166v1 +http://arxiv.org/abs/2407.02747v1 +http://arxiv.org/abs/2408.04057v3 +http://arxiv.org/abs/2402.08365v2 +http://arxiv.org/abs/2406.06979v1 +http://arxiv.org/abs/2404.13207v3 +http://arxiv.org/abs/2408.01420v1 +http://arxiv.org/abs/2402.08406v2 +http://arxiv.org/abs/2410.06734v2 +http://arxiv.org/abs/2405.16218v1 +http://arxiv.org/abs/2405.18009v1 +http://arxiv.org/abs/2312.15551v4 +http://arxiv.org/abs/2405.19374v1 +http://arxiv.org/abs/2406.14546v1 +http://arxiv.org/abs/2309.15726v2 +http://arxiv.org/abs/2405.18549v1 +http://arxiv.org/abs/2406.02968v1 +http://arxiv.org/abs/2405.16564v2 +http://arxiv.org/abs/2402.07735v2 +http://arxiv.org/abs/2404.07266v1 +http://arxiv.org/abs/2408.01933v2 +http://arxiv.org/abs/2402.18591v1 +http://arxiv.org/abs/2407.10956v1 +http://arxiv.org/abs/2405.14477v1 +http://arxiv.org/abs/2405.18457v2 +http://arxiv.org/abs/2402.15481v4 +http://arxiv.org/abs/2409.19414v1 +http://arxiv.org/abs/2405.15071v2 +http://arxiv.org/abs/2406.08466v1 +http://arxiv.org/abs/2405.14414v1 +http://arxiv.org/abs/2410.04037v1 +http://arxiv.org/abs/2405.14241v1 +http://arxiv.org/abs/2403.03744v5 +http://arxiv.org/abs/2402.02769v2 +http://arxiv.org/abs/2402.05232v1 +http://arxiv.org/abs/2402.02037v5 +http://arxiv.org/abs/2409.19603v1 +http://arxiv.org/abs/2407.02880v1 +http://arxiv.org/abs/2405.17151v1 +http://arxiv.org/abs/2408.08210v1 +http://arxiv.org/abs/2404.04125v2 +http://arxiv.org/abs/2408.11979v1 +http://arxiv.org/abs/2405.13763v1 +http://arxiv.org/abs/2405.15119v1 +http://arxiv.org/abs/2402.01188v3 +http://arxiv.org/abs/2403.09486v4 +http://arxiv.org/abs/2405.10934v2 +http://arxiv.org/abs/2405.12203v2 +http://arxiv.org/abs/2305.11567v2 +http://arxiv.org/abs/2405.12940v1 +http://arxiv.org/abs/2402.04383v1 +http://arxiv.org/abs/2409.19872v2 +http://arxiv.org/abs/2406.06106v1 +http://arxiv.org/abs/2405.17638v1 +http://arxiv.org/abs/2405.20763v2 +http://arxiv.org/abs/2406.05954v2 +http://arxiv.org/abs/2409.09359v1 +http://arxiv.org/abs/2404.15417v2 +http://arxiv.org/abs/2406.03923v2 +http://arxiv.org/abs/2312.04727v1 +http://arxiv.org/abs/2402.01000v3 +http://arxiv.org/abs/2406.01801v1 +http://arxiv.org/abs/2407.08680v3 +http://arxiv.org/abs/2311.17491v1 +http://arxiv.org/abs/2403.04082v2 +http://arxiv.org/abs/2406.06523v1 +http://arxiv.org/abs/2405.00332v3 +http://arxiv.org/abs/2312.16045v1 +http://arxiv.org/abs/2401.07080v2 +http://arxiv.org/abs/2405.13762v1 +http://arxiv.org/abs/2407.03204v1 +http://arxiv.org/abs/2406.14426v1 +http://arxiv.org/abs/2405.20540v1 +http://arxiv.org/abs/2410.12454v1 +http://arxiv.org/abs/2405.19581v1 +http://arxiv.org/abs/2401.14657v1 +http://arxiv.org/abs/2408.04526v1 +http://arxiv.org/abs/2311.13745v2 +http://arxiv.org/abs/2405.14768v2 +http://arxiv.org/abs/2407.11855v1 +http://arxiv.org/abs/2310.07707v1 +http://arxiv.org/abs/2404.12715v2 +http://arxiv.org/abs/2407.10827v1 +http://arxiv.org/abs/2406.06367v2 +http://arxiv.org/abs/2403.17329v2 +http://arxiv.org/abs/2410.10308v1 +http://arxiv.org/abs/2406.10324v1 +http://arxiv.org/abs/2402.04857v2 +http://arxiv.org/abs/2403.18079v2 +http://arxiv.org/abs/2403.11637v2 +http://arxiv.org/abs/2406.14544v1 +http://arxiv.org/abs/2409.17963v1 +http://arxiv.org/abs/2410.12713v1 +http://arxiv.org/abs/2402.12365v4 +http://arxiv.org/abs/2403.03333v2 +http://arxiv.org/abs/2405.18686v1 +http://arxiv.org/abs/2410.11449v1 +http://arxiv.org/abs/2406.12625v1 +http://arxiv.org/abs/2402.05234v2 +http://arxiv.org/abs/2403.19863v3 +http://arxiv.org/abs/2406.02366v1 +http://arxiv.org/abs/2406.14852v1 +http://arxiv.org/abs/2406.04339v1 +http://arxiv.org/abs/2403.07721v7 +http://arxiv.org/abs/2405.15719v1 +http://arxiv.org/abs/2410.10931v1 +http://arxiv.org/abs/2406.15568v2 +http://arxiv.org/abs/2405.05409v3 +http://arxiv.org/abs/2410.05441v1 +http://arxiv.org/abs/2409.07414v1 +http://arxiv.org/abs/2405.14813v1 +http://arxiv.org/abs/2406.02040v1 +http://arxiv.org/abs/2312.09608v2 +http://arxiv.org/abs/2405.12523v2 +http://arxiv.org/abs/2402.15166v2 +http://arxiv.org/abs/2405.19101v1 +http://arxiv.org/abs/2404.07724v1 +http://arxiv.org/abs/2406.13173v3 +http://arxiv.org/abs/2410.03558v3 +http://arxiv.org/abs/2406.00775v1 +http://arxiv.org/abs/2311.08376v2 +http://arxiv.org/abs/2305.16269v3 +http://arxiv.org/abs/2409.16756v2 +http://arxiv.org/abs/2404.15199v2 +http://arxiv.org/abs/2404.13046v1 +http://arxiv.org/abs/2405.16112v2 +http://arxiv.org/abs/2406.07284v2 +http://arxiv.org/abs/2406.09347v1 +http://arxiv.org/abs/2305.03136v4 +http://arxiv.org/abs/2405.14630v1 +http://arxiv.org/abs/2406.10252v2 +http://arxiv.org/abs/2409.15317v1 +http://arxiv.org/abs/2403.09613v1 +http://arxiv.org/abs/2402.18540v1 +http://arxiv.org/abs/2406.04333v1 +http://arxiv.org/abs/2402.14585v1 +http://arxiv.org/abs/2311.18760v2 +http://arxiv.org/abs/2405.19325v2 +http://arxiv.org/abs/2409.19422v2 +http://arxiv.org/abs/2406.10023v1 +http://arxiv.org/abs/2410.13914v3 +http://arxiv.org/abs/2406.08414v2 +http://arxiv.org/abs/2404.16767v3 +http://arxiv.org/abs/2406.10019v1 +http://arxiv.org/abs/2406.07457v2 +http://arxiv.org/abs/2402.05639v2 +http://arxiv.org/abs/2402.02827v3 +http://arxiv.org/abs/2409.19952v2 +http://arxiv.org/abs/2406.08920v2 +http://arxiv.org/abs/2406.09417v1 +http://arxiv.org/abs/2406.09714v1 +http://arxiv.org/abs/2406.04981v1 +http://arxiv.org/abs/2406.09948v1 +http://arxiv.org/abs/2405.17809v2 +http://arxiv.org/abs/2409.00421v1 +http://arxiv.org/abs/2406.14991v2 +http://arxiv.org/abs/2406.19824v2 +http://arxiv.org/abs/2410.10384v1 +http://arxiv.org/abs/2407.09141v1 +http://arxiv.org/abs/2404.14408v3 +http://arxiv.org/abs/2402.11100v2 +http://arxiv.org/abs/2312.06185v5 +http://arxiv.org/abs/2407.11052v1 +http://arxiv.org/abs/2406.02396v1 +http://arxiv.org/abs/2402.02622v2 +http://arxiv.org/abs/2407.00615v1 +http://arxiv.org/abs/2406.07057v1 +http://arxiv.org/abs/2405.02730v2 +http://arxiv.org/abs/2405.16337v2 +http://arxiv.org/abs/2406.18043v1 +http://arxiv.org/abs/2406.01234v1 +http://arxiv.org/abs/2402.05369v2 +http://arxiv.org/abs/2403.14398v1 +http://arxiv.org/abs/2404.15378v3 +http://arxiv.org/abs/2405.19572v1 +http://arxiv.org/abs/2407.08303v1 +http://arxiv.org/abs/2407.00695v1 +http://arxiv.org/abs/2406.15658v1 +http://arxiv.org/abs/2402.00793v2 +http://arxiv.org/abs/2312.11462v4 +http://arxiv.org/abs/2406.05745v1 +http://arxiv.org/abs/2406.16254v1 +http://arxiv.org/abs/2410.07166v1 +http://arxiv.org/abs/2410.04968v1 +http://arxiv.org/abs/2409.19620v2 +http://arxiv.org/abs/2310.07136v3 +http://arxiv.org/abs/2406.13909v1 +http://arxiv.org/abs/2403.02187v3 +http://arxiv.org/abs/2310.18162v2 +http://arxiv.org/abs/2405.14256v1 +http://arxiv.org/abs/2406.02900v1 +http://arxiv.org/abs/2405.18512v1 +http://arxiv.org/abs/2406.06959v1 +http://arxiv.org/abs/2406.10209v1 +http://arxiv.org/abs/2402.18815v2 +http://arxiv.org/abs/2402.06255v3 +http://arxiv.org/abs/2405.14302v1 +http://arxiv.org/abs/2405.18711v1 +http://arxiv.org/abs/2404.09494v4 +http://arxiv.org/abs/2406.13928v1 +http://arxiv.org/abs/2408.07941v1 +http://arxiv.org/abs/2410.07707v1 +http://arxiv.org/abs/2208.12764v3 +http://arxiv.org/abs/2402.19456v1 +http://arxiv.org/abs/2402.12550v4 +http://arxiv.org/abs/2409.19912v3 +http://arxiv.org/abs/2406.14219v1 +http://arxiv.org/abs/2410.14980v2 +http://arxiv.org/abs/2407.11910v1 +http://arxiv.org/abs/2406.13488v1 +http://arxiv.org/abs/2408.09227v1 +http://arxiv.org/abs/2406.04090v1 +http://arxiv.org/abs/2405.15586v1 +http://arxiv.org/abs/2409.17874v1 +http://arxiv.org/abs/2305.01377v3 +http://arxiv.org/abs/2405.05968v2 +http://arxiv.org/abs/2402.08576v4 +http://arxiv.org/abs/2410.15556v1 +http://arxiv.org/abs/2406.08316v2 +http://arxiv.org/abs/2402.10376v1 +http://arxiv.org/abs/2405.18781v1 +http://arxiv.org/abs/2406.01765v1 +http://arxiv.org/abs/2409.19472v2 +http://arxiv.org/abs/2404.12940v2 +http://arxiv.org/abs/2406.11672v2 +http://arxiv.org/abs/2407.13977v1 +http://arxiv.org/abs/2405.20331v1 +http://arxiv.org/abs/2406.08377v3 +http://arxiv.org/abs/2406.09405v1 +http://arxiv.org/abs/2405.16605v1 +http://arxiv.org/abs/2402.08758v1 +http://arxiv.org/abs/2405.13899v1 +http://arxiv.org/abs/2407.06183v1 +http://arxiv.org/abs/2403.14370v3 +http://arxiv.org/abs/2404.19733v3 +http://arxiv.org/abs/2407.11522v1 +http://arxiv.org/abs/2408.16221v3 +http://arxiv.org/abs/2407.07457v3 +http://arxiv.org/abs/2406.04280v1 +http://arxiv.org/abs/2406.04056v2 +http://arxiv.org/abs/2410.13027v1 +http://arxiv.org/abs/2405.15118v2 +http://arxiv.org/abs/2406.02749v2 +http://arxiv.org/abs/2312.07104v2 +http://arxiv.org/abs/2406.00529v1 +http://arxiv.org/abs/2402.16346v3 +http://arxiv.org/abs/2406.08938v1 +http://arxiv.org/abs/2304.09875v2 +http://arxiv.org/abs/2406.07524v1 +http://arxiv.org/abs/2406.10368v1 +http://arxiv.org/abs/2303.09205v3 +http://arxiv.org/abs/2202.04294v2 +http://arxiv.org/abs/2402.07588v3 +http://arxiv.org/abs/2410.02396v1 +http://arxiv.org/abs/2406.06576v4 +http://arxiv.org/abs/2402.08586v1 +http://arxiv.org/abs/2406.11838v2 +http://arxiv.org/abs/2410.16707v1 +http://arxiv.org/abs/2405.17745v1 +http://arxiv.org/abs/2406.00681v1 +http://arxiv.org/abs/2405.12205v1 +http://arxiv.org/abs/2407.00983v2 +http://arxiv.org/abs/2210.14988v2 +http://arxiv.org/abs/2407.05330v1 +http://arxiv.org/abs/2406.06494v1 +http://arxiv.org/abs/2405.16405v1 +http://arxiv.org/abs/2405.19231v1 +http://arxiv.org/abs/2406.10391v1 +http://arxiv.org/abs/2406.09630v1 +http://arxiv.org/abs/2406.09215v2 +http://arxiv.org/abs/2404.08476v1 +http://arxiv.org/abs/2406.09368v1 +http://arxiv.org/abs/2405.15891v3 +http://arxiv.org/abs/2408.16653v1 +http://arxiv.org/abs/2402.02030v2 +http://arxiv.org/abs/2404.13752v2 +http://arxiv.org/abs/2408.12489v1 +http://arxiv.org/abs/2406.08398v1 +http://arxiv.org/abs/2406.04325v1 +http://arxiv.org/abs/2410.03936v2 +http://arxiv.org/abs/2405.18489v1 +http://arxiv.org/abs/2405.20320v2 +http://arxiv.org/abs/2308.12970v2 +http://arxiv.org/abs/2401.17263v4 +http://arxiv.org/abs/2406.03949v1 +http://arxiv.org/abs/2406.20087v1 +http://arxiv.org/abs/2405.15583v1 +http://arxiv.org/abs/2406.01326v2 +http://arxiv.org/abs/2401.15866v1 +http://arxiv.org/abs/2405.15894v1 +http://arxiv.org/abs/2405.19509v1 +http://arxiv.org/abs/2405.15383v1 +http://arxiv.org/abs/2409.19407v1 +http://arxiv.org/abs/2409.18946v1 +http://arxiv.org/abs/2406.12952v1 +http://arxiv.org/abs/2406.02258v2 +http://arxiv.org/abs/2306.02071v2 +http://arxiv.org/abs/2405.20838v1 +http://arxiv.org/abs/2407.02240v1 +http://arxiv.org/abs/2407.00077v2 +http://arxiv.org/abs/2405.20231v3 +http://arxiv.org/abs/2406.12723v3 +http://arxiv.org/abs/2406.09291v3 +http://arxiv.org/abs/2405.17462v3 +http://arxiv.org/abs/2407.11004v1 +http://arxiv.org/abs/2410.11208v1 +http://arxiv.org/abs/2403.03493v1 +http://arxiv.org/abs/2402.02370v3 +http://arxiv.org/abs/2405.15767v2 +http://arxiv.org/abs/2406.14183v2 +http://arxiv.org/abs/2407.13168v1 +http://arxiv.org/abs/2406.01478v1 +http://arxiv.org/abs/2403.05327v3 +http://arxiv.org/abs/2409.20222v2 +http://arxiv.org/abs/2302.09160v2 +http://arxiv.org/abs/2405.14392v1 +http://arxiv.org/abs/2402.09712v2 +http://arxiv.org/abs/2404.15269v2 +http://arxiv.org/abs/2306.07951v3 +http://arxiv.org/abs/2312.06071v3 +http://arxiv.org/abs/2405.18750v2 +http://arxiv.org/abs/2407.16364v2 +http://arxiv.org/abs/2405.20971v1 +http://arxiv.org/abs/2312.02224v2 +http://arxiv.org/abs/2406.11831v2 +http://arxiv.org/abs/2405.20824v1 +http://arxiv.org/abs/2409.18158v1 +http://arxiv.org/abs/2405.21074v1 +http://arxiv.org/abs/2409.19659v1 +http://arxiv.org/abs/2402.04033v3 +http://arxiv.org/abs/2407.06115v1 +http://arxiv.org/abs/2406.15479v2 +http://arxiv.org/abs/2404.04375v1 +http://arxiv.org/abs/2409.18153v1 +http://arxiv.org/abs/2408.15237v1 +http://arxiv.org/abs/2405.12094v1 +http://arxiv.org/abs/2409.18778v1 +http://arxiv.org/abs/2406.02269v1 +http://arxiv.org/abs/2405.16700v2 +http://arxiv.org/abs/2407.14653v1 +http://arxiv.org/abs/2409.18735v1 +http://arxiv.org/abs/2408.17394v1 +http://arxiv.org/abs/2405.18075v1 +http://arxiv.org/abs/2408.00549v1 +http://arxiv.org/abs/2405.15020v1 +http://arxiv.org/abs/2309.00976v4 +http://arxiv.org/abs/2402.01469v1 +http://arxiv.org/abs/2410.13046v1 +http://arxiv.org/abs/2410.06621v1 +http://arxiv.org/abs/2410.03918v1 +http://arxiv.org/abs/2406.06007v1 +http://arxiv.org/abs/2312.00923v2 +http://arxiv.org/abs/2406.04845v1 +http://arxiv.org/abs/2408.02927v1 +http://arxiv.org/abs/2407.12034v1 +http://arxiv.org/abs/2403.14156v1 +http://arxiv.org/abs/2405.16663v2 +http://arxiv.org/abs/2406.06398v1 +http://arxiv.org/abs/2311.16671v1 +http://arxiv.org/abs/2408.03572v1 +http://arxiv.org/abs/2405.20693v1 +http://arxiv.org/abs/2410.12269v1 +http://arxiv.org/abs/2410.06675v1 +http://arxiv.org/abs/2405.10301v2 +http://arxiv.org/abs/2405.19296v1 +http://arxiv.org/abs/2408.08274v2 +http://arxiv.org/abs/2402.13728v5 +http://arxiv.org/abs/2404.05595v2 +http://arxiv.org/abs/2405.20279v2 +http://arxiv.org/abs/2409.20460v1 +http://arxiv.org/abs/2410.15526v1 +http://arxiv.org/abs/2405.17277v1 +http://arxiv.org/abs/2405.08027v4 +http://arxiv.org/abs/2401.10119v3 +http://arxiv.org/abs/2404.03592v3 +http://arxiv.org/abs/2405.15732v1 +http://arxiv.org/abs/2407.09024v1 +http://arxiv.org/abs/2410.13045v1 +http://arxiv.org/abs/2405.17399v1 +http://arxiv.org/abs/2410.08087v1 +http://arxiv.org/abs/2409.19680v1 +http://arxiv.org/abs/2405.14469v2 +http://arxiv.org/abs/2405.19806v1 +http://arxiv.org/abs/2406.10216v2 +http://arxiv.org/abs/2405.05343v1 +http://arxiv.org/abs/2409.17840v1 +http://arxiv.org/abs/2211.06934v1 +http://arxiv.org/abs/2410.14091v1 +http://arxiv.org/abs/2405.12399v1 +http://arxiv.org/abs/2308.02261v2 +http://arxiv.org/abs/2407.01032v2 +http://arxiv.org/abs/2406.11794v3 +http://arxiv.org/abs/2406.16148v1 +http://arxiv.org/abs/2406.06452v1 +http://arxiv.org/abs/2407.14332v1 +http://arxiv.org/abs/2406.01579v2 +http://arxiv.org/abs/2406.03619v1 +http://arxiv.org/abs/2405.10302v2 +http://arxiv.org/abs/2405.20291v1 +http://arxiv.org/abs/2406.07277v1 +http://arxiv.org/abs/2410.10356v1 +http://arxiv.org/abs/2405.14867v2 +http://arxiv.org/abs/2405.16876v2 +http://arxiv.org/abs/2407.16975v1 +http://arxiv.org/abs/2402.02774v1 +http://arxiv.org/abs/2407.20060v1 +http://arxiv.org/abs/2406.01583v2 +http://arxiv.org/abs/2410.04555v1 +http://arxiv.org/abs/2405.18886v1 +http://arxiv.org/abs/2310.07446v5 +http://arxiv.org/abs/2406.09459v1 +http://arxiv.org/abs/2403.01946v2 +http://arxiv.org/abs/2405.05665v1 +http://arxiv.org/abs/2406.07599v2 +http://arxiv.org/abs/2406.16745v1 +http://arxiv.org/abs/2405.14758v1 +http://arxiv.org/abs/2405.15393v1 +http://arxiv.org/abs/2311.14900v4 +http://arxiv.org/abs/2406.09563v1 +http://arxiv.org/abs/2406.06671v1 +http://arxiv.org/abs/2406.12036v4 +http://arxiv.org/abs/2405.14839v1 +http://arxiv.org/abs/2402.16788v4 +http://arxiv.org/abs/2312.04693v2 +http://arxiv.org/abs/2405.19705v1 +http://arxiv.org/abs/2407.02279v1 +http://arxiv.org/abs/2311.10459v3 +http://arxiv.org/abs/2406.14629v1 +http://arxiv.org/abs/2406.12356v2 +http://arxiv.org/abs/2406.09358v2 +http://arxiv.org/abs/2410.16432v1 +http://arxiv.org/abs/2410.09909v1 +http://arxiv.org/abs/2409.19044v1 +http://arxiv.org/abs/2404.01299v2 +http://arxiv.org/abs/2310.14129v1 +http://arxiv.org/abs/2402.13359v1 +http://arxiv.org/abs/2405.19266v2 +http://arxiv.org/abs/2410.11722v2 +http://arxiv.org/abs/2410.03813v1 +http://arxiv.org/abs/2408.14339v1 +http://arxiv.org/abs/2406.09279v2 +http://arxiv.org/abs/2402.11821v2 +http://arxiv.org/abs/2406.00324v1 +http://arxiv.org/abs/2406.03334v1 +http://arxiv.org/abs/2406.10215v1 +http://arxiv.org/abs/2311.16054v3 +http://arxiv.org/abs/2406.10227v1 +http://arxiv.org/abs/2405.17164v2 +http://arxiv.org/abs/2405.14728v1 +http://arxiv.org/abs/2405.04669v1 +http://arxiv.org/abs/2406.08506v1 +http://arxiv.org/abs/2407.04622v2 +http://arxiv.org/abs/2407.05516v1 +http://arxiv.org/abs/2407.15792v1 +http://arxiv.org/abs/2405.14903v1 +http://arxiv.org/abs/2405.13994v1 +http://arxiv.org/abs/2405.20860v1 +http://arxiv.org/abs/2404.04931v2 +http://arxiv.org/abs/2406.07550v1 +http://arxiv.org/abs/2407.09522v1 +http://arxiv.org/abs/2405.14606v3 +http://arxiv.org/abs/2406.04843v1 +http://arxiv.org/abs/2405.15943v1 +http://arxiv.org/abs/2406.12592v1 +http://arxiv.org/abs/2407.13623v2 +http://arxiv.org/abs/2405.21070v2 +http://arxiv.org/abs/2405.17922v1 +http://arxiv.org/abs/2407.00002v2 +http://arxiv.org/abs/2405.11473v3 +http://arxiv.org/abs/2405.17700v1 +http://arxiv.org/abs/2306.05726v2 +http://arxiv.org/abs/2406.17414v1 +http://arxiv.org/abs/2410.04013v1 +http://arxiv.org/abs/2405.21064v1 +http://arxiv.org/abs/2311.14934v1 +http://arxiv.org/abs/2409.05539v1 +http://arxiv.org/abs/2407.15007v1 +http://arxiv.org/abs/2406.15916v1 +http://arxiv.org/abs/2405.04776v2 +http://arxiv.org/abs/2407.11619v1 +http://arxiv.org/abs/2409.17372v1 +http://arxiv.org/abs/2410.15926v1 +http://arxiv.org/abs/2405.03548v4 +http://arxiv.org/abs/2407.18232v1 +http://arxiv.org/abs/2406.12616v2 +http://arxiv.org/abs/2406.13175v1 +http://arxiv.org/abs/2407.01171v1 +http://arxiv.org/abs/2404.16022v1 +http://arxiv.org/abs/2409.04095v1 +http://arxiv.org/abs/2406.05061v2 +http://arxiv.org/abs/2405.15599v1 +http://arxiv.org/abs/2405.15673v1 +http://arxiv.org/abs/2402.07240v3 +http://arxiv.org/abs/2405.15589v2 +http://arxiv.org/abs/2405.17311v2 +http://arxiv.org/abs/2406.09324v2 +http://arxiv.org/abs/2406.01486v1 +http://arxiv.org/abs/2409.17500v1 +http://arxiv.org/abs/2403.12181v1 +http://arxiv.org/abs/2404.13733v3 +http://arxiv.org/abs/2303.07988v3 +http://arxiv.org/abs/2405.16009v1 +http://arxiv.org/abs/2402.17747v3 +http://arxiv.org/abs/2405.12489v4 +http://arxiv.org/abs/2405.14205v2 +http://arxiv.org/abs/2405.18979v2 +http://arxiv.org/abs/2404.11049v3 +http://arxiv.org/abs/2405.16907v3 +http://arxiv.org/abs/2408.10075v1 +http://arxiv.org/abs/2407.00623v1 +http://arxiv.org/abs/2402.10260v2 +http://arxiv.org/abs/2406.07904v1 +http://arxiv.org/abs/2410.02430v1 +http://arxiv.org/abs/2406.18151v2 +http://arxiv.org/abs/2406.15708v1 +http://arxiv.org/abs/2406.01345v1 +http://arxiv.org/abs/2407.06494v3 +http://arxiv.org/abs/2311.00371v1 +http://arxiv.org/abs/2401.04486v2 +http://arxiv.org/abs/2405.12601v1 +http://arxiv.org/abs/2405.12952v1 +http://arxiv.org/abs/2409.09566v1 +http://arxiv.org/abs/2402.03982v1 +http://arxiv.org/abs/2406.15669v1 +http://arxiv.org/abs/2406.02329v1 +http://arxiv.org/abs/2406.09068v2 +http://arxiv.org/abs/2408.08305v1 +http://arxiv.org/abs/2406.19626v1 +http://arxiv.org/abs/2305.11512v2 +http://arxiv.org/abs/2406.11161v1 +http://arxiv.org/abs/2406.04759v1 +http://arxiv.org/abs/2409.11697v1 +http://arxiv.org/abs/2003.02214v3 +http://arxiv.org/abs/2403.16552v2 +http://arxiv.org/abs/2405.18199v1 +http://arxiv.org/abs/2408.05839v2 +http://arxiv.org/abs/2406.01577v1 +http://arxiv.org/abs/2402.04376v2 +http://arxiv.org/abs/2409.18017v1 +http://arxiv.org/abs/2402.06025v6 +http://arxiv.org/abs/2405.15706v2 +http://arxiv.org/abs/2405.18968v1 +http://arxiv.org/abs/2405.18784v1 +http://arxiv.org/abs/2402.03086v2 +http://arxiv.org/abs/2406.06911v3 +http://arxiv.org/abs/2406.17341v1 +http://arxiv.org/abs/2405.19073v1 +http://arxiv.org/abs/2405.14558v1 +http://arxiv.org/abs/2402.07099v2 +http://arxiv.org/abs/2405.20413v1 +http://arxiv.org/abs/2402.03564v1 +http://arxiv.org/abs/2406.10485v1 +http://arxiv.org/abs/2405.13997v2 +http://arxiv.org/abs/2406.00380v2 +http://arxiv.org/abs/2406.17433v1 +http://arxiv.org/abs/2410.05601v1 +http://arxiv.org/abs/2311.07633v4 +http://arxiv.org/abs/2405.13863v1 +http://arxiv.org/abs/2407.07462v1 +http://arxiv.org/abs/2311.03079v2 +http://arxiv.org/abs/2405.15223v2 +http://arxiv.org/abs/2407.16837v1 +http://arxiv.org/abs/2402.03545v1 +http://arxiv.org/abs/2406.05862v2 +http://arxiv.org/abs/2406.01721v2 +http://arxiv.org/abs/2304.07472v2 +http://arxiv.org/abs/2406.09403v2 +http://arxiv.org/abs/2405.16012v2 +http://arxiv.org/abs/2410.14388v1 +http://arxiv.org/abs/2401.06091v3 diff --git a/research_bench/nv_api.py b/research_bench/nv_api.py new file mode 100644 index 00000000..1d4a2b6a --- /dev/null +++ b/research_bench/nv_api.py @@ -0,0 +1,19 @@ +from openai import OpenAI + +client = OpenAI( + base_url = "https://integrate.api.nvidia.com/v1", + api_key = "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" +) + +completion = client.chat.completions.create( + model="nvidia/nv-embed-v1", + messages=[{"role":"user","content":"Write a limerick about the wonders of GPU computing."}], + temperature=0.2, + top_p=0.7, + max_tokens=1024, + stream=True +) + +for chunk in completion: + if chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") \ No newline at end of file diff --git a/research_bench/oodbench/oodbench_1202.json b/research_bench/oodbench/oodbench_1202.json new file mode 100644 index 00000000..1dd4b3dd --- /dev/null +++ b/research_bench/oodbench/oodbench_1202.json @@ -0,0 +1,5463 @@ +{ + "1406.2661": { + "paper_data": { + "title": "Generative Adversarial Networks", + "url": "http://arxiv.org/abs/1406.2661v1", + "arxiv_id": "1406.2661", + "authors": [ + "Ian J. Goodfellow", + "Jean Pouget-Abadie", + "Mehdi Mirza", + "Bing Xu", + "David Warde-Farley", + "Sherjil Ozair", + "Aaron Courville", + "Yoshua Bengio" + ], + "abstract": "We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to 1/2 everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.", + "introduction": " Introduction The promise of deep learning is to discover rich, hierarchical models [2] that represent probability distributions over the kinds of data encountered in artificial intelligence applications, such as natural images, audio waveforms containing speech, and symbols in natural language corpora. So far, the most striking successes in deep learning have involved discriminative models, usually those that map a high-dimensional, rich sensory input to a class label [14, 22]. These striking successes have primarily been based on the backpropagation and dropout algorithms, using piecewise linear units [19, 9, 10] which have a particularly well-behaved gradient . Deep generative models have had less of an impact, due to the difficulty of approximating many intractable probabilistic computations that arise in maximum likelihood estimation and related strategies, and due to difficulty of leveraging the benefits of piecewise linear units in the generative context. We propose a new generative model estimation procedure that sidesteps these difficulties.1 In the proposed adversarial nets framework, the generative model is pitted against an adversary: a discriminative model that learns to determine whether a sample is from the model distribution or the data distribution. The generative model can be thought of as analogous to a team of counterfeiters, trying to produce fake currency and use it without detection, while the discriminative model is analogous to the police, trying to detect the counterfeit currency. Competition in this game drives both teams to improve their methods for coordinating GandDor determining better distributions to sample zfrom during training. This paper has demonstrated the viability of the adversarial modeling framework, suggesting that these research directions could prove useful. Related work An alternative to directed graphical models with latent variables are undirected graphical models with latent variables, such as restricted Boltzmann machines (RBMs) [27, 16], deep Boltzmann machines (DBMs) [26] and their numerous variants. The interactions within such models are represented as the product of unnormalized potential functions, normalized by a global summa- tion/integration over all states of the random variables. This quantity (the partition function ) and its gradient are intractable for all but the most trivial instances, although they can be estimated by Markov chain Monte Carlo (MCMC) results of this section are done in a non- parametric setting, e.g. we represent a model with infinite capacity by studying convergence in the space of probability density functions. We will show in section 4.1 that this minimax game has a global optimum for pg=pdata. We will then show in section 4.2 that Algorithm 1 optimizes Eq 1, thus obtaining the desired result. 3Algorithm 1 Minibatch stochastic gradient descent training of generative adversarial nets. The number of steps to apply to the discriminator, k, is a hyperparameter. We used k= 1, the least expensive option, in our Results are reported in Table 1. This method of estimating the likelihood has somewhat high variance and does not perform well in high dimensional spaces but it is the best method available to our knowledge. Advances in generative models that can sample but not estimate likelihood directly motivate further research into how to evaluate such models. In Figures 2 and 3 we show samples drawn from the generator net after training. While we make no claim that these samples are better than samples generated by existing experiments. 4.1 Global Optimality of pg=pdata We first consider the optimal discriminator", + "references": [ + { + "title": "Neural Variational Inference and Learning in Belief Networks", + "abstract": "Highly expressive directed latent variable models, such as sigmoid belief networks, are difficult to train on large datasets because exact inference in them is intractable and none of the approximate inference methods that have been applied to them scale well. We propose a fast non-iterative approximate inference method that uses a feedforward network to implement efficient exact sampling from the variational posterior. The model and this inference network are trained jointly by maximizing a variational lower bound on the log-likelihood. Although the naive estimator of the inference network gradient is too high-variance to be useful, we make it practical by applying several straightforward model-independent variance reduction techniques. Applying our approach to training sigmoid belief networks and deep autoregressive networks, we show that it outperforms the wake-sleep algorithm on MNIST and achieves state-of-the-art results on the Reuters RCV1 document dataset." + }, + { + "title": "Stochastic Backpropagation and Approximate Inference in Deep Generative Models", + "abstract": "We marry ideas from deep neural networks and approximate Bayesian inference to derive a generalised class of deep, directed generative models, endowed with a new algorithm for scalable inference and learning. Our algorithm introduces a recognition model to represent approximate posterior distributions, and that acts as a stochastic encoder of the data. We develop stochastic back-propagation -- rules for back-propagation through stochastic variables -- and use this to develop an algorithm that allows for joint optimisation of the parameters of both the generative and recognition model. We demonstrate on several real-world data sets that the model generates realistic samples, provides accurate imputations of missing data and is a useful tool for high-dimensional data visualisation." + }, + { + "title": "Intriguing properties of neural networks", + "abstract": "Deep neural networks are highly expressive models that have recently achieved state of the art performance on speech and visual recognition tasks. While their expressiveness is the reason they succeed, it also causes them to learn uninterpretable solutions that could have counter-intuitive properties. In this paper we report two such properties. \nFirst, we find that there is no distinction between individual high level units and random linear combinations of high level units, according to various methods of unit analysis. It suggests that it is the space, rather than the individual units, that contains of the semantic information in the high layers of neural networks. \nSecond, we find that deep neural networks learn input-output mappings that are fairly discontinuous to a significant extend. We can cause the network to misclassify an image by applying a certain imperceptible perturbation, which is found by maximizing the network's prediction error. In addition, the specific nature of these perturbations is not a random artifact of learning: the same perturbation can cause a different network, that was trained on a different subset of the dataset, to misclassify the same input." + }, + { + "title": "Auto-Encoding Variational Bayes", + "abstract": "Abstract: How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions is two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results." + }, + { + "title": "Multi-Prediction Deep Boltzmann Machines", + "abstract": "We introduce the multi-prediction deep Boltzmann machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.1" + }, + { + "title": "Deep AutoRegressive Networks", + "abstract": "We introduce a deep, generative autoencoder capable of learning hierarchies of distributed representations from data. Successive deep stochastic hidden layers are equipped with autoregressive connections, which enable the model to be sampled from quickly and exactly via ancestral sampling. We derive an efficient approximate parameter estimation method based on the minimum description length (MDL) principle, which can be seen as maximising a variational lower bound on the log-likelihood, with a feedforward neural network implementing approximate inference. We demonstrate state-of-the-art generative performance on a number of classic data sets, including several UCI data sets, MNIST and Atari 2600 games." + }, + { + "title": "Pylearn2: a machine learning research library", + "abstract": "Pylearn2 is a machine learning research library. This does not just mean that it is a collection of machine learning algorithms that share a common API; it means that it has been designed for flexibility and extensibility in order to facilitate research projects that involve new or unusual use cases. In this paper we give a brief history of the library, an overview of its basic philosophy, a summary of the library's architecture, and a description of how the Pylearn2 community functions socially." + }, + { + "title": "Deep Generative Stochastic Networks Trainable by Backprop", + "abstract": "We introduce a novel training principle for probabilistic models that is an alternative to maximum likelihood. The proposed Generative Stochastic Networks (GSN) framework is based on learning the transition operator of a Markov chain whose stationary distribution estimates the data distribution. The transition distribution of the Markov chain is conditional on the previous state, generally involving a small move, so this conditional distribution has fewer dominant modes, being unimodal in the limit of small moves. Thus, it is easier to learn because it is easier to approximate its partition function, more like learning to perform supervised function approximation, with gradients that can be obtained by backprop. We provide theorems that generalize recent work on the probabilistic interpretation of denoising autoencoders and obtain along the way an interesting justification for dependency networks and generalized pseudolikelihood, along with a definition of an appropriate joint distribution and sampling mechanism even when the conditionals are not consistent. GSNs can be used with missing inputs and can be used to sample subsets of variables given the rest. We validate these theoretical results with experiments on two image datasets using an architecture that mimics the Deep Boltzmann Machine Gibbs sampler but allows training to proceed with simple backprop, without the need for layerwise pretraining." + }, + { + "title": "Generalized Denoising Auto-Encoders as Generative Models", + "abstract": "Recent work has shown how denoising and contractive autoencoders implicitly capture the structure of the data-generating density, in the case where the corruption noise is Gaussian, the reconstruction error is the squared error, and the data is continuous-valued. This has led to various proposals for sampling from this implicitly learned density function, using Langevin and Metropolis-Hastings MCMC. However, it remained unclear how to connect the training procedure of regularized auto-encoders to the implicit estimation of the underlying data-generating distribution when the data are discrete, or using other forms of corruption process and reconstruction errors. Another issue is the mathematical justification which is only valid in the limit of small corruption noise. We propose here a different attack on the problem, which deals with all these issues: arbitrary (but noisy enough) corruption, arbitrary reconstruction loss (seen as a log-likelihood), handling both discrete and continuous-valued variables, and removing the bias due to non-infinitesimal corruption noise (or non-infinitesimal contractive penalty)." + }, + { + "title": "Maxout Networks", + "abstract": "We consider the problem of designing models to leverage a recently introduced approximate model averaging technique called dropout. We define a simple new model called maxout (so named because its output is the max of a set of inputs, and because it is a natural companion to dropout) designed to both facilitate optimization by dropout and improve the accuracy of dropout's fast approximate model averaging technique. We empirically verify that the model successfully accomplishes both of these tasks. We use maxout and dropout to demonstrate state of the art classification performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN." + }, + { + "title": "ImageNet classification with deep convolutional neural networks", + "abstract": "We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called \"dropout\" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved by the second-best entry." + }, + { + "title": "Theano: new features and speed improvements", + "abstract": "Theano is a linear algebra compiler that optimizes a user's symbolically-specified mathematical computations to produce efficient low-level implementations. In this paper, we present new features and efficiency improvements to Theano, and benchmarks demonstrating Theano's performance relative to Torch7, a recently introduced machine learning library, and to RNNLM, a C++ library targeted at recurrent neural networks." + }, + { + "title": "Deep Neural Networks for Acoustic Modeling in Speech Recognition", + "abstract": "Most current speech recognition systems use hidden Markov models ( HMMs) to deal with the temporal variability of speech and Gaussian mixture models to determine how well each state of each HMM fits a frame or a short window of frames of coefficients that represents the acoustic input. An alternati ve way to evaluate the fit is to use a feedforward neural network that takes several frames of coefficients a s input and produces posterior probabilities over HMM states as output. Deep neural networks with many hidden layers, that are trained using new methods have been shown to outperform Gaussian mixture models on a variety of speech rec ognition benchmarks, sometimes by a large margin. This paper provides an overview of this progress and repres nts the shared views of four research groups who have had recent successes in using deep neural networks for a coustic modeling in speech recognition." + }, + { + "title": "Better Mixing via Deep Representations", + "abstract": "It has been hypothesized, and supported with experimental evidence, that deeper representations, when well trained, tend to do a better job at disentangling the underlying factors of variation. We study the following related conjecture: better representations, in the sense of better disentangling, can be exploited to produce Markov chains that mix faster between modes. Consequently, mixing between modes would be more efficient at higher levels of representation. To better understand this, we propose a secondary conjecture: the higher-level samples fill more uniformly the space they occupy and the high-density manifolds tend to unfold when represented at higher levels. The paper discusses these hypotheses and tests them experimentally through visualization and measurements of mixing between modes and interpolating between samples." + }, + { + "title": "Improving neural networks by preventing co-adaptation of feature detectors", + "abstract": "When a large feedforward neural network is trained on a small training set, it typically performs poorly on held-out test data. This \"overfitting\" is greatly reduced by randomly omitting half of the feature detectors on each training case. This prevents complex co-adaptations in which a feature detector is only helpful in the context of several other specific feature detectors. Instead, each neuron learns to detect a feature that is generally helpful for producing the correct answer given the combinatorially large variety of internal contexts in which it must operate. Random \"dropout\" gives big improvements on many benchmark tasks and sets new records for speech and object recognition." + }, + { + "title": "A Generative Process for Contractive Auto-Encoders", + "abstract": "The contractive auto-encoder learns a representation of the input data that captures the local manifold structure around each data point, through the leading singular vectors of the Jacobian of the transformation from input to representation. The corresponding singular values specify how much local variation is plausible in directions associated with the corresponding singular vectors, while remaining in a high-density region of the input space. This paper proposes a procedure for generating samples that are consistent with the local structure captured by a contractive auto-encoder. The associated stochastic process defines a distribution from which one can sample, and which experimentally appears to converge quickly and mix well between modes, compared to Restricted Boltzmann Machines and Deep Belief Networks. The intuitions behind this procedure can also be used to train the second layer of contraction that pools lower-level features and learns to be invariant to the local directions of variation discovered in the first layer. We show that this can help learn and represent invariances present in the data and improve classification error." + }, + { + "title": "Quickly Generating Representative Samples from an RBM-Derived Process", + "abstract": "Two recently proposed learning algorithms, herding and fast persistent contrastive divergence (FPCD), share the following interesting characteristic: they exploit changes in the model parameters while sampling in order to escape modes and mix better during the sampling process that is part of the learning algorithm. We justify such approaches as ways to escape modes while keeping approximately the same asymptotic distribution of the Markov chain. In that spirit, we extend FPCD using an idea borrowed from Herding in order to obtain a pure sampling algorithm, which we call the rates-FPCD sampler. Interestingly, this sampler can improve the model as we collect more samples, since it optimizes a lower bound on the log likelihood of the training data. We provide empirical evidence that this new algorithm displays substantially better and more robust mixing than Gibbs sampling." + }, + { + "title": "Deep Sparse Rectifier Neural Networks", + "abstract": "While logistic sigmoid neurons are more biologically plausible than hyperbolic tangent neurons, the latter work better for training multi-layer neural networks. This paper shows that rectifying neurons are an even better model of biological neurons and yield equal or better performance than hyperbolic tangent networks in spite of the hard non-linearity and non-dierentiabil ity" + }, + { + "title": "Noise-contrastive estimation: A new estimation principle for unnormalized statistical models", + "abstract": "We present a new estimation principle for parameterized statistical models. The idea is to perform nonlinear logistic regression to discriminate between the observed data and some artificially generated noise, using the model log-density function in the regression nonlinearity. We show that this leads to a consistent (convergent) estimator of the parameters, and analyze the asymptotic variance. In particular, the method is shown to directly work for unnormalized models, i.e. models where the density function does not integrate to one. The normalization constant can be estimated just like any other parameter. For a tractable ICA model, we compare the method with other estimation methods that can be used to learn unnormalized models, including score matching, contrastive divergence, and maximum-likelihood where the normalization constant is estimated with importance sampling. Simulations show that noise-contrastive estimation offers the best trade-off between computational and statistical efficiency. The method is then applied to the modeling of natural images: We show that the method can successfully estimate a large-scale two-layer model and a Markov random field." + }, + { + "title": "What is the best multi-stage architecture for object recognition?", + "abstract": "In many recent object recognition systems, feature extraction stages are generally composed of a filter bank, a non-linear transformation, and some sort of feature pooling layer. Most systems use only one stage of feature extraction in which the filters are hard-wired, or two stages where the filters in one or both stages are learned in supervised or unsupervised mode. This paper addresses three questions: 1. How does the non-linearities that follow the filter banks influence the recognition accuracy? 2. does learning the filter banks in an unsupervised or supervised manner improve the performance over random filters or hardwired filters? 3. Is there any advantage to using an architecture with two stages of feature extraction, rather than one? We show that using non-linearities that include rectification and local contrast normalization is the single most important ingredient for good accuracy on object recognition benchmarks. We show that two stages of feature extraction yield better accuracy than one. Most surprisingly, we show that a two-stage system with random filters can yield almost 63% recognition rate on Caltech-101, provided that the proper non-linearities and pooling layers are used. Finally, we show that with supervised refinement, the system achieves state-of-the-art performance on NORB dataset (5.6%) and unsupervised pre-training followed by supervised refinement produces good accuracy on Caltech-101 (≫ 65%), and the lowest known error rate on the undistorted, unprocessed MNIST dataset (0.53%)." + }, + { + "title": "Deep Boltzmann Machines", + "abstract": "We present a new learning algorithm for Boltzmann machines that contain many layers of hidden variables. Data-dependent expectations are estimated using a variational approximation that tends to focus on a single mode, and dataindependent expectations are approximated using persistent Markov chains. The use of two quite different techniques for estimating the two types of expectation that enter into the gradient of the log-likelihood makes it practical to learn Boltzmann machines with multiple hidden layers and millions of parameters. The learning can be made more efficient by using a layer-by-layer “pre-training” phase that allows variational inference to be initialized with a single bottomup pass. We present results on the MNIST and NORB datasets showing that deep Boltzmann machines learn good generative models and perform well on handwritten digit and visual object recognition tasks." + }, + { + "title": "Extracting and composing robust features with denoising autoencoders", + "abstract": "Previous work has shown that the difficulties in learning deep generative or discriminative models can be overcome by an initial unsupervised learning step that maps inputs to useful intermediate representations. We introduce and motivate a new training principle for unsupervised learning of a representation based on the idea of making the learned representations robust to partial corruption of the input pattern. This approach can be used to train autoencoders, and these denoising autoencoders can be stacked to initialize deep architectures. The algorithm can be motivated from a manifold learning and information theoretic perspective or from a generative model perspective. Comparative experiments clearly show the surprising advantage of corrupting the input of autoencoders on a pattern classification benchmark suite." + }, + { + "title": "Training restricted Boltzmann machines using approximations to the likelihood gradient", + "abstract": "A new algorithm for training Restricted Boltzmann Machines is introduced. The algorithm, named Persistent Contrastive Divergence, is different from the standard Contrastive Divergence algorithms in that it aims to draw samples from almost exactly the model distribution. It is compared to some standard Contrastive Divergence and Pseudo-Likelihood algorithms on the tasks of modeling and classifying various types of data. The Persistent Contrastive Divergence algorithm outperforms the other algorithms, and is equally fast and simple." + }, + { + "title": "Learning Generative Models via Discriminative Approaches", + "abstract": "Generative model learning is one of the key problems in machine learning and computer vision. Currently the use of generative models is limited due to the difficulty in effectively learning them. A new learning framework is proposed in this paper which progressively learns a target generative distribution through discriminative approaches. This framework provides many interesting aspects to the literature. From the generative model side: (1) A reference distribution is used to assist the learning process, which removes the need for a sampling processes in the early stages. (2) The classification power of discriminative approaches, e.g. boosting, is directly utilized. (3) The ability to select/explore features from a large candidate pool allows us to make nearly no assumptions about the training data. From the discriminative model side: (1) This framework improves the modeling capability of discriminative models. (2) It can start with source training data only and gradually \"invent\" negative samples. (3) We show how sampling schemes can be introduced to discriminative models. (4) The learning procedure helps to tighten the decision boundaries for classification, and therefore, improves robustness. In this paper, we show a variety of applications including texture modeling and classification, non-photorealistic rendering, learning image statistics/denoising, and face modeling. The framework handles both homogeneous patterns, e.g. textures, and inhomogeneous patterns, e.g. faces, with nearly an identical parameter setting for all the tasks in the learning stage." + }, + { + "title": "A Fast Learning Algorithm for Deep Belief Nets", + "abstract": "We show how to use complementary priors to eliminate the explaining-away effects that make inference difficult in densely connected belief nets that have many hidden layers. Using complementary priors, we derive a fast, greedy algorithm that can learn deep, directed belief networks one layer at a time, provided the top two layers form an undirected associative memory. The fast, greedy algorithm is used to initialize a slower learning procedure that fine-tunes the weights using a contrastive version of the wake-sleep algorithm. After fine-tuning, a network with three hidden layers forms a very good generative model of the joint distribution of handwritten digit images and their labels. This generative model gives better digit classification than the best discriminative learning algorithms. The low-dimensional manifolds on which the digits lie are modeled by long ravines in the free-energy landscape of the top-level associative memory, and it is easy to explore these ravines by using the directed connections to display what the associative memory has in mind." + }, + { + "title": "Estimation of Non-Normalized Statistical Models by Score Matching", + "abstract": "One often wants to estimate statistical models where the probability density function is known only up to a multiplicative normalization constant. Typically, one then has to resort to Markov Chain Monte Carlo methods, or approximations of the normalization constant. Here, we propose that such models can be estimated by minimizing the expected squared distance between the gradient of the log-density given by the model and the gradient of the log-density of the observed data. While the estimation of the gradient of log-density function is, in principle, a very difficult non-parametric problem, we prove a surprising result that gives a simple formula for this objective function. The density function of the observed data does not appear in this formula, which simplifies to a sample average of a sum of some derivatives of the log-density given by the model. The validity of the method is demonstrated on multivariate Gaussian and independent component analysis models, and by estimating an overcomplete filter set for natural image data." + }, + { + "title": "On the convergence of markovian stochastic algorithms with rapidly decreasing ergodicity rates", + "abstract": "We analyse the convergence of stochastic algorithms with Markovian noise when the ergodicity of the Markov chain governing the noise rapidly decreases as the control parameter tends to infinity. In such a case, there may be a positive probability of divergence of the algorithm in the classic Robbins-Monro form. We provide sufficient condition which ensure convergence. Moreover, we analyse the asymptotic behaviour of these algorithms and state a diffusion approximation theorem" + }, + { + "title": "The \"wake-sleep\" algorithm for unsupervised neural networks.", + "abstract": "An unsupervised learning algorithm for a multilayer network of stochastic neurons is described. Bottom-up \"recognition\" connections convert the input into representations in successive hidden layers, and top-down \"generative\" connections reconstruct the representation in one layer from the representation in the layer above. In the \"wake\" phase, neurons are driven by recognition connections, and generative connections are adapted to increase the probability that they would reconstruct the correct activity vector in the layer below. In the \"sleep\" phase, neurons are driven by generative connections, and recognition connections are adapted to increase the probability that they would produce the correct activity vector in the layer above." + }, + { + "title": "Learning Factorial Codes by Predictability Minimization", + "abstract": "I propose a novel general principle for unsupervised learning of distributed nonredundant internal representations of input patterns. The principle is based on two opposing forces. For each representational unit there is an adaptive predictor, which tries to predict the unit from the remaining units. In turn, each unit tries to react to the environment such that it minimizes its predictability. This encourages each unit to filter \"abstract concepts\" out of the environmental input such that these concepts are statistically independent of those on which the other units focus. I discuss various simple yet potentially powerful implementations of the principle that aim at finding binary factorial codes (Barlow et al. 1989), i.e., codes where the probability of the occurrence of a particular input is simply the product of the probabilities of the corresponding code symbols. Such codes are potentially relevant for (1) segmentation tasks, (2) speeding up supervised learning, and (3) novelty detection. Methods for finding factorial codes automatically implement Occam's razor for finding codes using a minimal number of units. Unlike previous methods the novel principle has a potential for removing not only linear but also nonlinear output redundancy. Illustrative experiments show that algorithms based on the principle of predictability minimization are practically feasible. The final part of this paper describes an entirely local algorithm that has a potential for learning unique representations of extended input sequences." + }, + { + "title": "Information processing in dynamical systems: foundations of harmony theory", + "abstract": "Abstract : At this early stage in the development of cognitive science, methodological issues are both open and central. There may have been times when developments in neuroscience, artificial intelligence, or cognitive psychology seduced researchers into believing that their discipline was on the verge of discovering the secret of intelligence. But a humbling history of hopes disappointed has produced the realization that understanding the mind will challenge the power of all these methodologies combined. The work reported in this chapter rests on the conviction that a methodology that has a crucial role to play in the development of cognitive science is mathematical analysis. The success of cognitive science, like that of many other sciences, will, I believe, depend upon the construction of a solid body of theoretical results: results that express in a mathematical language the conceptual insights of the field; results that squeeze all possible implications out of those insights by exploiting powerful mathematical techniques. This body of results, which I will call the theory of information processing, exists because information is a concept that lends itself to mathematical formalization. One part of the theory of information processing is already well-developed. The classical theory of computation provides powerful and elegant results about the notion of effective procedure, including languages for precisely expressing them and theoretical machines for realizing them." + }, + { + "title": "The Toronto face dataset", + "abstract": null + }, + { + "title": "Theano: a CPU and GPU math expression compiler", + "abstract": null + }, + { + "title": "Learning Multiple Layers of Features from Tiny Images", + "abstract": "Groups at MIT and NYU have collected a dataset of millions of tiny colour images from the web. It is, in principle, an excellent dataset for unsupervised training of deep generative models, but previous researchers who have tried this have found it dicult to learn a good set of lters from the images. We show how to train a multi-layer generative model that learns to extract meaningful features which resemble those found in the human visual cortex. Using a novel parallelization algorithm to distribute the work among multiple machines connected on a network, we show how training such a model can be done in reasonable time. A second problematic aspect of the tiny images dataset is that there are no reliable class labels which makes it hard to use for object recognition experiments. We created two sets of reliable labels. The CIFAR-10 set has 6000 examples of each of 10 classes and the CIFAR-100 set has 600 examples of each of 100 non-overlapping classes. Using these labels, we show that object recognition is signicantly improved by pre-training a layer of features on a large set of unlabeled tiny images." + }, + { + "title": "Learning Deep Architectures for AI", + "abstract": "Theoretical results strongly suggest that in order to learn the kind of complicated functions that can represent high-level abstractions (e.g. in vision, language, and other AI-level tasks), one needs deep architectures. Deep architectures are composed of multiple levels of non-linear operations, such as in neural nets with many hidden layers or in complicated propositional formulae re-using many sub-formulae. Searching the parameter space of deep architectures is a difficult optimization task, but learning algorithms such as those for Deep Belief Networks have recently been proposed to tackle this problem with notable success, beating the state-of-the-art in certain areas. This paper discusses the motivations and principles regarding learning algorithms for deep architectures, in particular those exploiting as building blocks unsupervised learning of single-layer models such as Restricted Boltzmann Machines, used to construct deeper models such as Deep Belief Networks." + }, + { + "title": "Gradient-based learning applied to document recognition", + "abstract": "Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day." + } + ] + }, + "author_data": {}, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we effectively estimate generative models in deep learning to overcome the challenges of intractable probabilistic computations and improve the performance of generative adversarial networks (GANs)?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is crucial for advancing the field of deep learning, particularly in the development of generative models that can produce high-quality samples. By improving generative model estimation, we can enhance applications in various domains such as image synthesis, natural language processing, and audio generation. This research could lead to more robust and efficient generative models, fostering further exploration and innovation in machine learning techniques and their practical applications.\n\n**[Question 3] - Why is it hard?** \nThe challenges in solving this problem stem from the complexity of approximating intractable probabilistic computations involved in maximum likelihood estimation. Naive approaches may fail due to the high variance in likelihood estimation and the difficulties in leveraging piecewise linear units in generative contexts. Additionally, the need for a balance between the generative and discriminative models in the adversarial framework adds to the technical complexity, requiring careful tuning and optimization to achieve effective results.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has been limited by the intractability of the partition function in undirected graphical models and the challenges associated with Markov chain Monte Carlo methods. These barriers have hindered the development of effective generative models. Our approach differs by introducing a novel adversarial framework that allows for a more efficient estimation of generative models, sidestepping the difficulties faced by traditional methods and providing a new perspective on model training.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves the use of adversarial nets, where a generative model is trained alongside a discriminative model in a minimax game setup. We will utilize a dataset of natural images for training and evaluate the models based on the quality of generated samples and their ability to mimic the data distribution. The expected outcomes include improved sample quality from the generative model and a better understanding of the dynamics between the generator and discriminator, ultimately demonstrating the viability of the adversarial modeling framework." + }, + "1503.02531": { + "paper_data": { + "title": "Distilling the Knowledge in a Neural Network", + "url": "http://arxiv.org/abs/1503.02531v1", + "arxiv_id": "1503.02531", + "authors": [ + "Geoffrey Hinton", + "Oriol Vinyals", + "Jeff Dean" + ], + "abstract": "A very simple way to improve the performance of almost any machine learning algorithm is to train many different models on the same data and then to average their predictions. Unfortunately, making predictions using a whole ensemble of models is cumbersome and may be too computationally expensive to allow deployment to a large number of users, especially if the individual models are large neural nets. Caruana and his collaborators have shown that it is possible to compress the knowledge in an ensemble into a single model which is much easier to deploy and we develop this approach further using a different compression technique. We achieve some surprising results on MNIST and we show that we can significantly improve the acoustic model of a heavily used commercial system by distilling the knowledge in an ensemble of models into a single model. We also introduce a new type of ensemble composed of one or more full models and many specialist models which learn to distinguish fine-grained classes that the full models confuse. Unlike a mixture of experts, these specialist models can be trained rapidly and in parallel.", + "introduction": " Introduction Manyinsectshavea larvalformthat isoptimizedforextract ingenergyandnutrientsfromthe envi- ronmentand a completely differentadult form that is optimi zed for the verydifferent requirements oftravelingandreproduction. Inlarge-scalemachinelear ning,wetypicallyuseverysimilarmodels for the training stage and the deployment stage despite thei r very different requirements: For tasks likespeechandobjectrecognition,trainingmustextracts tructurefromverylarge,highlyredundant datasets but it does not need to operate in real time and it can use a huge amount of computation. Deploymentto a largenumberofusers, however,hasmuchmore stringentrequirementsonlatency and computational resources. The analogy with insects sugg ests that we should be willing to train verycumbersomemodelsifthatmakesiteasiertoextractstr ucturefromthedata. Thecumbersome model could be an ensemble of separately trained modelsor a s ingle very large model trained with a very strong regularizer such as dropout [9]. Once the cumbe rsome model has been trained, we canthenuseadifferentkindoftraining,whichwecall“dist illation”totransfertheknowledgefrom the cumbersome model to a small model that is more suitable fo r deployment. A version of this strategy has already been pioneered by Rich Caruana and his c ollaborators [1]. In their important paper they demonstrate convincingly that the knowledge acq uired by a large ensemble of models canbetransferredto asinglesmall model. A conceptual block that may have prevented more investigati onof this very promising approach is thatwetendtoidentifytheknowledgeinatrainedmodelwith thelearnedparametervaluesandthis makesithardtoseehowwecanchangetheformofthemodelbutk eepthesameknowledge. Amore abstract view of the knowledge, that frees it from any partic ular instantiation, is that it is a learned ∗Alsoaffiliatedwiththe Universityof Toronto andthe Canadi an Institute for Advanced Research. †Equal contribution. 1mapping from input vectors to output vectors. For cumbersom e models that learn to discriminate between a large number of classes, the normal training objec tive is to maximize the average log probability of the correct answer, but a side-effect of the l earning is that the trained model assigns probabilitiesto all of the incorrect answers and even when t hese probabilitiesare very small, some ofthemaremuchlargerthanothers. Therelativeprobabilit iesofincorrectanswerstellusalotabout howthe cumbersomemodeltendsto generalize. An imageof a BM W, forexample,mayonlyhave averysmallchanceofbeingmistakenforagarbagetruck,but thatmistakeisstill manytimesmore probablethanmistakingit foracarrot. It is generallyacceptedthat the objectivefunctionused fo r trainingshouldreflect the true objective of the user as closely as possible. Despite this, modelsare u sually trained to optimizeperformance on the training data when the real objective is to generalize well to new data. It would clearly be better to train models to generalize well, but this requir es information about the correct way to generalize and this information is not normally available. When we are distilling the knowledge fromalargemodelintoasmallone,however,wecantrainthes mallmodeltogeneralizeinthesame way as the large model. If the cumbersome model generalizes w ell because, for example, it is the averageofalargeensembleofdifferentmodels,asmallmode ltrainedtogeneralizeinthesameway willtypicallydomuchbetterontestdatathanasmallmodelt hatistrainedinthenormalwayonthe sametrainingset aswasusedto traintheensemble. An obviousway to transferthe generalizationability of the cumbersomemodelto a small model is to use the class probabilities produced by the cumbersome mo del as “soft targets” for training the small model. For this transfer stage, we coulduse the same tr ainingset or a separate “transfer”set. When the cumbersome model is a large ensemble of simpler mode ls, we can use an arithmetic or geometricmeanof their individualpredictivedistributio nsas the soft targets. When the soft targets havehighentropy,theyprovidemuchmoreinformationpertr ainingcasethanhardtargetsandmuch lessvarianceinthegradientbetweentrainingcases,sothe smallmodelcanoftenbetrainedonmuch lessdatathantheoriginalcumbersomemodelandusinga much higherlearningrate. For tasks like MNIST in which the cumbersomemodel almost alw ays producesthe correct answer with very high confidence, much of the informationabout the l earned functionresides in the ratios of very small probabilities in the soft targets. For example , one version of a 2 may be given a probability of 10−6of being a 3 and 10−9of being a 7 whereas for another version it may be the other way around. This is valuable", + "references": [ + { + "title": "Large Scale Distributed Deep Networks", + "abstract": "Recent work in unsupervised feature learning and deep learning has shown that being able to train large models can dramatically improve performance. In this paper, we consider the problem of training a deep network with billions of parameters using tens of thousands of CPU cores. We have developed a software framework called DistBelief that can utilize computing clusters with thousands of machines to train large models. Within this framework, we have developed two algorithms for large-scale distributed training: (i) Downpour SGD, an asynchronous stochastic gradient descent procedure supporting a large number of model replicas, and (ii) Sandblaster, a framework that supports a variety of distributed batch optimization procedures, including a distributed implementation of L-BFGS. Downpour SGD and Sandblaster L-BFGS both increase the scale and speed of deep network training. We have successfully used our system to train a deep network 30x larger than previously reported in the literature, and achieves state-of-the-art performance on ImageNet, a visual object recognition task with 16 million images and 21k categories. We show that these same techniques dramatically accelerate the training of a more modestly- sized deep network for a commercial speech recognition service. Although we focus on and report performance of these methods as applied to training large neural networks, the underlying algorithms are applicable to any gradient-based machine learning algorithm." + }, + { + "title": "ImageNet classification with deep convolutional neural networks", + "abstract": "We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called \"dropout\" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved by the second-best entry." + }, + { + "title": "Deep Neural Networks for Acoustic Modeling in Speech Recognition: The Shared Views of Four Research Groups", + "abstract": "Most current speech recognition systems use hidden Markov models (HMMs) to deal with the temporal variability of speech and Gaussian mixture models (GMMs) to determine how well each state of each HMM fits a frame or a short window of frames of coefficients that represents the acoustic input. An alternative way to evaluate the fit is to use a feed-forward neural network that takes several frames of coefficients as input and produces posterior probabilities over HMM states as output. Deep neural networks (DNNs) that have many hidden layers and are trained using new methods have been shown to outperform GMMs on a variety of speech recognition benchmarks, sometimes by a large margin. This article provides an overview of this progress and represents the shared views of four research groups that have had recent successes in using DNNs for acoustic modeling in speech recognition." + }, + { + "title": "Improving neural networks by preventing co-adaptation of feature detectors", + "abstract": "When a large feedforward neural network is trained on a small training set, it typically performs poorly on held-out test data. This \"overfitting\" is greatly reduced by randomly omitting half of the feature detectors on each training case. This prevents complex co-adaptations in which a feature detector is only helpful in the context of several other specific feature detectors. Instead, each neuron learns to detect a feature that is generally helpful for producing the correct answer given the combinatorially large variety of internal contexts in which it must operate. Random \"dropout\" gives big improvements on many benchmark tasks and sets new records for speech and object recognition." + }, + { + "title": "Model compression", + "abstract": "Often the best performing supervised learning models are ensembles of hundreds or thousands of base-level classifiers. Unfortunately, the space required to store this many classifiers, and the time required to execute them at run-time, prohibits their use in applications where test sets are large (e.g. Google), where storage space is at a premium (e.g. PDAs), and where computational power is limited (e.g. hea-ring aids). We present a method for \"compressing\" large, complex ensembles into smaller, faster models, usually without significant loss in performance." + }, + { + "title": "Multiple Classifier Systems", + "abstract": null + }, + { + "title": "Adaptive Mixtures of Local Experts", + "abstract": "We present a new supervised learning procedure for systems composed of many separate networks, each of which learns to handle a subset of the complete set of training cases. The new procedure can be viewed either as a modular version of a multilayer supervised network, or as an associative version of competitive learning. It therefore provides a new link between these two apparently different approaches. We demonstrate that the learning procedure divides up a vowel discrimination task into appropriate subtasks, each of which can be solved by a very simple expert network." + }, + { + "title": "Dropout: a simple way to prevent neural networks from overfitting", + "abstract": "Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overfitting is a serious problem in such networks. Large networks are also slow to use, making it difficult to deal with overfitting by combining the predictions of many different large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different \"thinned\" networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets." + }, + { + "title": "Learning small-size DNN with output-distribution-based criteria", + "abstract": "Deep neural network (DNN) obtains significant accuracy improvements on many speech recognition tasks and its power comes from the deep and wide network structure with a very large number of parameters. It becomes challenging when we deploy DNN on devices which have limited computational and storage resources. The common practice is to train a DNN with a small number of hidden nodes and a small senone set using the standard training process, leading to significant accuracy loss. In this study, we propose to better address these issues by utilizing the DNN output distribution. To learn a DNN with small number of hidden nodes, we minimize the Kullback–Leibler divergence between the output distributions of the small-size DNN and a standard large-size DNN by utilizing a large number of un-transcribed data. For better senone set generation, we cluster the senones in the large set into a small one by directly relating the clustering process to DNN parameters, as opposed to decoupling the senone generation and DNN training process in the standard training. Evaluated on a short message dictation task, the proposed two methods get 5.08% and 1.33% relative word error rate reduction from the standard training method, respectively." + } + ] + }, + "author_data": {}, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we effectively transfer the knowledge from cumbersome machine learning models, which are optimized for training, to smaller models that are more suitable for deployment in real-time applications?\n\n---\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem has significant implications for the research community as it addresses the gap between model training and deployment, which is crucial for real-world applications. By improving the efficiency of model deployment, we can enhance the performance of machine learning systems in various domains, such as speech and object recognition, where latency and computational resources are critical. This research could lead to advancements in knowledge transfer techniques and inspire future studies on model optimization, ultimately resulting in more effective and accessible machine learning solutions.\n\n---\n\n**[Question 3] - Why is it hard?** \nThe challenges in solving this problem stem from the inherent differences in requirements between training and deployment stages. Naive approaches may fail because they do not account for the need to generalize well from a large, complex model to a smaller, efficient one. Technical obstacles include the difficulty in maintaining the generalization capabilities of the cumbersome model while transferring knowledge to a smaller model. Additionally, the lack of information on how to generalize effectively complicates the training process, making it hard to ensure that the distilled model retains the necessary knowledge.\n\n---\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has often focused on optimizing models for either training or deployment, but not on the transition between the two. A conceptual barrier has been the tendency to equate knowledge in a model with its learned parameters, which limits the understanding of how to change the model's form while preserving its knowledge. Existing solutions have not adequately addressed the need for a more abstract view of knowledge transfer. Our approach differs by emphasizing the use of soft targets derived from the cumbersome model to guide the training of the smaller model, thereby facilitating a more effective transfer of generalization capabilities.\n\n---\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves training a cumbersome model, which could be an ensemble of models or a large model with strong regularization, to extract structure from a large dataset. We will then employ a distillation process where the soft targets (class probabilities) produced by the cumbersome model are used to train a smaller model. The dataset will include both the original training set and a separate transfer set, and we will evaluate the performance using metrics" + }, + "1908.01755": { + "paper_data": { + "title": "On the Existence of Simpler Machine Learning Models", + "url": "http://arxiv.org/abs/1908.01755v4", + "arxiv_id": "1908.01755", + "authors": [ + "Lesia Semenova", + "Cynthia Rudin", + "Ronald Parr" + ], + "abstract": "It is almost always easier to find an accurate-but-complex model than an accurate-yet-simple model. Finding optimal, sparse, accurate models of various forms (linear models with integer coefficients, decision sets, rule lists, decision trees) is generally NP-hard. We often do not know whether the search for a simpler model will be worthwhile, and thus we do not go to the trouble of searching for one. In this work, we ask an important practical question: can accurate-yet-simple models be proven to exist, or shown likely to exist, before explicitly searching for them? We hypothesize that there is an important reason that simple-yet-accurate models often do exist. This hypothesis is that the size of the Rashomon set is often large, where the Rashomon set is the set of almost-equally-accurate models from a function class. If the Rashomon set is large, it contains numerous accurate models, and perhaps at least one of them is the simple model we desire. In this work, we formally present the Rashomon ratio as a new gauge of simplicity for a learning problem, depending on a function class and a data set. The Rashomon ratio is the ratio of the volume of the set of accurate models to the volume of the hypothesis space, and it is different from standard complexity measures from statistical learning theory. Insight from studying the Rashomon ratio provides an easy way to check whether a simpler model might exist for a problem before finding it, namely whether several different machine learning methods achieve similar performance on the data. In that sense, the Rashomon ratio is a powerful tool for understanding why and when an accurate-yet-simple model might exist. If, as we hypothesize in this work, many real-world data sets admit large Rashomon sets, the implications are vast: it means that simple or interpretable models may often be used for high-stakes decisions without losing accuracy.", + "introduction": " Introduction Following the principle of Occam’s Razor, one should use the simplest model that explains the data well. However, finding the simplest model, let alone any simple-yet-accurate model, is hard. As soon as simplicity constraints such as sparsity are introduced, the optimization problem for finding a simpler model typically becomes NP-hard. Thus, practitioners – who have no assurance of finding a simpler model that achieves the performance level of a black box – may not see a reason to attempt such potentially difficult optimization problems. Thus, sadly, what was once the holy grail of finding simpler models, has been, for the most part, abandoned in modern machine learning. In this work, we ask a question that is essential, and potentially game-changing, for this discussion: what if we knew, before attempting a computationally expensive search for a simpler-yet-accurate model, that one was likely to exist? Perhaps knowing this would allow us to justify the time and expense of searching for such a model. If it is true that many data sets have large enough Rashomon sets to admit simple models, then there are important implications for society – it means we may be able to use simpler or interpretable models for many high-stakes problems without losing accuracy. Proving the existence of simpler models before aiming to find them differs from the current approach to machine learning in practice. We generally do not think about going from more complicated spaces to simpler ones; in fact, the reverse is true, where typical statistical learning theory and algorithms allowed us to maintain generalization when handling more complicated model classes (e.g., large margins for support 1arXiv:1908.01755v4 [cs.LG] 12 May 2022vector machines with complex kernels or large margins for boosted trees) Cortes and Vapnik (1995); Schapire et al. (1998). We even build neural networks that are so complex that they can achieve zero training error, and try afterwards to determine why they generalize Belkin et al. (2019); Nakkiran et al. (2021). However, because simple models are essential for many high-stakes decisions (Rudin, 2019), perhaps we should return to the goal of aiming directly for simpler models. We will need new ideas in order to do this. Decades of study about generalization in machine learning have provided many different mathematical theories. Many of them measure the complexity of classes of functions without considering the data (e.g., VC theory, Vapnik, 1995), or measure properties of specific algorithms (e.g., algorithmic stability, see Bousquet and Elisseeff, 2002). However, none of these theories seems to capture directly a phenomenon that occurs throughout practical machine learning. In particular, there are a vast number of data sets for which many standard machine learning algorithms perform similarly . In these cases, the machine learning models tend to generalize well . Furthermore, in these same cases, there is often a simpler model that performs similarly and also generalizes well . We hypothesize that these three observations can all be explained by the same phenomenon: the “Rashomon effect,” which is the existence of many almost-equally-accurate models (Breiman et al., 2001). Firstly, following a key argument in our work, if there is a large Rashomon set of almost-equally-accurate models, a simple model may also be contained in it. Secondly, if the Rashomon set is large, many different machine learning algorithms", + "references": [ + { + "title": "Characterizing Fairness Over the Set of Good Models Under Selective Labels", + "abstract": "Algorithmic risk assessments are used to inform decisions in a wide variety of high-stakes settings. Often multiple predictive models deliver similar overall performance but differ markedly in their predictions for individual cases, an empirical phenomenon known as the\"Rashomon Effect.\"These models may have different properties over various groups, and therefore have different predictive fairness properties. We develop a framework for characterizing predictive fairness properties over the set of models that deliver similar overall performance, or\"the set of good models.\"Our framework addresses the empirically relevant challenge of selectively labelled data in the setting where the selection decision and outcome are unconfounded given the observed data features. Our framework can be used to 1) replace an existing model with one that has better fairness properties; or 2) audit for predictive bias. We illustrate these uses cases on a real-world credit-scoring task and a recidivism prediction task." + }, + { + "title": "Exploring the cloud of variable importance for the set of all good models", + "abstract": null + }, + { + "title": "Underspecification Presents Challenges for Credibility in Modern Machine Learning", + "abstract": "ML models often exhibit unexpectedly poor behavior when they are deployed in real-world domains. We identify underspecification as a key reason for these failures. An ML pipeline is underspecified when it can return many predictors with equivalently strong held-out performance in the training domain. Underspecification is common in modern ML pipelines, such as those based on deep learning. Predictors returned by underspecified pipelines are often treated as equivalent based on their training domain performance, but we show here that such predictors can behave very differently in deployment domains. This ambiguity can lead to instability and poor model behavior in practice, and is a distinct failure mode from previously identified issues arising from structural mismatch between training and deployment domains. We show that this problem appears in a wide variety of practical ML pipelines, using examples from computer vision, medical imaging, natural language processing, clinical risk prediction based on electronic health records, and medical genomics. Our results show the need to explicitly account for underspecification in modeling pipelines that are intended for real-world deployment in any domain." + }, + { + "title": "Deep double descent: where bigger models and more data hurt", + "abstract": "We show that a variety of modern deep learning tasks exhibit a ‘double-descent’ phenomenon where, as we increase model size, performance first gets worse and then gets better. Moreover, we show that double descent occurs not just as a function of model size, but also as a function of the number of training epochs. We unify the above phenomena by defining a new complexity measure we call the effective model complexity and conjecture a generalized double descent with respect to this measure. Furthermore, our notion of model complexity allows us to identify certain regimes where increasing (even quadrupling) the number of train samples actually hurts test performance." + }, + { + "title": "Detecting Underspecification with Local Ensembles", + "abstract": "We present local ensembles, a method for detecting underspecification -- when many possible predictors are consistent with the training data and model class -- at test time in a pre-trained model. Our method uses local second-order information to approximate the variance of predictions across an ensemble of models from the same class. We compute this approximation by estimating the norm of the component of a test point's gradient that aligns with the low-curvature directions of the Hessian, and provide a tractable method for estimating this quantity. Experimentally, we show that our method is capable of detecting when a pre-trained model is underspecified on test data, with applications to out-of-distribution detection, detecting spurious correlates, and active learning." + }, + { + "title": "Predictive Multiplicity in Classification", + "abstract": "Prediction problems often admit competing models that perform almost equally well. This effect challenges key assumptions in machine learning when competing models assign conflicting predictions. In this paper, we define predictive multiplicity as the ability of a prediction problem to admit competing models with conflicting predictions. We introduce formal measures to evaluate the severity of predictive multiplicity and develop integer programming tools to compute them exactly for linear classification problems. We apply our tools to measure predictive multiplicity in recidivism prediction problems. Our results show that real-world datasets may admit competing models that assign wildly conflicting predictions, and motivate the need to measure and report predictive multiplicity in model development." + }, + { + "title": "A study in Rashomon curves and volumes: A new perspective on generalization and model simplicity in machine learning", + "abstract": "The Rashomon effect occurs when many different explanations exist for the same phenomenon. In machine learning, Leo Breiman used this term to characterize problems where many accurate-but-different models exist to describe the same data. In this work, we study how the Rashomon effect can be useful for understanding the relationship between training and test performance, and the possibility that simple-yet-accurate models exist for many problems. We consider the Rashomon set - the set of almost-equally-accurate models for a given problem - and study its properties and the types of models it could contain. We present the Rashomon ratio as a new measure related to simplicity of model classes, which is the ratio of the volume of the set of accurate models to the volume of the hypothesis space; the Rashomon ratio is different from standard complexity measures from statistical learning theory. For a hierarchy of hypothesis spaces, the Rashomon ratio can help modelers to navigate the trade-off between simplicity and accuracy. In particular, we find empirically that a plot of empirical risk vs. Rashomon ratio forms a characteristic $\\Gamma$-shaped Rashomon curve, whose elbow seems to be a reliable model selection criterion. When the Rashomon set is large, models that are accurate - but that also have various other useful properties - can often be obtained. These models might obey various constraints such as interpretability, fairness, or monotonicity." + }, + { + "title": "Reconciling modern machine-learning practice and the classical bias–variance trade-off", + "abstract": "Significance While breakthroughs in machine learning and artificial intelligence are changing society, our fundamental understanding has lagged behind. It is traditionally believed that fitting models to the training data exactly is to be avoided as it leads to poor performance on unseen data. However, powerful modern classifiers frequently have near-perfect fit in training, a disconnect that spurred recent intensive research and controversy on whether theory provides practical insights. In this work, we show how classical theory and modern practice can be reconciled within a single unified performance curve and propose a mechanism underlying its emergence. We believe this previously unknown pattern connecting the structure and performance of learning architectures will help shape design and understanding of learning algorithms. Breakthroughs in machine learning are rapidly changing science and society, yet our fundamental understanding of this technology has lagged far behind. Indeed, one of the central tenets of the field, the bias–variance trade-off, appears to be at odds with the observed behavior of methods used in modern machine-learning practice. The bias–variance trade-off implies that a model should balance underfitting and overfitting: Rich enough to express underlying structure in data and simple enough to avoid fitting spurious patterns. However, in modern practice, very rich models such as neural networks are trained to exactly fit (i.e., interpolate) the data. Classically, such models would be considered overfitted, and yet they often obtain high accuracy on test data. This apparent contradiction has raised questions about the mathematical foundations of machine learning and their relevance to practitioners. In this paper, we reconcile the classical understanding and the modern practice within a unified performance curve. This “double-descent” curve subsumes the textbook U-shaped bias–variance trade-off curve by showing how increasing model capacity beyond the point of interpolation results in improved performance. We provide evidence for the existence and ubiquity of double descent for a wide spectrum of models and datasets, and we posit a mechanism for its emergence. This connection between the performance and the structure of machine-learning models delineates the limits of classical analyses and has implications for both the theory and the practice of machine learning." + }, + { + "title": "Stop explaining black box machine learning models for high stakes decisions and use interpretable models instead", + "abstract": null + }, + { + "title": "The age of secrecy and unfairness in recidivism prediction", + "abstract": "In our current society, secret algorithms make important decisions about individuals. There has been substantial discussion about whether these algorithms are unfair to groups of individuals. While noble, this pursuit is complex and ultimately stagnating because there is no clear definition of fairness and competing definitions are largely incompatible. We argue that the focus on the question of fairness is misplaced, as these algorithms fail to meet a more important and yet readily obtainable goal: transparency. As a result, creators of secret algorithms can provide incomplete or misleading descriptions about how their models work, and various other kinds of errors can easily go unnoticed. By partially reverse engineering the COMPAS algorithm -- a recidivism-risk scoring algorithm used throughout the criminal justice system -- we show that it does not seem to depend linearly on the defendant's age, despite statements to the contrary by the algorithm's creator. Furthermore, by subtracting from COMPAS its (hypothesized) nonlinear age component, we show that COMPAS does not necessarily depend on race, contradicting ProPublica's analysis, which assumed linearity in age. In other words, faulty assumptions about a proprietary algorithm lead to faulty conclusions that go unchecked without careful reverse engineering. Were the algorithm transparent in the first place, this would likely not have occurred. The most important result in this work is that we find that there are many defendants with low risk score but long criminal histories, suggesting that data inconsistencies occur frequently in criminal justice databases. We argue that transparency satisfies a different notion of procedural fairness by providing both the defendants and the public with the opportunity to scrutinize the methodology and calculations behind risk scores for recidivism." + }, + { + "title": "A Theory of Statistical Inference for Ensuring the Robustness of Scientific Results", + "abstract": "Inference is the process of using facts we know to learn about facts we do not know. A theory of inference gives assumptions necessary to get from the former to the latter, along with a definition for and summary of the resulting uncertainty. Any one theory of inference is neither right nor wrong but merely an axiom that may or may not be useful. Each of the many diverse theories of inference can be valuable for certain applications. However, no existing theory of inference addresses the tendency to choose, from the range of plausible data analysis specifications consistent with prior evidence, those that inadvertently favor one’s own hypotheses. Because the biases from these choices are a growing concern across scientific fields, and in a sense the reason the scientific community was invented in the first place, we introduce a new theory of inference designed to address this critical problem. We introduce hacking intervals, which are the range of a summary statistic one may obtain given a class of possible endogenous manipulations of the data. Hacking intervals require no appeal to hypothetical data sets drawn from imaginary superpopulations. A scientific result with a small hacking interval is more robust to researcher manipulation than one with a larger interval and is often easier to interpret than a classical confidence interval. Some versions of hacking intervals turn out to be equivalent to classical confidence intervals, which means they may also provide a more intuitive and potentially more useful interpretation of classical confidence intervals. This paper was accepted by J. George Shanthikumar, big data analytics." + }, + { + "title": "All Models are Wrong, but Many are Useful: Learning a Variable's Importance by Studying an Entire Class of Prediction Models Simultaneously", + "abstract": "Variable importance (VI) tools describe how much covariates contribute to a prediction model's accuracy. However, important variables for one well-performing model (for example, a linear model f (x) = x T β with a fixed coefficient vector β) may be unimportant for another model. In this paper, we propose model class reliance (MCR) as the range of VI values across all well-performing model in a prespecified class. Thus, MCR gives a more comprehensive description of importance by accounting for the fact that many prediction models, possibly of different parametric forms, may fit the data well. In the process of deriving MCR, we show several informative results for permutation-based VI estimates, based on the VI measures used in Random Forests. Specifically, we derive connections between permutation importance estimates for a single prediction model, U-statistics, conditional variable importance, conditional causal effects, and linear model coefficients. We then give probabilistic bounds for MCR, using a novel, generalizable technique. We apply MCR to a public data set of Broward County criminal records to study the reliance of recidivism prediction models on sex and race. In this application, MCR can be used to help inform VI for unknown, proprietary models." + }, + { + "title": "Sharp Minima Can Generalize For Deep Nets", + "abstract": "Despite their overwhelming capacity to overfit, deep learning architectures tend to generalize relatively well to unseen data, allowing them to be deployed in practice. However, explaining why this is the case is still an open area of research. One standing hypothesis that is gaining popularity, e.g. Hochreiter & Schmidhuber (1997); Keskar et al. (2017), is that the flatness of minima of the loss function found by stochastic gradient based methods results in good generalization. This paper argues that most notions of flatness are problematic for deep models and can not be directly applied to explain generalization. Specifically, when focusing on deep networks with rectifier units, we can exploit the particular geometry of parameter space induced by the inherent symmetries that these architectures exhibit to build equivalent models corresponding to arbitrarily sharper minima. Furthermore, if we allow to reparametrize a function, the geometry of its parameters can change drastically without affecting its generalization properties." + }, + { + "title": "Entropy-SGD: biasing gradient descent into wide valleys", + "abstract": "This paper proposes a new optimization algorithm called Entropy-SGD for training deep neural networks that is motivated by the local geometry of the energy landscape. Local extrema with low generalization error have a large proportion of almost-zero eigenvalues in the Hessian with very few positive or negative eigenvalues. We leverage upon this observation to construct a local-entropy-based objective function that favors well-generalizable solutions lying in large flat regions of the energy landscape, while avoiding poorly-generalizable solutions located in the sharp valleys. Conceptually, our algorithm resembles two nested loops of SGD where we use Langevin dynamics in the inner loop to compute the gradient of the local entropy before each update of the weights. We show that the new objective has a smoother energy landscape and show improved generalization over SGD using uniform stability, under certain assumptions. Our experiments on convolutional and recurrent networks demonstrate that Entropy-SGD compares favorably to state-of-the-art techniques in terms of generalization error and training time." + }, + { + "title": "On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima", + "abstract": "The stochastic gradient descent (SGD) method and its variants are algorithms of choice for many Deep Learning tasks. These methods operate in a small-batch regime wherein a fraction of the training data, say $32$-$512$ data points, is sampled to compute an approximation to the gradient. It has been observed in practice that when using a larger batch there is a degradation in the quality of the model, as measured by its ability to generalize. We investigate the cause for this generalization drop in the large-batch regime and present numerical evidence that supports the view that large-batch methods tend to converge to sharp minimizers of the training and testing functions - and as is well known, sharp minima lead to poorer generalization. In contrast, small-batch methods consistently converge to flat minimizers, and our experiments support a commonly held view that this is due to the inherent noise in the gradient estimation. We discuss several strategies to attempt to help large-batch methods eliminate this generalization gap." + }, + { + "title": "Prediction uncertainty and optimal experimental design for learning dynamical systems.", + "abstract": "Dynamical systems are frequently used to model biological systems. When these models are fit to data, it is necessary to ascertain the uncertainty in the model fit. Here, we present prediction deviation, a metric of uncertainty that determines the extent to which observed data have constrained the model's predictions. This is accomplished by solving an optimization problem that searches for a pair of models that each provides a good fit for the observed data, yet has maximally different predictions. We develop a method for estimating a priori the impact that additional experiments would have on the prediction deviation, allowing the experimenter to design a set of experiments that would most reduce uncertainty. We use prediction deviation to assess uncertainty in a model of interferon-alpha inhibition of viral infection, and to select a sequence of experiments that reduces this uncertainty. Finally, we prove a theoretical result which shows that prediction deviation provides bounds on the trajectories of the underlying true model. These results show that prediction deviation is a meaningful metric of uncertainty that can be used for optimal experimental design." + }, + { + "title": "Identifying a Minimal Class of Models for High-dimensional Data", + "abstract": "Model selection consistency in the high-dimensional regression setting can be achieved only if strong assumptions are fulfilled. We therefore suggest to pursue a different goal, which we call a minimal class of models. The minimal class of models includes models that are similar in their prediction accuracy but not necessarily in their elements. We suggest a random search algorithm to reveal candidate models. The algorithm implements simulated annealing while using a score for each predictor that we suggest to derive using a combination of the Lasso and the Elastic Net. The utility of using a minimal class of models is demonstrated in the analysis of two datasets." + }, + { + "title": "Interpretable classification models for recidivism prediction", + "abstract": "We investigate a long‐debated question, which is how to create predictive models of recidivism that are sufficiently accurate, transparent and interpretable to use for decision making. This question is complicated as these models are used to support different decisions, from sentencing, to determining release on probation to allocating preventative social services. Each case might have an objective other than classification accuracy, such as a desired true positive rate TPR or false positive rate FPR. Each (TPR, FPR) pair is a point on the receiver operator characteristic (ROC) curve. We use popular machine learning methods to create models along the full ROC curve on a wide range of recidivism prediction problems. We show that many methods (support vector machines, stochastic gradient boosting and ridge regression) produce equally accurate models along the full ROC curve. However, methods that are designed for interpretability (classification and regression trees and C5.0) cannot be tuned to produce models that are accurate and/or interpretable. To handle this shortcoming, we use a recent method called supersparse linear integer models to produce accurate, transparent and interpretable scoring systems along the full ROC curve. These scoring systems can be used for decision making for many different use cases, since they are just as accurate as the most powerful black box machine learning models for many applications, but completely transparent, and highly interpretable." + }, + { + "title": "Supersparse linear integer models for optimized medical scoring systems", + "abstract": null + }, + { + "title": "Robust Optimization using Machine Learning for Uncertainty Sets", + "abstract": "Our goal is to build robust optimization problems for making decisions based on complex data from the past. In robust optimization (RO) generally, the goal is to create a policy for decision-making that is robust to our uncertainty about the future. In particular, we want our policy to best handle the the worst possible situation that could arise, out of an uncertainty set of possible situations. Classically, the uncertainty set is simply chosen by the user, or it might be estimated in overly simplistic ways with strong assumptions; whereas in this work, we learn the uncertainty set from data collected in the past. The past data are drawn randomly from an (unknown) possibly complicated high-dimensional distribution. We propose a new uncertainty set design and show how tools from statistical learning theory can be employed to provide probabilistic guarantees on the robustness of the policy." + }, + { + "title": "Learning with Noisy Labels", + "abstract": "In this paper, we theoretically study the problem of binary classification in the presence of random classification noise—the learner, instead of seeing the true labels, sees labels that have independently been flipped with some small probability. Moreover, random label noise is class-conditional— the flip probability depends on the class. We provide two approaches to suitably modify any given surrogate loss function. First, we provide a simple unbiased estimator of any loss, and obtain performance bounds for empirical risk minimization in the presence of iid data with noisy labels. If the loss function satisfies a simple symmetry condition, we show that the method leads to an efficient algorithm for empirical minimization. Second, by leveraging a reduction of risk minimization under noisy labels to classification with weighted 0-1 loss, we suggest the use of a simple weighted surrogate loss, for which we are able to obtain strong empirical risk bounds. This approach has a very remarkable consequence — methods used in practice such as biased SVM and weighted logistic regression are provably noise-tolerant. On a synthetic non-separable dataset, our methods achieve over 88% accuracy even when 40% of the labels are corrupted, and are competitive with respect to recently proposed methods for dealing with label noise in several benchmark datasets." + }, + { + "title": "Interplay between concentration, complexity and geometry in learning theory with applications to high dimensional data analysis", + "abstract": "In this document I present the works I undertook since the end of my Ph.D. I started my Ph.D in September 2004 at the Laboratoire de Probabilit{e}s et Mod{e}les Al{e}atoires of Universit{e} Paris 6. I was then hired in October 2007 by the CNRS and spent my first two years at the Laboratoire d'Analyse, Topologie et Probabilit{e} in Marseille. In 2009, I moved to the Laboratoire d'Analyse et Math{e}matiques Appliqu{e}es at the Universit{e} Paris-Est Marne-la-vall{e}e. I will also use the opportunity of writing this manuscript to add some remarks and extensions to these works." + }, + { + "title": "Machine learning with operational costs", + "abstract": "This work proposes a way to align statistical modeling with decision making. We provide a method that propagates the uncertainty in predictive modeling to the uncertainty in operational cost, where operational cost is the amount spent by the practitioner in solving the problem. The method allows us to explore the range of operational costs associated with the set of reasonable statistical models, so as to provide a useful way for practitioners to understand uncertainty. To do this, the operational cost is cast as a regularization term in a learning algorithm's objective function, allowing either an optimistic or pessimistic view of possible costs, depending on the regularization parameter. From another perspective, if we have prior knowledge about the operational cost, for instance that it should be low, this knowledge can help to restrict the hypothesis space, and can help with generalization. We provide a theoretical generalization bound for this scenario. We also show that learning with operational costs is related to robust optimization." + }, + { + "title": "Noise Tolerance Under Risk Minimization", + "abstract": "In this paper, we explore noise-tolerant learning of classifiers. We formulate the problem as follows. We assume that there is an unobservable training set that is noise free. The actual training set given to the learning algorithm is obtained from this ideal data set by corrupting the class label of each example. The probability that the class label of an example is corrupted is a function of the feature vector of the example. This would account for most kinds of noisy data one encounters in practice. We say that a learning method is noise tolerant if the classifiers learnt with noise-free data and with noisy data, both have the same classification accuracy on the noise-free data. In this paper, we analyze the noise-tolerance properties of risk minimization (under different loss functions). We show that risk minimization under 0-1 loss function has impressive noise-tolerance properties and that under squared error loss is tolerant only to uniform noise; risk minimization under other loss functions is not noise tolerant. We conclude this paper with some discussion on the implications of these theoretical results." + }, + { + "title": "On combining machine learning with decision making", + "abstract": null + }, + { + "title": "Smoothness, Low Noise and Fast Rates", + "abstract": "We establish an excess risk bound of O(HR2n + √HL*Rn) for ERM with an H-smooth loss function and a hypothesis class with Rademacher complexity Rn, where L* is the best risk achievable by the hypothesis class. For typical hypothesis classes where Rn = √R/n, this translates to a learning rate of O(RH/n) in the separable (L* = 0) case and O(RH/n + √L*RH/n) more generally. We also provide similar guarantees for online and stochastic convex optimization of a smooth non-negative objective." + }, + { + "title": "On the Complexity of Linear Prediction: Risk Bounds, Margin Bounds, and Regularization", + "abstract": "This work characterizes the generalization ability of algorithms whose predictions are linear in the input vector. To this end, we provide sharp bounds for Rademacher and Gaussian complexities of (constrained) linear classes, which directly lead to a number of generalization bounds. This derivation provides simplified proofs of a number of corollaries including: risk bounds for linear prediction (including settings where the weight vectors are constrained by either L2 or L1 constraints), margin bounds (including both L2 and L1 margins, along with more general notions based on relative entropy), a proof of the PAC-Bayes theorem, and upper bounds on L2 covering numbers (with Lp norm constraints and relative entropy constraints). In addition to providing a unified analysis, the results herein provide some of the sharpest risk and margin bounds. Interestingly, our results show that the uniform convergence rates of empirical risk minimization algorithms tightly match the regret bounds of online learning algorithms for linear prediction, up to a constant factor of 2." + }, + { + "title": "Stability selection", + "abstract": "Summary.  Estimation of structure, such as in variable selection, graphical modelling or cluster analysis, is notoriously difficult, especially for high dimensional data. We introduce stability selection. It is based on subsampling in combination with (high dimensional) selection algorithms. As such, the method is extremely general and has a very wide range of applicability. Stability selection provides finite sample control for some error rates of false discoveries and hence a transparent principle to choose a proper amount of regularization for structure estimation. Variable selection and structure estimation improve markedly for a range of selection methods if stability selection is applied. We prove for the randomized lasso that stability selection will be variable selection consistent even if the necessary conditions for consistency of the original lasso method are violated. We demonstrate stability selection for variable selection and Gaussian graphical modelling, using real and simulated data." + }, + { + "title": "Local Rademacher complexities", + "abstract": "We propose new bounds on the error of learning algorithms in terms of a data-dependent notion of complexity. The estimates we establish give optimal rates and are based on a local and empirical version of Rademacher averages, in the sense that the Rademacher averages are computed from the data, on a subset of functions with small empirical error. We present some applications to classification and prediction with convex function classes, and with kernel classes in particular." + }, + { + "title": "Complexity regularization via localized random penalties", + "abstract": "In this article, model selection via penalized empirical loss minimization in nonparametric classification problems is studied. Datadependent penalties are constructed, which are based on estimates of the complexity of a small subclass of each model class, containing only those functions with small empirical loss. The penalties are novel since those considered in the literature are typically based on the entire model class. Oracle inequalities using these penalties are established, and the advantage of the new penalties over those based on the complexity of the whole model class is demonstrated. 1. Introduction. In this article, we propose a new complexity-penalized model selection method based on data-dependent penalties. We consider the binary classification problem where, given a random observation X ∈ R d , one has to predict Y ∈ {0,1}. A classifier or classification rule is a function f : R d → {0,1}, with loss" + }, + { + "title": "Rademacher and Gaussian Complexities: Risk Bounds and Structural Results", + "abstract": null + }, + { + "title": "The covering number in learning theory", + "abstract": "The covering number of a ball of a reproducing kernel Hilbert space as a subset of the continuous function space plays an important role in Learning Theory. We give estimates for this covering number by means of the regularity of the Mercer kernel K. For convolution type kernels K(x, t) = k(x - t) on [0, 1]n, we provide estimates depending on the decay of k, the Fourier transform of k. In particular, when k decays exponentially, our estimate for this covering number is better than all the previous results and covers many important Mercer kernels. A counter example is presented to show that the eigenfunctions of the Hilbert-Schmidt operator LK associated with a Mercer kernel K may not be uniformly bounded. Hence some previous methods used for estimating the covering number in Learning Theory are not valid. We also provide an example of a Mercer kernel to show that LK½ may not be generated by a Mercer kernel." + }, + { + "title": "Empirical margin distributions and bounding the generalization error of combined classifiers", + "abstract": "We prove new probabilistic upper bounds on generalization error of complex classifiers that are combinations of simple classifiers. Such combinations could be implemented by neural networks or by voting methods of combining the classifiers, such as boosting and bagging. The bounds are in terms of the empirical distribution of the margin of the combined classifier. They are based on the methods of the theory of Gaussian and empirical processes (comparison inequalities, symmetrization method, concentration inequalities) and they improve previous results of Bartlett (1998) on bounding the generalization error of neural networks in terms of 1 -norms of the weights of neurons and of Schapire, Freund, Bartlett and Lee (1998) on bounding the generalization error of boosting. We also obtain rates of convergence in Levy distance of empirical margin distribution to the true margin distribution uniformly over the classes of classifiers and prove the optimality of these rates." + }, + { + "title": "Statistical Modeling: The Two Cultures (with comments and a rejoinder by the author)", + "abstract": "There are two cultures in the use of statistical modeling to reach conclusions from data. One assumes that the data are generated bya given stochastic data model. The other uses algorithmic models and treats the data mechanism as unknown. The statistical communityhas been committed to the almost exclusive use of data models. This commit- ment has led to irrelevant theory, questionable conclusions, and has kept statisticians from working on a large range of interesting current prob- lems. Algorithmic modeling, both in theoryand practice, has developed rapidlyin fields outside statistics. It can be used both on large complex data sets and as a more accurate and informative alternative to data modeling on smaller data sets. If our goal as a field is to use data to solve problems, then we need to move awayfrom exclusive dependence on data models and adopt a more diverse set of tools." + }, + { + "title": "A Tutorial on Support Vector Machines for Pattern Recognition", + "abstract": null + }, + { + "title": "Nonlinear approximation", + "abstract": "This is a survey of nonlinear approximation, especially that part of the subject which is important in numerical computation. Nonlinear approximation means that the approximants do not come from linear spaces but rather from nonlinear manifolds. The central question to be studied is what, if any, are the advantages of nonlinear approximation over the simpler, more established, linear methods. This question is answered by studying the rate of approximation which is the decrease in error versus the number of parameters in the approximant. The number of parameters usually correlates well with computational effort. It is shown that in many settings the rate of nonlinear approximation can be characterized by certain smoothness conditions which are significantly weaker than required in the linear theory. Emphasis in the survey will be placed on approximation by piecewise polynomials and wavelets as well as their numerical implementation. Results on highly nonlinear methods such as optimal basis selection and greedy algorithms (adaptive pursuit) are also given. Applications to image processing, statistical estimation, regularity for PDEs, and adaptive algorithms are discussed." + }, + { + "title": "Boosting the margin: A new explanation for the effectiveness of voting methods", + "abstract": "One of the surprising recurring phenomena observed in experiments with boosting is that the test error of the generated classifier usually does not increase as its size becomes very large, and often is observed to decrease even after the training error reaches zero. In this paper, we show that this phenomenon is related to the distribution of margins of the training examples with respect to the generated voting classification rule, where the margin of an example is simply the difference between the number of correct votes and the maximum number of votes received by any incorrect label. We show that techniques used in the analysis of Vapnik's support vector classifiers and of neural networks with small weights can be applied to voting methods to relate the margin distribution to the test error. We also show theoretically and experimentally that boosting is especially effective at increasing the margins of the training examples. Finally, we compare our explanation to those based on the bias-variance" + }, + { + "title": "Algorithmic Stability and Sanity-Check Bounds for Leave-One-Out Cross-Validation", + "abstract": "In this article we prove sanity-check bounds for the error of the leave-oneout cross-validation estimate of the generalization error: that is, bounds showing that the worst-case error of this estimate is not much worse than that of the training error estimate. The name sanity check refers to the fact that although we often expect the leave-one-out estimate to perform considerably better than the training error estimate, we are here only seeking assurance that its performance will not be considerably worse. Perhaps surprisingly, such assurance has been given only for limited cases in the prior literature on cross-validation. Any nontrivial bound on the error of leave-one-out must rely on some notion of algorithmic stability. Previous bounds relied on the rather strong notion of hypothesis stability, whose application was primarily limited to nearest-neighbor and other local algorithms. Here we introduce the new and weaker notion of error stability and apply it to obtain sanity-check bounds for leave-one-out for other classes of learning algorithms, including training error minimization procedures and Bayesian algorithms. We also provide lower bounds demonstrating the necessity of some form of error stability for proving bounds on the error of the leave-one-out estimate, and the fact that for training error minimization algorithms, in the worst case such bounds must still depend on the Vapnik-Chervonenkis dimension of the hypothesis class." + }, + { + "title": "Support-Vector Networks", + "abstract": null + }, + { + "title": "On the degree of polynomials that approximate symmetric Boolean functions (preliminary version)", + "abstract": "In this paper, we provide matching (up to a constant factor) upper and lower bounds on the degree of polynomials that represent symmetric boolean functions with an error 1/3. Let Γ(f)=min{|2kn+1|:fk fk+ 1 and 0 ≤ kn – 1} where fi is the value of f on inputs with exactly i 1's. We prove that the minimum degree over all the approximating polynomials of f is &THgr;((n(n-Γ(f))).5). We apply the techniques and tools from approximation theory to derive this result." + }, + { + "title": "Learning From Noisy Examples", + "abstract": null + }, + { + "title": "A Finite Sample Distribution-Free Performance Bound for Local Discrimination Rules", + "abstract": null + }, + { + "title": "Approximation of monomials by lower degree polynomials", + "abstract": null + }, + { + "title": "An Information Measure for Classification", + "abstract": "1. The class to which each thing belongs. 2. The average properties of each class. 3. The deviations of each thing from the average properties of its parent class. If the things are found to be concentrated in a small area of the region of each class in the measurement space then the deviations will be small, and with reference to the average class properties most of the information about a thing is given by naming the class to which it belongs. In this case the information may be recorded much more briefly than if a classification had not been used. We suggest that the best classification is that which results in the briefest recording of all the attribute information. In this context, we will regard the measurements of each thing as being a message about that thing. Shannon (1948) showed that where messages may be regarded as each nominating the occurrence of a particular event among a universe of possible events, the information needed to record a series of such messages is minimised if the messages are encoded so that the length of each message is proportional to minus the logarithm of the relative frequency of occurrence of the event which it nominates. The information required is greatest when all frequencies are equal. The messages here nominate the positions in measurement space of the 5 1 points representing the attributes of the things. If the expected density of points in the measurement space is everywhere uniform, the positions of the points cannot be encoded more briefly than by a simple list of the measured values. However, if the expected density is markedly non-uniform, application" + }, + { + "title": "UCI Machine Learning Repository", + "abstract": null + }, + { + "title": "2019. UCI Machine Learning Repository. University of California, Irvine, School of Information and Computer Sciences", + "abstract": null + }, + { + "title": "Data Set Descriptions We provide a description of the data sets used in our experiments in Table 4. All of them were downloaded from the UCI Machine Learning Repository (Dua and Graff, 2019)", + "abstract": null + }, + { + "title": "Algorithms and Error Bounds for Multivariate Piecewise Constant Approximation", + "abstract": null + }, + { + "title": "The estimate for approximation error of neural networks: A constructive approach", + "abstract": null + }, + { + "title": "The prediction uncertainty", + "abstract": "Participating time laboratories provide the evaluation for the prediction uncertainty of [UTCUTC(k)] within the framework of Calibration and Measurement Capabilities (CMCs) and CCTF key comparisons for the key comparison database (KCDB). The time interval for declaring the prediction uncertainty is 20 days and the declared values show a large variation; from 20 ns to 200 ns. A study was initiated as a result of this large variation among the declared values to determine reasonable values for the prediction uncertainty of [UTCUTC(k)] depending on the clock used and the time transfer method. The laboratories should exercise caution when evaluating the prediction uncertainty and to avoid misunderstandings due to the deviation between the values of [UTC-UTC(k)] over 20 days." + }, + { + "title": "Stability and Generalization", + "abstract": "We define notions of stability for learning algorithms and show how to use these notions to derive generalization error bounds based on the empirical error and the leave-one-out error. The methods we use can be applied in the regression framework as well as in the classification one when the classifier is obtained by thresholding a real-valued function. We study the stability properties of large classes of learning algorithms such as regularization based algorithms. In particular we focus on Hilbert space regularization and Kullback-Leibler regularization. We demonstrate how to apply the results to SVM for regression and classification." + }, + { + "title": "A Few Notes on Statistical Learning Theory", + "abstract": null + }, + { + "title": "PAC-Bayes & Margins", + "abstract": "We show two related things: \n \n(1) Given a classifier which consists of a weighted sum of features with a large margin, we can construct a stochastic classifier with negligibly larger training error rate. The stochastic classifier has a future error rate bound that depends on the margin distribution and is independent of the size of the base hypothesis class. \n \n(2) A new true error bound for classifiers with a margin which is simpler, functionally tighter, and more data-dependent than all previous bounds." + }, + { + "title": "The Nature of Statistical Learning Theory", + "abstract": null + }, + { + "title": "Flat Minima", + "abstract": "We present a new algorithm for finding low-complexity neural networks with high generalization capability. The algorithm searches for a flat minimum of the error function. A flat minimum is a large connected region in weight space where the error remains approximately constant. An MDL-based, Bayesian argument suggests that flat minima correspond to simple networks and low expected overfitting. The argument is based on a Gibbs algorithm variant and a novel way of splitting generalization error into underfitting and overfitting error. Unlike many previous approaches, ours does not require gaussian assumptions and does not depend on a good weight prior. Instead we have a prior over input output functions, thus taking into account net architecture and training set. Although our algorithm requires the computation of second-order derivatives, it has backpropagation's order of complexity. Automatically, it effectively prunes units, weights, and input lines. Various experiments with feedforward and recurrent nets are described. In an application to stock market prediction, flat minimum search outperforms conventional backprop, weight decay, and optimal brain surgeon/optimal brain damage." + }, + { + "title": "The Nature of Statistical Learning", + "abstract": null + }, + { + "title": "Chervonenkis: On the uniform convergence of relative frequencies of events to their probabilities", + "abstract": null + } + ] + }, + "author_data": { + "189c8561-cf9e-48db-920d-a21fae9aa663": { + "pk": "189c8561-cf9e-48db-920d-a21fae9aa663", + "project_name": null, + "name": "Cynthia Rudin", + "bio": "I am a researcher dedicated to advancing methodologies in causal inference, machine learning, and data analysis, particularly in the context of observational studies and treatment effect estimation. My work focuses on creating high-quality treatment-control matches for categorical data, utilizing a novel approach based on weighted Hamming distances that accounts for the importance of covariates. This method not only enhances match quality but also effectively handles irrelevant variables and missing data.\n\nIn addition to matching techniques, I have explored the intricacies of variable importance measures, proposing a framework that provides a comprehensive understanding of how different models rely on specific covariates. My research also delves into the challenges of inference, introducing concepts like hacking intervals to address biases in data analysis.\n\nI am passionate about developing interpretable machine learning models, as demonstrated in my work on deep networks that mimic human reasoning in image classification. My recent contributions include the creation of the 2HELPS2B scoring system for predicting seizure risks from continuous EEG monitoring, showcasing my commitment to applying machine learning in clinical settings.\n\nOverall, my research aims to bridge the gap between complex data analysis and practical applications, ensuring that our methodologies are robust, interpretable, and beneficial for real-world decision-making.", + "collaborators": [ + "A. Volfovsky", + "Awa Dieng", + "Yameng Liu", + "Sudeepa Roy", + "Beau Coker", + "Aaron Fisher", + "F. Dominici", + "S. Ertekin", + "Yijie Bei", + "Alexandru Damian", + "Shijia Hu", + "Sachit Menon", + "Nikhila Ravi", + "Gary King", + "Chaofan CHEN", + "Chaofan Tao", + "A. Barnett", + "Siong Thye Goh", + "R. Parr", + "Kamesh Munagala", + "Caroline Linjun Wang", + "Yining Wang", + "Marco Morucci", + "Md. Noor-E.-Alam", + "A. Struck", + "Berk Ustun", + "Andres A. Rodriguez Ruiz", + "Jong Woo Lee", + "S. Laroche", + "L. Hirsch", + "E. Gilmore", + "J. Vlachý", + "Hiba A. Haider", + "M. Westover", + "Harsh Parikh", + "John Benhart", + "Tianlin Duan", + "Peter Hase", + "Liuyi Zhu", + "Mai-Anh T. Vu", + "T. Adalı", + "Demba E. Ba", + "G. Buzsáki", + "David Edwin Carlson", + "K. Heller", + "C. Liston", + "V. Sohal", + "A. Widge", + "H. Mayberg", + "G. Sapiro", + "K. Dzirasa" + ], + "pub_titles": [ + "Almost-Exact Matching with Replacement for Causal Inference", + "New Techniques for Preserving Global Structure and Denoising with Low Information Loss in Single-Image Super-Resolution", + "A Theory of Statistical Inference for Ensuring the Robustness of Scientific Results", + "This Looks Like That: Deep Learning for Interpretable Image Recognition", + "A Minimax Surrogate Loss Approach to Conditional Difference Estimation", + "Collapsing-Fast-Large-Almost-Matching-Exactly: A Matching Method for Causal Inference", + "Model Class Reliance: Variable Importance Measures for any Machine Learning Model Class, from the \"Rashomon\" Perspective", + "The age of secrecy and unfairness in recidivism prediction", + "Direct Learning to Rank and Rerank", + "All Models are Wrong, but Many are Useful: Learning a Variable's Importance by Studying an Entire Class of Prediction Models Simultaneously", + "Interpretable Almost Matching Exactly for Causal Inference", + "A Robust Approach to Quantifying Uncertainty in Matching Problems of Causal Inference", + "A Practical Risk Score for EEG Seizures in Hospitalized Patients (S11.002)", + "MALTS: Matching After Learning to Stretch", + "Shall I Compare Thee to a Machine-Written Sonnet? An Approach to Algorithmic Sonnet Generation", + "A Shared Vision for Machine Learning in Neuroscience" + ], + "pub_abstracts": [ + "We aim to create the highest possible quality of treatment-control matches for categorical data in the potential outcomes framework. Matching methods are heavily used in the social sciences due to their interpretability, but most matching methods do not pass basic sanity checks: they fail when irrelevant variables are introduced, and tend to be either computationally slow or produce low-quality matches. The method proposed in this work aims to match units on a weighted Hamming distance, taking into account the relative importance of the covariates; the algorithm aims to match units on as many relevant variables as possible. To do this, the algorithm creates a hierarchy of covariate combinations on which to match (similar to downward closure), in the process solving an optimization problem for each unit in order to construct the optimal matches. The algorithm uses a single dynamic program to solve all of the optimization problems simultaneously. Notable advantages of our method over existing matching procedures are its high-quality matches, versatility in handling different data distributions that may have irrelevant variables, and ability to handle missing data by matching on as many available covariates as possible.", + "This work identifies and addresses two important technical challenges in single-image super-resolution: (1) how to upsample an image without magnifying noise and (2) how to preserve large scale structure when upsampling. We summarize the techniques we developed for our second place entry in Track 1 (Bicubic Downsampling), seventh place entry in Track 2 (Realistic Adverse Conditions), and seventh place entry in Track 3 (Realistic difficult) in the 2018 NTIRE Super-Resolution Challenge. Furthermore, we present new neural network architectures that specifically address the two challenges listed above: denoising and preservation of large-scale structure.", + "Inference is the process of using facts we know to learn about facts we do not know. A theory of inference gives assumptions necessary to get from the former to the latter, along with a definition for and summary of the resulting uncertainty. Any one theory of inference is neither right nor wrong but merely an axiom that may or may not be useful. Each of the many diverse theories of inference can be valuable for certain applications. However, no existing theory of inference addresses the tendency to choose, from the range of plausible data analysis specifications consistent with prior evidence, those that inadvertently favor one’s own hypotheses. Because the biases from these choices are a growing concern across scientific fields, and in a sense the reason the scientific community was invented in the first place, we introduce a new theory of inference designed to address this critical problem. We introduce hacking intervals, which are the range of a summary statistic one may obtain given a class of possible endogenous manipulations of the data. Hacking intervals require no appeal to hypothetical data sets drawn from imaginary superpopulations. A scientific result with a small hacking interval is more robust to researcher manipulation than one with a larger interval and is often easier to interpret than a classical confidence interval. Some versions of hacking intervals turn out to be equivalent to classical confidence intervals, which means they may also provide a more intuitive and potentially more useful interpretation of classical confidence intervals. This paper was accepted by J. George Shanthikumar, big data analytics.", + "When we are faced with challenging image classification tasks, we often explain our reasoning by dissecting the image, and pointing out prototypical aspects of one class or another. The mounting evidence for each of the classes helps us make our final decision. In this work, we introduce a deep network architecture that reasons in a similar way: the network dissects the image by finding prototypical parts, and combines evidence from the prototypes to make a final classification. The model thus reasons in a way that is qualitatively similar to the way ornithologists, physicians, geologists, architects, and others would explain to people on how to solve challenging image classification tasks. The network uses only image-level labels for training, meaning that there are no labels for parts of images. We demonstrate our method on the CUB-200-2011 dataset and the CBIS-DDSM dataset. Our experiments show that our interpretable network can achieve comparable accuracy with its analogous standard non-interpretable counterpart as well as other interpretable deep models.", + "We present a new machine learning approach to estimate personalized treatment effects in the classical potential outcomes framework with binary outcomes. To overcome the problem that both treatment and control outcomes for the same unit are required for supervised learning, we propose surrogate loss functions that incorporate both treatment and control data. The new surrogates yield tighter bounds than the sum of losses for treatment and control groups. A specific choice of loss function, namely a type of hinge loss, yields a minimax support vector machine formulation. The resulting optimization problem requires the solution to only a single convex optimization problem, incorporating both treatment and control units, and it enables the kernel trick to be used to handle nonlinear (also non-parametric) estimation. Statistical learning bounds are also presented for the framework, and experimental results.", + "We aim to create the highest possible quality of treatment-control matches for categorical data in the potential outcomes framework. Matching methods are heavily used in the social sciences due to their interpretability, but most matching methods in the past do not pass basic sanity checks in that they fail when irrelevant variables are introduced. Also, past methods tend to be either computationally slow or produce poor matches. The method proposed in this work aims to match units on a weighted Hamming distance, taking into account the relative importance of the covariates; the algorithm aims to match units on as many relevant variables as possible. To do this, the algorithm creates a hierarchy of covariate combinations on which to match (similar to downward closure), in the process solving an optimization problem for each unit in order to construct the optimal matches. The algorithm uses a single dynamic program to solve all of optimization problems simultaneously. Notable advantages of our method over existing matching procedures are its high-quality matches, versatility in handling different data distributions that may have irrelevant variables, and ability to handle missing data by matching on as many available covariates as possible", + "There are serious drawbacks to many current variable importance (VI) methods, in that they tend to not be comparable across model types, can obscure implicit assumptions about the data generating distribution, or can give seemingly incoherent results when multiple prediction models fit the data well. In this paper we propose a framework of VI measures for describing how much any model class (e.g. all linear models of dimension p), any model-fitting algorithm (e.g. Ridge regression with fixed regularization parameter), or any individual prediction model (e.g. a single linear model with fixed coefficient vector), relies on covariate(s) of interest. The building block of our approach, Model Reliance (MR), compares a prediction model's expected loss with that model's expected loss on a pair observations in which the value of the covariate of interest has been switched. Expanding on MR, we propose Model Class Reliance (MCR) as the upper and lower bounds on the degree to which any well-performing prediction model within a class may rely on a variable of interest, or set of variables of interest. Thus, MCR describes reliance on a variable while accounting for the fact that many prediction models, possibly of different parametric forms, may fit the data well. We give probabilistic bounds for MR and MCR, leveraging existing results for U-statistics. We also illustrate connections between MR, conditional causal effects, and linear regression coefficients. We outline implementations of our approaches for regularized linear regression, and to regression in a reproducing kernel Hilbert space. We then apply MR & MCR to study the behavior of recidivism prediction models, using a public dataset of Broward County criminal records.", + "In our current society, secret algorithms make important decisions about individuals. There has been substantial discussion about whether these algorithms are unfair to groups of individuals. While noble, this pursuit is complex and ultimately stagnating because there is no clear definition of fairness and competing definitions are largely incompatible. We argue that the focus on the question of fairness is misplaced, as these algorithms fail to meet a more important and yet readily obtainable goal: transparency. As a result, creators of secret algorithms can provide incomplete or misleading descriptions about how their models work, and various other kinds of errors can easily go unnoticed. By partially reverse engineering the COMPAS algorithm -- a recidivism-risk scoring algorithm used throughout the criminal justice system -- we show that it does not seem to depend linearly on the defendant's age, despite statements to the contrary by the algorithm's creator. Furthermore, by subtracting from COMPAS its (hypothesized) nonlinear age component, we show that COMPAS does not necessarily depend on race, contradicting ProPublica's analysis, which assumed linearity in age. In other words, faulty assumptions about a proprietary algorithm lead to faulty conclusions that go unchecked without careful reverse engineering. Were the algorithm transparent in the first place, this would likely not have occurred. The most important result in this work is that we find that there are many defendants with low risk score but long criminal histories, suggesting that data inconsistencies occur frequently in criminal justice databases. We argue that transparency satisfies a different notion of procedural fairness by providing both the defendants and the public with the opportunity to scrutinize the methodology and calculations behind risk scores for recidivism.", + "Learning-to-rank techniques have proven to be extremely useful for prioritization problems, where we rank items in order of their estimated probabilities, and dedicate our limited resources to the top-ranked items. This work exposes a serious problem with the state of learning-to-rank algorithms, which is that they are based on convex proxies that lead to poor approximations. We then discuss the possibility of \"exact\" reranking algorithms based on mathematical programming. We prove that a relaxed version of the \"exact\" problem has the same optimal solution, and provide an empirical analysis.", + "Variable importance (VI) tools describe how much covariates contribute to a prediction model's accuracy. However, important variables for one well-performing model (for example, a linear model f (x) = x T β with a fixed coefficient vector β) may be unimportant for another model. In this paper, we propose model class reliance (MCR) as the range of VI values across all well-performing model in a prespecified class. Thus, MCR gives a more comprehensive description of importance by accounting for the fact that many prediction models, possibly of different parametric forms, may fit the data well. In the process of deriving MCR, we show several informative results for permutation-based VI estimates, based on the VI measures used in Random Forests. Specifically, we derive connections between permutation importance estimates for a single prediction model, U-statistics, conditional variable importance, conditional causal effects, and linear model coefficients. We then give probabilistic bounds for MCR, using a novel, generalizable technique. We apply MCR to a public data set of Broward County criminal records to study the reliance of recidivism prediction models on sex and race. In this application, MCR can be used to help inform VI for unknown, proprietary models.", + "We aim to create the highest possible quality of treatment-control matches for categorical data in the potential outcomes framework. Matching methods are heavily used in the social sciences due to their interpretability, but most matching methods do not pass basic sanity checks: they fail when irrelevant variables are introduced, and tend to be either computationally slow or produce low-quality matches. The method proposed in this work aims to match units on a weighted Hamming distance, taking into account the relative importance of the covariates; the algorithm aims to match units on as many relevant variables as possible. To do this, the algorithm creates a hierarchy of covariate combinations on which to match (similar to downward closure), in the process solving an optimization problem for each unit in order to construct the optimal matches. The algorithm uses a single dynamic program to solve all of the optimization problems simultaneously. Notable advantages of our method over existing matching procedures are its high-quality matches, versatility in handling different data distributions that may have irrelevant variables, and ability to handle missing data by matching on as many available covariates as possible.", + "Unquantified sources of uncertainty in observational causal analyses can break the integrity of the results. One would never want another analyst to repeat a calculation with the same data set, using a seemingly identical procedure, only to find a different conclusion. However, as we show in this work, there is a typical source of uncertainty that is essentially never considered in observational causal studies: the choice of match assignment for matched groups—that is, which unit is matched to which other unit before a hypothesis test is conducted. The choice of match assignment is anything but innocuous and can have a surprisingly large influence on the causal conclusions. Given that a vast number of causal inference studies test hypotheses on treatment effects after treatment cases are matched with similar control cases, we should find a way to quantify how much this extra source of uncertainty impacts results. What we would really like to be able to report is that no matter which match assignment is made, as long as the match is sufficiently good, then the hypothesis test results are still informative. In this paper, we provide methodology based on discrete optimization to create robust tests that explicitly account for this possibility. We formulate robust tests for binary and continuous data based on common test statistics as integer linear programs solvable with common methodologies. We study the finite-sample behavior of our test statistic in the discrete-data case. We apply our methods to simulated and real-world data sets and show that they can produce useful results in practical applied settings.", + "Objective: To use seizure risk factors from continuous EEG monitoring (cEEG) to create a simple scoring system for predicting the probability of electrographic seizures in patients with acute illness. Background: cEEG use has expanded, motivated by the high incidence of subclinical seizures in hospitalized encephalopathic patients. EEG features reported as predictors of seizures include epileptiform discharges and periodic discharges. However, no study has examined how these predictors affect seizure risk jointly. Here we propose a simple scoring system for seizure risk that we refer to as the 2HELPS2B score. Design/Methods: We used a prospective database to derive a dataset containing 24 clinical and electrographic variables for 5427 >24hr cEEG sessions. Using this dataset, we created a scoring system model to estimate seizure risk in patients undergoing cEEG. The model was built using a new machine learning method (RiskSLIM) that is designed to produce accurate, risk-calibrated, scoring systems with a limited number of variables and small integer weights. We validated the predictive accuracy and risk-calibration of our model using cross-validation, and compared its performance to models built with state-of-the-art logistic regression methods. Results: Our final model (2HELPS2B) has an AUC of 0.821 and average calibration error of 2.7%. It includes 6 variables with the following point assignments: (i) brief potentially ictal rhythmic discharges (B(I)RDs) (2 points); (ii) presence of LPD, LRDA, or BIPDs (1 point); (iii) prior seizure (1 point); (iv) sporadic epileptiform discharges (1 point); (v) frequency >2.0Hz for any periodic or rhythmic pattern (1 point); (vi) presence of “plus” features (superimposed, rhythmic, sharp, or fast activity) (1 point). The predicted seizure risk of each score is: 0: 5%, 1: 12%, 2: 27%, 3: 50%, 4: 73%, 5: 88%, 6–7: >95%. Conclusions: The 2HELPS2B score allows accurate prediction of seizures by adding points for 6 readily available variables from the patient history and initial EEG. Study Supported by: This study was supported by a Research Infrastructure award from the American Epilepsy Society and the Epilepsy Foundation. Disclosure: Dr. Struck has nothing to disclose. Dr. Ustun has nothing to disclose. Dr. Rodriguez-Ruiz has nothing to disclose. Dr. Lee has nothing to disclose. Dr. LaRoche has received royalty, license fees, or contractual rights payments from Demos Publishing. Dr. Hirsch has nothing to disclose. Dr Gilmore has nothing to disclose. Dr. Vlachy has nothing to disclose. Dr. Haider has nothing to disclose. Dr. Rudin has nothing to disclose. Dr. Westover has nothing to disclose.", + "We introduce a flexible framework that produces high-quality almost-exact matches for causal inference. Most prior work in matching uses ad-hoc distance metrics, often leading to poor quality matches, particularly when there are irrelevant covariates. In this work, we learn an interpretable distance metric for matching, which leads to substantially higher quality matches. The learned distance metric stretches the covariates according to their contribution to outcome prediction. The framework is flexible in that the user can choose the form of the distance metric and the type of optimization algorithm. Our ability to learn flexible distance metrics leads to matches that are interpretable and useful for the estimation of conditional average treatment effects.", + "We provide an approach for generating beautiful poetry. Our sonnet-generation algorithm includes several novel elements that improve over the state of the art, leading to metrical, rhyming poetry with many human-like qualities. These novel elements include in-line punctuation, part of speech restrictions, and more appropriate training corpora. Our work is the winner of the 2018 PoetiX Literary Turing Test Award for computer-generated poetry.", + "With ever-increasing advancements in technology, neuroscientists are able to collect data in greater volumes and with finer resolution. The bottleneck in understanding how the brain works is consequently shifting away from the amount and type of data we can collect and toward what we actually do with the data. There has been a growing interest in leveraging this vast volume of data across levels of analysis, measurement techniques, and experimental paradigms to gain more insight into brain function. Such efforts are visible at an international scale, with the emergence of big data neuroscience initiatives, such as the BRAIN initiative (Bargmann et al., 2014), the Human Brain Project, the Human Connectome Project, and the National Institute of Mental Health's Research Domain Criteria initiative. With these large-scale projects, much thought has been given to data-sharing across groups (Poldrack and Gorgolewski, 2014; Sejnowski et al., 2014); however, even with such data-sharing initiatives, funding mechanisms, and infrastructure, there still exists the challenge of how to cohesively integrate all the data. At multiple stages and levels of neuroscience investigation, machine learning holds great promise as an addition to the arsenal of analysis tools for discovering how the brain works." + ], + "domain": [ + "Causal Inference", + "Machine Learning", + "Image Processing", + "Variable Importance" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nIs it possible to determine the existence of simpler yet accurate models in machine learning before engaging in computationally expensive searches for them?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem could significantly impact the research community by shifting the focus back to simpler, interpretable models, which are crucial for high-stakes decision-making. If we can establish that simpler models are likely to exist for many datasets, it would encourage researchers and practitioners to invest time in finding these models, potentially leading to advancements in interpretability and trust in machine learning systems. This could also open new avenues for practical applications in fields such as healthcare, finance, and law, where understanding model decisions is essential.\n\n**[Question 3] - Why is it hard?** \nThe challenge lies in the inherent complexity of the optimization problem when introducing simplicity constraints, such as sparsity, which can make the search for simpler models NP-hard. Naive approaches may fail because they do not account for the intricate relationships within the data that could lead to simpler representations. Additionally, there are theoretical obstacles in proving the existence of simpler models within the vast space of potential solutions, as well as practical difficulties in designing algorithms that can efficiently navigate this space.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has largely focused on complex models that achieve high accuracy without considering the potential for simpler alternatives. There has been a lack of theoretical frameworks that directly address the existence of simpler models in the context of the Rashomon effect. Barriers include the prevailing mindset in machine learning that prioritizes model complexity for performance, as well as insufficient exploration of the relationship between model simplicity and generalization. Our approach differs by explicitly investigating the conditions under which simpler models can exist and be effective, thereby providing a new perspective on model selection.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves analyzing datasets to identify the presence of large Rashomon sets and their implications for model simplicity. We will utilize a combination of theoretical analysis and empirical validation on benchmark datasets, measuring model performance using metrics such as accuracy and interpretability. The expected outcome is to demonstrate that for many datasets, simpler models not only exist but also perform comparably to more complex models, thereby providing a foundation for future research into model selection and interpretability in machine learning." + }, + "1411.5018": { + "paper_data": { + "title": "Frequentism and Bayesianism: A Python-driven Primer", + "url": "http://arxiv.org/abs/1411.5018v1", + "arxiv_id": "1411.5018", + "authors": [ + "Jake VanderPlas" + ], + "abstract": "This paper presents a brief, semi-technical comparison of the essential features of the frequentist and Bayesian approaches to statistical inference, with several illustrative examples implemented in Python. The differences between frequentism and Bayesianism fundamentally stem from differing definitions of probability, a philosophical divide which leads to distinct approaches to the solution of statistical problems as well as contrasting ways of asking and answering questions about unknown parameters. After an example-driven discussion of these differences, we briefly compare several leading Python statistical packages which implement frequentist inference using classical methods and Bayesian inference using Markov Chain Monte Carlo.", + "introduction": " Introduction One of the first things a scientist in a data-intensive field hears about statistics is that there are two different approaches: frequentism and Bayesianism. Despite their importance, many researchers never have opportunity to learn the distinctions between them and the different practical approaches that result. This paper seeks to synthesize the philosophical and prag- matic aspects of this debate, so that scientists who use these approaches might be better prepared to understand the tools available to them. Along the way we will explore the funda- mental philosophical disagreement between frequentism and Bayesianism, explore the practical aspects of how this dis- agreement affects data analysis, and discuss the ways that these practices may affect the interpretation of scientific results. introduction can be found in [Gelman2004]. Below, we will propose a straightforward model and compare a standard frequentist approach with three MCMC implementations available in Python. Application: A Simple Linear Model As an example of a more realistic data-driven analysis, let’s consider a simple three-parameter linear model which fits a straight-line to data with unknown errors. The parameters will be the the y-intercept a, the slope b, and the (unknown) normal scatter sabout the line. For data D=fxi;yig, the model is ˆy(xija;b) =a+bxi; and the likelihood is the product of the Gaussian distribution for each point: L(Dja;b;s) = ( 2ps2)\u0000N=2N Õ i=1exp\u0014\u0000[yi\u0000ˆy(xija;b)]2 2s2\u0015 : We will evaluate this model on the following data set: import numpy as np np.random.seed(42) # for repeatability theta_true = (25, 0.5) xdata = 100 *np.random.random(20) ydata = theta_true[0] + theta_true[1] *xdata ydata = np.random.normal(ydata, 10) # add error Below we will consider a frequentist solution to this problem computed with the statsmodels package6, as well as a Bayesian solution computed with several MCMC implementations in Python: emcee7, PyMC8, and PyStan9. A full experiments, but this is not fundamental. The probability is a statement of the researcher’s knowledge of what the true flux is. For Bayesians, probabilities are fundamentally related to their own knowledge about an event . This means, for example, that in a Bayesian view, we can meaningfully talk about the probability that the true flux of a star lies in a given range. That probability codifies our knowledge of the value based on prior information and available data. The surprising thing is that this arguably subtle difference in philosophy can lead, in practice, to vastly different approaches to the statistical analysis of data. Below we will explore a few examples chosen to illustrate the differences in approach, along with associated Python code to demonstrate the practical aspects of the frequentist and Bayesian approaches. A Simple Example: Photon Flux Measurements First we will compare the frequentist and Bayesian approaches to the solution of an extremely simple problem. Imagine thatarXiv:1411.5018v1 [astro-ph.IM] 18 Nov 20142 PROC. OF THE 13th PYTHON IN SCIENCE CONF. (SCIPY 2014) we point a telescope to the sky, and observe the light coming from a single star. For simplicity, we will assume that the star’s true photon flux is constant with time, i.e. that is it has a fixed value F; we will also ignore effects like sky background systematic errors. We will assume that a series ofNmeasurements are performed, where the ithmeasurement reports the observed flux Fiand error ei.2The question is, given this set of measurements D=fFi;eig, what is our best estimate of the true flux F? First we will use Python to generate some toy data to demonstrate the two approaches to the problem. We will draw 50 samples Fiwith a", + "references": [ + { + "title": "emcee: The MCMC Hammer", + "abstract": "We introduce a stable, well tested Python implementation of the affine-invariant ensemble sampler for Markov chain Monte Carlo (MCMC) proposed by Goodman & Weare (). The code is open source and has already been used in several published projects in the astrophysics literature. The algorithm behind emcee has several advantages over traditional MCMC sampling methods and it has excellent performance as measured by the autocorrelation time (or function calls per independent sample). One major advantage of the algorithm is that it requires hand-tuning of only 1 or 2 parameters compared to ∼N2 for a traditional algorithm in an N-dimensional parameter space. In this document, we describe the algorithm and the details of our implementation. Exploiting the parallelism of the ensemble method, emcee permits any user to take advantage of multiple CPU cores without extra effort. The code is available online at http://dan.iel.fm/emcee under the GNU General Public License v2." + }, + { + "title": "The No-U-turn sampler: adaptively setting path lengths in Hamiltonian Monte Carlo", + "abstract": "Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm that avoids the random walk behavior and sensitivity to correlated parameters that plague many MCMC methods by taking a series of steps informed by first-order gradient information. These features allow it to converge to high-dimensional target distributions much more quickly than simpler methods such as random walk Metropolis or Gibbs sampling. However, HMC's performance is highly sensitive to two user-specified parameters: a step size {\\epsilon} and a desired number of steps L. In particular, if L is too small then the algorithm exhibits undesirable random walk behavior, while if L is too large the algorithm wastes computation. We introduce the No-U-Turn Sampler (NUTS), an extension to HMC that eliminates the need to set a number of steps L. NUTS uses a recursive algorithm to build a set of likely candidate points that spans a wide swath of the target distribution, stopping automatically when it starts to double back and retrace its steps. Empirically, NUTS perform at least as efficiently as and sometimes more efficiently than a well tuned standard HMC method, without requiring user intervention or costly tuning runs. We also derive a method for adapting the step size parameter {\\epsilon} on the fly based on primal-dual averaging. NUTS can thus be used with no hand-tuning at all. NUTS is also suitable for applications such as BUGS-style automatic inference engines that require efficient \"turnkey\" sampling algorithms." + }, + { + "title": "PyMC: Bayesian Stochastic Modelling in Python.", + "abstract": "This user guide describes a Python package, PyMC, that allows users to efficiently code a probabilistic model and draw samples from its posterior distribution using Markov chain Monte Carlo techniques." + }, + { + "title": "Ensemble samplers with affine invariance", + "abstract": "We propose a family of Markov chain Monte Carlo methods whose performance is unaffected by affine tranformations of space. These algorithms are easy to construct and require little or no additional computational overhead. They should be particularly useful for sampling badly scaled distributions. Computational tests show that the affine invariant methods can be significantly faster than standard MCMC methods on highly skewed distributions." + }, + { + "title": "All of Statistics: A Concise Course in Statistical Inference", + "abstract": "WINNER OF THE 2005 DEGROOT PRIZE! This book is for people who want to learn probability and statistics quickly. It brings together many of the main ideas in modern statistics in one place. The book is suitable for students and researchers in statistics, computer science, data mining and machine learning. This book covers a much wider range of topics than a typical introductory text on mathematical statistics. It includes modern topics like nonparametric curve estimation, bootstrapping and classification, topics that are usually relegated to follow-up courses. The reader is assumed to know calculus and a little linear algebra. No previous knowledge of probability and statistics is required. The text can be used at the advanced undergraduate and graduate level." + }, + { + "title": "An Illuminating Counterexample", + "abstract": "Sd+1PI1 occurs exactly as a product (aci . .* aliklaak ... aid)(/llaj), where the index j belongs to {1, 2, ..., n} \\ {il, ..., id) and ik-1 < j < ik. (Of course if j < ii, then aj is the first factor, while if id < j, then aj is the last factor in the bracket.) Therefore each such summand ailai2 ... a d Occurs n d times. Precisely because (n d)sd consists of the monomials y-(n d)ail ... aid where 1 < i1 < i2 < ... < id < n, we see that B contains no positive monomial. The upshot: we see that in the sum A + B there are only negative monomials, whereas in C there are only positive monomials. Therefore no monomial in A + B can be cancelled out by a monomial in C. Thus A + B = 0 = C, and our proof is complete." + }, + { + "title": "Inverse problems as statistics", + "abstract": "What mathematicians, scientists, engineers and statisticians mean by ‘inverse problem’ differs. For a statistician, an inverse problem is an inference or estimation problem. The data are finite in number and contain errors, as they do in classical estimation or inference problems, and the unknown typically is infinite dimensional, as it is in nonparametric regression. The additional complication in an inverse problem is that the data are only indirectly related to the unknown. Canonical abstract formulations of statistical estimation problems subsume this complication by allowing probability distributions to be indexed in more-or-less arbitrary ways by parameters, which can be infinite dimensional. Standard statistical concepts, questions and considerations such as bias, variance, mean-squared error, identifiability, consistency, efficiency and various forms of optimality apply to inverse problems. This paper discusses inverse problems as statistical estimation and inference problems, and points to the literature for a variety of techniques and results. It shows how statistical measures of performance apply to techniques used in practical inverse problems, such as regularization, maximum penalized likelihood, Bayes estimation and the Backus–Gilbert method. The paper generalizes results of Backus and Gilbert characterizing parameters in inverse problems that can be estimated with finite bias. It also establishes general conditions under which parameters in inverse problems can be estimated consistently." + }, + { + "title": "Bayesian Data Analysis.", + "abstract": null + }, + { + "title": "An invariant form for the prior probability in estimation problems", + "abstract": "It is shown that a certain differential form depending on the values of the parameters in a law of chance is invariant for all transformations of the parameters when the law is differentiable with regard to all parameters. For laws containing a location and a scale parameter a form with a somewhat restricted type of invariance is found even when the law is not everywhere differentiable with regard to the parameters. This form has the properties required to give a general rule for stating the prior probability in a large class of estimation problems." + }, + { + "title": "Frequentism and Bayesianism", + "abstract": null + }, + { + "title": "Statsmodels: Econometric and Statistical Modeling with Python", + "abstract": "Statsmodels is a library for statistical and econometric analysis in Python. This paper discusses the current relationship between statistics and Python and open source more generally, outlining how the statsmodels package fills a gap in this relationship. An overview of statsmodels is provided, including a discussion of the overarching design and philosophy, what can be found in the package, and some usage examples. The paper concludes with a look at what the future holds." + }, + { + "title": "An essay towards solving a problem in the doctrine of chances", + "abstract": null + }, + { + "title": "Confidence Intervals vs Bayesian Intervals", + "abstract": null + } + ] + }, + "author_data": {}, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow do the philosophical and practical differences between frequentist and Bayesian approaches affect the analysis and interpretation of data in scientific research?\n\n**[Question 2] - Why is it interesting and important?** \nUnderstanding the distinctions between frequentism and Bayesianism is crucial for researchers in data-intensive fields, as it influences the choice of statistical methods and the interpretation of results. By addressing this problem, the research community can enhance the rigor and transparency of data analysis, leading to more reliable scientific conclusions. This synthesis of philosophical and pragmatic aspects can guide future research methodologies, improve statistical literacy among scientists, and foster better communication of results across disciplines.\n\n**[Question 3] - Why is it hard?** \nThe challenge lies in the deep-rooted philosophical differences between the two approaches, which can lead to fundamentally different interpretations of the same data. Naive approaches may fail because they overlook these philosophical distinctions, leading to inappropriate application of statistical methods. Additionally, the technical complexities of implementing Bayesian methods, such as MCMC, require a solid understanding of both the underlying mathematics and computational techniques, which can be a barrier for many researchers.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research often focused on either frequentist or Bayesian methods in isolation, neglecting a comprehensive comparison that includes both philosophical and practical dimensions. Barriers include a lack of interdisciplinary communication and the dominance of frequentist methods in many scientific fields. This paper aims to bridge this gap by providing a clear comparison and practical examples that highlight the strengths and weaknesses of both approaches, thus improving upon prior work that has not synthesized these perspectives.\n\n**[Question 5] - What are the key components of my approach and results?** \nThe proposed methodology involves a comparative analysis of a simple three-parameter linear model using both frequentist and Bayesian approaches. The dataset will consist of simulated photon flux measurements with added noise. The metrics for evaluation will include parameter estimates and their uncertainties derived from both methods. Expected outcomes include a clearer understanding of how each approach influences the results and interpretations, along with practical Python implementations using libraries such as statsmodels for frequentist analysis and emcee, PyMC, and PyStan for Bayesian analysis." + }, + "1811.12808": { + "paper_data": { + "title": "Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning", + "url": "http://arxiv.org/abs/1811.12808v3", + "arxiv_id": "1811.12808", + "authors": [ + "Sebastian Raschka" + ], + "abstract": "The correct use of model evaluation, model selection, and algorithm selection techniques is vital in academic machine learning research as well as in many industrial settings. This article reviews different techniques that can be used for each of these three subtasks and discusses the main advantages and disadvantages of each technique with references to theoretical and empirical studies. Further, recommendations are given to encourage best yet feasible practices in research and applications of machine learning. Common methods such as the holdout method for model evaluation and selection are covered, which are not recommended when working with small datasets. Different flavors of the bootstrap technique are introduced for estimating the uncertainty of performance estimates, as an alternative to confidence intervals via normal approximation if bootstrapping is computationally feasible. Common cross-validation techniques such as leave-one-out cross-validation and k-fold cross-validation are reviewed, the bias-variance trade-off for choosing k is discussed, and practical tips for the optimal choice of k are given based on empirical evidence. Different statistical tests for algorithm comparisons are presented, and strategies for dealing with multiple comparisons such as omnibus tests and multiple-comparison corrections are discussed. Finally, alternative methods for algorithm selection, such as the combined F-test 5x2 cross-validation and nested cross-validation, are recommended for comparing machine learning algorithms when datasets are small.", + "introduction": " Introduction to Data Mining . Pearson Addison Wesley, Boston. [Varma and Simon, 2006] Varma, S. and Simon, R. (2006). Bias in error estimation when using cross-validation for model selection. BMC bioinformatics , 7(1):91. [Varoquaux, 2017] Varoquaux, G. (2017). Cross-validation failure: small sample sizes lead to large error bars. Neuroimage . [Westfall et al., 2010] Westfall, P. H., Troendle, J. F., and Pennello, G. (2010). Multiple McNemar tests. Biometrics , 66(4):1185–1191. 49 Conclusions Since \"a picture is worth a thousand words,\" I want to conclude this series on model evaluation, model selection, and algorithm selection with a diagram (Figure 23) that summarizes my personal recommendations based on the concepts and literature that was reviewed. It should be stressed that parametric tests for comparing model performances usually violate one or more independent assumptions (the models are not independent because the same training set was used, and the estimated generalization performances are not independent because the same test set was used.). In an ideal world, we would have access to the data generating distribution or at least an 46Performanceestimation Model selection(hyperparameter optimization)and performance estimationLarge dataset▪2-way holdout method (train/test split)▪Confidence interval via normal approximationSmall dataset▪3-way holdout method (train/validation/test split)▪(Repeated) k-fold cross-validation without independent test set▪Leave-one-out cross-validation without independent test set▪Confidence interval via 0.632(+) bootstrap Model & algorithm comparison▪Multiple independent training sets + test sets (algorithm comparison, AC)▪McNemar test (model comparison, MC)▪Cochran’s Q + McNemar test (MC)▪Combined 5x2cv F test (AC)▪Nested cross-validation (AC)Large datasetSmall datasetLarge datasetSmall dataset▪(Repeated) k-fold cross-validation with independent test set▪Leave-one-out cross-validation with independent test set This work by Sebastian Raschka is licensed under aCreative CommonsAttribution 4.0 International License.Figure 23: A recommended subset of techniques to be used to address different aspects of model evaluation in the context of small and large datasets. The abbreviation \"MC\" stands for \"Model Comparison,\" and \"AC\" stands for \"Algorithm Comparison,\" to distinguish these two tasks. almost infinite pool of new data. However, in most practical applications, the size of the dataset is limited; hence, we can use one of the statistical tests discussed in this article as a heuristic to aid our decision making. Note that the recommendations I listed in the figure above are suggestions and depend on the problem at hand. For instance, large test datasets (where \"large\" is relative but might refer to thousands or millions of data records), can provide reliable estimates of the generalization performance, whereas using a single training and test set when only a few data records are available can be problematic for several reasons discussed throughout Section 2 and Section 3. If the dataset is very small, it might not be feasible to set aside data for testing, and in such cases, we can use k-fold cross-validation with a large kor Leave-one-out cross-validation as a workaround for evaluating", + "references": [] + }, + "author_data": { + "03db2dab-b073-4474-ae31-a51d4d02484d": { + "pk": "03db2dab-b073-4474-ae31-a51d4d02484d", + "project_name": null, + "name": "Sebastian Raschka", + "bio": "I am a researcher with a diverse background in machine learning, bioinformatics, and computational biology. My work primarily focuses on developing innovative algorithms and models that bridge the gap between technology and real-world applications. One of my notable contributions is the design of a convolutional autoencoder that enhances privacy in facial recognition systems by perturbing input images to obscure gender classification while maintaining recognition accuracy. This project introduced a novel semi-adversarial training scheme, showcasing my ability to integrate complex concepts into practical solutions.\n\nIn addition to my work in privacy-preserving technologies, I have explored the intricacies of protein-ligand interactions through my SiteInterlock approach, which leverages rigidity theory to predict native binding modes. This method not only enhances the accuracy of binding predictions but also provides insights into the cooperative nature of molecular interactions.\n\nMy research extends to sentiment analysis in music, where I developed a recommendation system using a naive Bayes classifier to predict song sentiments based on lyrics. This project highlights my interest in applying machine learning techniques to improve emotional well-being in various contexts.\n\nI am passionate about utilizing Python and its powerful libraries for machine learning and data analysis, as evidenced by my contributions to literature on predictive analytics. My goal is to continue exploring the intersections of technology and biology, driving advancements that can lead to meaningful societal impacts.", + "collaborators": [ + "R. Olson", + "Pronojit Saha", + "Nathan", + "Randy J. Carnevale", + "Ted", + "kadarakos", + "ktkirk", + "Daniel", + "derekjanni", + "screwed", + "Vahid Mirjalili", + "F. O'Donovan", + "Grishma Jena", + "A. Namboodiri", + "A. Ross", + "S. Turner", + "Daniel S. Standage", + "Cui Jie", + "Phelim Bradley", + "Daniel E Cook", + "deepstop", + "É. Normandeau", + "HLiang", + "Joseph Bemister-Buffington", + "L. Kuhn", + "Akshay Varik", + "weixuanfu", + "Randal S. Olson" + ], + "pub_titles": [ + "Semi-adversarial Networks: Convolutional Autoencoders for Imparting Privacy to Face Images", + "BioPandas: Working with molecular structures in pandas DataFrames", + "MusicMood: Predicting the mood of music from song lyrics using machine learning", + "Detecting the native ligand orientation by interfacial rigidity: SiteInterlock", + "Python machine learning : unlock deeper insights into machine learning with this vital guide to cutting-edge predictive analytics", + "Naive Bayes and Text Classification I - Introduction and Theory", + "An Overview of General Performance Metrics of Binary Classifier Systems", + "Statistical Identification of Potential CLAVATA2 Interactors by Fluorescence Resonance Energy Transfer Analysis" + ], + "pub_abstracts": [ + "In this paper, we design and evaluate a convolutional autoencoder that perturbs an input face image to impart privacy to a subject. Specifically, the proposed autoencoder transforms an input face image such that the transformed image can be successfully used for face recognition but not for gender classification. In order to train this autoencoder, we propose a novel training scheme, referred to as semi-adversarial training in this work. The training is facilitated by attaching a semi-adversarial module consisting of an auxiliary gender classifier and an auxiliary face matcher to the autoencoder. The objective function utilized for training this network has three terms: one to ensure that the perturbed image is a realistic face image; another to ensure that the gender attributes of the face are confounded; and a third to ensure that biometric recognition performance due to the perturbed image is not impacted. Extensive experiments confirm the efficacy of the proposed architecture in extending gender privacy to face images.", + "Furthermore, useful small-molecule related functions are provided for reading and parsing millions of small molecule structures (from multi-MOL2 files (Tripos 2007)) fast and efficiently in virtual screening applications. Inbuilt functions for filtering molecules by the presence of functional groups and their pair-wise distances to each other make BioPandas a particularly attractive utility library for virtual screening and protein-ligand docking applications.", + "Sentiment prediction of contemporary music can have a wide-range of applications in modern society, for instance, selecting music for public institutions such as hospitals or restaurants to potentially improve the emotional well-being of personnel, patients, and customers, respectively. In this project, music recommendation system built upon on a naive Bayes classifier, trained to predict the sentiment of songs based on song lyrics alone. The experimental results show that music corresponding to a happy mood can be detected with high precision based on text features obtained from song lyrics.", + "Understanding the physical attributes of protein‐ligand interfaces, the source of most biological activity, is a fundamental problem in biophysics. Knowing the characteristic features of interfaces also enables the design of molecules with potent and selective interactions. Prediction of native protein‐ligand interactions has traditionally focused on the development of physics‐based potential energy functions, empirical scoring functions that are fit to binding data, and knowledge‐based potentials that assess the likelihood of pairwise interactions. Here we explore a new approach, testing the hypothesis that protein‐ligand binding results in computationally detectable rigidification of the protein‐ligand interface. Our SiteInterlock approach uses rigidity theory to efficiently measure the relative interfacial rigidity of a series of small‐molecule ligand orientations and conformations for a number of protein complexes. In the majority of cases, SiteInterlock detects a near‐native binding mode as being the most rigid, with particularly robust performance relative to other methods when the ligand‐free conformation of the protein is provided. The interfacial rigidification of both the protein and ligand prove to be important characteristics of the native binding mode. This measure of rigidity is also sensitive to the spatial coupling of interactions and bond‐rotational degrees of freedom in the interface. While the predictive performance of SiteInterlock is competitive with the best of the five other scoring functions tested, its measure of rigidity encompasses cooperative rather than just additive binding interactions, providing novel information for detecting native‐like complexes. SiteInterlock shows special strength in enhancing the prediction of native complexes by ruling out inaccurate poses. Proteins 2016; 84:1888–1901. © 2016 Wiley Periodicals, Inc.", + "Unlock deeper insights into Machine Leaning with this vital guide to cutting-edge predictive analytics About This Book * Leverage Python's most powerful open-source libraries for deep learning, data wrangling, and data visualization * Learn effective strategies and best practices to improve and optimize machine learning systems and algorithms * Ask and answer tough questions of your data with robust statistical models, built for a range of datasets Who This Book Is For If you want to find out how to use Python to start answering critical questions of your data, pick up Python Machine Learning whether you want to get started from scratch or want to extend your data science knowledge, this is an essential and unmissable resource. What You Will Learn * Explore how to use different machine learning models to ask different questions of your data * Learn how to build neural networks using Keras and Theano * Find out how to write clean and elegant Python code that will optimize the strength of your algorithms * Discover how to embed your machine learning model in a web application for increased accessibility * Predict continuous target outcomes using regression analysis * Uncover hidden patterns and structures in data with clustering * Organize data using effective pre-processing techniques * Get to grips with sentiment analysis to delve deeper into textual and social media data In Detail Machine learning and predictive analytics are transforming the way businesses and other organizations operate. Being able to understand trends and patterns in complex data is critical to success, becoming one of the key strategies for unlocking growth in a challenging contemporary marketplace. Python can help you deliver key insights into your data its unique capabilities as a language let you build sophisticated algorithms and statistical models that can reveal new perspectives and answer key questions that are vital for success. Python Machine Learning gives you access to the world of predictive analytics and demonstrates why Python is one of the world's leading data science languages. If you want to ask better questions of data, or need to improve and extend the capabilities of your machine learning systems, this practical data science book is invaluable. Covering a wide range of powerful Python libraries, including scikit-learn, Theano, and Keras, and featuring guidance and tips on everything from sentiment analysis to neural networks, you'll soon be able to answer some of the most important questions facing you and your organization. Style and approach Python Machine Learning connects the fundamental theoretical principles behind machine learning to their practical application in a way that focuses you on asking and answering the right questions. It walks you through the key elements of Python and its powerful machine learning libraries, while demonstrating how to get to grips with a range of statistical models.", + "Naive Bayes classifiers, a family of classifiers that are based on the popular Bayes' probability theorem, are known for creating simple yet well performing models, especially in the fields of document classification and disease prediction. In this article, we will look at the main concepts of naive Bayes classification in the context of document categorization.", + "This document provides a brief overview of different metrics and terminology that is used to measure the performance of binary classification systems.", + "The overall goal of this study was to identify potential interactors of the CLAVATA2 (CLV2) membrane receptor, which is participating in the stem cell signaling pathway of the model plant Arabidopsis thaliana. In order to investigate the physical interaction between those proteins, a fluorescence resonance energy transfer (FRET) analysis was conducted. Data have been collected all by myself during my undergraduate laboratory experiences in August, 2011, at the Department of Developmental Genetics at Heinrich-Heine University Düsseldorf." + ], + "domain": [ + "Machine Learning", + "Computer Vision", + "Bioinformatics", + "Sentiment Analysis" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "### [Question 1] - What is the problem?\nHow can we improve the reliability of model evaluation and selection in machine learning, particularly when dealing with small datasets?\n\n### [Question 2] - Why is it interesting and important?\nSolving this problem is crucial for the research community as it addresses the pervasive issue of bias in error estimation during model selection, which can lead to misleading conclusions about model performance. By developing more robust evaluation techniques, future research can build on more reliable foundations, ultimately advancing knowledge in machine learning and leading to practical applications in fields such as bioinformatics, finance, and healthcare, where data scarcity is common.\n\n### [Question 3] - Why is it hard?\nThe challenges in solving this problem stem from the inherent limitations of small datasets, which can lead to high variance in performance estimates and unreliable model comparisons. Naive approaches, such as simple train/test splits or standard cross-validation, may fail to account for the dependencies between models and datasets, resulting in biased performance metrics. Technical obstacles include the need for sophisticated statistical methods to accurately estimate generalization performance and the complexity of implementing these methods in practice.\n\n### [Question 4] - Why hasn't it been solved before?\nPrevious research has often overlooked the specific challenges posed by small sample sizes, leading to a reliance on traditional evaluation methods that do not adequately address the issue of bias. Barriers include a lack of awareness of the limitations of existing techniques and insufficient emphasis on developing tailored solutions for small datasets. My approach differs by proposing a comprehensive framework that incorporates advanced statistical tests and cross-validation strategies specifically designed for small sample sizes, thereby improving upon prior work.\n\n### [Question 5] - What are the key components of my approach and results?\nMy proposed methodology includes the use of repeated k-fold cross-validation and leave-one-out cross-validation for small datasets, combined with statistical tests such as McNemar's test and Cochran’s Q for model comparison. I will utilize a diverse set of datasets from bioinformatics to validate the approach, measuring performance using metrics like accuracy and confidence intervals. The expected outcomes include a set of best practices for model evaluation that enhances reliability and reduces bias, ultimately leading to more trustworthy model selection in machine learning applications." + }, + "1503.04069": { + "paper_data": { + "title": "LSTM: A Search Space Odyssey", + "url": "http://arxiv.org/abs/1503.04069v2", + "arxiv_id": "1503.04069", + "authors": [ + "Klaus Greff", + "Rupesh Kumar Srivastava", + "Jan Koutník", + "Bas R. Steunebrink", + "Jürgen Schmidhuber" + ], + "abstract": "Several variants of the Long Short-Term Memory (LSTM) architecture for recurrent neural networks have been proposed since its inception in 1995. In recent years, these networks have become the state-of-the-art models for a variety of machine learning problems. This has led to a renewed interest in understanding the role and utility of various computational components of typical LSTM variants. In this paper, we present the first large-scale analysis of eight LSTM variants on three representative tasks: speech recognition, handwriting recognition, and polyphonic music modeling. The hyperparameters of all LSTM variants for each task were optimized separately using random search, and their importance was assessed using the powerful fANOVA framework. In total, we summarize the results of 5400 experimental runs ($\\approx 15$ years of CPU time), which makes our study the largest of its kind on LSTM networks. Our results show that none of the variants can improve upon the standard LSTM architecture significantly, and demonstrate the forget gate and the output activation function to be its most critical components. We further observe that the studied hyperparameters are virtually independent and derive guidelines for their efficient adjustment.", + "introduction": " introduction of full BPTT training, Gers et al. [27] utilized a training method based on Extended Kalman Filtering which enabled the LSTM to be trained on some pathological cases at the cost of high computational complexity. Schmidhuber et al. [28] proposed using a hybrid evolution-based method instead of BPTT for training but retained the vanilla LSTM architecture. Bayer et al. [29] evolved different LSTM block architectures that maximize fitness on context-sensitive grammars. A largerstudy of this kind was later done by Jozefowicz et al. [30]. Sak et al. [9]introduced a linear projection layer that projects the output of the LSTM layer down before recurrent and forward connections in order to reduce the amount of parameters for LSTM networks with many blocks. By introducing a trainable scaling parameter for the slope of the gate activation functions, Doetsch et al. [5]were able to improve the performance of LSTM on an offline handwriting recognition dataset. In what they call Dynamic Cortex Memory , Otte et al. [31] improved convergence speed of LSTM by adding recurrent connections between the gates of a single block (but not between the blocks). Cho et al. [32] proposed a simplified variant of the LSTM architecture called Gated Recurrent Unit (GRU). They used neither peephole connections nor output activation functions, and coupled the input and the forget gate into an update gate . Finally, their output gate (called reset gate ) only gates the recurrent connections to the block input ( Wz). Chung et al. [33] performed an initial comparison between GRU and Vanilla LSTM and reported mixed conclusions we restrict our further analysis to the top 10% performing trials for each combination of dataset and variant (see bottom half of Figure 3). This way our findings will be less dependent on the chosen search space and will be representative for the case of “reasonable hyperparameter tuning efforts.”9 The first important observation based on Figure 3 is that removing the output activation function (NOAF) or the forget gate (NFG) significantly hurt performance on all three datasets. Apart from the CEC, the ability to forget old information and the squashing of the cell state appear to be critical for the LSTM architecture. Indeed, without the output activation function, the block output can in principle grow unbounded. Coupling the input and the forget gate avoids this problem and might render the use of an output non-linearity less important, which could explain why GRU performs well without it. 9How much effort is “reasonable” will still depend on the search space. If the ranges are chosen much larger, the search will take much longer to find good hyperparameters.Input and forget gate coupling (CIFG) did not significantly change mean performance on any of the datasets, although the best performance improved slightly on music modeling. Similarly, removing peephole connections (NP) also did not lead to significant changes, but the best performance improved slightly for handwriting recognition. Both of these variants simplify LSTMs and reduce the computational complexity, so it might be worthwhile to incorporate these changes into the architecture. Adding full gate recurrence (FGR) did not significantly change performance on TIMIT or IAM Online, but led to worse background presents the average number of parameters for the top 10% performers of every variant. specific to our choice of search ranges. We have tried to chose reasonable ranges for the hyperparameters that include the best settings for each variant and are still small enough to allow for", + "references": [] + }, + "author_data": { + "3d07dad3-289e-43fc-98ef-e1f6f00ce57f": { + "pk": "3d07dad3-289e-43fc-98ef-e1f6f00ce57f", + "project_name": null, + "name": "Rupesh Kumar Srivastava", + "bio": "I am a researcher dedicated to advancing the fields of machine learning and optimization, with a particular focus on the intersection of neural networks and design optimization under uncertainty. My work spans a variety of innovative approaches, including the development of a novel system for automatically generating image descriptions that leverages visual detectors and language models, achieving state-of-the-art results on the Microsoft COCO benchmark.\n\nI have explored the dynamics of neural network activation functions, demonstrating how local competition among neurons can enhance performance and mitigate issues like catastrophic forgetting. My research also delves into evolutionary algorithms (EAs) for design optimization, where I have successfully integrated evidence theory to address uncertainties in design variables. This work has shown promising results in finding optimal solutions across various engineering problems, significantly improving computational efficiency through GPU-based parallelization.\n\nAdditionally, I am intrigued by the concept of curiosity-driven problem-solving, as exemplified by the POWERPLAY framework, which fosters the development of artificial explorers capable of generating and solving novel problems. My recent efforts in Generalized Compressed Network Search (GCNS) aim to enhance neural network compression and generalization capabilities, showcasing my commitment to pushing the boundaries of what is possible in machine learning.\n\nOverall, my research is characterized by a blend of theoretical insights and practical applications, aimed at solving complex problems in innovative ways while contributing to the broader understanding of machine learning and optimization methodologies.", + "collaborators": [ + "J. Schmidhuber", + "K. Deb", + "Faustino J. Gomez", + "Jonathan Masci", + "Bas R. Steunebrink", + "Hao Fang", + "Saurabh Gupta", + "F. Iandola", + "L. Deng", + "Piotr Dollár", + "Jianfeng Gao", + "Xiaodong He", + "Margaret Mitchell", + "John C. Platt", + "C. L. Zitnick", + "G. Zweig", + "Rupesh Tulshyan", + "Sohrob Kazerounian", + "Marijn F. Stollenga" + ], + "pub_titles": [ + "From captions to visual concepts and back", + "Understanding Locally Competitive Networks", + "An evolutionary algorithm based approach to design optimization using evidence theory", + "Compete to Compute", + "An evolutionary based Bayesian design optimization approach under incomplete information", + "Continually adding self-invented problems to the repertoire: First experiments with POWERPLAY", + "Generalized compressed network search", + "An EA-based approach to design optimization using evidence theory" + ], + "pub_abstracts": [ + "This paper presents a novel approach for automatically generating image descriptions: visual detectors, language models, and multimodal similarity models learnt directly from a dataset of image captions. We use multiple instance learning to train visual detectors for words that commonly occur in captions, including many different parts of speech such as nouns, verbs, and adjectives. The word detector outputs serve as conditional inputs to a maximum-entropy language model. The language model learns from a set of over 400,000 image descriptions to capture the statistics of word usage. We capture global semantics by re-ranking caption candidates using sentence-level features and a deep multimodal similarity model. Our system is state-of-the-art on the official Microsoft COCO benchmark, producing a BLEU-4 score of 29.1%. When human judges compare the system captions to ones written by other people on our held-out test set, the system captions have equal or better quality 34% of the time.", + "Recently proposed neural network activation functions such as rectified linear, maxout, and local winner-take-all have allowed for faster and more effective training of deep neural architectures on large and complex datasets. The common trait among these functions is that they implement local competition between small groups of computational units within a layer, so that only part of the network is activated for any given input pattern. In this paper, we attempt to visualize and understand this self-modularization, and suggest a unified explanation for the beneficial properties of such networks. We also show how our insights can be directly useful for efficiently performing retrieval over large datasets using neural networks.", + "For problems involving uncertainties in design variables and parameters, a bi-objective evolutionary algorithm (EA) based approach to design optimization using evidence theory is proposed and implemented in this paper. In addition to a functional objective, a plausibility measure of failure of constraint satisfaction is minimized. Despite some interests in classical optimization literature, this is the first attempt to use evidence theory with an EA. Due to EA’s flexibility in its operators, non-requirement of any gradient, its ability to handle multiple conflicting objectives, and ease of parallelization, evidence-based design optimization using an EA is promising. Results on a test problem and a couple of engineering design problems show that the modified evolutionary multi-objective optimization (EMO) algorithm is capable of finding a widely distributed trade-off frontier showing different optimal solutions corresponding to different levels of plausibility failure limits. Furthermore, a single-objective evidence based EA is found to produce better optimal solutions than a previously reported classical optimization procedure. The use of a GPU based parallel computing platform demonstrates EA’s performance enhancement around 160 to 700 times in implementing plausibility computations. Handling uncertainties of different types are getting increasingly popular in applied optimization studies and this EA based study should motivate further studies in handling uncertainties.", + "Local competition among neighboring neurons is common in biological neural networks (NNs). In this paper, we apply the concept to gradient-based, backprop-trained artificial multilayer NNs. NNs with competing linear units tend to outperform those with non-competing nonlinear units, and avoid catastrophic forgetting when training sets change over time.", + "Design optimization in the absence of complete information about uncertain quantities has been recently gaining consideration, as expensive repetitive computation tasks are becoming tractable due to the invention of faster and parallel computers. This work uses Bayesian inference to quantify design reliability when only sample measurements of the uncertain quantities are available. A generalized Bayesian reliability based design optimization algorithm has been proposed and implemented for numerical as well as engineering design problems. The approach uses an evolutionary algorithm (EA) to obtain a trade-off front between design objectives and reliability. The Bayesian approach provides a well-defined link between the amount of available information and the reliability through a confidence measure, and the EA acts as an efficient optimizer for a discrete and multi-dimensional objective space. Additionally, a GPU-based parallelization study shows computational speed-up of close to 100 times in a simulated scenario wherein the constraint qualification checks may be time consuming and could render a sequential implementation that can be impractical for large sample sets. These results show promise for the use of a parallel implementation of EAs in handling design optimization problems under uncertainties.", + "Pure scientists do not only invent new methods to solve given problems. They also invent new problems. The recent POWERPLAY framework formalizes this type of curiosity and creativity in a new, general, yet practical way. To acquire problem solving prowess through playing, POWERPLAY-based artificial explorers by design continually come up with the fastest to find, initially novel, but eventually solvable problems. They also continually simplify or speed up solutions to previous problems. We report on results of first experiments with POWERPLAY. A self-delimiting recurrent neural network (SLIM RNN) is used as a general computational architecture to implement the system's solver. Its weights can encode arbitrary, self-delimiting, halting or non-halting programs affecting both environment (through effectors) and internal states encoding abstractions of event sequences. In open-ended fashion, our POWERPLAY-driven RNNs learn to become increasingly general problem solvers, continually adding new problem solving procedures to the growing repertoire, exhibiting interesting developmental stages.", + "This paper presents initial results of Generalized Compressed Network Search (GCNS), a method for automatically identifying the important frequencies for neural networks encoded as a set of Fourier-type coefficients (i.e. \"compressed\" networks). GCNS achieves better compression than our previous approach, and promises better generalization capabilities. Results for a high-dimensional Octopus arm control problem show that a high fitness 3680-weight network can be encoded using less than 10 coefficients, using the frequencies identified by GCNS.", + "For problems involving uncertainties in design variables and parameters, a bi-objective evolutionary algorithm (EA) based approach to design optimization using evidence theory is proposed and implemented in this paper. In addition to a functional objective, a plausibility measure of failure of constraint satisfaction is minimized. Despite some interests in classical optimization literature, such a consideration in EA is rare. Due to EA's flexibility in its operators, non-requirement of any gradient, its ability to handle multiple conflicting objectives, and ease of parallelization, evidence-based design optimization using an EA is promising. Results on a test problem and a couple of engineering design problems show that the modified evolutionary multi-objective optimization (EMO) algorithm is capable of finding a widely distributed trade-off frontier showing different optimal solutions corresponding to different levels of plausibility failure limits. Furthermore, a single-objective evidence based EA is found to produce better optimal solutions than a previously reported classical optimization procedure. Handling uncertainties of different types are getting increasingly popular in applied optimization studies and more such studies using EAs will make EAs more useful and pragmatic in practical optimization problem-solving tasks." + ], + "domain": [ + "Computer Vision", + "Neural Networks", + "Evolutionary Algorithms", + "Design Optimization" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "7876750a-4762-4792-98ad-aa16c75f51bb": { + "pk": "7876750a-4762-4792-98ad-aa16c75f51bb", + "project_name": null, + "name": "Jan Koutník", + "bio": "I am a researcher specializing in the intersection of neuroevolution and reinforcement learning (RL), with a focus on developing efficient neural network architectures for high-dimensional input spaces. My work has pioneered the use of deep learning in evolutionary RL, particularly through the evolution of compact recurrent neural networks (RNNs) that can effectively process visual inputs, such as in the TORCS racing simulator. \n\nI have introduced innovative methods like the Clockwork RNN (CW-RNN), which enhances the training of RNNs by partitioning hidden layers to process inputs at varying temporal granularities, significantly improving performance while reducing complexity. My research also explores indirect encoding schemes, such as representing neural network weights in the frequency domain using Fourier coefficients, which allows for a dramatic reduction in search space dimensionality and accelerates convergence in complex tasks.\n\nAdditionally, I have developed benchmarks, such as the Super Mario Bros. RL benchmark, to investigate the capabilities of neural networks in learning sophisticated strategies. My work emphasizes the importance of simplicity and efficiency in neural network design, as demonstrated by my Compressed Network Complexity Search (CNCS) method, which favors parsimonious solutions.\n\nThrough my research, I aim to push the boundaries of what is possible in neuroevolution and RL, making strides toward scalable and effective solutions for complex, high-dimensional problems.", + "collaborators": [ + "J. Schmidhuber", + "Faustino J. Gomez", + "M. Snorek", + "Jan Drchal", + "Giuseppe Cuccu", + "V. Kůrková", + "Roman Neruda", + "Klaus Greff", + "Bas R. Steunebrink", + "K. Thórisson", + "Eric Nivel", + "T. Glasmachers", + "V. Graziano", + "P. Kordík", + "O. Kovárík", + "Miroslav epek", + "J. Togelius", + "S. Karakovskiy", + "Ondrej Kapral", + "Zdeněk Buk" + ], + "pub_titles": [ + "Evolving deep unsupervised convolutional networks for vision-based reinforcement learning", + "A Clockwork RNN", + "Evolving large-scale neural networks for vision-based TORCS", + "Evolving large-scale neural networks for vision-based reinforcement learning", + "Complexity search for compressed neural networks", + "A Frequency-Domain Encoding for Neuroevolution", + "Searching for Minimal Neural Networks in Fourier Space", + "Evolving neural networks in compressed weight space", + "Super mario evolution", + "HyperNEAT controlled robots learn how to drive on roads in simulated environment" + ], + "pub_abstracts": [ + "Dealing with high-dimensional input spaces, like visual input, is a challenging task for reinforcement learning (RL). Neuroevolution (NE), used for continuous RL problems, has to either reduce the problem dimensionality by (1) compressing the representation of the neural network controllers or (2) employing a pre-processor (compressor) that transforms the high-dimensional raw inputs into low-dimensional features. In this paper, we are able to evolve extremely small recurrent neural network (RNN) controllers for a task that previously required networks with over a million weights. The high-dimensional visual input, which the controller would normally receive, is first transformed into a compact feature vector through a deep, max-pooling convolutional neural network (MPCNN). Both the MPCNN preprocessor and the RNN controller are evolved successfully to control a car in the TORCS racing simulator using only visual input. This is the first use of deep learning in the context evolutionary RL.", + "Sequence prediction and classification are ubiquitous and challenging problems in machine learning that can require identifying complex dependencies between temporally distant inputs. Recurrent Neural Networks (RNNs) have the ability, in theory, to cope with these temporal dependencies by virtue of the short-term memory implemented by their recurrent (feedback) connections. However, in practice they are difficult to train successfully when long-term memory is required. This paper introduces a simple, yet powerful modification to the simple RNN (SRN) architecture, the Clockwork RNN (CW-RNN), in which the hidden layer is partitioned into separate modules, each processing inputs at its own temporal granularity, making computations only at its prescribed clock rate. Rather than making the standard RNN models more complex, CW-RNN reduces the number of SRN parameters, improves the performance significantly in the tasks tested, and speeds up the network evaluation. The network is demonstrated in preliminary experiments involving three tasks: audio signal generation, TIMIT spoken word classification, where it outperforms both SRN and LSTM networks, and online handwriting recognition, where it outperforms SRNs.", + "The TORCS racing simulator has become a standard testbed used in many recent reinforcement learning competitions, where an agent must learn to drive a car around a track using a small set of task-specific features. In this paper, large, recurrent neural networks (with over 1 million weights) are evolved to solve a much more challenging version of the task that instead uses only a stream of images from the driver’s perspective as input. Evolving such large nets is made possible by representing them in the frequency domain as a set of coefficients that are transformed into weight matrices via an inverse Fourier-type transform. To our knowledge this is the first attempt to tackle TORCS using vision, and successfully evolve a neural network controllers of this size.", + "The idea of using evolutionary computation to train artificial neural networks, or neuroevolution (NE), for reinforcement learning (RL) tasks has now been around for over 20 years. However, as RL tasks become more challenging, the networks required become larger, as do their genomes. But, scaling NE to large nets (i.e. tens of thousands of weights) is infeasible using direct encodings that map genes one-to-one to network components. In this paper, we scale-up our compressed network encoding where network weight matrices are represented indirectly as a set of Fourier-type coefficients, to tasks that require very-large networks due to the high-dimensionality of their input space. The approach is demonstrated successfully on two reinforcement learning tasks in which the control networks receive visual input: (1) a vision-based version of the octopus control task requiring networks with over 3 thousand weights, and (2) a version of the TORCS driving game where networks with over 1 million weights are evolved to drive a car around a track using video images from the driver's perspective.", + "In this paper, we introduce a method, called Compressed Network Complexity Search (CNCS), for automatically determining the complexity of compressed networks (neural networks encoded indirectly by Fourier-type coefficients) that favors parsimonious solutions. CNCS maintains a probability distribution over complexity classes that it uses to select which class to optimize. Class probabilities are adapted based on their expected fitness, starting with a prior biased toward the simplest networks. Experiments on a challenging non-linear version of the helicopter hovering task, show that the method consistently finds simple solutions.", + "Neuroevolution has yet to scale up to complex reinforcement learning tasks that require large networks. Networks with many inputs (e.g. raw video) imply a very high dimensional search space if encoded directly. Indirect methods use a more compact genotype representation that is transformed into networks of potentially arbitrary size. In this paper, we present an indirect method where networks are encoded by a set of Fourier coefficients which are transformed into network weight matrices via an inverse Fourier-type transform. Because there often exist network solutions whose weight matrices contain regularity (i.e. adjacent weights are correlated), the number of coefficients required to represent these networks in the frequency domain is much smaller than the number of weights (in the same way that natural images can be compressed by ignore high-frequency components). This \"compressed\" encoding is compared to the direct approach where search is conducted in the weight space on the high-dimensional octopus arm task. The results show that representing networks in the frequency domain can reduce the search-space dimensionality by as much as two orders of magnitude, both accelerating convergence and yielding more general solutions.", + "The principle of minimum description length suggests looking for the simplest network that works well on the training examples, where simplicity is measured by network description size based on a reasonable programming language for encoding networks. Previous work used an assembler-like universal network encoding language (NEL) and Speed Priorbased search (related to Levin’s Universal Search) to quickly find low-complexity nets with excellent generalization performance. Here we define a more natural and often more practical NEL whose instructions are frequency domain coefficients. Frequency coefficients may get encoded by few bits, hence huge weight matrices may just be low-complexity superpositions of patterns computed by programs with few elementary instructions. On various benchmarks this weight matrix encoding greatly accelerates the search. The scheme was tested on pole-balancing, long-term dependency T-maze, and ball throwing. Some of the solutions turn out to be unexpectedly simple as they are computable by fairly short bit", + "We propose a new indirect encoding scheme for neural networks in which the weight matrices are represented in the frequency domain by sets Fourier coefficients. This scheme exploits spatial regularities in the matrix to reduce the dimensionality of the representation by ignoring high-frequency coefficients, as is done in lossy image compression. We compare the efficiency of searching in this \"compressed\" network space to searching in the space of directly encoded networks, using the CoSyNE neuroevolution algorithm on three benchmark problems: pole-balancing, ball throwing and octopus arm control. The results show that this encoding can dramatically reduce the search space dimensionality such that solutions can be found in significantly fewer evaluations", + "We introduce a new reinforcement learning benchmark based on the classic platform game Super Mario Bros. The benchmark has a high-dimensional input space, and achieving a good score requires sophisticated and varied strategies. However, it has tunable difficulty, and at the lowest difficulty setting decent score can be achieved using rudimentary strategies and a small fraction of the input space. To investigate the properties of the benchmark, we evolve neural network-based controllers using different network architectures and input spaces. We show that it is relatively easy to learn basic strategies capable of clearing individual levels of low difficulty, but that these controllers have problems with generalization to unseen levels and with taking larger parts of the input space into account. A number of directions worth exploring for learning betterperforming strategies are discussed.", + "In this paper we describe simulation of autonomous robots controlled by recurrent neural networks, which are evolved through indirect encoding using HyperNEAT algorithm. The robots utilize 180 degree wide sensor array. Thanks to the scalability of the neural network generated by HyperNEAT, the sensor array can have various resolution. This would allow to use camera as an input for neural network controller used in real robot. The robots were simulated using software simulation environment. In the experiments the robots were trained to drive with imaximum average speed. Such fitness forces them to learn how to drive on roads and avoid collisions. Evolved neural networks show excellent scalability. Scaling of the sensory input breaks performance of the robots, which should be gained back with re-training of the robot with a different sensory input resolution." + ], + "domain": [ + "Reinforcement Learning", + "Neuroevolution", + "Neural Networks", + "Deep Learning" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "aa54eb48-e9ae-4b8e-9fa1-98fe7bc19e07": { + "pk": "aa54eb48-e9ae-4b8e-9fa1-98fe7bc19e07", + "project_name": null, + "name": "Bas R. Steunebrink", + "bio": "I am a researcher dedicated to exploring the intersection of artificial intelligence and human communication, particularly how artificial agents can learn language and social skills through observation. My work has led to the development of the Auto-Catalytic, Endogenous, Reflective Architecture (AERA), which enables agents to learn complex multimodal language by observing human interactions without prior grammatical knowledge. Through experiments, I have demonstrated that agents can acquire the pragmatics, semantics, and syntax of language in real-time settings, showcasing the potential for machines to learn in a manner akin to human socialization.\n\nI am also deeply interested in the broader implications of artificial general intelligence (AGI) and the importance of self-reflective systems. My research emphasizes the need for AGI systems to reason about their own programming and improve autonomously, which I believe is crucial for achieving true intelligence. Additionally, I have explored the role of emotions in decision-making, arguing that understanding human emotions can enhance the design of artificial agents, allowing them to interact more effectively with humans.\n\nMy recent work includes the POWERPLAY framework, which formalizes curiosity-driven problem-solving in artificial explorers, and I have proposed a module-based vision for designing multi-agent programming languages that incorporates emotional intelligence. Through these contributions, I aim to bridge the gap between cognitive science and AI, fostering a deeper understanding of intelligence in both humans and machines.", + "collaborators": [ + "J. Schmidhuber", + "M. Dastani", + "K. Thórisson", + "J. Meyer", + "Eric Nivel", + "H. Dindo", + "G. Pezzulo", + "D. Ognibene", + "Helgi Páll Helgason", + "A. Chella", + "R. Sanz", + "Manuel Rodríguez", + "C. Hernández", + "G. Jonsson", + "C. H. Corbato", + "R. Srivastava", + "Nivel Nivel", + "R. S. Bravo", + "Manuel Hernandez", + "Pei Wang", + "Kristinn R. Þórisson", + "J. Koutník", + "M. Rodriguez", + "Marijn F. Stollenga" + ], + "pub_titles": [ + "AUTONOMOUS ACQUISITION OF NATURAL LANGUAGE", + "Autonomous Acquisition of Natural Situated Communication", + "What Should AGI Learn From AI & CogSci ?", + "Bounded Recursive Self-Improvement", + "Continually adding self-invented problems to the repertoire: First experiments with POWERPLAY", + "Towards an Actual Gödel Machine Implementation: a Lesson in Self-Reflective Systems", + "Emotions to control agent deliberation", + "The logical structure of emotions", + "Modularity in BDI-Based Multi-agent Programming Languages", + "The OCC Model Revisited", + "A Formal Model of Emotions: Integrating Qualitative and Quantitative Aspects" + ], + "pub_abstracts": [ + "An important part of human intelligence is the ability to use language. Humans learn how to use language in a society of language users, which is probably the most effective way to learn a language from the ground up. Principles that might allow an artificial agents to learn language this way are not known at present. Here we present a framework which begins to address this challenge. Our auto-catalytic, endogenous, reflective architecture (AERA) supports the creation of agents that can learn natural language by observation. We present results from two experiments where our S1 agent learns human communication by observing two humans interacting in a realtime mock television interview, using gesture and situated language. Results show that S1 can learn multimodal complex language and multimodal communicative acts, using a vocabulary of 100 words with numerous sentence formats, by observing unscripted interaction between the humans, with no grammar being provided to it a priori, and only high-level information about the format of the human interaction in the form of high-level goals of the interviewer and interviewee and a small ontology. The agent learns both the pragmatics, semantics, and syntax of complex sentences spoken by the human subjects on the topic of recycling of objects such as aluminum cans, glass bottles, plastic, and wood, as well as use of manual deictic reference and anaphora.", + "An important part of human intelligence, both historically and operationally, is our ability to communicate. We learn how to communicate, and maintain our communicative skills, in a society of communicators – a highly effective way to reach and maintain proficiency in this complex skill. Principles that might allow artificial agents to learn language this way are in completely known at present – the multi-dimensional nature of socio-communicative skills are beyond every machine learning framework so far proposed. Our work begins to address the challenge of proposing a way for observation-based machine learning of natural language and communication. Our framework can learn complex communicative skills with minimal up-front knowledge. The system learns by incrementally producing predictive models of causal relationships in observed data, guided by goal-inference and reasoning using forward-inverse models. We present results from two experiments where our S1 agent learns human communication by observing two humans interacting in a realtime TV-style interview, using multimodal communicative gesture and situated language to talk about recycling of various materials and objects. S1 can learn multimodal complex language and multimodal communicative acts, a vocabulary of 100 words forming natural sentences with relatively complex sentence structure, including manual deictic reference and anaphora. S1 is seeded only with high-level information about goals of the interviewer and interviewee, and a small ontology; no grammar or other information is provided to S1 a priori. The agent learns the pragmatics, semantics, and syntax of complex utterances spoken and gestures from scratch, by observing the humans compare and contrast the cost and pollution related to recycling aluminum cans, glass bottles, newspaper, plastic, and wood. After 20 hours of observation S1 can perform an unscripted TV interview with a human, in the same style, without making mistakes.", + "While the fields of artificial intelligence (AI) and cognitive science (CogSci) both originated from a deep interest in the same phenomenon – intelligence – and both setting themselves high aims in their early days, each has since greatly narrowed its focus, and all but abandoned their core subject for a more limited version of the phenomenon. The many non-obvious causes for this change over the decades are perhaps understandable, but they have significantly reduced the potential of both fields to impact our understanding of the fundamentals of intelligence – in the wild and in the laboratory. This position paper argues that researchers in the field of artificial general intelligence (AGI) should carefully posit their research objectives and methodology to avoid repeating the same mistakes. 1 The Big Picture of Intelligence and Cognition Roughly speaking, artificial intelligence (AI) and cognitive science (CogSci) come from the same observation and imagination, namely that in a certain sense, the human mind and the electronic computer are – or can become – similar to each other. The similarities (and differences) have been suggested by many people, including Wiener [26], Turing [16], von Neumann [9], McCulloch and Pitt [7], though each from a different perspective. Initiated in this atmosphere, AI and CogSci can be seen as two sides of the same coin: while the former attempts to build a mind-like machine [11], the latter tries to study the mind as a machine [1]. Their relation is like that between engineering and science in general, that is, there is a strong mutual dependence. It is obvious that, to build an intelligent system, one has to have a clear idea about how intelligence works, and most of our knowledge on that topic comes from the study of the human mind. On the other hand, to evaluate the correctness of a theory of cognition, a straightforward way is to model it in an artifact to see if it produces the expected results. Given this relation, it is natural for AI to get inspiration from CogSci, as well as for CogSci to use AI models. Various theories have been proposed both to explain the phenomena observed in human cognition and to guide the design of machine intelligence (cf. [8, 10]). However, as the difficulties in this research became more and more clear, the mainstream in both fields gradually departed from the original objective to", + "We have designed a machine that becomes increasingly better at behaving in underspecified circumstances, in a goal-directed way, on the job, by modeling itself and its environment as experience accumulates. Based on principles of autocatalysis, endogeny, and reflectivity, the work provides an architectural blueprint for constructing systems with high levels of operational autonomy in underspecified circumstances, starting from a small seed. Through value-driven dynamic priority scheduling controlling the parallel execution of a vast number of reasoning threads, the system achieves recursive self-improvement after it leaves the lab, within the boundaries imposed by its designers. A prototype system has been implemented and demonstrated to learn a complex real-world task, real-time multimodal dialogue with humans, by on-line observation. Our work presents solutions to several challenges that must be solved for achieving artificial general intelligence.", + "Pure scientists do not only invent new methods to solve given problems. They also invent new problems. The recent POWERPLAY framework formalizes this type of curiosity and creativity in a new, general, yet practical way. To acquire problem solving prowess through playing, POWERPLAY-based artificial explorers by design continually come up with the fastest to find, initially novel, but eventually solvable problems. They also continually simplify or speed up solutions to previous problems. We report on results of first experiments with POWERPLAY. A self-delimiting recurrent neural network (SLIM RNN) is used as a general computational architecture to implement the system's solver. Its weights can encode arbitrary, self-delimiting, halting or non-halting programs affecting both environment (through effectors) and internal states encoding abstractions of event sequences. In open-ended fashion, our POWERPLAY-driven RNNs learn to become increasingly general problem solvers, continually adding new problem solving procedures to the growing repertoire, exhibiting interesting developmental stages.", + "Recently, interest has been revived in self-reflective systems in the context of Artificial General Intelligence (AGI). An AGI system should be intelligent enough to be able to reason about its own program code, and make modifications where it sees fit, improving on the initial code written by human programmers. A pertinent example is the GA¶del Machine, which employs a proof searcher—in parallel to its regular problem solves duties—to find a self-rewrite of which it can prove that it will be beneficial.", + "The execution of an artificial agent is usually implemented with a sense--reason--act cycle. This cycle includes tasks such as event processing, generating and revising plans, and selecting actions to execute. However, there are typically many choices in the design of such a cycle, which are often hard-coded in the cycle in an ad hoc way. The question of this paper is how one decides, in a principled way, how often and which reasoning rules to apply, how to interleave the execution of plans, or when to start replanning. This paper proposes and formalizes the eliciting conditions of hope, fear, joy, and distress according to a well-known psychological model of human emotion. These conditions are then used to reduce the choices an agent can make in each state. They formalize the idea that emotions focus an agent's attention on what is important in each state.", + "Even though emotions sometimes lead us astray, there is mounting evidence from psychology and neurology that emotions have---on the whole---a positive effect on intelligent decision making and acting. Emotions help both overtly and covertly by focusing a person's attention to what is important and pruning unpromising directions of reasoning. Like humans, artificial agents---such as robots and virtual characters---have to act intelligently under resource constraints. A deep understanding of how human emotions function as innate and learned heuristics can help us in designing more effective artificial agents. Even if one does not want artificial agents to behave emotionally, it will still be useful to make these agents have knowledge of human emotions, so that they can take these into account when interacting or cooperating with humans. In order to incorporate emotions in artificial agents, a bridge must be built from psychological models of human emotions to computer science. This is done in this dissertation by capturing an emotion theory in a formal agent specification language. This formalization both serves as a foundation for the implementation of emotions in artificial agents, and enables us to formally analyze properties of the psychological model, leading to a more precise understanding of the workings of human emotions", + "This paper proposes a module-based vision for designing BDI-based multi-agent programming languages. The introduced concept of modules enables common programming techniques such as encapsulation and information hiding for BDI-based programs, and facilitates the implementation of agent roles and profiles. This vision is applied to a BDI-based agent programming language to which specific programming constructs are added to allow the implementation of modules. The syntax and intuitive semantics of module based programming constructs are explained. An example is presented to illustrate how modules can be used to implement BDI-based multi-agent systems.", + "Although popular among computer scientists, the OCC model of emotions contains a number of ambiguities that stand in the way of a truthful formalization or implementation. This paper aims to identify and clarify several of these ambiguities. Furthermore, a new inheritance-based view of the logical structure of emotions of the OCC model is proposed and discussed.", + "When constructing a formal model of emotions for intelligent agents, two types of aspects have to be taken into account. First, qualitative aspects pertain to the conditions that elicit emotions. Second, quantitative aspects pertain to the actual experience and intensity of elicited emotions. In this presentation, we show how the qualitative aspects of a well-known psychological model of human emotions can be formalized in an agent specification language and how its quantitative aspects can be integrated into this model. Furthermore, we discuss several unspecified details and implicit assumptions in the psychological model that are explicated by this effort." + ], + "domain": [ + "Natural Language Processing", + "Artificial Intelligence", + "Cognitive Science", + "Emotion Modeling" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "e4b987e5-23ac-4b32-bfd3-92461bd590c9": { + "pk": "e4b987e5-23ac-4b32-bfd3-92461bd590c9", + "project_name": null, + "name": "Jürgen Schmidhuber", + "bio": "I am a researcher deeply engaged in the intersection of cognitive architectures, robotics, and machine learning. My work explores how neural dynamics can inform both the modeling of human behavior and the control of robotic systems. Recently, I have integrated reinforcement learning with shaping techniques to enhance the learning efficiency of robotic agents, enabling them to perform complex tasks like pick-and-place operations more effectively.\n\nMy research also delves into the development of object representations through manipulation actions, utilizing Cartesian Genetic Programming to improve visual detection capabilities in robots. I have pioneered the use of deep learning in neuroevolution, successfully evolving compact recurrent neural networks that can process high-dimensional visual inputs for real-time control tasks.\n\nA significant focus of my work is on language acquisition in artificial agents. I have developed a framework that allows agents to learn natural language through observation, demonstrating that they can acquire complex communicative skills by watching human interactions without prior grammatical knowledge.\n\nAdditionally, I have contributed to the understanding of deep neural networks, exploring activation functions that enhance training efficiency and performance. My work on artificial curiosity in embodied agents has led to the first real-time, curious humanoid robot capable of intelligent exploration and learning.\n\nThrough my research, I aim to bridge the gap between cognitive science and robotics, creating systems that not only mimic human learning but also adapt and thrive in dynamic environments.", + "collaborators": [ + "Faustino J. Gomez", + "L. Gambardella", + "J. Koutník", + "J. Leitner", + "A. Förster", + "Marijn F. Stollenga", + "K. Thórisson", + "Bas R. Steunebrink", + "H. Dindo", + "G. Pezzulo", + "D. Ognibene", + "Helgi Páll Helgason", + "A. Chella", + "D. Ciresan", + "A. Giusti", + "Sohrob Kazerounian", + "Eric Nivel", + "Manuel Rodríguez", + "C. Hernández", + "R. Sanz", + "G. Jonsson", + "Jonathan Masci", + "Mikhail Frank", + "M. Luciw", + "Yulia Sandamirskaya", + "G. Schöner", + "Klaus Greff", + "Jan Funke", + "Julien N. P. Martel", + "S. Gerhard", + "Björn Andres", + "H. Pfister", + "Albert Cardona", + "Matthew Cook", + "Nivel Nivel", + "R. S. Bravo", + "Manuel Hernandez", + "C. H. Corbato", + "R. Srivastava", + "Varun Raj Kompella", + "J. Nagi", + "H. Ngo", + "G. D. Caro", + "Claudio Caccia", + "M. Veta", + "P. Diest", + "S. Willems", + "Haibo Wang", + "A. Madabhushi", + "Angel Cruz-Roa", + "F. González", + "A. Larsen", + "Jacob S. Vestergaard", + "A. Dahl", + "F. B. Tek", + "Thomas Walter", + "Ching-Wei Wang", + "Satoshi Kondo", + "B. Matuszewski", + "F. Precioso", + "V. Snell", + "J. Kittler", + "Teófilo Emídio de Campos", + "A. Khan", + "Nasir M. Rajpoot", + "E. Arkoumani", + "M. Laclé", + "M. Viergever", + "J. Pluim" + ], + "pub_titles": [ + "Reinforcement and shaping in learning action sequences with neural dynamics", + "Improving robot vision models for object detection through interaction", + "Evolving deep unsupervised convolutional networks for vision-based reinforcement learning", + "AUTONOMOUS ACQUISITION OF NATURAL LANGUAGE", + "Draft : Deep Learning in Neural Networks : An Overview", + "A Clockwork RNN", + "Deep Networks with Internal Selective Attention through Feedback Connections", + "Autonomous Acquisition of Natural Situated Communication", + "Understanding Locally Competitive Networks", + "Reactive reaching and grasping on a humanoid: Towards closing the action-perception loop on the iCub", + "Curiosity driven reinforcement learning for motion planning on humanoids", + "Human-Robot Cooperation: Fast, Interactive Learning from Binary Feedback", + "A comparison of algorithms and humans for mitosis detection" + ], + "pub_abstracts": [ + "Neural dynamics offer a theoretical and computational framework, in which cognitive architectures may be developed, which are suitable both to model psychophysics of human behaviour and to control robotic behaviour. Recently, we have introduced reinforcement learning in this framework, which allows an agent to learn goal-directed sequences of behaviours based on a reward signal, perceived at the end of a sequence. Although stability of the dynamic neural fields and behavioural organisation allowed to demonstrate autonomous learning in the robotic system, learning of longer sequences was taking prohibitedly long time. Here, we combine the neural dynamic reinforcement learning with shaping, which consists in providing intermediate rewards and accelerates learning.We have implemented the new learning algorithm on a simulated Kuka YouBot robot and evaluated robustness and efficacy of learning in a pick-and-place task.", + "We propose a method for learning specific object representations that can be applied (and reused) in visual detection and identification tasks. A machine learning technique called Cartesian Genetic Programming (CGP) is used to create these models based on a series of images. Our research investigates how manipulation actions might allow for the development of better visual models and therefore better robot vision. This paper describes how visual object representations can be learned and improved by performing object manipulation actions, such as, poke, push and pick-up with a humanoid robot. The improvement can be measured and allows for the robot to select and perform the `right' action, i.e. the action with the best possible improvement of the detector.", + "Dealing with high-dimensional input spaces, like visual input, is a challenging task for reinforcement learning (RL). Neuroevolution (NE), used for continuous RL problems, has to either reduce the problem dimensionality by (1) compressing the representation of the neural network controllers or (2) employing a pre-processor (compressor) that transforms the high-dimensional raw inputs into low-dimensional features. In this paper, we are able to evolve extremely small recurrent neural network (RNN) controllers for a task that previously required networks with over a million weights. The high-dimensional visual input, which the controller would normally receive, is first transformed into a compact feature vector through a deep, max-pooling convolutional neural network (MPCNN). Both the MPCNN preprocessor and the RNN controller are evolved successfully to control a car in the TORCS racing simulator using only visual input. This is the first use of deep learning in the context evolutionary RL.", + "An important part of human intelligence is the ability to use language. Humans learn how to use language in a society of language users, which is probably the most effective way to learn a language from the ground up. Principles that might allow an artificial agents to learn language this way are not known at present. Here we present a framework which begins to address this challenge. Our auto-catalytic, endogenous, reflective architecture (AERA) supports the creation of agents that can learn natural language by observation. We present results from two experiments where our S1 agent learns human communication by observing two humans interacting in a realtime mock television interview, using gesture and situated language. Results show that S1 can learn multimodal complex language and multimodal communicative acts, using a vocabulary of 100 words with numerous sentence formats, by observing unscripted interaction between the humans, with no grammar being provided to it a priori, and only high-level information about the format of the human interaction in the form of high-level goals of the interviewer and interviewee and a small ontology. The agent learns both the pragmatics, semantics, and syntax of complex sentences spoken by the human subjects on the topic of recycling of objects such as aluminum cans, glass bottles, plastic, and wood, as well as use of manual deictic reference and anaphora.", + "In recent years, deep artificial neural networks (including recurrent ones) have won numerous contests in pattern recognition and machine learning. This historical survey compactly summarises relevant work, much of it from the previous millennium. Shallow and deep learners are distinguished by the depth of their credit assignment paths, which are chains of possibly learnable, causal links between actions and effects. I review deep supervised learning (also recapitulating the history of backpropagation), unsupervised learning, reinforcement learning & evolutionary computation, and indirect search for short programs encoding deep and large networks. PDF of earlier draft (v1): http://www.idsia.ch/∼juergen/DeepLearning30April2014.pdf LATEX source: http://www.idsia.ch/∼juergen/DeepLearning30April2014.tex Complete BIBTEX file: http://www.idsia.ch/∼juergen/bib.bib Preface This is the draft of an invited Deep Learning (DL) overview. One of its goals is to assign credit to those who contributed to the present state of the art. I acknowledge the limitations of attempting to achieve this goal. The DL research community itself may be viewed as a continually evolving, deep network of scientists who have influenced each other in complex ways. Starting from recent DL results, I tried to trace back the origins of relevant ideas through the past half century and beyond, sometimes using “local search” to follow citations of citations backwards in time. Since not all DL publications properly acknowledge earlier relevant work, additional global search strategies were employed, aided by consulting numerous neural network experts. As a result, the present draft mostly consists of references (about 800 entries so far). Nevertheless, through an expert selection bias I may have missed important work. A related bias was surely introduced by my special familiarity with the work of my own DL research group in the past quarter-century. For these reasons, the present draft should be viewed as merely a snapshot of an ongoing credit assignment process. To help improve it, please do not hesitate to send corrections and suggestions to juergen@idsia.ch.", + "Sequence prediction and classification are ubiquitous and challenging problems in machine learning that can require identifying complex dependencies between temporally distant inputs. Recurrent Neural Networks (RNNs) have the ability, in theory, to cope with these temporal dependencies by virtue of the short-term memory implemented by their recurrent (feedback) connections. However, in practice they are difficult to train successfully when long-term memory is required. This paper introduces a simple, yet powerful modification to the simple RNN (SRN) architecture, the Clockwork RNN (CW-RNN), in which the hidden layer is partitioned into separate modules, each processing inputs at its own temporal granularity, making computations only at its prescribed clock rate. Rather than making the standard RNN models more complex, CW-RNN reduces the number of SRN parameters, improves the performance significantly in the tasks tested, and speeds up the network evaluation. The network is demonstrated in preliminary experiments involving three tasks: audio signal generation, TIMIT spoken word classification, where it outperforms both SRN and LSTM networks, and online handwriting recognition, where it outperforms SRNs.", + "Traditional convolutional neural networks (CNN) are stationary and feedforward. They neither change their parameters during evaluation nor use feedback from higher to lower layers. Real brains, however, do. So does our Deep Attention Selective Network (dasNet) architecture. DasNets feedback structure can dynamically alter its convolutional filter sensitivities during classification. It harnesses the power of sequential processing to improve classification performance, by allowing the network to iteratively focus its internal attention on some of its convolutional filters. Feedback is trained through direct policy search in a huge million-dimensional parameter space, through scalable natural evolution strategies (SNES). On the CIFAR-10 and CIFAR-100 datasets, dasNet outperforms the previous state-of-the-art model.", + "An important part of human intelligence, both historically and operationally, is our ability to communicate. We learn how to communicate, and maintain our communicative skills, in a society of communicators – a highly effective way to reach and maintain proficiency in this complex skill. Principles that might allow artificial agents to learn language this way are in completely known at present – the multi-dimensional nature of socio-communicative skills are beyond every machine learning framework so far proposed. Our work begins to address the challenge of proposing a way for observation-based machine learning of natural language and communication. Our framework can learn complex communicative skills with minimal up-front knowledge. The system learns by incrementally producing predictive models of causal relationships in observed data, guided by goal-inference and reasoning using forward-inverse models. We present results from two experiments where our S1 agent learns human communication by observing two humans interacting in a realtime TV-style interview, using multimodal communicative gesture and situated language to talk about recycling of various materials and objects. S1 can learn multimodal complex language and multimodal communicative acts, a vocabulary of 100 words forming natural sentences with relatively complex sentence structure, including manual deictic reference and anaphora. S1 is seeded only with high-level information about goals of the interviewer and interviewee, and a small ontology; no grammar or other information is provided to S1 a priori. The agent learns the pragmatics, semantics, and syntax of complex utterances spoken and gestures from scratch, by observing the humans compare and contrast the cost and pollution related to recycling aluminum cans, glass bottles, newspaper, plastic, and wood. After 20 hours of observation S1 can perform an unscripted TV interview with a human, in the same style, without making mistakes.", + "Recently proposed neural network activation functions such as rectified linear, maxout, and local winner-take-all have allowed for faster and more effective training of deep neural architectures on large and complex datasets. The common trait among these functions is that they implement local competition between small groups of computational units within a layer, so that only part of the network is activated for any given input pattern. In this paper, we attempt to visualize and understand this self-modularization, and suggest a unified explanation for the beneficial properties of such networks. We also show how our insights can be directly useful for efficiently performing retrieval over large datasets using neural networks.", + "We propose a system incorporating a tight integration between computer vision and robot control modules on a complex, high-DOF humanoid robot. Its functionality is showcased by having our iCub humanoid robot pick-up objects from a table in front of it. An important feature is that the system can avoid obstacles - other objects detected in the visual stream - while reaching for the intended target object. Our integration also allows for non-static environments, i.e. the reaching is adapted on-the-fly from the visual feedback received, e.g. when an obstacle is moved into the trajectory. Furthermore we show that this system can be used both in autonomous and tele-operation scenarios.", + "Most previous work on artificial curiosity (AC) and intrinsic motivation focuses on basic concepts and theory. Experimental results are generally limited to toy scenarios, such as navigation in a simulated maze, or control of a simple mechanical system with one or two degrees of freedom. To study AC in a more realistic setting, we embody a curious agent in the complex iCub humanoid robot. Our novel reinforcement learning (RL) framework consists of a state-of-the-art, low-level, reactive control layer, which controls the iCub while respecting constraints, and a high-level curious agent, which explores the iCub's state-action space through information gain maximization, learning a world model from experience, controlling the actual iCub hardware in real-time. To the best of our knowledge, this is the first ever embodied, curious agent for real-time motion planning on a humanoid. We demonstrate that it can learn compact Markov models to represent large regions of the iCub's configuration space, and that the iCub explores intelligently, showing interest in its physical constraints as well as in objects it finds in its environment.", + "Categories and Subject DescriptorsCategories and Subject DescriptorsI.2.9 [Robotics]; I.4 [Image Processing and Computer Vision]; I.5 [Pattern Recognition]: Interactive SystemsGeneral TermsAlgorithms, Methods, Design", + "We consider the problem of detecting mitotic figures in breast cancer histology slides. We investigate whether the performance of state-of-the-art detection algorithms is comparable to the performance of humans, when they are compared under fair conditions: our test subjects were not previously exposed to the task, and were required to learn their own classification criteria solely by studying the same training set available to algorithms. We designed and implemented a standardized web-based test based on the publicly-available MITOS dataset, and compared results with the performance of the 6 top-scoring algorithms in the ICPR 2012 Mitosis Detection Contest. The problem is presented as a classification task on a balanced dataset. 45 different test subjects produced a total of 3009 classifications. The best individual (accuracy = 0.859 ± 0.012), is outperformed by the most accurate algorithm (accuracy = 0.873 ± 0.004). This suggests that state-of-the-art detection algorithms are likely limited by the size of the training set, rather than by lack of generalization ability." + ], + "domain": [ + "Reinforcement Learning", + "Neural Networks", + "Robotics", + "Computer Vision" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we improve the training efficiency and performance of Long Short-Term Memory (LSTM) networks while maintaining or reducing their computational complexity?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is crucial for the research community as it addresses the ongoing challenge of optimizing LSTM architectures, which are widely used in various applications such as natural language processing, speech recognition, and time series forecasting. By enhancing training efficiency and performance, this research could lead to more effective models that require less computational power, making advanced machine learning techniques more accessible. Furthermore, it could inspire future research into novel architectures and training methods, potentially leading to breakthroughs in deep learning applications.\n\n**[Question 3] - Why is it hard?** \nThe challenges in solving this problem stem from the inherent complexity of LSTM networks, which involve multiple gates and parameters that must be finely tuned. Naive approaches may fail because they do not account for the intricate interactions between the gates, which can lead to issues such as exploding gradients or overfitting. Additionally, the computational burden of training LSTMs on large datasets can hinder experimentation with different architectures and hyperparameters. Overcoming these technical obstacles requires a deep understanding of both the theoretical underpinnings of LSTMs and practical strategies for efficient training.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has often focused on specific modifications to LSTM architectures or training methods without fully addressing the trade-offs between performance and computational efficiency. Limitations in computational resources and the complexity of hyperparameter tuning have also posed significant barriers. Many existing solutions have not systematically evaluated the impact of various architectural changes on performance across diverse datasets. Our approach aims to fill these gaps by conducting a comprehensive analysis of LSTM variants and their performance metrics, thereby providing clearer insights into effective modifications.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves a systematic evaluation of various LSTM architectures, including modifications such as the removal of output activation functions and coupling of input and forget gates. We will utilize benchmark datasets such as TIMIT and IAM Online for training and testing. The performance will be measured using metrics such as accuracy and computational efficiency (e.g., training time and number of parameters). We expect to identify specific architectural changes that lead to improved performance while reducing computational complexity, ultimately contributing to more efficient LSTM training practices." + }, + "1807.02811": { + "paper_data": { + "title": "A Tutorial on Bayesian Optimization", + "url": "http://arxiv.org/abs/1807.02811v1", + "arxiv_id": "1807.02811", + "authors": [ + "Peter I. Frazier" + ], + "abstract": "Bayesian optimization is an approach to optimizing objective functions that take a long time (minutes or hours) to evaluate. It is best-suited for optimization over continuous domains of less than 20 dimensions, and tolerates stochastic noise in function evaluations. It builds a surrogate for the objective and quantifies the uncertainty in that surrogate using a Bayesian machine learning technique, Gaussian process regression, and then uses an acquisition function defined from this surrogate to decide where to sample. In this tutorial, we describe how Bayesian optimization works, including Gaussian process regression and three common acquisition functions: expected improvement, entropy search, and knowledge gradient. We then discuss more advanced techniques, including running multiple function evaluations in parallel, multi-fidelity and multi-information source optimization, expensive-to-evaluate constraints, random environmental conditions, multi-task Bayesian optimization, and the inclusion of derivative information. We conclude with a discussion of Bayesian optimization software and future research directions in the field. Within our tutorial material we provide a generalization of expected improvement to noisy evaluations, beyond the noise-free setting where it is more commonly applied. This generalization is justified by a formal decision-theoretic argument, standing in contrast to previous ad hoc modifications.", + "introduction": " Introduction . MIT press Cambridge. Swersky, K., Snoek, J., and Adams, R. P. (2013). Multi-task Bayesian optimization. In Advances in Neural Information Processing Systems , pages 2004{2012. Swersky, K., Snoek, J., and Adams, R. P. (2014). Freeze-thaw bayesian optimization. arXiv preprint arXiv:1406.3896 . Toscano-Palmerin, S. and Frazier, P. I. (2018). Bayesian optimization with expensive integrands. arXiv preprint arXiv:1803.08661 . Ueno, T., Rhone, T. D., Hou, Z., Mizoguchi, T., and Tsuda, K. (2016). COMBO: An e\u000ecient Bayesian optimization library for materials science. Materials Discovery , 4:18{21. \u0014Zilinskas, A. (1975). Single-step Bayesian search method for an extremum of functions of a single variable. Cybernetics and Systems Analysis , 11(1):160{166. Waeber, R., Frazier, P. I., and Henderson, S. G. (2013). Bisection search with noisy responses. SIAM Journal on Control and Optimization , 51(3):2261{2279. Wang, J., Clark, S. C., Liu, E., and Frazier, P. I. (2016a). Parallel Bayesian global optimization of expensive functions. arXiv preprint arXiv:1602.05149 . Wang, Z., Hutter, F., Zoghi, M., Matheson, D., and de Feitas, N. (2016b). Bayesian optimization in a billion dimensions via random embeddings. Journal of Arti\fcial Intelligence Research , 55:361{387. Wang, Z., Zoghi, M., Hutter, F., Matheson, D., De Freitas, N., et al. (2013). Bayesian optimization in high dimensions via random embeddings. In IJCAI , pages 1778{1784. Williams, B. J., Santner, T. J., and Notz, W. I. (2000). Sequential design of computer methods for the global optimization of expensive functions. European Journal of Operational Research , 182(2):514{535. Robbins, H. and Monro, S. (1951). A stochastic approximation method. The Annals of Mathematical Statistics , 22(3):400{407. Roustant, O., Ginsbourger, D., and Deville, Y. (2012). Dicekriging, diceoptim: Two r packages for the analysis of computer experiments to minimize integrated response functions. Statistica Sinica , 10(4):1133{1152. Wu, J. and Frazier, P. (2016). The parallel knowledge gradient method for batch Bayesian optimization. InAdvances in Neural Information Processing Systems , pages 3126{3134. 21Wu, J., Poloczek, M., Wilson, A. G., and Frazier, P. (2017). Bayesian optimization with gradients. In Advances in Neural Information Processing Systems , pages 5273{5284. Xie, J. and Frazier, P. I. (2013). Sequential Bayes-optimal policies for multiple comparisons with a known standard. Operations Research , 61(5):1174{1189. Xie, J., Frazier, P. I., Sankaran, S., Marsden, A., and Elmohamed, S. (2012). Optimization of compu- tationally expensive simulations with Gaussian processes and parameter uncertainty: Application to cardiovascular surgery. In 50th Annual Allerton Conference on Communication, Control, and Com- puting , pages 406{413. IEEE. 22 introduction here. A more complete treatment may be found in Rasmussen and Williams (2006). We \frst describe GP regression, focusing on f's values at a \fnite collection of points x1;:::;xk2Rd. It is convenient to collect the function's values at these points together into a vector [ f(x1);:::;f (xk)]. Whenever we have a quantity that is unknown in Bayesian statistics, like this vector, we suppose that it was drawn at random by nature from some prior probability distribution. GP regression takes this prior distribution to be multivariate normal, with a particular mean vector and covariance matrix. 3Figure 1: Illustration of BayesOpt, maximizing an objective function fwith a 1-dimensional continuous input. The top panel shows: noise-free observations of the objective function fat 3 points, in blue; an estimate of f(x) (solid red line); and Bayesian credible intervals (similar to con\fdence intervals) for f(x) (dashed red line). These estimates and credible intervals are obtained using GP regression. The bottom panel shows the", + "references": [ + { + "title": "Bayesian Optimization for Materials Science", + "abstract": null + }, + { + "title": "Practical Bayesian Optimization for Variable Cost Objectives", + "abstract": "We propose a novel Bayesian Optimization approach for black-box functions with an environmental variable whose value determines the tradeoff between evaluation cost and the fidelity of the evaluations. Further, we use a novel approach to sampling support points, allowing faster construction of the acquisition function. This allows us to achieve optimization with lower overheads than previous approaches and is implemented for a more general class of problem. We show this approach to be effective on synthetic and real world benchmark problems." + }, + { + "title": "Bayesian Optimization with Gradients", + "abstract": "Bayesian optimization has been successful at global optimization of expensive-to-evaluate multimodal objective functions. However, unlike most optimization methods, Bayesian optimization typically does not use derivative information. In this paper we show how Bayesian optimization can exploit derivative information to decrease the number of objective function evaluations required for good performance. In particular, we develop a novel Bayesian optimization algorithm, the derivative-enabled knowledge-gradient (dKG), for which we show one-step Bayes-optimality, asymptotic consistency, and greater one-step value of information than is possible in the derivative-free setting. Our procedure accommodates noisy and incomplete derivative information, comes in both sequential and batch forms, and can optionally reduce the computational cost of inference through automatically selected retention of a single directional derivative. We also compute the d-KG acquisition function and its gradient using a novel fast discretization-free technique. We show d-KG provides state-of-the-art performance compared to a wide range of optimization procedures with and without gradients, on benchmarks including logistic regression, deep learning, kernel learning, and k-nearest neighbors." + }, + { + "title": "Bayesian Optimization with a Finite Budget: An Approximate Dynamic Programming Approach", + "abstract": "We consider the problem of optimizing an expensive objective function when a finite budget of total evaluations is prescribed. In that context, the optimal solution strategy for Bayesian optimization can be formulated as a dynamic programming instance. This results in a complex problem with uncountable, dimension-increasing state space and an uncountable control space. We show how to approximate the solution of this dynamic programming problem using rollout, and propose rollout heuristics specifically designed for the Bayesian optimization setting. We present numerical experiments showing that the resulting algorithm for optimization with a finite budget outperforms several popular Bayesian optimization algorithms." + }, + { + "title": "Designing Nanostructures for Phonon Transport via Bayesian Optimization", + "abstract": "Phonon transport---the movement of vibrational wave packets in a solid---in nanostructures is a key element in controlling solid heat conduction, but it remains a complex design challenge. A new framework uses informatics and phonon transport calculations to greatly accelerate the design process and reveals nonintuitive structures that are more effective than their traditional counterparts." + }, + { + "title": "Multi-Step Bayesian Optimization for One-Dimensional Feasibility Determination", + "abstract": "Bayesian optimization methods allocate limited sampling budgets to maximize expensive-to-evaluate functions. One-step-lookahead policies are often used, but computing optimal multi-step-lookahead policies remains a challenge. We consider a specialized Bayesian optimization problem: finding the superlevel set of an expensive one-dimensional function, with a Markov process prior. We compute the Bayes-optimal sampling policy efficiently, and characterize the suboptimality of one-step lookahead. Our numerical experiments demonstrate that the one-step lookahead policy is close to optimal in this problem, performing within 98% of optimal in the experimental settings considered." + }, + { + "title": "The Parallel Knowledge Gradient Method for Batch Bayesian Optimization", + "abstract": "In many applications of black-box optimization, one can evaluate multiple points simultaneously, e.g. when evaluating the performances of several different neural network architectures in a parallel computing environment. In this paper, we develop a novel batch Bayesian optimization algorithm --- the parallel knowledge gradient method. By construction, this method provides the one-step Bayes-optimal batch of points to sample. We provide an efficient strategy for computing this Bayes-optimal batch of points, and we demonstrate that the parallel knowledge gradient method finds global optima significantly faster than previous batch Bayesian optimization algorithms on both synthetic test functions and when tuning hyperparameters of practical machine learning algorithms, especially when function evaluations are noisy." + }, + { + "title": "COMBO: An efficient Bayesian optimization library for materials science", + "abstract": null + }, + { + "title": "Fast Bayesian Optimization of Machine Learning Hyperparameters on Large Datasets", + "abstract": "Bayesian optimization has become a successful tool for hyperparameter optimization of machine learning algorithms, such as support vector machines or deep neural networks. Despite its success, for large datasets, training and validating a single configuration often takes hours, days, or even weeks, which limits the achievable performance. To accelerate hyperparameter optimization, we propose a generative model for the validation error as a function of training set size, which is learned during the optimization process and allows exploration of preliminary configurations on small subsets, by extrapolating to the full dataset. We construct a Bayesian optimization procedure, dubbed Fabolas, which models loss and training time as a function of dataset size and automatically trades off high information gain about the global optimum against computational cost. Experiments optimizing support vector machines and deep neural networks show that Fabolas often finds high-quality solutions 10 to 100 times faster than other state-of-the-art Bayesian optimization methods or the recently proposed bandit strategy Hyperband." + }, + { + "title": "Multi-Information Source Optimization", + "abstract": "We consider Bayesian optimization of an expensive-to-evaluate black-box objective function, where we also have access to cheaper approximations of the objective. In general, such approximations arise in applications such as reinforcement learning, engineering, and the natural sciences, and are subject to an inherent, unknown bias. This model discrepancy is caused by an inadequate internal model that deviates from reality and can vary over the domain, making the utilization of these approximations a non-trivial task. \nWe present a novel algorithm that provides a rigorous mathematical treatment of the uncertainties arising from model discrepancies and noisy observations. Its optimization decisions rely on a value of information analysis that extends the Knowledge Gradient factor to the setting of multiple information sources that vary in cost: each sampling decision maximizes the predicted benefit per unit cost. \nWe conduct an experimental evaluation that demonstrates that the method consistently outperforms other state-of-the-art techniques: it finds designs of considerably higher objective value and additionally inflicts less cost in the exploration process." + }, + { + "title": "Parallel Bayesian Global Optimization of Expensive Functions", + "abstract": "Large-Scale Parallel Bayesian Optimization" + }, + { + "title": "GLASSES: Relieving The Myopia Of Bayesian Optimisation", + "abstract": "We present GLASSES: Global optimisation with Look-Ahead through Stochastic Simulation and Expected-loss Search. The majority of global optimisation approaches in use are myopic, in only considering the impact of the next function value; the non-myopic approaches that do exist are able to consider only a handful of future evaluations. Our novel algorithm, GLASSES, permits the consideration of dozens of evaluations into the future. This is done by approximating the ideal look-ahead loss function, which is expensive to evaluate, by a cheaper alternative in which the future steps of the algorithm are simulated beforehand. An Expectation Propagation algorithm is used to compute the expected value of the loss.We show that the far-horizon planning thus enabled leads to substantive performance gains in empirical tests." + }, + { + "title": "Efficient Global Optimization for Black-Box Simulation Via Sequential Intrinsic Kriging", + "abstract": "Efficient Global Optimization (EGO) is a popular method that searches sequentially for the global optimum of a simulated system. EGO treats the simulation model as a black-box, and balances local and global searches. In deterministic simulation, EGO uses ordinary Kriging (OK), which is a special case of universal Kriging (UK). In our EGO variant we use intrinsic Kriging (IK), which eliminates the need to estimate the parameters that quantify the trend in UK. In random simulation, EGO uses stochastic Kriging (SK), but we use stochastic IK (SIK). Moreover, in random simulation, EGO needs to select the number of replications per simulated input combination, accounting for the heteroscedastic variances of the simulation outputs. A popular selection method uses optimal computer budget allocation (OCBA), which allocates the available total number of replications over simulated combinations. We derive a new allocation algorithm. We perform several numerical experiments with deterministic simulations and random simulations. These experiments suggest that (1) in deterministic simulations, EGO with IK outperforms classic EGO; (2) in random simulations, EGO with SIK and our allocation rule does not differ significantly from EGO with SK combined with the OCBA allocation rule." + }, + { + "title": "Prediction of Low-Thermal-Conductivity Compounds with First-Principles Anharmonic Lattice-Dynamics Calculations and Bayesian Optimization.", + "abstract": "Compounds of low lattice thermal conductivity (LTC) are essential for seeking thermoelectric materials with high conversion efficiency. Some strategies have been used to decrease LTC. However, such trials have yielded successes only within a limited exploration space. Here, we report the virtual screening of a library containing 54,779 compounds. Our strategy is to search the library through Bayesian optimization using for the initial data the LTC obtained from first-principles anharmonic lattice-dynamics calculations for a set of 101 compounds. We discovered 221 materials with very low LTC. Two of them even have an electronic band gap <1 eV, which makes them exceptional candidates for thermoelectric applications. In addition to those newly discovered thermoelectric materials, the present strategy is believed to be powerful for many other applications in which the chemistry of materials is required to be optimized." + }, + { + "title": "Bayesian optimization for materials design", + "abstract": null + }, + { + "title": "High Dimensional Bayesian Optimisation and Bandits via Additive Models", + "abstract": "Bayesian Optimisation (BO) is a technique used in optimising a $D$-dimensional function which is typically expensive to evaluate. While there have been many successes for BO in low dimensions, scaling it to high dimensions has been notoriously difficult. Existing literature on the topic are under very restrictive settings. In this paper, we identify two key challenges in this endeavour. We tackle these challenges by assuming an additive structure for the function. This setting is substantially more expressive and contains a richer class of functions than previous work. We prove that, for additive functions the regret has only linear dependence on $D$ even though the function depends on all $D$ dimensions. We also demonstrate several other statistical and computational benefits in our framework. Via synthetic examples, a scientific simulation and a face detection problem we demonstrate that our method outperforms naive BO on additive functions and on several examples where the function is not additive." + }, + { + "title": "Predictive Entropy Search for Bayesian Optimization with Unknown Constraints", + "abstract": "Unknown constraints arise in many types of expensive black-box optimization problems. Several methods have been proposed recently for performing Bayesian optimization with constraints, based on the expected improvement (EI) heuristic. However, EI can lead to pathologies when used with constraints. For example, in the case of decoupled constraints--i.e., when one can independently evaluate the objective or the constraints--EI can encounter a pathology that prevents exploration. Additionally, computing EI requires a current best solution, which may not exist if none of the data collected so far satisfy the constraints. By contrast, informationbased approaches do not suffer from these failure modes. In this paper, we present a new information-based method called Predictive Entropy Search with Constraints (PESC). We analyze the performance of PESC and show that it compares favorably to EI-based approaches on synthetic and benchmark problems, as well as several real-world examples. We demonstrate that PESC is an effective algorithm that provides a promising direction towards a unified solution for constrained Bayesian optimization." + }, + { + "title": "Multifidelity Optimization using Statistical Surrogate Modeling for Non-Hierarchical Information Sources", + "abstract": "United States. Air Force. Office of Scientific Research. Multidisciplinary University Research Initiative (Grant FA9550- 09-0613)" + }, + { + "title": "Discrete optimization via simulation using Gaussian Markov random fields", + "abstract": "We construct a discrete optimization via simulation (DOvS) procedure using discrete Gaussian Markov random fields (GMRFs). Gaussian random fields (GRFs) are used in DOvS to balance exploration and exploitation. They enable computation of the expected improvement (EI) due to running the simulation to evaluate a feasible point of the optimization problem. Existing methods use GRFs with a continuous domain, which leads to dense covariance matrices, and therefore can be ill-suited for large-scale problems due to slow and ill-conditioned numerical computations. The use of GMRFs leads to sparse precision matrices, on which several sparse matrix techniques can be applied. To allocate the simulation effort throughout the procedure, we introduce a new EI criterion that incorporates the uncertainty in stochastic simulation by treating the value at the current optimal point as a random variable." + }, + { + "title": "Bayesian Optimization with Inequality Constraints", + "abstract": "Bayesian optimization is a powerful framework for minimizing expensive objective functions while using very few function evaluations. It has been successfully applied to a variety of problems, including hyperparameter tuning and experimental design. However, this framework has not been extended to the inequality-constrained optimization setting, particularly the setting in which evaluating feasibility is just as expensive as evaluating the objective. Here we present constrained Bayesian optimization, which places a prior distribution on both the objective and the constraint functions. We evaluate our method on simulated and real data, demonstrating that constrained Bayesian optimization can quickly find optimal and feasible points, even when small feasible regions cause standard methods to fail." + }, + { + "title": "Freeze-Thaw Bayesian Optimization", + "abstract": "In this paper we develop a dynamic form of Bayesian optimization for machine learning models with the goal of rapidly finding good hyperparameter settings. Our method uses the partial information gained during the training of a machine learning model in order to decide whether to pause training and start a new model, or resume the training of a previously-considered model. We specifically tailor our method to machine learning problems by developing a novel positive-definite covariance kernel to capture a variety of training curves. Furthermore, we develop a Gaussian process prior that scales gracefully with additional temporal observations. Finally, we provide an information-theoretic framework to automate the decision process. Experiments on several common machine learning models show that our approach is extremely effective in practice." + }, + { + "title": "Predictive Entropy Search for Efficient Global Optimization of Black-box Functions", + "abstract": "We propose a novel information-theoretic approach for Bayesian optimization called Predictive Entropy Search (PES). At each iteration, PES selects the next evaluation point that maximizes the expected information gained with respect to the global maximum. PES codifies this intractable acquisition function in terms of the expected reduction in the differential entropy of the predictive distribution. This reformulation allows PES to obtain approximations that are both more accurate and efficient than other alternatives such as Entropy Search (ES). Furthermore, PES can easily perform a fully Bayesian treatment of the model hyperparameters while ES cannot. We evaluate PES in both synthetic and real-world applications, including optimization problems in machine learning, finance, biotechnology, and robotics. We show that the increased accuracy of PES leads to significant gains in optimization performance." + }, + { + "title": "Input Warping for Bayesian Optimization of Non-Stationary Functions", + "abstract": "Bayesian optimization has proven to be a highly effective methodology for the global optimization of unknown, expensive and multimodal functions. The ability to accurately model distributions over functions is critical to the effectiveness of Bayesian optimization. Although Gaussian processes provide a flexible prior over functions, there are various classes of functions that remain difficult to model. One of the most frequently occurring of these is the class of non-stationary functions. The optimization of the hyperparameters of machine learning algorithms is a problem domain in which parameters are often manually transformed a priori, for example by optimizing in \"log-space,\" to mitigate the effects of spatially-varying length scale. We develop a methodology for automatically learning a wide family of bijective transformations or warpings of the input space using the Beta cumulative distribution function. We further extend the warping framework to multi-task Bayesian optimization so that multiple tasks can be warped into a jointly stationary space. On a set of challenging benchmark optimization tasks, we observe that the inclusion of warping greatly improves on the state-of-the-art, producing better results faster and more reliably." + }, + { + "title": "Multi-Task Bayesian Optimization", + "abstract": "Bayesian optimization has recently been proposed as a framework for automatically tuning the hyperparameters of machine learning models and has been shown to yield state-of-the-art performance with impressive ease and efficiency. In this paper, we explore whether it is possible to transfer the knowledge gained from previous optimizations to new tasks in order to find optimal hyperparameter settings more efficiently. Our approach is based on extending multi-task Gaussian processes to the framework of Bayesian optimization. We show that this method significantly speeds up the optimization process when compared to the standard single-task approach. We further propose a straightforward extension of our algorithm in order to jointly minimize the average error across multiple tasks and demonstrate how this can be used to greatly speed up k-fold cross-validation. Lastly, we propose an adaptation of a recently developed acquisition function, entropy search, to the cost-sensitive, multi-task setting. We demonstrate the utility of this new acquisition function by leveraging a small dataset to explore hyper-parameter settings for a large dataset. Our algorithm dynamically chooses which dataset to query in order to yield the most information per unit cost." + }, + { + "title": "Sequential Bayes-Optimal Policies for Multiple Comparisons with a Known Standard", + "abstract": "We consider the problem of efficiently allocating simulation effort to determine which of several simulated systems have mean performance exceeding a threshold of known value. Within a Bayesian formulation of this problem, the optimal fully sequential policy for allocating simulation effort is the solution to a dynamic program. When sampling is limited by probabilistic termination or sampling costs, we show that this dynamic program can be solved efficiently, providing a tractable way to compute the Bayes-optimal policy. The solution uses techniques from optimal stopping and multiarmed bandits. We then present further theoretical results characterizing this Bayes-optimal policy, compare it numerically to several approximate policies, and apply it to applications in emergency services and manufacturing." + }, + { + "title": "Bisection Search with Noisy Responses", + "abstract": "Bisection search is the most efficient algorithm for locating a unique point X ∗ ∈ (0, 1) when we are able to query an oracle only about whether X ∗ lies to the left or right of a point x of our choosing. We study a noisy version of this classic problem, where the oracle's response is correct only with probability p. The probabilistic bisection algorithm (PBA) introduced by Horstein (IEEE Trans. Inform. Theory, 9 (1963), pp. 136-143) can be used to locate X ∗ in this setting. While the method works extremely well in practice, very little is known about its theoretical properties. In this paper, we provide several key findings about the PBA, which lead to the main conclusion that the expected absolute residuals of successive search results, i.e., E(|X ∗ − Xn|), converge to 0 at a geometric rate." + }, + { + "title": "Bayesian Optimization in a Billion Dimensions via Random Embeddings", + "abstract": "Bayesian optimization techniques have been successfully applied to robotics, planning, sensor placement, recommendation, advertising, intelligent user interfaces and automatic algorithm configuration. Despite these successes, the approach is restricted to problems of moderate dimension, and several workshops on Bayesian optimization have identified its scaling to high-dimensions as one of the holy grails of the field. In this paper, we introduce a novel random embedding idea to attack this problem. The resulting Random EMbedding Bayesian Optimization (REMBO) algorithm is very simple, has important invariance properties, and applies to domains with both categorical and continuous variables. We present a thorough theoretical analysis of REMBO. Empirical results confirm that REMBO can effectively solve problems with billions of dimensions, provided the intrinsic dimensionality is low. They also show that REMBO achieves state-of-the-art performance in optimizing the 47 discrete parameters of a popular mixed integer linear programming solver." + }, + { + "title": "Tutorial: Optimization via simulation with Bayesian statistics and dynamic programming", + "abstract": "Bayesian statistics comprises a powerful set of methods for analyzing simulated systems. Combined with dynamic programming and other methods for sequential decision making under uncertainty, Bayesian methods have been used to design algorithms for finding the best of several simulated systems. When the dynamic program can be solved exactly, these algorithms have optimal average-case performance. In other situations, this dynamic programming analysis supports the development of approximate methods with sub-optimal but nevertheless good average-case performance. These methods with good average-case performance are particularly useful when the cost of simulation prevents the use of procedures with worst-case statistical performance guarantees. We provide an overview of Bayesian methods used for selecting the best, providing an in-depth treatment of the simpler case of ranking and selection with independent priors appropriate for smaller-scale problems, and then discussing how these same ideas can be applied to correlated priors appropriate for large-scale problems." + }, + { + "title": "DiceKriging, DiceOptim: Two R Packages for the Analysis of Computer Experiments by Kriging-Based Metamodeling and Optimization", + "abstract": "We present two recently released R packages, DiceKriging and DiceOptim, for the approximation and the optimization of expensive-to-evaluate deterministic functions. Following a self-contained mini tutorial on Kriging-based approximation and optimization, the functionalities of both packages are detailed and demonstrated in two distinct sections. In particular, the versatility of DiceKriging with respect to trend and noise specifications, covariance parameter estimation, as well as conditional and unconditional simulations are illustrated on the basis of several reproducible numerical experiments. We then put to the fore the implementation of sequential and parallel optimization strategies relying on the expected improvement criterion on the occasion of DiceOptim’s presentation. An appendix is dedicated to complementary mathematical and computational details." + }, + { + "title": "Optimization of computationally expensive simulations with Gaussian processes and parameter uncertainty: Application to cardiovascular surgery", + "abstract": "In many applications of simulation-based optimization, the random output variable whose expectation is being optimized is a deterministic function of a low-dimensional random vector. This deterministic function is often expensive to compute, making simulation-based optimization difficult. Motivated by an application in the design of bypass grafts for cardiovascular surgery with uncertainty about input parameters, we use Bayesian methods to design an algorithm that exploits this random vector's low-dimensionality to improve performance." + }, + { + "title": "Practical Bayesian Optimization of Machine Learning Algorithms", + "abstract": "The use of machine learning algorithms frequently involves careful tuning of learning parameters and model hyperparameters. Unfortunately, this tuning is often a \"black art\" requiring expert experience, rules of thumb, or sometimes brute-force search. There is therefore great appeal for automatic approaches that can optimize the performance of any given learning algorithm to the problem at hand. In this work, we consider this problem through the framework of Bayesian optimization, in which a learning algorithm's generalization performance is modeled as a sample from a Gaussian process (GP). We show that certain choices for the nature of the GP, such as the type of kernel and the treatment of its hyperparameters, can play a crucial role in obtaining a good optimizer that can achieve expertlevel performance. We describe new algorithms that take into account the variable cost (duration) of learning algorithm experiments and that can leverage the presence of multiple cores for parallel experimentation. We show that these proposed algorithms improve on previous automatic procedures and can reach or surpass human expert-level optimization for many algorithms including latent Dirichlet allocation, structured SVMs and convolutional neural networks." + }, + { + "title": "Twenty Questions with Noise: Bayes Optimal Policies for Entropy Loss", + "abstract": "We consider the problem of twenty questions with noisy answers, in which we seek to find a target by repeatedly choosing a set, asking an oracle whether the target lies in this set, and obtaining an answer corrupted by noise. Starting with a prior distribution on the target's location, we seek to minimize the expected entropy of the posterior distribution. We formulate this problem as a dynamic program and show that any policy optimizing the one-step expected reduction in entropy is also optimal over the full horizon. Two such Bayes optimal policies are presented: one generalizes the probabilistic bisection policy due to Horstein and the other asks a deterministic set of questions. We study the structural properties of the latter, and illustrate its use in a computer vision application." + }, + { + "title": "Entropy Search for Information-Efficient Global Optimization", + "abstract": "Contemporary global optimization algorithms are based on local measures of utility, rather than a probability measure over location and value of the optimum. They thus attempt to collect low function values, not to learn about the optimum. The reason for the absence of probabilistic global optimizers is that the corresponding inference problem is intractable in several ways. This paper develops desiderata for probabilistic optimization algorithms, then presents a concrete algorithm which addresses each of the computational intractabilities with a sequence of approximations and explicitly addresses the decision problem of maximizing information gain from each evaluation." + }, + { + "title": "The Correlated Knowledge Gradient for Simulation Optimization of Continuous Parameters using Gaussian Process Regression", + "abstract": "We extend the concept of the correlated knowledge-gradient policy for the ranking and selection of a finite set of alternatives to the case of continuous decision variables. We propose an approximate knowledge gradient for problems with continuous decision variables in the context of a Gaussian process regression model in a Bayesian setting, along with an algorithm to maximize the approximate knowledge gradient. In the problem class considered, we use the knowledge gradient for continuous parameters to sequentially choose where to sample an expensive noisy function in order to find the maximum quickly. We show that the knowledge gradient for continuous decisions is a generalization of the efficient global optimization algorithm proposed in [D. R. Jones, M. Schonlau and W. J. Welch, J. Global Optim., 13 (1998), pp. 455–492]." + }, + { + "title": "The Knowledge-Gradient Algorithm for Sequencing Experiments in Drug Discovery", + "abstract": "We present a new technique for adaptively choosing the sequence of molecular compounds to test in drug discovery. Beginning with a base compound, we consider the problem of searching for a chemical derivative of the molecule that best treats a given disease. The problem of choosing molecules to test to maximize the expected quality of the best compound discovered may be formulated mathematically as a ranking-and-selection problem in which each molecule is an alternative. We apply a recently developed algorithm, known as the knowledge-gradient algorithm, that uses correlations in our Bayesian prior distribution between the performance of different alternatives (molecules) to dramatically reduce the number of molecular tests required, but it has heavy computational requirements that limit the number of possible alternatives to a few thousand. We develop computational improvements that allow the knowledge-gradient method to consider much larger sets of alternatives, and we demonstrate the method on a problem with 87,120 alternatives." + }, + { + "title": "Convergence Rates of Efficient Global Optimization Algorithms", + "abstract": "Efficient global optimization is the problem of minimizing an unknown function f, using as few evaluations f(x) as possible. It can be considered as a continuum-armed bandit problem, with noiseless data and simple regret. Expected improvement is perhaps the most popular method for solving this problem; the algorithm performs well in experiments, but little is known about its theoretical properties. Implementing expected improvement requires a choice of Gaussian process prior, which determines an associated space of functions, its reproducing-kernel Hilbert space (RKHS). When the prior is fixed, expected improvement is known to converge on the minimum of any function in the RKHS. We begin by providing convergence rates for this procedure. The rates are optimal for functions of low smoothness, and we modify the algorithm to attain optimal rates for smoother functions. For practitioners, however, these results are somewhat misleading. Priors are typically not held fixed, but depend on parameters estimated from the data. For standard estimators, we show this procedure may never discover the minimum of f. We then propose alternative estimators, chosen to minimize the constants in the rate of convergence, and show these estimators retain the convergence rates of a fixed prior." + }, + { + "title": "A Tutorial on Bayesian Optimization of Expensive Cost Functions, with Application to Active User Modeling and Hierarchical Reinforcement Learning", + "abstract": "We present a tutorial on Bayesian optimization, a method of finding the maximum of expensive cost functions. Bayesian optimization employs the Bayesian technique of setting a prior over the objective function and combining it with evidence to get a posterior function. This permits a utility-based selection of the next observation to make on the objective function, which must take into account both exploration (sampling from areas of high uncertainty) and exploitation (sampling areas likely to offer improvement over the current best observation). We also present two detailed extensions of Bayesian optimization, with experiments---active user modelling with preferences, and hierarchical reinforcement learning---and a discussion of the pros and cons of Bayesian optimization based on our experiences." + }, + { + "title": "Bayesian data analysis.", + "abstract": "Bayesian methods have garnered huge interest in cognitive science as an approach to models of cognition and perception. On the other hand, Bayesian methods for data analysis have not yet made much headway in cognitive science against the institutionalized inertia of 20th century null hypothesis significance testing (NHST). Ironically, specific Bayesian models of cognition and perception may not long endure the ravages of empirical verification, but generic Bayesian methods for data analysis will eventually dominate. It is time that Bayesian data analysis became the norm for empirical methods in cognitive science. This article reviews a fatal flaw of NHST and introduces the reader to some benefits of Bayesian data analysis. The article presents illustrative examples of multiple comparisons in Bayesian analysis of variance and Bayesian approaches to statistical power. Copyright © 2010 John Wiley & Sons, Ltd. For further resources related to this article, please visit the WIREs website." + }, + { + "title": "Bayesian Monte Carlo for the Global Optimization of Expensive Functions", + "abstract": "In the last decades enormous advances have been made possible for modelling complex (physical) systems by mathematical equations and computer algorithms. To deal with very long running times of such models a promising approach has been to replace them by stochastic approximations based on a few model evaluations. In this paper we focus on the often occuring case that the system modelled has two types of inputs x = (xc, xe) with xc representing control variables and xe representing environmental variables. Typically, xc needs to be optimised, whereas xe are uncontrollable but are assumed to adhere to some distribution. In this paper we use a Bayesian approach to address this problem: we specify a prior distribution on the underlying function using a Gaussian process and use Bayesian Monte Carlo to obtain the objective function by integrating out environmental variables. Furthermore, we empirically evaluate several active learning criteria that were developed for the deterministic case (i.e., no environmental variables) and show that the ALC criterion appears significantly better than expected improvement and random selection." + }, + { + "title": "Survey of modeling and optimization strategies to solve high-dimensional design problems with computationally-expensive black-box functions", + "abstract": null + }, + { + "title": "The Knowledge-Gradient Policy for Correlated Normal Beliefs", + "abstract": "We consider a Bayesian ranking and selection problem with independent normal rewards and a correlated multivariate normal belief on the mean values of these rewards. Because this formulation of the ranking and selection problem models dependence between alternatives' mean values, algorithms may use this dependence to perform efficiently even when the number of alternatives is very large. We propose a fully sequential sampling policy called the knowledge-gradient policy, which is provably optimal in some special cases and has bounded suboptimality in all others. We then demonstrate how this policy may be applied to efficiently maximize a continuous function on a continuous domain while constrained to a fixed number of noisy measurements." + }, + { + "title": "Approximate dynamic programming: solving the curses of dimensionality", + "abstract": "Approximate dynamic programming: solving the curses of dimensionality, by Warren B. Powell, Wiley Series in Probability and Statistics, Hoboken, NJ, J. Wiley & Sons, 2007, 488 pp., US$83.60 (hardcover), ISBN 978-0-470-17155-4 Dynamic programming introduced by Bellman back in the 1950s offers a unified approach to solving problems arising in various applications, such as stochastic control or managing entire economies. The principles of dynamic programming have also been widely used in solving largescale stochastic reservoir models. However, due to the three curses of dimensionality, the sizes of a state space as well as the size of the outcome and action space typically grow exponentially in the number of state variables. Therefore, dynamic programming has limited practical applicability when used to find the exact solution of large-scale problems. One approach dealing with the curses of dimensionality is approximate dynamic programming. This book provides a unified and insightful treatment of approximate dynamic programming by integrating four distinct disciplines: Markov design processes, optimization, simulation, and statistics. Approximate Dynamic Programming is a comprehensive book that is designed to offer an introduction to the field. The book provides detailed coverage of modelling decision processes under uncertainty, robustness, designing and estimating value function approximations, choosing effective step-size rules, and convergence issues. In summary, the book Approximate Dynamic Programming is well written, highly pedagogical, with a clear and precise presentation of the material. Each chapter ends with bibliographic notes and well-selected problems that make this material an excellent textbook for advanced undergraduate and beginning graduate students. In addition, the author offers a companion webpage that includes additional problems and their solutions, as well as data sets to reinforce the book’s main concepts. I highly recommend this book for everyone interested in learning the theory and applications of approximate dynamic programming." + }, + { + "title": "Engineering Design via Surrogate Modelling - A Practical Guide", + "abstract": "Preface. About the Authors. Foreword. Prologue. Part I: Fundamentals. 1. Sampling Plans. 1.1 The 'Curse of Dimensionality' and How to Avoid It. 1.2 Physical versus Computational Experiments. 1.3 Designing Preliminary Experiments (Screening). 1.3.1 Estimating the Distribution of Elementary Effects. 1.4 Designing a Sampling Plan. 1.4.1 Stratification. 1.4.2 Latin Squares and Random Latin Hypercubes. 1.4.3 Space-filling Latin Hypercubes. 1.4.4 Space-filling Subsets. 1.5 A Note on Harmonic Responses. 1.6 Some Pointers for Further Reading. References. 2. Constructing a Surrogate. 2.1 The Modelling Process. 2.1.1 Stage One: Preparing the Data and Choosing a Modelling Approach. 2.1.2 Stage Two: Parameter Estimation and Training. 2.1.3 Stage Three: Model Testing. 2.2 Polynomial Models. 2.2.1 Example One: Aerofoil Drag. 2.2.2 Example Two: a Multimodal Testcase. 2.2.3 What About the k -variable Case? 2.3 Radial Basis Function Models. 2.3.1 Fitting Noise-Free Data. 2.3.2 Radial Basis Function Models of Noisy Data. 2.4 Kriging. 2.4.1 Building the Kriging Model. 2.4.2 Kriging Prediction. 2.5 Support Vector Regression. 2.5.1 The Support Vector Predictor. 2.5.2 The Kernel Trick. 2.5.3 Finding the Support Vectors. 2.5.4 Finding . 2.5.5 Choosing C and epsilon. 2.5.6 Computing epsilon : v -SVR 71. 2.6 The Big(ger) Picture. References. 3. Exploring and Exploiting a Surrogate. 3.1 Searching the Surrogate. 3.2 Infill Criteria. 3.2.1 Prediction Based Exploitation. 3.2.2 Error Based Exploration. 3.2.3 Balanced Exploitation and Exploration. 3.2.4 Conditional Likelihood Approaches. 3.2.5 Other Methods. 3.3 Managing a Surrogate Based Optimization Process. 3.3.1 Which Surrogate for What Use? 3.3.2 How Many Sample Plan and Infill Points? 3.3.3 Convergence Criteria. 3.3.4 Search of the Vibration Isolator Geometry Feasibility Using Kriging Goal Seeking. References. Part II: Advanced Concepts. 4. Visualization. 4.1 Matrices of Contour Plots. 4.2 Nested Dimensions. Reference. 5. Constraints. 5.1 Satisfaction of Constraints by Construction. 5.2 Penalty Functions. 5.3 Example Constrained Problem. 5.3.1 Using a Kriging Model of the Constraint Function. 5.3.2 Using a Kriging Model of the Objective Function. 5.4 Expected Improvement Based Approaches. 5.4.1 Expected Improvement With Simple Penalty Function. 5.4.2 Constrained Expected Improvement. 5.5 Missing Data. 5.5.1 Imputing Data for Infeasible Designs. 5.6 Design of a Helical Compression Spring Using Constrained Expected Improvement. 5.7 Summary. References. 6. Infill Criteria With Noisy Data. 6.1 Regressing Kriging. 6.2 Searching the Regression Model. 6.2.1 Re-Interpolation. 6.2.2 Re-Interpolation With Conditional Likelihood Approaches. 6.3 A Note on Matrix Ill-Conditioning. 6.4 Summary. References. 7. Exploiting Gradient Information. 7.1 Obtaining Gradients. 7.1.1 Finite Differencing. 7.1.2 Complex Step Approximation. 7.1.3 Adjoint Methods and Algorithmic Differentiation. 7.2 Gradient-enhanced Modelling. 7.3 Hessian-enhanced Modelling. 7.4 Summary. References. 8. Multi-fidelity Analysis. 8.1 Co-Kriging. 8.2 One-variable Demonstration. 8.3 Choosing X c and X e . 8.4 Summary. References. 9. Multiple Design Objectives. 9.1 Pareto Optimization. 9.2 Multi-objective Expected Improvement. 9.3 Design of the Nowacki Cantilever Beam Using Multi-objective, Constrained Expected Improvement. 9.4 Design of a Helical Compression Spring Using Multi-objective, Constrained Expected Improvement. 9.5 Summary. References. Appendix: Example Problems. A.1 One-Variable Test Function. A.2 Branin Test Function. A.3 Aerofoil Design. A.4 The Nowacki Beam. A.5 Multi-objective, Constrained Optimal Design of a Helical Compression Spring. A.6 Novel Passive Vibration Isolator Feasibility. References. Index." + }, + { + "title": "A Knowledge-Gradient Policy for Sequential Information Collection", + "abstract": "In a sequential Bayesian ranking and selection problem with independent normal populations and common known variance, we study a previously introduced measurement policy which we refer to as the knowledge-gradient policy. This policy myopically maximizes the expected increment in the value of information in each time period, where the value is measured according to the terminal utility function. We show that the knowledge-gradient policy is optimal both when the horizon is a single time period and in the limit as the horizon extends to infinity. We show furthermore that, in some special cases, the knowledge-gradient policy is optimal regardless of the length of any given fixed total sampling horizon. We bound the knowledge-gradient policy's suboptimality in the remaining cases, and show through simulations that it performs competitively with or significantly better than other policies." + }, + { + "title": "Multi-fidelity optimization via surrogate modelling", + "abstract": "This paper demonstrates the application of correlated Gaussian process based approximations to optimization where multiple levels of analysis are available, using an extension to the geostatistical method of co-kriging. An exchange algorithm is used to choose which points of the search space to sample within each level of analysis. The derivation of the co-kriging equations is presented in an intuitive manner, along with a new variance estimator to account for varying degrees of computational ‘noise’ in the multiple levels of analysis. A multi-fidelity wing optimization is used to demonstrate the methodology." + }, + { + "title": "Parallel radial basis function methods for the global optimization of expensive functions", + "abstract": null + }, + { + "title": "Most likely heteroscedastic Gaussian process regression", + "abstract": "This paper presents a novel Gaussian process (GP) approach to regression with input-dependent noise rates. We follow Goldberg et al.'s approach and model the noise variance using a second GP in addition to the GP governing the noise-free output value. In contrast to Goldberg et al., however, we do not use a Markov chain Monte Carlo method to approximate the posterior noise variance but a most likely noise approach. The resulting model is easy to implement and can directly be used in combination with various existing extensions of the standard GPs such as sparse approximations. Extensive experiments on both synthetic and real-world data, including a challenging perception problem in robotics, show the effectiveness of most likely heteroscedastic GP regression." + }, + { + "title": "Watershed calibration using multistart local optimization and evolutionary optimization with radial basis function approximation", + "abstract": "Abstract Calibration of computationally expensive watershed models is more feasible with algorithms that require fewer simulations. This paper compares the performance of seven global optimization algorithms on a 14-parameter and an 8-parameter watershed calibration problem. The optimization algorithms include Shuffled Complex Evolution (SCE), Differential Evolution (DE), an evolutionary algorithm that uses Radial Basis Function (RBF) approximation (ESGRBF), and four types of local optimization methods coupled with the Multi-Level Single Linkage (MLSL) multistart procedure. The four local optimization algorithms are: Sequential Quadratic Programming, which is a derivative-based method; Unconstrained Optimization by Quadratic Approximation (UOBYQA), which is a derivativefree trust-region method; Pattern Search; and Implicit Filtering. The results indicate that ESGRBF is the most effective algorithm on the two calibration problems, followed by Implicit Filtering coupled with the MLSL multistart approach. Hence, this study provides some promising alternatives to the currently most widely used methods in watershed calibration, which did not perform as well." + }, + { + "title": "Automatic Gait Optimization with Gaussian Process Regression", + "abstract": "Gait optimization is a basic yet challenging problem for both quadrupedal and bipedal robots. Although techniques for automating the process exist, most involve local function optimization procedures that suffer from three key drawbacks. Local optimization techniques are naturally plagued by local optima, make no use of the expensive gait evaluations once a local step is taken, and do not explicitly model noise in gait evaluation. These drawbacks increase the need for a large number of gait evaluations, making optimization slow, data inefficient, and manually intensive. We present a Bayesian approach based on Gaussian process regression that addresses all three drawbacks. It uses a global search strategy based on a posterior model inferred from all of the individual noisy evaluations. We demonstrate the technique on a quadruped robot, using it to optimize two different criteria: speed and smoothness. We show in both cases our technique requires dramatically fewer gait evaluations than state-of-the-art local gradient approaches." + }, + { + "title": "Sequential kriging optimization using multiple-fidelity evaluations", + "abstract": null + }, + { + "title": "Statistical Improvement Criteria for Use in Multiobjective Design Optimization", + "abstract": "Design of experiment and response surface modeling methods are applied to the problem of constructing Pareto fronts for computationally expensive multiobjective design optimization problems. The work presented combines design of experiment methods with kriging (Gaussian process) models to enable the parallel evolution of multiobjective Pareto sets. This is achieved via the use of updating schemes based on new extensions of the expected improvement criterion commonly applied in single-objective searches. The approaches described provide a statistically coherent means of solving expensive multiobjective design problems using single-objective search tools. They are compared to the use of nondominated sorting genetic algorithm (NSGA-ii) based multiobjective searches, both with and without response surface support. The new approaches are shown to give more exact, wider ranging, and more evenly populated Pareto fronts than the genetic algorithm based searches at reduced or similar cost." + }, + { + "title": "ParEGO: a hybrid algorithm with on-line landscape approximation for expensive multiobjective optimization problems", + "abstract": "This paper concerns multiobjective optimization in scenarios where each solution evaluation is financially and/or temporally expensive. We make use of nine relatively low-dimensional, nonpathological, real-valued functions, such as arise in many applications, and assess the performance of two algorithms after just 100 and 250 (or 260) function evaluations. The results show that NSGA-II, a popular multiobjective evolutionary algorithm, performs well compared with random search, even within the restricted number of evaluations used. A significantly better performance (particularly, in the worst case) is, however, achieved on our test set by an algorithm proposed herein-ParEGO-which is an extension of the single-objective efficient global optimization (EGO) algorithm of Jones et al. ParEGO uses a design-of-experiments inspired initialization procedure and learns a Gaussian processes model of the search landscape, which is updated after every function evaluation. Overall, ParEGO exhibits a promising performance for multiobjective optimization problems where evaluations are expensive or otherwise restricted in number." + }, + { + "title": "Elements of Information Theory", + "abstract": "Preface to the Second Edition. Preface to the First Edition. Acknowledgments for the Second Edition. Acknowledgments for the First Edition. 1. Introduction and Preview. 1.1 Preview of the Book. 2. Entropy, Relative Entropy, and Mutual Information. 2.1 Entropy. 2.2 Joint Entropy and Conditional Entropy. 2.3 Relative Entropy and Mutual Information. 2.4 Relationship Between Entropy and Mutual Information. 2.5 Chain Rules for Entropy, Relative Entropy, and Mutual Information. 2.6 Jensen's Inequality and Its Consequences. 2.7 Log Sum Inequality and Its Applications. 2.8 Data-Processing Inequality. 2.9 Sufficient Statistics. 2.10 Fano's Inequality. Summary. Problems. Historical Notes. 3. Asymptotic Equipartition Property. 3.1 Asymptotic Equipartition Property Theorem. 3.2 Consequences of the AEP: Data Compression. 3.3 High-Probability Sets and the Typical Set. Summary. Problems. Historical Notes. 4. Entropy Rates of a Stochastic Process. 4.1 Markov Chains. 4.2 Entropy Rate. 4.3 Example: Entropy Rate of a Random Walk on a Weighted Graph. 4.4 Second Law of Thermodynamics. 4.5 Functions of Markov Chains. Summary. Problems. Historical Notes. 5. Data Compression. 5.1 Examples of Codes. 5.2 Kraft Inequality. 5.3 Optimal Codes. 5.4 Bounds on the Optimal Code Length. 5.5 Kraft Inequality for Uniquely Decodable Codes. 5.6 Huffman Codes. 5.7 Some Comments on Huffman Codes. 5.8 Optimality of Huffman Codes. 5.9 Shannon-Fano-Elias Coding. 5.10 Competitive Optimality of the Shannon Code. 5.11 Generation of Discrete Distributions from Fair Coins. Summary. Problems. Historical Notes. 6. Gambling and Data Compression. 6.1 The Horse Race. 6.2 Gambling and Side Information. 6.3 Dependent Horse Races and Entropy Rate. 6.4 The Entropy of English. 6.5 Data Compression and Gambling. 6.6 Gambling Estimate of the Entropy of English. Summary. Problems. Historical Notes. 7. Channel Capacity. 7.1 Examples of Channel Capacity. 7.2 Symmetric Channels. 7.3 Properties of Channel Capacity. 7.4 Preview of the Channel Coding Theorem. 7.5 Definitions. 7.6 Jointly Typical Sequences. 7.7 Channel Coding Theorem. 7.8 Zero-Error Codes. 7.9 Fano's Inequality and the Converse to the Coding Theorem. 7.10 Equality in the Converse to the Channel Coding Theorem. 7.11 Hamming Codes. 7.12 Feedback Capacity. 7.13 Source-Channel Separation Theorem. Summary. Problems. Historical Notes. 8. Differential Entropy. 8.1 Definitions. 8.2 AEP for Continuous Random Variables. 8.3 Relation of Differential Entropy to Discrete Entropy. 8.4 Joint and Conditional Differential Entropy. 8.5 Relative Entropy and Mutual Information. 8.6 Properties of Differential Entropy, Relative Entropy, and Mutual Information. Summary. Problems. Historical Notes. 9. Gaussian Channel. 9.1 Gaussian Channel: Definitions. 9.2 Converse to the Coding Theorem for Gaussian Channels. 9.3 Bandlimited Channels. 9.4 Parallel Gaussian Channels. 9.5 Channels with Colored Gaussian Noise. 9.6 Gaussian Channels with Feedback. Summary. Problems. Historical Notes. 10. Rate Distortion Theory. 10.1 Quantization. 10.2 Definitions. 10.3 Calculation of the Rate Distortion Function. 10.4 Converse to the Rate Distortion Theorem. 10.5 Achievability of the Rate Distortion Function. 10.6 Strongly Typical Sequences and Rate Distortion. 10.7 Characterization of the Rate Distortion Function. 10.8 Computation of Channel Capacity and the Rate Distortion Function. Summary. Problems. Historical Notes. 11. Information Theory and Statistics. 11.1 Method of Types. 11.2 Law of Large Numbers. 11.3 Universal Source Coding. 11.4 Large Deviation Theory. 11.5 Examples of Sanov's Theorem. 11.6 Conditional Limit Theorem. 11.7 Hypothesis Testing. 11.8 Chernoff-Stein Lemma. 11.9 Chernoff Information. 11.10 Fisher Information and the Cram-er-Rao Inequality. Summary. Problems. Historical Notes. 12. Maximum Entropy. 12.1 Maximum Entropy Distributions. 12.2 Examples. 12.3 Anomalous Maximum Entropy Problem. 12.4 Spectrum Estimation. 12.5 Entropy Rates of a Gaussian Process. 12.6 Burg's Maximum Entropy Theorem. Summary. Problems. Historical Notes. 13. Universal Source Coding. 13.1 Universal Codes and Channel Capacity. 13.2 Universal Coding for Binary Sequences. 13.3 Arithmetic Coding. 13.4 Lempel-Ziv Coding. 13.5 Optimality of Lempel-Ziv Algorithms. Compression. Summary. Problems. Historical Notes. 14. Kolmogorov Complexity. 14.1 Models of Computation. 14.2 Kolmogorov Complexity: Definitions and Examples. 14.3 Kolmogorov Complexity and Entropy. 14.4 Kolmogorov Complexity of Integers. 14.5 Algorithmically Random and Incompressible Sequences. 14.6 Universal Probability. 14.7 Kolmogorov complexity. 14.9 Universal Gambling. 14.10 Occam's Razor. 14.11 Kolmogorov Complexity and Universal Probability. 14.12 Kolmogorov Sufficient Statistic. 14.13 Minimum Description Length Principle. Summary. Problems. Historical Notes. 15. Network Information Theory. 15.1 Gaussian Multiple-User Channels. 15.2 Jointly Typical Sequences. 15.3 Multiple-Access Channel. 15.4 Encoding of Correlated Sources. 15.5 Duality Between Slepian-Wolf Encoding and Multiple-Access Channels. 15.6 Broadcast Channel. 15.7 Relay Channel. 15.8 Source Coding with Side Information. 15.9 Rate Distortion with Side Information. 15.10 General Multiterminal Networks. Summary. Problems. Historical Notes. 16. Information Theory and Portfolio Theory. 16.1 The Stock Market: Some Definitions. 16.2 Kuhn-Tucker Characterization of the Log-Optimal Portfolio. 16.3 Asymptotic Optimality of the Log-Optimal Portfolio. 16.4 Side Information and the Growth Rate. 16.5 Investment in Stationary Markets. 16.6 Competitive Optimality of the Log-Optimal Portfolio. 16.7 Universal Portfolios. 16.8 Shannon-McMillan-Breiman Theorem (General AEP). Summary. Problems. Historical Notes. 17. Inequalities in Information Theory. 17.1 Basic Inequalities of Information Theory. 17.2 Differential Entropy. 17.3 Bounds on Entropy and Relative Entropy. 17.4 Inequalities for Types. 17.5 Combinatorial Bounds on Entropy. 17.6 Entropy Rates of Subsets. 17.7 Entropy and Fisher Information. 17.8 Entropy Power Inequality and Brunn-Minkowski Inequality. 17.9 Inequalities for Determinants. 17.10 Inequalities for Ratios of Determinants. Summary. Problems. Historical Notes. Bibliography. List of Symbols. Index." + }, + { + "title": "One-Dimensional global optimization for observations with noise", + "abstract": null + }, + { + "title": "A parallel updating scheme for approximating and optimizing high fidelity computer simulations", + "abstract": null + }, + { + "title": "Gaussian Processes For Machine Learning", + "abstract": "Gaussian processes (GPs) are natural generalisations of multivariate Gaussian random variables to infinite (countably or continuous) index sets. GPs have been applied in a large number of fields to a diverse range of ends, and very many deep theoretical analyses of various properties are available. This paper gives an introduction to Gaussian processes on a fairly elementary level with special emphasis on characteristics relevant in machine learning. It draws explicit connections to branches such as spline smoothing models and support vector machines in which similar ideas have been investigated. Gaussian process models are routinely used to solve hard machine learning problems. They are attractive because of their flexible non-parametric nature and computational simplicity. Treated within a Bayesian framework, very powerful statistical methods can be implemented which offer valid estimates of uncertainties in our predictions and generic model selection procedures cast as nonlinear optimization problems. Their main drawback of heavy computational scaling has recently been alleviated by the introduction of generic sparse approximations.13,78,31 The mathematical literature on GPs is large and often uses deep concepts which are not required to fully understand most machine learning applications. In this tutorial paper, we aim to present characteristics of GPs relevant to machine learning and to show up precise connections to other \"kernel machines\" popular in the community. Our focus is on a simple presentation, but references to more detailed sources are provided." + }, + { + "title": "Envelope Theorems for Arbitrary Choice Sets", + "abstract": "The standard envelope theorems apply to choice sets with convex and topological structure, providing sufficient conditions for the value function to be differentiable in a parameter and characterizing its derivative. This paper studies optimization with arbitrary choice sets and shows that the traditional envelope formula holds at any differentiability point of the value function. We also provide conditions for the value function to be, variously, absolutely continuous, left- and right-differentiable, or fully differentiable. These results are applied to mechanism design, convex programming, continuous optimization problems, saddle-point problems, problems with parameterized constraints, and optimal stopping problems." + }, + { + "title": "New Two-Stage and Sequential Procedures for Selecting the Best Simulated System", + "abstract": "Standard \"indifference-zone\" procedures that allocate computer resources to infer the best of a finite set of simulated systems are designed with a statistically conservative, least favorable configuration assumption consider the probability of correct selection (but not the opportunity cost) and assume that the cost of simulating each system is the same. Recent Bayesian work considers opportunity cost and shows that an average case analysis may be less conservative but assumes a known output variance, an assumption that typically is violated in simulation. This paper presents new two-stage and sequential selection procedures that integrate attractive features of both lines of research. They are derived assuming that the simulation output is normally distributed with unknown mean and variance that may differ for each system. We permit the reduction of either opportunity cost loss or the probability of incorrect selection and allow for different replication costs for each system. The generality of our formulation comes at the expense of difficulty in obtaining exact closed-form solutions. We therefore derive a bound for the expected loss associated potentially incorrect selections, then asymptotically minimize that bound. Theoretical and empirical results indicate that our approach compares favorably with indifference-zone procedures." + }, + { + "title": "Design and analysis of robust total joint replacements: finite element model experiments with environmental variables.", + "abstract": "Computer simulation of orthopaedic devices can be prohibitively time consuming, particularly when assessing multiple design and environmental factors. Chang et al. (1999) address these computational challenges using an efficient statistical predictor to optimize a flexible hip implant, defined by a midstem reduction, subjected to multiple environmental conditions. Here, we extend this methodology by: (1) explicitly considering constraint equations in the optimization formulation, (2) showing that the optimal design for one environmental distribution is robust to alternate distributions, and (3) illustrating a sensitivity analysis technique to determine influential design and environmental factors. A thin midstem diameter with a short stabilizing distal tip minimized the bone remodeling signal while maintaining satisfactory stability. Hip joint force orientation was more influential than the effect of the controllable design variables on bone remodeling and the cancellous bone elastic modulus had the most influence on relative motion, both results indicating the importance of including uncontrollable environmental factors. The optimal search indicated that only 16 to 22 computer simulations were necessary to predict the optimal design, a significant savings over traditional search techniques." + }, + { + "title": "On the Convergence of the P-Algorithm for One-Dimensional Global Optimization of Smooth Functions", + "abstract": null + }, + { + "title": "Efficient Global Optimization of Expensive Black-Box Functions", + "abstract": null + }, + { + "title": "A rigorous framework for optimization of expensive functions by surrogates", + "abstract": null + }, + { + "title": "Average performance of a class of adaptive algorithms for global optimization", + "abstract": "We describe a class of adaptive algorithms for approximating the global minimum of a continuous function on the unit interval. The limiting distribution of the error is derived under the assumption of Wiener measure on the objective functions. For any 8 > 0, we construct an algorithm which has error converging to zero at rate n\"\"*- 1 ~ ^ in the number of function evaluations n. This convergence rate contrasts with the 7i~ 1/2 rate of previously studied nonadaptive methods." + }, + { + "title": "Reinforcement Learning: A Survey", + "abstract": "This paper surveys the field of reinforcement learning from a computer-science perspective. It is written to be accessible to researchers familiar with machine learning. Both the historical basis of the field and a broad selection of current work are summarized. Reinforcement learning is the problem faced by an agent that learns behavior through trial-and-error interactions with a dynamic environment. The work described here has a resemblance to work in psychology, but differs considerably in the details and in the use of the word \"reinforcement.\" The paper discusses central issues of reinforcement learning, including trading off exploration and exploitation, establishing the foundations of the field via Markov decision theory, learning from delayed reinforcement, constructing empirical models to accelerate learning, making use of generalization and hierarchy, and coping with hidden state. It concludes with a survey of some implemented systems and an assessment of the practical utility of current methods for reinforcement learning." + }, + { + "title": "Bayesian approach to global optimization and application to multiobjective and constrained problems", + "abstract": null + }, + { + "title": "On the limited memory BFGS method for large scale optimization", + "abstract": null + }, + { + "title": "Bayesian Approach to Global Optimization: Theory and Applications", + "abstract": "1 Global optimization and the Bayesian approach.- 1.1 What is global optimization?.- 1.2 Advantages of the Bayesian approach to global optimization.- 2 The conditions of Bayesian optimality.- 2.1 Introduction.- 2.2 Reduction to dynamic programming equations.- 2.3 The existence of a measurable solution.- 2.4 The calculation of conditional expectations.- 2.5 The one-step approximation.- 2.6 The adaptive Bayesian approach.- 3 The axiomatic non-probabilistic justification of Bayesian optimality conditions.- 3.1 Introduction.- 3.2 The linearity of the loss function.- 3.3 The existence of the unique a priori probability corresponding to subjective preferences.- 3.4 Optimal method under uncertainty.- 3.5 Nonlinear loss functions.- 4 Stochastic models.- 4.1 Introduction.- 4.2 Sufficient convergence conditions.- 4.3 The Gaussian field.- 4.4 Homogeneous Wiener field.- 4.5 A case of noisy observations.- 4.6 Estimation of parameters from dependent observations.- 5 Bayesian methods for global optimization in the Gaussian case.- 5.1 The one-step approximation.- 5.2 Adaptive models.- 5.3 Extrapolation models.- 5.4 Maximum likelihood models.- 5.5 The comparison of algorithms.- 5.6 The Bayesian approach to global optimization with linear constraints.- 5.7 The Bayesian approach to global optimization with nonlinear constraints.- 5.8 The Bayesian approach to multi-objective optimization.- 5.9 Interactive procedures and the Bayesian approach to global optimization.- 5.10 The reduction of multi-dimensional data.- 5.11 The stopping rules.- 6 The analysis of structure and the simplification of the optimization problems.- 6.1 Introduction.- 6.2 Structural characteristics and the optimization problem.- 6.3 The estimation of structural characteristics.- 6.4 The estimation of a simplification error.- 6.5 Examples of the estimates.- 7 The Bayesian approach to local optimization.- 7.1 Introduction.- 7.2 The one-dimensional Bayesian model.- 7.3 Convergence of the local Bayesian algorithm.- 7.4 Generalization of a multi-dimensional case.- 7.5 Convergence in the multi-dimensional case.- 7.6 The local Bayesian algorithm.- 7.7 Results of computer simulation.- 8 The application of Bayesian methods.- 8.1 Introduction.- 8.2 The optimization of an electricity meter.- 8.3 The optimization of vibromotors.- 8.4 The optimization of a shock-absorber.- 8.5 The optimization of a magnetic beam deflection system.- 8.6 The optimization of small aperture coupling between a rectangular waveguide and a microstrip line.- 8.7 The maximization of LSI yield by optimization of parameters of differential amplifier functional blocks.- 8.8 Optimization of technology to avoid waste in the wet-etching of printed circuit boards in iron-copper-chloride solutions.- 8.9 The optimization of pigment compounds.- 8.10 The least square estimation of electrochemical adsorption using observations of the magnitude of electrode impedance.- 8.11 Estimation of parameters of the immunological model.- 8.12 The optimization of nonstationary queuing systems.- 8.13 The analysis of structure of the Steiner problem.- 8.14 The estimation of decision making by intuition on the example of the Steiner problem.- 9 Portable FORTRAN software for global optimization.- 9.1 Introduction.- 9.2 Parameters.- 9.3 Methods available.- 9.4 Common blocks.- 9.5 The function.- 9.6 The main program.- 9.7 The example of the main program.- 9.8 Description of routines.- 9.9 BAYES1, the global Bayesian method by Mockus.- 9.10 UNT, the global method of extrapolation type by Zilinskas.- 9.11 LPMIN, the global method of uniform search by Sobolj, Shaltenis and Dzemyda.- 9.12 GLOPT, the global method of clustering type by Torn.- 9.13 MIG1, the global method of Monte Carlo (uniform random search).- 9.14 MIG2, the modified version of MIG 1.- 9.15 EXTR, the global one-dimensional method by Zilinskas.- 9.16 MIVAR4, the local method of variable metrics by Tieshis.- 9.17 REQP, the local method of recursive quadratic programming by Biggs.- 9.18 FLEXI, the local simplex method by Nelder and Mead.- 9.19 LBAYES, the local Bayesian method by Mockus.- 9.20 ANAL1, the method of analysis by structure by Shaltenis.- 9.21 Portability routines.- References.- Appendix 1 The software for global optimization for IMB/PC/XT/AT and compatibles.- Appendix 2 How the global optimization software can improve the performance of your CAD system.- Appendix 3 Machine dependent constants of portable FORTRAN." + }, + { + "title": "Statistical Decision Theory and Bayesian Analysis", + "abstract": "An overview of statistical decision theory, which emphasizes the use and application of the philosophical ideas and mathematical structure of decision theory. The text assumes a knowledge of basic probability theory and some advanced calculus is also required." + }, + { + "title": "Infinitesimal and finite perturbation analysis for queueing networks", + "abstract": null + }, + { + "title": "On Bayesian Methods for Seeking the Extremum", + "abstract": null + }, + { + "title": "A New Method of Locating the Maximum Point of an Arbitrary Multipeak Curve in the Presence of Noise", + "abstract": null + }, + { + "title": "The Greatest of a Finite Set of Random Variables", + "abstract": "The variables ξ1,..., ξn have a joint normal distribution. We are concerned with the calculation or approximation of maxξ1,..., ξn. Current analyses and tables handle the case in which the ξi are independently distributed with common expected values and common variances. This paper presents formulas and tables for the most general case with n = 2. When n > 2, the problem becomes cumbersome. This paper presents formulas and tables that permit approximations to the moments in case n > 2. The moments are approximated by iteration of a three-parameter computation or, alternatively, through successive use of a three-parameter table, which is given. Recent applications of the theory are described." + }, + { + "title": "Multidimensional Stochastic Approximation Methods", + "abstract": null + }, + { + "title": "A Stochastic Approximation Method", + "abstract": "Let M(x) denote the expected value at level x of the response to a certain experiment. M(x) is assumed to be a monotone function of x but is unknown tot he experiment, and it is desire to find the solution x=0 of the equation M(x) = a, where x is a given constant. we give a method for making successive experiments at levels x1, x2,... in such a way that x, will tend to 0 in probability." + }, + { + "title": "Multi-start Methods", + "abstract": null + }, + { + "title": "Gaussian Process Bandit Optimisation with Multi-fidelity Evaluations", + "abstract": "In many scientific and engineering applications, we are tasked with the optimisation of an expensive to evaluate black box function $\\func$. Traditional methods for this problem assume just the availability of this single function. However, in many cases, cheap approximations to $\\func$ may be obtainable. For example, the expensive real world behaviour of a robot can be approximated by a cheap computer simulation. We can use these approximations to eliminate low function value regions cheaply and use the expensive evaluations of $\\func$ in a small but promising region and speedily identify the optimum. We formalise this task as a \\emph{multi-fidelity} bandit problem where the target function and its approximations are sampled from a Gaussian process. We develop \\mfgpucb, a novel method based on upper confidence bound techniques. In our theoretical analysis we demonstrate that it exhibits precisely the above behaviour, and achieves better regret than strategies which ignore multi-fidelity information. \\mfgpucbs outperforms such naive strategies and other multi-fidelity methods on several synthetic and real experiments." + }, + { + "title": "Taking the Human Out of the Loop: A Review of Bayesian Optimization", + "abstract": "Big Data applications are typically associated with systems involving large numbers of users, massive complex software systems, and large-scale heterogeneous computing and storage architectures. The construction of such systems involves many distributed design choices. The end products (e.g., recommendation systems, medical analysis tools, real-time game engines, speech recognizers) thus involve many tunable configuration parameters. These parameters are often specified and hard-coded into the software by various developers or teams. If optimized jointly, these parameters can result in significant improvements. Bayesian optimization is a powerful tool for the joint optimization of design choices that is gaining great popularity in recent years. It promises greater automation so as to increase both product quality and human productivity. This review paper introduces Bayesian optimization, highlights some of its methodological aspects, and showcases a wide range of applications." + }, + { + "title": "Do we need “ Harmless ” Bayesian Optimization and “ First-Order ” Bayesian Optimization ?", + "abstract": "A recent empirical study highlighted the shocking result that, for many hyperparameter tuning problems, Bayesian optimization methods can be outperformed by random guessing run for twice as many iterations [1]. This is supported by theoretical results showing the optimality of random search under certain assumptions, but disagrees with other theoretical and empirical results showing that Bayesian optimization can lead to large gains in some situations. In light of this fact, we propose two research directions that we believe the community should pursue. First, we should focus on developing “harmless” Bayesian optimization methods that do no worse than random, and we propose a very simple “harmless” algorithm. Second, we should focus on developing first-order Bayesian optimization algorithms that use gradient information to improve performance for situations where Bayesian optimization already beats random. We empirically show the advantage of both of these ideas in simple simulations. We also propose a simple strategy for reducing the memory and computational requirements of existing first-order Bayesian optimization methods by using directional derivatives instead of full gradients, which can be obtained from analytic functions even when gradient code is not available." + }, + { + "title": "Twitter acquires machine learning startup whetlab", + "abstract": null + }, + { + "title": "Stochastic Gradient Descent Tricks", + "abstract": null + }, + { + "title": "Kriging is well-suited to parallelize optimization", + "abstract": null + }, + { + "title": "Towards Gaussian Process-based Optimization with Finite Time Horizon", + "abstract": null + }, + { + "title": "Multi-Armed Bandit Problems", + "abstract": null + }, + { + "title": "Gaussian Processes for Global Optimization", + "abstract": "We introduce a novel Bayesian approach to global optimization using Gaussian processes. We frame the optimization of both noisy and noiseless functions as sequential decision problems, and introduce myopic and non-myopic solutions to them. Here our solutions can be tailored to exactly the degree of confidence we require of them. The use of Gaussian processes allows us to benefit from the incorporation of prior knowledge about our objective function, and also from any derivative observations. Using this latter fact, we introduce an innovative method to combat conditioning problems. Our algorithm demonstrates a significant improvement over its competitors in overall performance across a wide range of canonical test problems." + }, + { + "title": "Practical bayesian optimization", + "abstract": "Global optimization of non-convex functions over real vector spaces is a problem of widespread theoretical and practical interest. In the past fifty years, research in global optimization has produced many important approaches including Lipschitz optimization, simulated annealing, homotopy methods, genetic algorithms, and Bayesian response-surface methods. This work examines the last of these approaches. The Bayesian response-surface approach to global optimization maintains a posterior model of the function being optimized by combining a prior over functions with accumulating function evaluations. The model is then used to compute which point the method should acquire next in its search for the optimum of the function. Bayesian methods can be some of the most efficient approaches to optimization in terms of the number of function evaluations required, but they have significant drawbacks: Current approaches are needlessly data-inefficient, approximations to the Bayes-optimal acquisition criterion are poorly studied, and current approaches do not take advantage of the small-scale properties of differentiable functions near local optima. This work addresses each of these problems to make Bayesian methods more widely applicable." + }, + { + "title": "A Multi-points Criterion for Deterministic Parallel Global Optimization based on Kriging", + "abstract": null + }, + { + "title": "Improved Strategies for Radial basis Function Methods for Global Optimization", + "abstract": null + }, + { + "title": "Constrained Global Optimization of Expensive Black Box Functions Using Radial Basis Functions", + "abstract": null + }, + { + "title": "A ONE-DIMENSIONAL", + "abstract": ". Algorithms based on statistical models compete favorably with other global optimization algorithms as shown by extensive testing results. A theoretical inadequacy of previously used statistical models for smooth ob-jective functions was eliminated by the authors who in a recent paper have constructed a P-algorithm for a statistical model of smooth functions. In the present note a modification of that P-algorithm with an improved convergence rate is described." + }, + { + "title": "Slice Sampling", + "abstract": "Markov chain sampling methods that adapt to characteristics of the distribution being sampled can be constructed using the principle that one can ample from a distribution by sampling uniformly from the region under the plot of its density function. A Markov chain that converges to this uniform distribution can be constructed by alternating uniform sampling in the vertical direction with uniform sampling from the horizontal \"slice\" defined by the current vertical position, or more generally, with some update that leaves the uniform distribution over this slice invariant. Such \"slice sampling\" methods are easily implemented for univariate distributions, and can be used to sample from a multivariate distribution by updating each variable in turn. This approach is often easier to implement than Gibbs sampling and more efficient than simple Metropolis updates, due to the ability of slice sampling to adaptively choose the magnitude of changes made. It is therefore attractive for routine and automated use. Slice sampling methods that update all variables simultaneously are also possible. These methods can adaptively choose the magnitudes of changes made to each variable, based on the local properties of the density function. More ambitiously, such methods could potentially adapt to the dependencies between variables by constructing local quadratic approximations. Another approach is to improve sampling efficiency by suppressing random walks. This can be done for univariate slice sampling by \"overrelaxation,\" and for multivariate slice sampling by \"reflection\" from the edges of the slice." + }, + { + "title": "Flexibility and efficiency enhancements for constrained global design optimization with kriging approximations.", + "abstract": null + }, + { + "title": "Thesis Supervisor Accepted by.......................................................................", + "abstract": null + }, + { + "title": "Sequential design of computer experiments to minimize integrated response functions", + "abstract": "In the last ten to fifteen years many phenomena that could be studied only using physical experiments can now be studied by computer experiments. Advances in the mathematical modeling of many physical processes, in algorithms for solving mathematical systems, and in computer speeds, have combined to make it possible to augment or replace physical experiments with computer experiments. In a computer experiment, a response z( x), usually deterministic, is computed for each set of input variables, x, according to an experimental design strategy. This strategy is determined by the goal of the experiment and depends, for example, on whether response prediction at unsampled input sites or response optimization is of primary interest. \nWe are concerned with the commonly occuring situation in which there are two types of input variables: suppose x = ( xc, x e) where xc is a set of “control” (manufacturing) variables and xe is a set of “environmental” (noise) variables. Manufacturing variables can be controlled while noise variables are not controllable but have values governed by some probability distribution. \nFor single response settings, we introduce a sequential experimental design for finding the optimum of e(x c) = E[z(x c, Xe)], where the expectation is taken over the distribution of the environmental variables. For bivariate response settings, we introduce a sequential experimental design for finding the constrained optimum of e1( xc)) = E[z( xc, X e)], subject to e2 (x c) = E[z2(x c, Xe)] ≤ U. The approach is Bayesian; the prior information is that the responses are a draw from a stationary Gaussian stochastic process with correlation function belonging to a parametric family with unknown parameters. The idea of the methods is to compute the posterior expected “improvement” over the current optimum for each untested site; the design selects the next site to maximize the expected improvement. Both procedures are illustrated by examples utilizing test functions from the numerical optimization literature." + }, + { + "title": "Global versus local search in constrained optimization of computer models", + "abstract": null + }, + { + "title": "Reinforcement Learning: An Introduction", + "abstract": "Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives when interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the key ideas and algorithms of reinforcement learning. Their discussion ranges from the history of the field's intellectual foundations to the most recent developments and applications. The only necessary mathematical background is familiarity with elementary concepts of probability. The book is divided into three parts. Part I defines the reinforcement learning problem in terms of Markov decision processes. Part II provides basic solution methods: dynamic programming, Monte Carlo methods, and temporal-difference learning. Part III presents a unified view of the solution methods and incorporates artificial neural networks, eligibility traces, and planning; the two final chapters present case studies and consider the future of reinforcement learning." + }, + { + "title": "Design and analysis of simulation experiments", + "abstract": "We define simulation as the process of designing a model of a real system and conducting experiments with this model for the purpose either of understanding the behavior of the system or of evaluating various strategies for the operation of the system [25]. It then follows that we must concern ourselves with the strategic and tactical planning of how to design and run an experiment that will yield the desired information." + }, + { + "title": "Single-step Bayesian search method for an extremum of functions of a single variable", + "abstract": null + }, + { + "title": "Controlled Markov Processes", + "abstract": null + }, + { + "title": "Proceedings of the Twenty-Third International Joint Conference on Artificial Intelligence Bayesian Optimization in High Dimensions via Random Embeddings", + "abstract": null + } + ] + }, + "author_data": {}, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we effectively optimize expensive and high-dimensional functions using Bayesian optimization techniques?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is crucial for the research community as it can significantly enhance the efficiency of optimization processes in various fields, including materials science, engineering, and machine learning. By improving Bayesian optimization methods, researchers can tackle complex problems that involve expensive evaluations, leading to advancements in knowledge and practical applications such as optimizing hyperparameters in machine learning models or designing new materials. This could pave the way for more efficient algorithms and methodologies that can handle high-dimensional spaces, ultimately influencing future research directions and applications.\n\n**[Question 3] - Why is it hard?** \nThe challenges in solving this problem stem from the high dimensionality of the function space and the computational expense associated with evaluating the objective functions. Naive approaches may fail because they do not adequately account for the uncertainty and noise in the function evaluations, leading to suboptimal exploration and exploitation strategies. Additionally, the complexity of modeling the underlying function with Gaussian processes can introduce theoretical and practical obstacles, such as the need for effective kernel selection and managing computational costs associated with large datasets.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has often focused on specific aspects of Bayesian optimization, such as single-task optimization or low-dimensional problems, leaving a gap in addressing high-dimensional and expensive function optimization comprehensively. Barriers include limitations in existing algorithms' scalability and their inability to efficiently handle noise and uncertainty in evaluations. My approach differs by integrating advanced techniques such as random embeddings and parallel optimization strategies, which can enhance the performance and applicability of Bayesian optimization in high-dimensional settings.\n\n**[Question 5] - What are the key components of my approach and results?** \nMy proposed methodology involves utilizing a combination of Gaussian process regression and random embeddings to optimize high-dimensional functions. I will employ a benchmark dataset that includes various expensive functions to evaluate the performance of the optimization process. The key metrics for success will include the number of function evaluations required to reach a specified accuracy and the overall computational efficiency. The expected outcomes are improved optimization performance in terms of convergence speed and accuracy, demonstrating the effectiveness of the proposed approach in real-world applications." + }, + "1808.03867": { + "paper_data": { + "title": "Pervasive Attention: 2D Convolutional Neural Networks for Sequence-to-Sequence Prediction", + "url": "http://arxiv.org/abs/1808.03867v3", + "arxiv_id": "1808.03867", + "authors": [ + "Maha Elbayad", + "Laurent Besacier", + "Jakob Verbeek" + ], + "abstract": "Current state-of-the-art machine translation systems are based on encoder-decoder architectures, that first encode the input sequence, and then generate an output sequence based on the input encoding. Both are interfaced with an attention mechanism that recombines a fixed encoding of the source tokens based on the decoder state. We propose an alternative approach which instead relies on a single 2D convolutional neural network across both sequences. Each layer of our network re-codes source tokens on the basis of the output sequence produced so far. Attention-like properties are therefore pervasive throughout the network. Our model yields excellent results, outperforming state-of-the-art encoder-decoder systems, while being conceptually simpler and having fewer parameters.", + "introduction": " Introduction Deep neural networks have made a profound im- pact on natural language processing technology in general, and machine translation in particu- lar (Kalchbrenner and Blunsom, 2013; Sutskever et al., 2014; Cho et al., 2014; Jean et al., 2015; LeCun et al., 2015). Machine translation (MT) can be seen as a sequence-to-sequence prediction problem, where the source and target sequences are of different and variable length. Current state-of-the-art approaches are based on encoder- decoder architectures (Kalchbrenner and Blun- som, 2013; Sutskever et al., 2014; Cho et al., 2014; Bahdanau et al., 2015). The encoder “reads” the variable-length source sequence and maps it into a vector representation. The decoder takes this vector as input and “writes” the target sequence, updating its state each step with the most recent word that it generated. The basic encoder-decoder model is generally equipped with an attention model (Bahdanau et al., 2015), which repetitivelyre-accesses the source sequence during the decod- ing process. Given the current state of the decoder, a probability distribution over the elements in the source sequence is computed, which is then used to select or aggregate features of these elements into a single “context” vector that is used by the decoder. Rather than relying on the global rep- resentation of the source sequence, the attention mechanism allows the decoder to “look back” into the source sequence and focus on salient positions. Besides this inductive bias, the attention mecha- nism bypasses the problem of vanishing gradients that most recurrent architectures encounter. However, the current attention mechanisms have limited modeling abilities and are generally a simple weighted sum of the source representations (Bahdanau et al., 2015; Luong et al., 2015), where the weights are the result of a shallow matching between source and target elements. The atten- tion module re-combines the same source token codes and is unable to re-encode or re-interpret the source sequence while decoding. To address these limitations, we propose an al- ternative neural MT architecture, based on deep 2D convolutional neural networks (CNNs). The product space of the positions in source and tar- get sequences defines the 2D grid over which the network is defined. The convolutional filters are masked to prohibit accessing information derived from future tokens in the target sequence, obtain- ing an autoregressive model akin to generative models for images and audio waveforms (Oord et al., 2016a,b). See Figure 1 for an illustration. This approach allows us to learn deep feature hierarchies based on a stack of 2D convolutional layers, and benefit from parallel computation dur- ing training. Every layer of our network computes features of the the source tokens, based on the tar- get sequence produced so far, and uses these to predict the next output token. Our model therefore 1arXiv:1808.03867v3 [cs.CL] 1 Nov 2018Figure 1 : Convolutional layers in our model use masked 3\u00023filters so that features are only com- puted from previous output symbols. Illustration of the receptive fields after one (dark blue) and two layers (light blue), together with the masked part of the field of view of a normal 3\u00023filter (gray). has attention-like capabilities by construction, that are pervasive throughout the layers of the network, rather than using an “add-on” attention model. We validate our model with experiments we only use max-pooling for simplicity, unless stated otherwise. In Figure 4 we consider the effect of the token embedding size, the growth rate of the network, and its depth. The token embedding size together with the growth rate gcontrol the dimension of the final feature used for estimating the", + "references": [ + { + "title": "Towards Two-Dimensional Sequence to Sequence Model in Neural Machine Translation", + "abstract": "This work investigates an alternative model for neural machine translation (NMT) and proposes a novel architecture, where we employ a multi-dimensional long short-term memory (MDLSTM) for translation modelling. In the state-of-the-art methods, source and target sentences are treated as one-dimensional sequences over time, while we view translation as a two-dimensional (2D) mapping using an MDLSTM layer to define the correspondence between source and target words. We extend beyond the current sequence to sequence backbone NMT models to a 2D structure in which the source and target sentences are aligned with each other in a 2D grid. Our proposed topology shows consistent improvements over attention-based sequence to sequence model on two WMT 2017 tasks, German<->English." + }, + { + "title": "Latent Alignment and Variational Attention", + "abstract": "Neural attention has become central to many state-of-the-art models in natural language processing and related domains. Attention networks are an easy-to-train and effective method for softly simulating alignment; however, the approach does not marginalize over latent alignments in a probabilistic sense. This property makes it difficult to compare attention to other alignment approaches, to compose it with probabilistic models, and to perform posterior inference conditioned on observed data. A related latent approach, hard attention, fixes these issues, but is generally harder to train and less accurate. This work considers variational attention networks, alternatives to soft and hard attention for learning latent variable alignment models, with tighter approximation bounds based on amortized variational inference. We further propose methods for reducing the variance of gradients to make these approaches computationally feasible. Experiments show that for machine translation and visual question answering, inefficient exact latent variable models outperform standard neural attention, but these gains go away when using hard attention based training. On the other hand, variational attention retains most of the performance gain but with training speed comparable to neural attention." + }, + { + "title": "Weaver: Deep Co-Encoding of Questions and Documents for Machine Reading", + "abstract": "This paper aims at improving how machines can answer questions directly from text, with the focus of having models that can answer correctly multiple types of questions and from various types of texts, documents or even from large collections of them. To that end, we introduce the Weaver model that uses a new way to relate a question to a textual context by weaving layers of recurrent networks, with the goal of making as few assumptions as possible as to how the information from both question and context should be combined to form the answer. We show empirically on six datasets that Weaver performs well in multiple conditions. For instance, it produces solid results on the very popular SQuAD dataset (Rajpurkar et al., 2016), solves almost all bAbI tasks (Weston et al., 2015) and greatly outperforms state-of-the-art methods for open domain question answering from text (Chen et al., 2017)." + }, + { + "title": "Classical Structured Prediction Losses for Sequence to Sequence Learning", + "abstract": "There has been much recent work on training neural attention models at the sequence-level using either reinforcement learning-style methods or by optimizing the beam. In this paper, we survey a range of classical objective functions that have been widely used to train linear models for structured prediction and apply them to neural sequence to sequence models. Our experiments show that these losses can perform surprisingly well by slightly outperforming beam search optimization in a like for like setup. We also report new state of the art results on both IWSLT’14 German-English translation as well as Gigaword abstractive summarization. On the large WMT’14 English-French task, sequence-level training achieves 41.5 BLEU which is on par with the state of the art." + }, + { + "title": "Automatic differentiation in PyTorch", + "abstract": "In this article, we describe an automatic differentiation module of PyTorch — a library designed to enable rapid research on machine learning models. It builds upon a few projects, most notably Lua Torch, Chainer, and HIPS Autograd [4], and provides a high performance environment with easy access to automatic differentiation of models executed on different devices (CPU and GPU). To make prototyping easier, PyTorch does not follow the symbolic approach used in many other deep learning frameworks, but focuses on differentiation of purely imperative programs, with a focus on extensibility and low overhead. Note that this preprint is a draft of certain sections from an upcoming paper covering all PyTorch features." + }, + { + "title": "Learning to Align the Source Code to the Compiled Object Code", + "abstract": "We propose a new neural network architecture and use it for the task of statement-by-statement alignment of source code and its compiled object code. Our architecture learns the alignment between the two sequences – one being the translation of the other – by mapping each statement to a context-dependent representation vector and aligning such vectors using a grid of the two sequence domains. Our experiments include short C functions, both artificial and human-written, and show that our neural network architecture is able to predict the alignment with high accuracy, outperforming known baselines. We also demonstrate that our model is general and can learn to solve graph problems such as the Traveling Salesman Problem." + }, + { + "title": "Towards Neural Phrase-based Machine Translation", + "abstract": "In this paper, we present Neural Phrase-based Machine Translation (NPMT). Our method explicitly models the phrase structures in output sequences using Sleep-WAke Networks (SWAN), a recently proposed segmentation-based sequence modeling method. To mitigate the monotonic alignment requirement of SWAN, we introduce a new layer to perform (soft) local reordering of input sequences. Different from existing neural machine translation (NMT) approaches, NPMT does not use attention-based decoding mechanisms. Instead, it directly outputs phrases in a sequential order and can decode in linear time. Our experiments show that NPMT achieves superior performances on IWSLT 2014 German-English/English-German and IWSLT 2015 English-Vietnamese machine translation tasks compared with strong NMT baselines. We also observe that our method produces meaningful phrases in output languages." + }, + { + "title": "Attention is All you Need", + "abstract": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data." + }, + { + "title": "Convolutional Sequence to Sequence Learning", + "abstract": "The prevalent approach to sequence to sequence learning maps an input sequence to a variable length output sequence via recurrent neural networks. We introduce an architecture based entirely on convolutional neural networks. Compared to recurrent models, computations over all elements can be fully parallelized during training and optimization is easier since the number of non-linearities is fixed and independent of the input length. Our use of gated linear units eases gradient propagation and we equip each decoder layer with a separate attention module. We outperform the accuracy of the deep LSTM setup of Wu et al. (2016) on both WMT'14 English-German and WMT'14 English-French translation at an order of magnitude faster speed, both on GPU and CPU." + }, + { + "title": "Adversarial Neural Machine Translation", + "abstract": "In this paper, we study a new learning paradigm for Neural Machine Translation (NMT). Instead of maximizing the likelihood of the human translation as in previous works, we minimize the distinction between human translation and the translation given by an NMT model. To achieve this goal, inspired by the recent success of generative adversarial networks (GANs), we employ an adversarial training architecture and name it as Adversarial-NMT. In Adversarial-NMT, the training of the NMT model is assisted by an adversary, which is an elaborately designed Convolutional Neural Network (CNN). The goal of the adversary is to differentiate the translation result generated by the NMT model from that by human. The goal of the NMT model is to produce high quality translations so as to cheat the adversary. A policy gradient method is leveraged to co-train the NMT model and the adversary. Experimental results on English$\\rightarrow$French and German$\\rightarrow$English translation tasks show that Adversarial-NMT can achieve significantly better translation quality than several strong baselines." + }, + { + "title": "Parallel Multiscale Autoregressive Density Estimation", + "abstract": "PixelCNN achieves state-of-the-art results in density estimation for natural images. Although training is fast, inference is costly, requiring one network evaluation per pixel; O(N) for N pixels. This can be sped up by caching activations, but still involves generating each pixel sequentially. In this work, we propose a parallelized PixelCNN that allows more efficient inference by modeling certain pixel groups as conditionally independent. Our new PixelCNN model achieves competitive density estimation and orders of magnitude speedup - O(log N) sampling instead of O(N) - enabling the practical generation of 512x512 images. We evaluate the model on class-conditional image generation, text-to-image synthesis, and action-conditional video generation, showing that our model achieves the best results among non-pixel-autoregressive density models that allow efficient sampling." + }, + { + "title": "A Structured Self-attentive Sentence Embedding", + "abstract": "This paper proposes a new model for extracting an interpretable sentence embedding by introducing self-attention. Instead of using a vector, we use a 2-D matrix to represent the embedding, with each row of the matrix attending on a different part of the sentence. We also propose a self-attention mechanism and a special regularization term for the model. As a side effect, the embedding comes with an easy way of visualizing what specific parts of the sentence are encoded into the embedding. We evaluate our model on 3 different tasks: author profiling, sentiment classification, and textual entailment. Results show that our model yields a significant performance gain compared to other sentence embedding methods in all of the 3 tasks." + }, + { + "title": "Sequence Modeling via Segmentations", + "abstract": "Segmental structure is a common pattern in many types of sequences such as phrases in human languages. In this paper, we present a probabilistic model for sequences via their segmentations. The probability of a segmented sequence is calculated as the product of the probabilities of all its segments, where each segment is modeled using existing tools such as recurrent neural networks. Since the segmentation of a sequence is usually unknown in advance, we sum over all valid segmentations to obtain the final probability for the sequence. An efficient dynamic programming algorithm is developed for forward and backward computations without resorting to any approximation. We demonstrate our approach on text segmentation and speech recognition tasks. In addition to quantitative results, we also show that our approach can discover meaningful segments in their respective application contexts." + }, + { + "title": "PixelCNN++: Improving the PixelCNN with Discretized Logistic Mixture Likelihood and Other Modifications", + "abstract": "PixelCNNs are a recently proposed class of powerful generative models with tractable likelihood. Here we discuss our implementation of PixelCNNs which we make available at this https URL Our implementation contains a number of modifications to the original model that both simplify its structure and improve its performance. 1) We use a discretized logistic mixture likelihood on the pixels, rather than a 256-way softmax, which we find to speed up training. 2) We condition on whole pixels, rather than R/G/B sub-pixels, simplifying the model structure. 3) We use downsampling to efficiently capture structure at multiple resolutions. 4) We introduce additional short-cut connections to further speed up optimization. 5) We regularize the model using dropout. Finally, we present state-of-the-art log likelihood results on CIFAR-10 to demonstrate the usefulness of these modifications." + }, + { + "title": "Language Modeling with Gated Convolutional Networks", + "abstract": "The pre-dominant approach to language modeling to date is based on recurrent neural networks. Their success on this task is often linked to their ability to capture unbounded context. In this paper we develop a finite context approach through stacked convolutions, which can be more efficient since they allow parallelization over sequential tokens. We propose a novel simplified gating mechanism that outperforms Oord et al (2016) and investigate the impact of key architectural decisions. The proposed approach achieves state-of-the-art on the WikiText-103 benchmark, even though it features long-term dependencies, as well as competitive results on the Google Billion Words benchmark. Our model reduces the latency to score a sentence by an order of magnitude compared to a recurrent baseline. To our knowledge, this is the first time a non-recurrent approach is competitive with strong recurrent models on these large scale language tasks." + }, + { + "title": "A Convolutional Encoder Model for Neural Machine Translation", + "abstract": "The prevalent approach to neural machine translation relies on bi-directional LSTMs to encode the source sentence. We present a faster and simpler architecture based on a succession of convolutional layers. This allows to encode the source sentence simultaneously compared to recurrent networks for which computation is constrained by temporal dependencies. On WMT’16 English-Romanian translation we achieve competitive accuracy to the state-of-the-art and on WMT’15 English-German we outperform several recently published results. Our models obtain almost the same accuracy as a very deep LSTM setup on WMT’14 English-French translation. We speed up CPU decoding by more than two times at the same or higher accuracy as a strong bi-directional LSTM." + }, + { + "title": "Neural Machine Translation in Linear Time", + "abstract": "We present a novel neural network for processing sequences. The ByteNet is a one-dimensional convolutional neural network that is composed of two parts, one to encode the source sequence and the other to decode the target sequence. The two network parts are connected by stacking the decoder on top of the encoder and preserving the temporal resolution of the sequences. To address the differing lengths of the source and the target, we introduce an efficient mechanism by which the decoder is dynamically unfolded over the representation of the encoder. The ByteNet uses dilation in the convolutional layers to increase its receptive field. The resulting network has two core properties: it runs in time that is linear in the length of the sequences and it sidesteps the need for excessive memorization. The ByteNet decoder attains state-of-the-art performance on character-level language modelling and outperforms the previous best results obtained with recurrent networks. The ByteNet also achieves state-of-the-art performance on character-to-character machine translation on the English-to-German WMT translation task, surpassing comparable neural translation models that are based on recurrent networks with attentional pooling and run in quadratic time. We find that the latent alignment structure contained in the representations reflects the expected alignment between the tokens." + }, + { + "title": "Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation", + "abstract": "Neural Machine Translation (NMT) is an end-to-end learning approach for automated translation, with the potential to overcome many of the weaknesses of conventional phrase-based translation systems. Unfortunately, NMT systems are known to be computationally expensive both in training and in translation inference. Also, most NMT systems have difficulty with rare words. These issues have hindered NMT's use in practical deployments and services, where both accuracy and speed are essential. In this work, we present GNMT, Google's Neural Machine Translation system, which attempts to address many of these issues. Our model consists of a deep LSTM network with 8 encoder and 8 decoder layers using attention and residual connections. To improve parallelism and therefore decrease training time, our attention mechanism connects the bottom layer of the decoder to the top layer of the encoder. To accelerate the final translation speed, we employ low-precision arithmetic during inference computations. To improve handling of rare words, we divide words into a limited set of common sub-word units (\"wordpieces\") for both input and output. This method provides a good balance between the flexibility of \"character\"-delimited models and the efficiency of \"word\"-delimited models, naturally handles translation of rare words, and ultimately improves the overall accuracy of the system. Our beam search technique employs a length-normalization procedure and uses a coverage penalty, which encourages generation of an output sentence that is most likely to cover all the words in the source sentence. On the WMT'14 English-to-French and English-to-German benchmarks, GNMT achieves competitive results to state-of-the-art. Using a human side-by-side evaluation on a set of isolated simple sentences, it reduces translation errors by an average of 60% compared to Google's phrase-based production system." + }, + { + "title": "WaveNet: A Generative Model for Raw Audio", + "abstract": "This paper introduces WaveNet, a deep neural network for generating raw audio waveforms. The model is fully probabilistic and autoregressive, with the predictive distribution for each audio sample conditioned on all previous ones; nonetheless we show that it can be efficiently trained on data with tens of thousands of samples per second of audio. When applied to text-to-speech, it yields state-of-the-art performance, with human listeners rating it as significantly more natural sounding than the best parametric and concatenative systems for both English and Mandarin. A single WaveNet can capture the characteristics of many different speakers with equal fidelity, and can switch between them by conditioning on the speaker identity. When trained to model music, we find that it generates novel and often highly realistic musical fragments. We also show that it can be employed as a discriminative model, returning promising results for phoneme recognition." + }, + { + "title": "Densely Connected Convolutional Networks", + "abstract": "Recent work has shown that convolutional networks can be substantially deeper, more accurate, and efficient to train if they contain shorter connections between layers close to the input and those close to the output. In this paper, we embrace this observation and introduce the Dense Convolutional Network (DenseNet), which connects each layer to every other layer in a feed-forward fashion. Whereas traditional convolutional networks with L layers have L connections—one between each layer and its subsequent layer—our network has L(L+1)/2 direct connections. For each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation, encourage feature reuse, and substantially reduce the number of parameters. We evaluate our proposed architecture on four highly competitive object recognition benchmark tasks (CIFAR-10, CIFAR-100, SVHN, and ImageNet). DenseNets obtain significant improvements over the state-of-the-art on most of them, whilst requiring less memory and computation to achieve high performance. Code and pre-trained models are available at https://github.com/liuzhuang13/DenseNet." + }, + { + "title": "An Actor-Critic Algorithm for Sequence Prediction", + "abstract": "We present an approach to training neural networks to generate sequences using actor-critic methods from reinforcement learning (RL). Current log-likelihood training methods are limited by the discrepancy between their training and testing modes, as models must generate tokens conditioned on their previous guesses rather than the ground-truth tokens. We address this problem by introducing a \\textit{critic} network that is trained to predict the value of an output token, given the policy of an \\textit{actor} network. This results in a training procedure that is much closer to the test phase, and allows us to directly optimize for a task-specific score such as BLEU. Crucially, since we leverage these techniques in the supervised learning setting rather than the traditional RL setting, we condition the critic network on the ground-truth output. We show that our method leads to improved performance on both a synthetic task, and for German-English machine translation. Our analysis paves the way for such methods to be applied in natural language generation tasks, such as machine translation, caption generation, and dialogue modelling." + }, + { + "title": "Conditional Image Generation with PixelCNN Decoders", + "abstract": "This work explores conditional image generation with a new image density model based on the PixelCNN architecture. The model can be conditioned on any vector, including descriptive labels or tags, or latent embeddings created by other networks. When conditioned on class labels from the ImageNet database, the model is able to generate diverse, realistic scenes representing distinct animals, objects, landscapes and structures. When conditioned on an embedding produced by a convolutional network given a single image of an unseen face, it generates a variety of new portraits of the same person with different facial expressions, poses and lighting conditions. We also show that conditional PixelCNN can serve as a powerful decoder in an image autoencoder. Additionally, the gated convolutional layers in the proposed model improve the log-likelihood of PixelCNN to match the state-of-the-art performance of PixelRNN on ImageNet, with greatly reduced computational cost." + }, + { + "title": "A Decomposable Attention Model for Natural Language Inference", + "abstract": "We propose a simple neural architecture for natural language inference. Our approach uses attention to decompose the problem into subproblems that can be solved separately, thus making it trivially parallelizable. On the Stanford Natural Language Inference (SNLI) dataset, we obtain state-of-the-art results with almost an order of magnitude fewer parameters than previous work and without relying on any word-order information. Adding intra-sentence attention that takes a minimum amount of order into account yields further improvements." + }, + { + "title": "Pairwise Word Interaction Modeling with Deep Neural Networks for Semantic Similarity Measurement", + "abstract": "Textual similarity measurement is a challenging problem, as it requires understanding the semantics of input sentences. Most previous neural network models use coarse-grained sentence modeling, which has difficulty capturing fine-grained word-level information for semantic comparisons. As an alternative, we propose to explicitly model pairwise word interactions and present a novel similarity focus mechanism to identify important correspondences for better similarity measurement. Our ideas are implemented in a novel neural network architecture that demonstrates state-ofthe-art accuracy on three SemEval tasks and two answer selection tasks." + }, + { + "title": "Pixel Recurrent Neural Networks", + "abstract": "Modeling the distribution of natural images is a landmark problem in unsupervised learning. This task requires an image model that is at once expressive, tractable and scalable. We present a deep neural network that sequentially predicts the pixels in an image along the two spatial dimensions. Our method models the discrete probability of the raw pixel values and encodes the complete set of dependencies in the image. Architectural novelties include fast two-dimensional recurrent layers and an effective use of residual connections in deep recurrent networks. We achieve log-likelihood scores on natural images that are considerably better than the previous state of the art. Our main results also provide benchmarks on the diverse ImageNet dataset. Samples generated from the model appear crisp, varied and globally coherent." + }, + { + "title": "A Deep Architecture for Semantic Matching with Multiple Positional Sentence Representations", + "abstract": "\n \n Matching natural language sentences is central for many applications such as information retrieval and question answering. Existing deep models rely on a single sentence representation or multiple granularity representations for matching. However, such methods cannot well capture the contextualized local information in the matching process. To tackle this problem, we present a new deep architecture to match two sentences with multiple positional sentence representations. Specifically, each positional sentence representation is a sentence representation at this position, generated by a bidirectional long short term memory (Bi-LSTM). The matching score is finally produced by aggregating interactions between these different positional sentence representations, through k-Max pooling and a multi-layer perceptron. Our model has several advantages: (1) By using Bi-LSTM, rich context of the whole sentence is leveraged to capture the contextualized local information in each positional sentence representation; (2) By matching with multiple positional sentence representations, it is flexible to aggregate different important contextualized local information in a sentence to support the matching; (3) Experiments on different tasks such as question answering and sentence completion demonstrate the superiority of our model.\n \n" + }, + { + "title": "Sequence Level Training with Recurrent Neural Networks", + "abstract": "Many natural language processing applications use language models to generate text. These models are typically trained to predict the next word in a sequence, given the previous words and some context such as an image. However, at test time the model is expected to generate the entire sequence from scratch. This discrepancy makes generation brittle, as errors may accumulate along the way. We address this issue by proposing a novel sequence level training algorithm that directly optimizes the metric used at test time, such as BLEU or ROUGE. On three different tasks, our approach outperforms several strong baselines for greedy generation. The method is also competitive when these baselines employ beam search, while being several times faster." + }, + { + "title": "Neural Machine Translation of Rare Words with Subword Units", + "abstract": "Neural machine translation (NMT) models typically operate with a fixed vocabulary, but translation is an open-vocabulary problem. Previous work addresses the translation of out-of-vocabulary words by backing off to a dictionary. In this paper, we introduce a simpler and more effective approach, making the NMT model capable of open-vocabulary translation by encoding rare and unknown words as sequences of subword units. This is based on the intuition that various word classes are translatable via smaller units than words, for instance names (via character copying or transliteration), compounds (via compositional translation), and cognates and loanwords (via phonological and morphological transformations). We discuss the suitability of different word segmentation techniques, including simple character n-gram models and a segmentation based on the byte pair encoding compression algorithm, and empirically show that subword models improve over a back-off dictionary baseline for the WMT 15 translation tasks English-German and English-Russian by 1.1 and 1.3 BLEU, respectively." + }, + { + "title": "Effective Approaches to Attention-based Neural Machine Translation", + "abstract": "An attentional mechanism has lately been used to improve neural machine translation (NMT) by selectively focusing on parts of the source sentence during translation. However, there has been little work exploring useful architectures for attention-based NMT. This paper examines two simple and effective classes of attentional mechanism: a global approach which always attends to all source words and a local one that only looks at a subset of source words at a time. We demonstrate the effectiveness of both approaches on the WMT translation tasks between English and German in both directions. With local attention, we achieve a significant gain of 5.0 BLEU points over non-attentional systems that already incorporate known techniques such as dropout. Our ensemble model using different attention architectures yields a new state-of-the-art result in the WMT’15 English to German translation task with 25.9 BLEU points, an improvement of 1.0 BLEU points over the existing best system backed by NMT and an n-gram reranker. 1" + }, + { + "title": "Grid Long Short-Term Memory", + "abstract": "This paper introduces Grid Long Short-Term Memory, a network of LSTM cells arranged in a multidimensional grid that can be applied to vectors, sequences or higher dimensional data such as images. The network differs from existing deep LSTM architectures in that the cells are connected between network layers as well as along the spatiotemporal dimensions of the data. The network provides a unified way of using LSTM for both deep and sequential computation. We apply the model to algorithmic tasks such as 15-digit integer addition and sequence memorization, where it is able to significantly outperform the standard LSTM. We then give results for two empirical tasks. We find that 2D Grid LSTM achieves 1.47 bits per character on the Wikipedia character prediction benchmark, which is state-of-the-art among neural approaches. In addition, we use the Grid LSTM to define a novel two-dimensional translation model, the Reencoder, and show that it outperforms a phrase-based reference system on a Chinese-to-English translation task." + }, + { + "title": "Encoding Source Language with Convolutional Neural Network for Machine Translation", + "abstract": "The recently proposed neural network joint model (NNJM) (Devlin et al., 2014) augments the n-gram target language model with a heuristically chosen source context window, achieving state-of-the-art performance in SMT. In this paper, we give a more systematic treatment by summarizing the relevant source information through a convolutional architecture guided by the target information. With different guiding signals during decoding, our specifically designed convolution+gating architectures can pinpoint the parts of a source sentence that are relevant to predicting a target word, and fuse them with the context of entire source sentence to form a unified representation. This representation, together with target language words, are fed to a deep neural network (DNN) to form a stronger NNJM. Experiments on two NIST Chinese-English translation tasks show that the proposed model can achieve significant improvements over the previous NNJM by up to +1.08 BLEU points on average" + }, + { + "title": "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention", + "abstract": "Inspired by recent work in machine translation and object detection, we introduce an attention based model that automatically learns to describe the content of images. We describe how we can train this model in a deterministic manner using standard backpropagation techniques and stochastically by maximizing a variational lower bound. We also show through visualization how the model is able to automatically learn to fix its gaze on salient objects while generating the corresponding words in the output sequence. We validate the use of attention with state-of-the-art performance on three benchmark datasets: Flickr9k, Flickr30k and MS COCO." + }, + { + "title": "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift", + "abstract": "Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization, and in some cases eliminates the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.82% top-5 test error, exceeding the accuracy of human raters." + }, + { + "title": "Adam: A Method for Stochastic Optimization", + "abstract": "We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm." + }, + { + "title": "Convolutional Neural Network Architectures for Matching Natural Language Sentences", + "abstract": "Semantic matching is of central importance to many natural language tasks [2,28]. A successful matching algorithm needs to adequately model the internal structures of language objects and the interaction between them. As a step toward this goal, we propose convolutional neural network models for matching two sentences, by adapting the convolutional strategy in vision and speech. The proposed models not only nicely represent the hierarchical structures of sentences with their layer-by-layer composition and pooling, but also capture the rich matching patterns at different levels. Our models are rather generic, requiring no prior knowledge on language, and can hence be applied to matching tasks of different nature and in different languages. The empirical study on a variety of matching tasks demonstrates the efficacy of the proposed model on a variety of matching tasks and its superiority to competitor models." + }, + { + "title": "On Using Very Large Target Vocabulary for Neural Machine Translation", + "abstract": "Neural machine translation, a recently proposed approach to machine translation based purely on neural networks, has shown promising results compared to the existing approaches such as phrase-based statistical machine translation. Despite its recent success, neural machine translation has its limitation in handling a larger vocabulary, as training complexity as well as decoding complexity increase proportionally to the number of target words. In this paper, we propose a method based on importance sampling that allows us to use a very large target vocabulary without increasing training complexity. We show that decoding can be efficiently done even with the model having a very large target vocabulary by selecting only a small subset of the whole target vocabulary. The models trained by the proposed approach are empirically found to outperform the baseline models with a small vocabulary as well as the LSTM-based neural machine translation models. Furthermore, when we use the ensemble of a few models with very large target vocabularies, we achieve the state-of-the-art translation performance (measured by BLEU) on the English!German translation and almost as high performance as state-of-the-art English!French translation system." + }, + { + "title": "Sequence to Sequence Learning with Neural Networks", + "abstract": "Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks. Although DNNs work well whenever large labeled training sets are available, they cannot be used to map sequences to sequences. In this paper, we present a general end-to-end approach to sequence learning that makes minimal assumptions on the sequence structure. Our method uses a multilayered Long Short-Term Memory (LSTM) to map the input sequence to a vector of a fixed dimensionality, and then another deep LSTM to decode the target sequence from the vector. Our main result is that on an English to French translation task from the WMT-14 dataset, the translations produced by the LSTM achieve a BLEU score of 34.8 on the entire test set, where the LSTM's BLEU score was penalized on out-of-vocabulary words. Additionally, the LSTM did not have difficulty on long sentences. For comparison, a phrase-based SMT system achieves a BLEU score of 33.3 on the same dataset. When we used the LSTM to rerank the 1000 hypotheses produced by the aforementioned SMT system, its BLEU score increases to 36.5, which is close to the previous state of the art. The LSTM also learned sensible phrase and sentence representations that are sensitive to word order and are relatively invariant to the active and the passive voice. Finally, we found that reversing the order of the words in all source sentences (but not target sentences) improved the LSTM's performance markedly, because doing so introduced many short term dependencies between the source and the target sentence which made the optimization problem easier." + }, + { + "title": "Neural Machine Translation by Jointly Learning to Align and Translate", + "abstract": "Neural machine translation is a recently proposed approach to machine translation. Unlike the traditional statistical machine translation, the neural machine translation aims at building a single neural network that can be jointly tuned to maximize the translation performance. The models proposed recently for neural machine translation often belong to a family of encoder-decoders and consists of an encoder that encodes a source sentence into a fixed-length vector from which a decoder generates a translation. In this paper, we conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder-decoder architecture, and propose to extend this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly. With this new approach, we achieve a translation performance comparable to the existing state-of-the-art phrase-based system on the task of English-to-French translation. Furthermore, qualitative analysis reveals that the (soft-)alignments found by the model agree well with our intuition." + }, + { + "title": "Convolutional Neural Networks for Sentence Classification", + "abstract": "We report on a series of experiments with convolutional neural networks (CNN) trained on top of pre-trained word vectors for sentence-level classification tasks. We show that a simple CNN with little hyperparameter tuning and static vectors achieves excellent results on multiple benchmarks. Learning task-specific vectors through fine-tuning offers further gains in performance. We additionally propose a simple modification to the architecture to allow for the use of both task-specific and static vectors. The CNN models discussed herein improve upon the state of the art on 4 out of 7 tasks, which include sentiment analysis and question classification." + }, + { + "title": "Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation", + "abstract": "In this paper, we propose a novel neural network model called RNN Encoder‐ Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixedlength vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder‐Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases." + }, + { + "title": "A Convolutional Neural Network for Modelling Sentences", + "abstract": "The ability to accurately represent sentences is central to language understanding. We describe a convolutional architecture dubbed the Dynamic Convolutional Neural Network (DCNN) that we adopt for the semantic modelling of sentences. The network uses Dynamic k-Max Pooling, a global pooling operation over linear sequences. The network handles input sentences of varying length and induces a feature graph over the sentence that is capable of explicitly capturing short and long-range relations. The network does not rely on a parse tree and is easily applicable to any language. We test the DCNN in four experiments: small scale binary and multi-class sentiment prediction, six-way question classification and Twitter sentiment prediction by distant supervision. The network achieves excellent performance in the first three tasks and a greater than 25% error reduction in the last task with respect to the strongest baseline." + }, + { + "title": "Recurrent Continuous Translation Models", + "abstract": "We introduce a class of probabilistic continuous translation models called Recurrent Continuous Translation Models that are purely based on continuous representations for words, phrases and sentences and do not rely on alignments or phrasal translation units. The models have a generation and a conditioning aspect. The generation of the translation is modelled with a target Recurrent Language Model, whereas the conditioning on the source sentence is modelled with a Convolutional Sentence Model. Through various experiments, we show first that our models obtain a perplexity with respect to gold translations that is > 43% lower than that of stateof-the-art alignment-based translation models. Secondly, we show that they are remarkably sensitive to the word order, syntax, and meaning of the source sentence despite lacking alignments. Finally we show that they match a state-of-the-art system when rescoring n-best lists of translations." + }, + { + "title": "Sequence Transduction with Recurrent Neural Networks", + "abstract": "Many machine learning tasks can be expressed as the transformation---or \\emph{transduction}---of input sequences into output sequences: speech recognition, machine translation, protein secondary structure prediction and text-to-speech to name but a few. One of the key challenges in sequence transduction is learning to represent both the input and output sequences in a way that is invariant to sequential distortions such as shrinking, stretching and translating. Recurrent neural networks (RNNs) are a powerful sequence learning architecture that has proven capable of learning such representations. However RNNs traditionally require a pre-defined alignment between the input and output sequences to perform transduction. This is a severe limitation since \\emph{finding} the alignment is the most difficult aspect of many sequence transduction problems. Indeed, even determining the length of the output sequence is often challenging. This paper introduces an end-to-end, probabilistic sequence transduction system, based entirely on RNNs, that is in principle able to transform any input sequence into any finite, discrete output sequence. Experimental results for phoneme recognition are provided on the TIMIT speech corpus." + }, + { + "title": "Rectified Linear Units Improve Restricted Boltzmann Machines", + "abstract": "Restricted Boltzmann machines were developed using binary stochastic hidden units. These can be generalized by replacing each binary unit by an infinite number of copies that all have the same weights but have progressively more negative biases. The learning and inference rules for these \"Stepped Sigmoid Units\" are unchanged. They can be approximated efficiently by noisy, rectified linear units. Compared with binary units, these units learn features that are better for object recognition on the NORB dataset and face verification on the Labeled Faces in the Wild dataset. Unlike binary units, rectified linear units preserve information about relative intensities as information travels through multiple layers of feature detectors." + }, + { + "title": "A unified architecture for natural language processing: deep neural networks with multitask learning", + "abstract": "We describe a single convolutional neural network architecture that, given a sentence, outputs a host of language processing predictions: part-of-speech tags, chunks, named entity tags, semantic roles, semantically similar words and the likelihood that the sentence makes sense (grammatically and semantically) using a language model. The entire network is trained jointly on all these tasks using weight-sharing, an instance of multitask learning. All the tasks use labeled data except the language model which is learnt from unlabeled text and represents a novel form of semi-supervised learning for the shared tasks. We show how both multitask learning and semi-supervised learning improve the generalization of the shared tasks, resulting in state-of-the-art-performance." + }, + { + "title": "Moses: Open Source Toolkit for Statistical Machine Translation", + "abstract": "We describe an open-source toolkit for statistical machine translation whose novel contributions are (a) support for linguistically motivated factors, (b) confusion network decoding, and (c) efficient data formats for translation models and language models. In addition to the SMT decoder, the toolkit also includes a wide variety of tools for training, tuning and applying the system to many translation tasks." + }, + { + "title": "Bleu: a Method for Automatic Evaluation of Machine Translation", + "abstract": "Human evaluations of machine translation are extensive but expensive. Human evaluations can take months to finish and involve human labor that can not be reused. We propose a method of automatic machine translation evaluation that is quick, inexpensive, and language-independent, that correlates highly with human evaluation, and that has little marginal cost per run. We present this method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations." + }, + { + "title": "Bidirectional recurrent neural networks", + "abstract": "In the first part of this paper, a regular recurrent neural network (RNN) is extended to a bidirectional recurrent neural network (BRNN). The BRNN can be trained without the limitation of using input information just up to a preset future frame. This is accomplished by training it simultaneously in positive and negative time direction. Structure and training procedure of the proposed network are explained. In regression and classification experiments on artificial data, the proposed structure gives better results than other approaches. For real data, classification experiments for phonemes from the TIMIT database show the same tendency. In the second part of this paper, it is shown how the proposed bidirectional structure can be easily modified to allow efficient estimation of the conditional posterior probability of complete symbol sequences without making any explicit assumption about the shape of the distribution. For this part, experiments on real data are reported." + }, + { + "title": "Long Short-Term Memory", + "abstract": "Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms." + }, + { + "title": "Deep Learning", + "abstract": null + }, + { + "title": "Dropout: a simple way to prevent neural networks from overfitting", + "abstract": "Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overfitting is a serious problem in such networks. Large networks are also slow to use, making it difficult to deal with overfitting by combining the predictions of many different large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different \"thinned\" networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets." + }, + { + "title": "Report on the 11th IWSLT evaluation campaign", + "abstract": "The paper overviews the 11th evaluation campaign organized by the IWSLT workshop. The 2014 evaluation offered multiple tracks on lecture transcription and translation based on the TED Talks corpus. In particular, this year IWSLT included three automatic speech recognition tracks, on English, German and Italian, five speech translation tracks, from English to French, English to German, German to English, English to Italian, and Italian to English, and five text translation track, also from English to French, English to German, German to English, English to Italian, and Italian to English. In addition to the official tracks, speech and text translation optional tracks were offered, globally involving 12 other languages: Arabic, Spanish, Portuguese (B), Hebrew, Chinese, Polish, Persian, Slovenian, Turkish, Dutch, Romanian, Russian. Overall, 21 teams participated in the evaluation, for a total of 76 primary runs submitted. Participants were also asked to submit runs on the 2013 test set (progress test set), in order to measure the progress of systems with respect to the previous year. All runs were evaluated with objective metrics, and submissions for two of the official text translation tracks were also evaluated with human post-editing." + } + ] + }, + "author_data": { + "13b7b523-05c5-42e6-91e2-7cddf7af52a1": { + "pk": "13b7b523-05c5-42e6-91e2-7cddf7af52a1", + "project_name": null, + "name": "Jakob Verbeek", + "bio": "I am a researcher with a strong focus on advancing the field of computer vision through innovative neural network architectures. My work has primarily revolved around extending the capabilities of convolutional neural networks (CNNs) to non-traditional data types, such as 3D shape meshes and graph-structured data. I developed novel graph-convolutional network architectures that dynamically compute filter shapes based on learned features, achieving state-of-the-art results in tasks like shape correspondence and digit recognition.\n\nIn addition to my contributions to graph-based models, I have explored the concept of a \"fabric\" of architectures, which allows for the embedding of numerous architectures within a single framework, enabling efficient training and competitive performance across various tasks. My research also delves into the challenges of weakly supervised learning, where I have proposed methods that leverage multiple-instance learning to improve object localization and image classification.\n\nI am particularly passionate about addressing the limitations of existing models in real-world applications, such as face recognition in uncontrolled settings, where I introduced techniques to generate synthetic training datasets. My work on attention-based models for automatic image captioning and adversarial training for semantic segmentation further exemplifies my commitment to pushing the boundaries of what is possible in visual recognition.\n\nOverall, my research aims to bridge the gap between theoretical advancements and practical applications, contributing to the development of robust, efficient, and scalable models that can tackle complex visual recognition challenges.", + "collaborators": [ + "C. Schmid", + "Dan Oneaţă", + "Thomas Lucas", + "S. Saxena", + "Matthijs Douze", + "Danila Potapov", + "Zaïd Harchaoui", + "Nitika Verma", + "Edmond Boyer", + "M. Pedersoli", + "Robin Aly", + "Relja Arandjelović", + "Ken Chatfield", + "N. O’Connor", + "Jérôme Revaud", + "T. Tuytelaars", + "Heng Wang", + "Andrew Zisserman", + "Thomas Mensink", + "Guosheng Hu", + "Xiaojiang Peng", + "Yongxin Yang", + "Timothy M. Hospedales", + "Pauline Luc", + "C. Couprie", + "Soumith Chintala", + "R. G. Cinbis", + "Mattis Paulin", + "Clement Leray", + "N. Chesneau", + "Alahari Karteek", + "L. Lamel", + "J. Gauvain", + "C. Schmidt", + "Basura Fernando", + "Kevin McGuinness", + "Omar M. Parkhi", + "J. Schwenninger", + "David Scott", + "Schwenninger Jochen", + "K. Mcguiness", + "Shu Chen", + "Omkar M. Parkhi", + "Fernando Basura", + "Florent Perronnin", + "Oksana Yakhnenko", + "G. Csurka", + "Josip Krapac", + "F. Jurie" + ], + "pub_titles": [ + "Dynamic Filters in Graph Convolutional Networks", + "Areas of Attention for Image Captioning — Supplementary Material —", + "FeaStNet: Feature-Steered Graph Convolutions for 3D Shape Analysis", + "Convolutional Neural Fabrics", + "Machine learning solutions to visual recognition problems", + "Frankenstein: Learning Deep Face Representations Using Small Data", + "Areas of Attention for Image Captioning", + "Semantic Segmentation using Adversarial Networks", + "Weakly Supervised Object Localization with Multi-Fold Multiple Instance Learning", + "Coordinated Local Metric Learning", + "The LEAR submission at Thumos 2014", + "The INRIA-LIM-VocR and AXES submissions to TrecVid 2014 Multimedia Event Detection", + "AXES at TRECVid 2013", + "AXES at TRECVID 2012: KIS, INS, and MED", + "Region-Based Image Classification with a Latent SVM Model", + "Weighted Transmedia Relevance Feedback for Image Retrieval and Auto-annotation", + "Learning Tree-structured Quantizers for Image Categorization" + ], + "pub_abstracts": [ + "Convolutional neural networks (CNNs) have massively impacted visual recognition in 2D images, and are now ubiquitous in state-of-the-art approaches. While CNNs naturally extend to other domains, such as audio and video, where data is also organized in rectangular grids, they do not easily generalize to other types of data such as 3D shape meshes, social network graphs or molecular graphs. To handle such data, we propose a novel graph-convolutional network architecture that builds on a generic formulation that relaxes the 1-to-1 correspondence between filter weights and data elements around the center of the convolution. The main novelty of our architecture is that the shape of the filter is a function of the features in the previous network layer, which is learned as an integral part of the neural network. Experimental evaluations on digit recognition, semi-supervised document classification, and 3D shape correspondence yield state-of-the-art results, significantly improving over previous work for shape correspondence.", + "For sake of brevity we reported only the three metrics that are most commonly used in the recent captioning literature in the main paper, the same three as in e.g . [1, 8, 11]. While the evaluation of caption quality remains a challenging issue, the CIDEr-D metric [7] is generally considered to be correlating the best to human judgement. The BLEU metrics [6] are based on N-gram matching statistics. In particular, the BLEU1 metric completely disregards word ordering, and is thus of little interest to measure sentence quality. From the BLEU measures, BLEU4 (based on 4-grams) is most commonly used [7]. In tables 1, 2, and 3 provide the evaluation results including the BLEU 1–3 metrics. The conclusion of the comparisons among the variants of our model and to the state of the art remain unchanged. The tables here correspond to those with the same numbers in the main paper. We refer to the main paper for a full description of the experimental setup.", + "Convolutional neural networks (CNNs) have massively impacted visual recognition in 2D images, and are now ubiquitous in state-of-the-art approaches. CNNs do not easily extend, however, to data that are not represented by regular grids, such as 3D shape meshes or other graph-structured data, to which traditional local convolution operators do not directly apply. To address this problem, we propose a novel graph-convolution operator to establish correspondences between filter weights and graph neighborhoods with arbitrary connectivity. The key novelty of our approach is that these correspondences are dynamically computed from features learned by the network, rather than relying on predefined static coordinates over the graph as in previous work. We obtain excellent experimental results that significantly improve over previous state-of-the-art shape correspondence results. This shows that our approach can learn effective shape representations from raw input coordinates, without relying on shape descriptors.", + "Despite the success of CNNs, selecting the optimal architecture for a given task remains an open problem. Instead of aiming to select a single optimal architecture, we propose a \"fabric\" that embeds an exponentially large number of architectures. The fabric consists of a 3D trellis that connects response maps at different layers, scales, and channels with a sparse homogeneous local connectivity pattern. The only hyper-parameters of a fabric are the number of channels and layers. While individual architectures can be recovered as paths, the fabric can in addition ensemble all embedded architectures together, sharing their weights where their paths overlap. Parameters can be learned using standard methods based on back-propagation, at a cost that scales linearly in the fabric size. We present benchmark results competitive with the state of the art for image classification on MNIST and CIFAR10, and for semantic segmentation on the Part Labels dataset.", + "This thesis gives an overview of my research since my arrival in December 2005 as a postdoctoral fellow at the in the LEAR team at INRIA Rhone-Alpes. After a general introduction in Chapter 1, the contributions are presented in chapters 2–4 along three themes. In each chapter we describe the contributions, their relation to related work, and highlight two contributions with more detail. Chapter 2 is concerned with contributions related to the Fisher vector representation. We highlight an extension of the representation based on modeling dependencies among local descriptors (Cinbis et al., 2012, 2016a). The second highlight is on an approximate normalization scheme which speeds-up applications for object and action localization (Oneata et al., 2014b). In Chapter 3 we consider the contributions related to metric learning. The first contribution we highlight is a nearest-neighbor based image annotation method that learns weights over neighbors, and effectively determines the number of neighbors to use (Guillaumin et al., 2009a). The second contribution we highlight is an image classification method based on metric learning for the nearest class mean classifier that can efficiently generalize to new classes (Mensink et al., 2012, 2013b). The third set of contributions, presented in Chapter 4, is related to learning visual recognition models from incomplete supervision. The first highlighted contribution is an interactive image annotation method that exploits dependencies across different image labels, to improve predictions and to identify the most informative user input (Mensink et al., 2011, 2013a). The second highlighted contribution is a multi-fold multiple instance learning method for learning object localization models from training images where we only know if the object is present in the image or not (Cinbis et al., 2014, 2016b). Finally, Chapter 5 summarizes the contributions, and presents future research directions.", + "Deep convolutional neural networks have recently proven extremely effective for difficult face recognition problems in uncontrolled settings. To train such networks, very large training sets are needed with millions of labeled images. For some applications, such as near-infrared (NIR) face recognition, such large training data sets are not publicly available and difficult to collect. In this paper, we propose a method to generate very large training data sets of synthetic images by compositing real face images in a given data set. We show that this method enables to learn models from as few as 10 000 training images, which perform on par with models trained from 500 000 images. Using our approach, we also obtain state-of-the-art results on the CASIA NIR-VIS2.0 heterogeneous face recognition data set.", + "We propose “Areas of Attention”, a novel attentionbased model for automatic image captioning. Our approach models the dependencies between image regions, caption words, and the state of an RNN language model, using three pairwise interactions. In contrast to previous attentionbased approaches that associate image regions only to the RNN state, our method allows a direct association between caption words and image regions. During training these associations are inferred from image-level captions, akin to weakly-supervised object detector training. These associations help to improve captioning by localizing the corresponding regions during testing. We also propose and compare different ways of generating attention areas: CNN activation grids, object proposals, and spatial transformers nets applied in a convolutional fashion. Spatial transformers give the best results. They allow for image specific attention areas, and can be trained jointly with the rest of the network. Our attention mechanism and spatial transformer attention areas together yield state-of-the-art results on the MSCOCO dataset.", + "Adversarial training has been shown to produce state of the art results for generative image modeling. In this paper we propose an adversarial training approach to train semantic segmentation models. We train a convolutional semantic segmentation network along with an adversarial network that discriminates segmentation maps coming either from the ground truth or from the segmentation network. The motivation for our approach is that it can detect and correct higher-order inconsistencies between ground truth segmentation maps and the ones produced by the segmentation net. Our experiments show that our adversarial training approach leads to improved accuracy on the Stanford Background and PASCAL VOC 2012 datasets.", + "Object category localization is a challenging problem in computer vision. Standard supervised training requires bounding box annotations of object instances. This time-consuming annotation process is sidestepped in weakly supervised learning. In this case, the supervised information is restricted to binary labels that indicate the absence/presence of object instances in the image, without their locations. We follow a multiple-instance learning approach that iteratively trains the detector and infers the object locations in the positive training images. Our main contribution is a multi-fold multiple instance learning procedure, which prevents training from prematurely locking onto erroneous object locations. This procedure is particularly important when using high-dimensional representations, such as Fisher vectors and convolutional neural network features. We also propose a window refinement method, which improves the localization accuracy by incorporating an objectness prior. We present a detailed experimental evaluation using the PASCAL VOC 2007 dataset, which verifies the effectiveness of our approach.", + "Mahalanobis metric learning amounts to learning a linear data projection, after which the ℓ2 metric is used to compute distances. To allow more flexible metrics, not restricted to linear projections, local metric learning techniques have been developed. Most of these methods partition the data space using clustering, and for each cluster a separate metric is learned. Using local metrics, however, it is not clear how to measure distances between data points assigned to different clusters. In this paper we propose to embed the local metrics in a global low-dimensional representation, in which the ℓ2 metric can be used. With each cluster we associate a linear mapping that projects the data to the global representation. This global representation directly allows computing distances between points regardless to which local cluster they belong. Moreover, it also enables data visualization in a single view, and the use of ℓ2-based efficient retrieval methods. Experiments on the Labeled Faces in the Wild dataset show that our approach improves over previous global and local metric learning approaches.", + "We describe the submission of the INRIA LEAR team to the THU-MOS workshop in conjunction with ECCV 2014. Our system is based on Fisher vector (FV) encoding of dense trajectory features (DTF), which we also used in our 2013 submission. This year's submission additionally incorporates static-image features (SIFT, Color, and CNN) and audio features (ASR and MFCC) for the classification task. For the detection task, we combine scores from the clas-sification task with FV-DTF features extracted from video slices. We found that these additional visual and audio feature significantly improve the classification results. For localization we found that using the classification scores as a contex-tual feature besides local motion features leads to significant improvements.", + "This paper describes our participation to the 2014 edition of the TrecVid Multimedia Event Detection task. Our system is based on a collection of local visual and audio descriptors, which are aggregated to global descriptors, one for each type of low-level descriptor, using Fisher vectors. Besides these features, we use two features based on convolutional networks: one for the visual channel, and one for the audio channel. Additional high-level featuresare extracted using ASR and OCR features. Finally, we used mid-level attribute features based on object and action detectors trained on external datasets. Our two submissions (INRIA-LIM-VocR and AXES) are identical interms of all the components, except for the ASR system that is used. We present an overview of the features andthe classification techniques, and experimentally evaluate our system on TrecVid MED 2011 data.", + "The AXES project participated in the interactive instance search task (INS), the semantic indexing task (SIN) the multimedia event recounting task (MER), and the multimedia event detection task (MED) for TRECVid 2013. Our interactive INS focused this year on using classifiers trained at query time with positive examples collected from external search engines. Participants in our INS experiments were carried out by students and researchers at Dublin City University. Our best INS runs performed on par with the top ranked INS runs in terms of P@10 and P@30, and around the median in terms of mAP. For SIN, MED and MER, we use systems based on state- of-the-art local low-level descriptors for motion, image, and sound, as well as high-level features to capture speech and text and the visual and audio stream respectively. The low-level descriptors were aggregated by means of Fisher vectors into high- dimensional video-level signatures, the high-level features are aggregated into bag-of-word histograms. Using these features we train linear classifiers, and use early and late-fusion to combine the different features. Our MED system achieved the best score of all submitted runs in the main track, as well as in the ad-hoc track. This paper describes in detail our INS, MER, and MED systems and the results and findings of our experiments.", + "The AXES project participated in the interactive instance search task (INS), the known-item search task (KIS), and the multimedia event detection task (MED) for TRECVid 2012. As in our TRECVid 2011 system, we used nearly identical search systems and user interfaces for both INS and KIS. Our interactive INS and KIS systems focused this year on using classifiers trained at query time with positive examples collected from external search engines. Participants in our KIS experiments were media professionals from the BBC; our INS experiments were carried out by students and researchers at Dublin City University. We performed comparatively well in both experiments. Our best KIS run found 13 of the 25 topics, and our best INS runs outperformed all other submitted runs in terms of P@100. For MED, the system presented was based on a minimal number of low-level descriptors, which we chose to be as large as computationally feasible. These descriptors are aggregated to produce high-dimensional video-level signatures, which are used to train a set of linear classifiers. Our MED system achieved the second-best score of all submitted runs in the main track, and best score in the ad-hoc track, suggesting that a simple system based on state-of-the-art low-level descriptors can give relatively high performance. This paper describes in detail our KIS, INS, and MED systems and the results and findings of our experiments.", + "Image classification is a challenging problem due to intra-class appearance variation, background clutter, occlusion, and photometric variability. Current state-of-the-art methods do not explicitly handle background clutter, but rely on global image representations, such as bag-of-word (BoW) models. Multiple-instance learning has been used to explicitly deal with clutter, classifying an image positively as soon as at least one image region is classified positively. In this paper, we propose a more robust latent-SVM model that, unlike multiple-instance learning, does not rely on a single image region to trigger a positive image classification. Rather, our model scores an images using all regions, and associates with each region a latent variable that indicates whether the region represents the object of interest or its background. Background and foreground regions are each scored by a different appearance model, and an additional term in the score function ensures that neighboring regions tend to take the same background/foreground label. We learn the parameters of our latent SVM model using an iterative procedure that alternates between inferring the latent variables, and updating the parameters. We compare the performance of our approach on the PASCAL VOC'07 dataset to that of SVMs trained on global BoW representations, and to a multiple-instance SVM trained on BoW representations of image regions. We show that our approach outperforms multiple-instance learning by a large margin on all classes, and outperforms global BoW models in 17 out of the 20 classes.", + "Currently large scale multimodal image databases have become widely available, for example via photo sharing sites where images come along with textual descriptions and keyword annotations. Most existing work on image retrieval and image auto-annotation has considered uni-modal techniques, either focusing on query-by-example systems or query-by-text systems for image retrieval, and mono modal classification for image auto-annotation. However recent state-of-the-art multimodal image retrieval and image auto-annotation systems combine different uni-modal models using late-fusion techniques. In addition, significant advances have been made by using pseudo-relevance feedback techniques, as well as using transmedia relevance models that swap modalities in the query expansion step of pseudo-relevance methods. While these techniques are promising it is not trivial to set the parameters that control the late fusion and pseudo/cross relevance models. In this paper, we therefore propose approaches to learn these parameters from a labeled training set: queries with relevant and non-relevant documents, or images with relevant and non-relevant keywords. Three additional contributions are the introduction of (i) two new parameterizations of transmedia and pseudo-relevance models, (ii) correction parameters for inter-query variations in the distribution of retrieval scores for both relevant and non-relevant documents, and (iii) the extension of TagProp, a nearest neighbor based image annotation method to exploit transmedia relevance feedback. We evaluate our models using public benchmark data sets for image retrieval and annotation. Using the data set of the ImageClef 2008 Photo Retrieval task, our retrieval experiments show that our learned models lead to significant improvements of retrieval performance over the current state-of-the-art. In our experiments on image annotation we use the COREL and IAPR data sets, and also here we observe annotation accuracies that improve over the current state-of-the-art results on these data sets.", + "Current state-of-the-art image categorization systems rely on bag-of-words representations that model image content as a histogram of quantization indices that code local image appearance. In this context, randomized tree-structured quantizers have been shown to be both computationally efficient and yielding discriminative visual words for a given categorization task. This paper presents a new algorithm that builds tree-structured quantizers not to optimize patch classification ‐ as it is done by approaches such as [18] ‐ but to directly optimize the image classification performance. This approach is experimentally validated on several challenging data sets for which it outperforms other patch quantizers such as standard decision trees or k-means." + ], + "domain": [ + "Computer Vision", + "Deep Learning", + "Graph Neural Network", + "Image Processing" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we improve the modeling capabilities of attention mechanisms in neural machine translation (MT) to better capture the relationships between source and target sequences?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is crucial for advancing the field of machine translation, as it addresses the limitations of current attention mechanisms that rely on shallow matching and simple weighted sums. By enhancing the modeling capabilities, we can improve translation quality, leading to more accurate and contextually relevant translations. This research could pave the way for future innovations in MT and related areas, potentially impacting applications in real-time translation, cross-lingual communication, and multilingual content generation.\n\n**[Question 3] - Why is it hard?** \nThe challenges in solving this problem stem from the inherent complexity of accurately modeling the relationships between variable-length source and target sequences. Naive approaches may fail because they do not adequately capture the nuanced dependencies and contextual information necessary for high-quality translation. Additionally, the limitations of existing attention mechanisms, which do not allow for re-encoding or re-interpretation of the source sequence during decoding, pose significant technical obstacles that need to be addressed.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has primarily focused on improving attention mechanisms through incremental adjustments rather than fundamentally rethinking their architecture. Existing solutions have been constrained by the reliance on recurrent architectures and the limitations of shallow matching techniques. Our approach differs by introducing a deep 2D convolutional neural network architecture that inherently incorporates attention-like capabilities throughout its layers, allowing for a more sophisticated and comprehensive modeling of the relationships between source and target sequences.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves developing a neural machine translation model based on deep 2D convolutional neural networks (CNNs) that utilize masked convolutional filters to maintain autoregressive properties. We will evaluate our model using standard machine translation datasets, employing metrics such as BLEU scores to assess translation quality. The expected outcomes include improved translation accuracy and the demonstration of the model's ability to learn deep feature hierarchies, ultimately showcasing the advantages of our approach over traditional attention-based models." + }, + "1502.03167": { + "paper_data": { + "title": "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift", + "url": "http://arxiv.org/abs/1502.03167v3", + "arxiv_id": "1502.03167", + "authors": [ + "Sergey Ioffe", + "Christian Szegedy" + ], + "abstract": "Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization. It also acts as a regularizer, in some cases eliminating the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.9% top-5 validation error (and 4.8% test error), exceeding the accuracy of human raters.", + "introduction": " Introduction Deep learning has dramatically advanced the state of the art in vision, speech, and many other areas. Stochas- tic gradient descent (SGD) has proved to be an effec- tive way of training deep networks, and SGD variants such as momentum (Sutskeveret al., 2013) and Adagrad (Duchiet al.,2011)havebeenusedtoachievestate ofthe art performance. SGD optimizes the parameters Θof the network,soasto minimizetheloss Θ = argmin Θ1 NN/summationdisplay i=1ℓ(xi,Θ) wherex1...Nisthetrainingdataset. With SGD,thetrain- ingproceedsinsteps,andateachstepweconsidera mini- batchx1...mofsizem. The mini-batchis usedtoapprox- imate the gradient of the loss functionwith respect to the parameters,bycomputing 1 m∂ℓ(xi,Θ) ∂Θ.Usingmini-batchesofexamples,asopposedtooneexam- pleatatime,ishelpfulinseveralways. First,thegradient ofthelossoveramini-batchisanestimateofthegradient overthetrainingset, whose qualityimprovesas thebatch size increases. Second, computation over a batch can be much more efficient than mcomputations for individual examples, due to the parallelism afforded by the modern computingplatforms. While stochastic gradient is simple and effective, it requires careful tuning of the model hyper-parameters, specificallythelearningrateusedinoptimization,aswell as the initial values for the model parameters. The train- ingiscomplicatedbythefactthattheinputstoeachlayer areaffectedbytheparametersofallprecedinglayers–so that small changes to the network parameters amplify as thenetworkbecomesdeeper. The change in the distributions of layers’ inputs presents a problem because the layers need to continu- ously adapt to the new distribution. When the input dis- tributiontoalearningsystemchanges,itissaidtoexperi- encecovariateshift (Shimodaira, 2000). This is typically handled via domain adaptation (Jiang, 2008). However, the notion of covariate shift can be extended beyond the learningsystemasawhole,toapplytoitsparts,suchasa sub-networkora layer. Considera networkcomputing ℓ=F2(F1(u,Θ1),Θ2) whereF1andF2are arbitrary transformations, and the parameters Θ1,Θ2are to be learned so as to minimize the lossℓ. Learning Θ2can be viewed as if the inputs x =F1(u,Θ1)arefedintothesub-network ℓ=F2(x,Θ2). Forexample,agradientdescentstep Θ2←Θ2−α mm/summationdisplay i=1∂F2(xi,Θ2) ∂Θ2 (forbatchsize mandlearningrate α)isexactlyequivalent to that for a stand-alone network F2with input x. There- fore, the input distribution properties that make training more efficient – such as having the same distribution be- tween the training and test data – apply to training the sub-network as well. As such it is advantageous for the distributionof xtoremainfixedovertime. Then, Θ2does 1not have to readjust to compensate for the change in the distributionof x. Fixed distribution of inputs to a sub-network would havepositiveconsequencesforthelayers outsidethesub- network,as well. Consider a layer with a sigmoid activa- tion function z =g(Wu+b)whereuis the layer input, the weight matrix Wand bias vector bare the layer pa- rameters to be learned, and g(x) =1 1+exp(−x). As|x| increases, g′(x)tends to zero. This means that for all di- mensionsof x =Wu+bexceptthosewithsmallabsolute values,thegradientflowingdownto uwillvanishandthe model will train slowly. However, since xis affected by W,band the parameters of all the layers below, changes tothoseparametersduringtrainingwilllikelymovemany dimensions of xinto the saturated regime of the nonlin- earity and slow down the convergence. This effect is amplified as the network depth increases. In practice, the saturation problem and the resulting vanishing gradi- entsareusuallyaddressedbyusingRectifiedLinearUnits (Nair&Hinton, 2010) ReLU(x) = max( x,0), careful initialization (Bengio&Glorot, 2010; Saxeet al., 2013), and small learning rates. If, however, we could ensure that the distribution of nonlinearity inputs remains more stable as the network trains, then the optimizer would be less likely to get stuck in the saturated regime, and the trainingwouldaccelerate. We refer to the change in the distributions of internal nodes of a deep network, in the course of training, as In- ternal Covariate Shift . Eliminating it offers a promise of faster training. We propose a new mechanism, which we callBatch Normalization , that takes a step towards re- ducing internal covariate shift, and in doing so dramati- cally accelerates the training of deep neural nets. It ac- complishes this via a normalization step that fixes the meansandvariancesoflayerinputs. BatchNormalization also has a beneficial effect on the gradient flow through the network, by reducing the dependence of gradients on the scale of the parameters or of their initial values. This allows us to use much higher learning rates with- out the risk of divergence. Furthermore, batch normal- ization regularizes the model and", + "references": [ + { + "title": "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications", + "abstract": "We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization." + }, + { + "title": "Xception: Deep Learning with Depthwise Separable Convolutions", + "abstract": "We present an interpretation of Inception modules in convolutional neural networks as being an intermediate step in-between regular convolution and the depthwise separable convolution operation (a depthwise convolution followed by a pointwise convolution). In this light, a depthwise separable convolution can be understood as an Inception module with a maximally large number of towers. This observation leads us to propose a novel deep convolutional neural network architecture inspired by Inception, where Inception modules have been replaced with depthwise separable convolutions. We show that this architecture, dubbed Xception, slightly outperforms Inception V3 on the ImageNet dataset (which Inception V3 was designed for), and significantly outperforms Inception V3 on a larger image classification dataset comprising 350 million images and 17,000 classes. Since the Xception architecture has the same number of parameters as Inception V3, the performance gains are not due to increased capacity but rather to a more efficient use of model parameters." + }, + { + "title": "Densely Connected Convolutional Networks", + "abstract": "Recent work has shown that convolutional networks can be substantially deeper, more accurate, and efficient to train if they contain shorter connections between layers close to the input and those close to the output. In this paper, we embrace this observation and introduce the Dense Convolutional Network (DenseNet), which connects each layer to every other layer in a feed-forward fashion. Whereas traditional convolutional networks with L layers have L connections—one between each layer and its subsequent layer—our network has L(L+1)/2 direct connections. For each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation, encourage feature reuse, and substantially reduce the number of parameters. We evaluate our proposed architecture on four highly competitive object recognition benchmark tasks (CIFAR-10, CIFAR-100, SVHN, and ImageNet). DenseNets obtain significant improvements over the state-of-the-art on most of them, whilst requiring less memory and computation to achieve high performance. Code and pre-trained models are available at https://github.com/liuzhuang13/DenseNet." + }, + { + "title": "FractalNet: Ultra-Deep Neural Networks without Residuals", + "abstract": "We introduce a design strategy for neural network macro-architecture based on self-similarity. Repeated application of a simple expansion rule generates deep networks whose structural layouts are precisely truncated fractals. These networks contain interacting subpaths of different lengths, but do not include any pass-through or residual connections; every internal signal is transformed by a filter and nonlinearity before being seen by subsequent layers. In experiments, fractal networks match the excellent performance of standard residual networks on both CIFAR and ImageNet classification tasks, thereby demonstrating that residual representations may not be fundamental to the success of extremely deep convolutional neural networks. Rather, the key may be the ability to transition, during training, from effectively shallow to deep. We note similarities with student-teacher behavior and develop drop-path, a natural extension of dropout, to regularize co-adaptation of subpaths in fractal architectures. Such regularization allows extraction of high-performance fixed-depth subnetworks. Additionally, fractal networks exhibit an anytime property: shallow subnetworks provide a quick answer, while deeper subnetworks, with higher latency, provide a more accurate answer." + }, + { + "title": "Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units", + "abstract": "Recently, convolutional neural networks (CNNs) have been used as a powerful tool to solve many problems of machine learning and computer vision. In this paper, we aim to provide insight on the property of convolutional neural networks, as well as a generic method to improve the performance of many CNN architectures. Specifically, we first examine existing CNN models and observe an intriguing property that the filters in the lower layers form pairs (i.e., filters with opposite phase). Inspired by our observation, we propose a novel, simple yet effective activation scheme called concatenated ReLU (CRelu) and theoretically analyze its reconstruction property in CNNs. We integrate CRelu into several state-of-the-art CNN architectures and demonstrate improvement in their recognition performance on CIFAR-10/100 and ImageNet datasets with fewer trainable parameters. Our results suggest that better understanding of the properties of CNNs can lead to significant performance improvement with a simple modification." + }, + { + "title": "Deep Residual Learning for Image Recognition", + "abstract": "Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation." + }, + { + "title": "Multi-Scale Context Aggregation by Dilated Convolutions", + "abstract": "State-of-the-art models for semantic segmentation are based on adaptations of convolutional networks that had originally been designed for image classification. However, dense prediction and image classification are structurally different. In this work, we develop a new convolutional network module that is specifically designed for dense prediction. The presented module uses dilated convolutions to systematically aggregate multi-scale contextual information without losing resolution. The architecture is based on the fact that dilated convolutions support exponential expansion of the receptive field without loss of resolution or coverage. We show that the presented context module increases the accuracy of state-of-the-art semantic segmentation systems. In addition, we examine the adaptation of image classification networks to dense prediction and show that simplifying the adapted network can increase accuracy." + }, + { + "title": "Natural Neural Networks", + "abstract": "We introduce Natural Neural Networks, a novel family of algorithms that speed up convergence by adapting their internal representation during training to improve conditioning of the Fisher matrix. In particular, we show a specific example that employs a simple and efficient reparametrization of the neural network weights by implicitly whitening the representation obtained at each layer, while preserving the feed-forward computation of the network. Such networks can be trained efficiently via the proposed Projected Natural Gradient Descent algorithm (PRONG), which amortizes the cost of these reparametrizations over many parameter updates and is closely related to the Mirror Descent online learning algorithm. We highlight the benefits of our method on both unsupervised and supervised learning tasks, and showcase its scalability by training on the large-scale ImageNet Challenge dataset." + }, + { + "title": "You Only Look Once: Unified, Real-Time Object Detection", + "abstract": "We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance. Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background. Finally, YOLO learns very general representations of objects. It outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork." + }, + { + "title": "Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification", + "abstract": "Rectified activation units (rectifiers) are essential for state-of-the-art neural networks. In this work, we study rectifier neural networks for image classification from two aspects. First, we propose a Parametric Rectified Linear Unit (PReLU) that generalizes the traditional rectified unit. PReLU improves model fitting with nearly zero extra computational cost and little overfitting risk. Second, we derive a robust initialization method that particularly considers the rectifier nonlinearities. This method enables us to train extremely deep rectified models directly from scratch and to investigate deeper or wider network architectures. Based on the learnable activation and advanced initialization, we achieve 4.94% top-5 test error on the ImageNet 2012 classification dataset. This is a 26% relative improvement over the ILSVRC 2014 winner (GoogLeNet, 6.66% [33]). To our knowledge, our result is the first to surpass the reported human-level performance (5.1%, [26]) on this dataset." + }, + { + "title": "Deep Image: Scaling up Image Recognition", + "abstract": "We present a state-of-the-art image recognition system, Deep Image, developed using end-to-end deep learning. The key components are a custom-built supercomputer dedicated to deep learning, a highly optimized parallel algorithm using new strategies for data partitioning and communication, larger deep neural network models, novel data augmentation approaches, and usage of multi-scale high-resolution images. Our method achieves excellent results on multiple challenging computer vision benchmarks." + }, + { + "title": "Striving for Simplicity: The All Convolutional Net", + "abstract": "Most modern convolutional neural networks (CNNs) used for object recognition are built using the same principles: Alternating convolution and max-pooling layers followed by a small number of fully connected layers. We re-evaluate the state of the art for object recognition from small images with convolutional networks, questioning the necessity of different components in the pipeline. We find that max-pooling can simply be replaced by a convolutional layer with increased stride without loss in accuracy on several image recognition benchmarks. Following this finding -- and building on other recent work for finding simple network structures -- we propose a new architecture that consists solely of convolutional layers and yields competitive or state of the art performance on several object recognition datasets (CIFAR-10, CIFAR-100, ImageNet). To analyze the network we introduce a new variant of the \"deconvolution approach\" for visualizing features learned by CNNs, which can be applied to a broader range of network structures than existing approaches." + }, + { + "title": "Parallel training of Deep Neural Networks with Natural Gradient and Parameter Averaging", + "abstract": "We describe the neural-network training framework used in the Kaldi speech recognition toolkit, which is geared towards training DNNs with large amounts of training data using multiple GPU-equipped or multicore machines. In order to be as hardwareagnostic as possible, we needed a way to use multiple machines without generating excessive network traffic. Our method is to average the neural network parameters periodically (typically every minute or two), and redistribute the averaged parameters to the machines for further training. Each machine sees different data. By itself, this method does not work very well. However, we have another method, an approximate and efficient implementation of Natural Gradient for Stochastic Gradient Descent (NG-SGD), which seems to allow our periodic-averaging method to work well, as well as substantially improving the convergence of SGD on a single machine." + }, + { + "title": "Going deeper with convolutions", + "abstract": "We propose a deep convolutional neural network architecture codenamed Inception that achieves the new state of the art for classification and detection in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC14). The main hallmark of this architecture is the improved utilization of the computing resources inside the network. By a carefully crafted design, we increased the depth and width of the network while keeping the computational budget constant. To optimize quality, the architectural decisions were based on the Hebbian principle and the intuition of multi-scale processing. One particular incarnation used in our submission for ILSVRC14 is called GoogLeNet, a 22 layers deep network, the quality of which is assessed in the context of classification and detection." + }, + { + "title": "Very Deep Convolutional Networks for Large-Scale Image Recognition", + "abstract": "In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision." + }, + { + "title": "ImageNet Large Scale Visual Recognition Challenge", + "abstract": null + }, + { + "title": "Mean-normalized stochastic gradient for large-scale deep learning", + "abstract": "Deep neural networks are typically optimized with stochastic gradient descent (SGD). In this work, we propose a novel second-order stochastic optimization algorithm. The algorithm is based on analytic results showing that a non-zero mean of features is harmful for the optimization. We prove convergence of our algorithm in a convex setting. In our experiments we show that our proposed algorithm converges faster than SGD. Further, in contrast to earlier work, our algorithm allows for training models with a factorized structure from scratch. We found this structure to be very useful not only because it accelerates training and decoding, but also because it is a very effective means against overfitting. Combining our proposed optimization algorithm with this model structure, model size can be reduced by a factor of eight and still improvements in recognition error rate are obtained. Additional gains are obtained by improving the Newbob learning rate strategy." + }, + { + "title": "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks", + "abstract": "Despite the widespread practical success of deep learning methods, our theoretical understanding of the dynamics of learning in deep neural networks remains quite sparse. We attempt to bridge the gap between the theory and practice of deep learning by systematically analyzing learning dynamics for the restricted case of deep linear neural networks. Despite the linearity of their input-output map, such networks have nonlinear gradient descent dynamics on weights that change with the addition of each new hidden layer. We show that deep linear networks exhibit nonlinear learning phenomena similar to those seen in simulations of nonlinear networks, including long plateaus followed by rapid transitions to lower error solutions, and faster convergence from greedy unsupervised pretraining initial conditions than from random initial conditions. We provide an analytical description of these phenomena by finding new exact solutions to the nonlinear dynamics of deep learning. Our theoretical analysis also reveals the surprising finding that as the depth of a network approaches infinity, learning speed can nevertheless remain finite: for a special class of initial conditions on the weights, very deep networks incur only a finite, depth independent, delay in learning speed relative to shallow networks. We show that, under certain conditions on the training data, unsupervised pretraining can find this special class of initial conditions, while scaled random Gaussian initializations cannot. We further exhibit a new class of random orthogonal initial conditions on weights that, like unsupervised pre-training, enjoys depth independent learning times. We further show that these initial conditions also lead to faithful propagation of gradients even in deep nonlinear networks, as long as they operate in a special regime known as the edge of chaos." + }, + { + "title": "On the importance of initialization and momentum in deep learning", + "abstract": "Deep and recurrent neural networks (DNNs and RNNs respectively) are powerful models that were considered to be almost impossible to train using stochastic gradient descent with momentum. In this paper, we show that when stochastic gradient descent with momentum uses a well-designed random initialization and a particular type of slowly increasing schedule for the momentum parameter, it can train both DNNs and RNNs (on datasets with long-term dependencies) to levels of performance that were previously achievable only with Hessian-Free optimization. We find that both the initialization and the momentum are crucial since poorly initialized networks cannot be trained with momentum and well-initialized networks perform markedly worse when the momentum is absent or poorly tuned. \n \nOur success training these models suggests that previous attempts to train deep and recurrent neural networks from random initializations have likely failed due to poor initialization schemes. Furthermore, carefully tuned momentum methods suffice for dealing with the curvature issues in deep and recurrent network training objectives without the need for sophisticated second-order methods." + }, + { + "title": "Knowledge Matters: Importance of Prior Information for Optimization", + "abstract": "We explore the effect of introducing prior information into the intermediate level of neural networks for a learning task on which all the state-of-the-art machine learning algorithms tested failed to learn. We motivate our work from the hypothesis that humans learn such intermediate concepts from other individuals via a form of supervision or guidance using a curriculum. The experiments we have conducted provide positive evidence in favor of this hypothesis. In our experiments, a two-tiered MLP architecture is trained on a dataset with 64x64 binary inputs images, each image with three sprites. The final task is to decide whether all the sprites are the same or one of them is different. Sprites are pentomino tetris shapes and they are placed in an image with different locations using scaling and rotation transformations. The first part of the two-tiered MLP is pre-trained with intermediate-level targets being the presence of sprites at each location, while the second part takes the output of the first part as input and predicts the final task's target binary event. The two-tiered MLP architecture, with a few tens of thousand examples, was able to learn the task perfectly, whereas all other algorithms (include unsupervised pre-training, but also traditional algorithms like SVMs, decision trees and boosting) all perform no better than chance. We hypothesize that the optimization difficulty involved when the intermediate pre-training is not performed is due to the {\\em composition} of two highly non-linear tasks. Our findings are also consistent with hypotheses on cultural learning inspired by the observations of optimization problems with deep learning, presumably because of effective local minima." + }, + { + "title": "Large Scale Distributed Deep Networks", + "abstract": "Recent work in unsupervised feature learning and deep learning has shown that being able to train large models can dramatically improve performance. In this paper, we consider the problem of training a deep network with billions of parameters using tens of thousands of CPU cores. We have developed a software framework called DistBelief that can utilize computing clusters with thousands of machines to train large models. Within this framework, we have developed two algorithms for large-scale distributed training: (i) Downpour SGD, an asynchronous stochastic gradient descent procedure supporting a large number of model replicas, and (ii) Sandblaster, a framework that supports a variety of distributed batch optimization procedures, including a distributed implementation of L-BFGS. Downpour SGD and Sandblaster L-BFGS both increase the scale and speed of deep network training. We have successfully used our system to train a deep network 30x larger than previously reported in the literature, and achieves state-of-the-art performance on ImageNet, a visual object recognition task with 16 million images and 21k categories. We show that these same techniques dramatically accelerate the training of a more modestly- sized deep network for a commercial speech recognition service. Although we focus on and report performance of these methods as applied to training large neural networks, the underlying algorithms are applicable to any gradient-based machine learning algorithm." + }, + { + "title": "ImageNet classification with deep convolutional neural networks", + "abstract": "We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called \"dropout\" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved by the second-best entry." + }, + { + "title": "On the difficulty of training recurrent neural networks", + "abstract": "There are two widely known issues with properly training recurrent neural networks, the vanishing and the exploding gradient problems detailed in Bengio et al. (1994). In this paper we attempt to improve the understanding of the underlying issues by exploring these problems from an analytical, a geometric and a dynamical systems perspective. Our analysis is used to justify a simple yet effective solution. We propose a gradient norm clipping strategy to deal with exploding gradients and a soft constraint for the vanishing gradients problem. We validate empirically our hypothesis and proposed solutions in the experimental section." + }, + { + "title": "Large Scale Visual Recognition", + "abstract": "Abstract : Visual recognition remains one of the grand goals of artificial intelligence research. One major challenge is endowing machines with human ability to recognize tens of thousands of categories. Moving beyond previous work that is mostly focused on hundreds of categories we make progress toward human scale visual recognition. Specifically, our contributions are as follows First, we have constructed ImageNet, a large scale image ontology. The Fall 2011 version consists of 22 thousand categories and 14 million images; it depicts each category by an average of 650 images collected from the Internet and verified by multiple humans. To the best of our knowledge this is currently the largest human-verified dataset in terms of both the number of categories and the number of images. Given the large amount of human effort required, the traditional approach to dataset collection, involving in-house annotation by a small number of human subjects, becomes infeasible. In this dissertation we describe how ImageNet has been built through quality controlled, cost effective, large scale online crowdsourcing. Next, we use ImageNet to conduct the first benchmarking study of state of the art recognition algorithms at the human scale. By experimenting on 10 thousand categories, we discover that the previous state of the art performance is still low (6.4%). We further observe that the confusion among categories is hierarchically structured at large scale, a key insight that leads to our subsequent contributions. Third, we study how to efficiently classify tens of thousands of categories by exploiting the structure of visual confusion among categories. We propose a novel learning technique that scales logarithmically with the number of classes in both training and testing, improving both accuracy and efficiency of the previous state of the art while reducing training time by 31 fold on 10 thousand classes." + }, + { + "title": "Deep Learning Made Easier by Linear Transformations in Perceptrons", + "abstract": "We transform the outputs of each hidden neuron in a multi-layer perceptron network to have zero output and zero slope on average, and use separate shortcut connections to model the linear dependencies instead. This transformation aims at separating the problems of learning the linear and nonlinear parts of the whole input-output mapping, which has many benefits. We study the theoretical properties of the transformation by noting that they make the Fisher information matrix closer to a diagonal matrix, and thus standard gradient closer to the natural gradient. We experimentally confirm the usefulness of the transformations by noting that they make basic stochastic gradient learning competitive with state-of-the-art learning algorithms in speed, and that they seem also to help find solutions that generalize better. The experiments include both classification of small images and learning a lowdimensional representation for images by using a deep unsupervised auto-encoder network. The transformations were beneficial in all cases, with and without regularization and with networks from two to five hidden layers." + }, + { + "title": "A Convergence Analysis of Log-Linear Training", + "abstract": "Log-linear models are widely used probability models for statistical pattern recognition. Typically, log-linear models are trained according to a convex criterion. In recent years, the interest in log-linear models has greatly increased. The optimization of log-linear model parameters is costly and therefore an important topic, in particular for large-scale applications. Different optimization algorithms have been evaluated empirically in many papers. In this work, we analyze the optimization problem analytically and show that the training of log-linear models can be highly ill-conditioned. We verify our findings on two handwriting tasks. By making use of our convergence analysis, we obtain good results on a large-scale continuous handwriting recognition task with a simple and generic approach." + }, + { + "title": "Adaptive Subgradient Methods for Online Learning and Stochastic Optimization", + "abstract": "We present a new family of subgradient methods that dynamically incorporate knowledge of the geometry of the data observed in earlier iterations to perform more informative gradient-based learning. Metaphorically, the adaptation allows us to find needles in haystacks in the form of very predictive but rarely seen features. Our paradigm stems from recent advances in stochastic optimization and online learning which employ proximal functions to control the gradient steps of the algorithm. We describe and analyze an apparatus for adaptively modifying the proximal function, which significantly simplifies setting a learning rate and results in regret guarantees that are provably as good as the best proximal function that can be chosen in hindsight. We give several efficient algorithms for empirical risk minimization problems with common and important regularization functions and domain constraints. We experimentally study our theoretical analysis and show that adaptive subgradient methods outperform state-of-the-art, yet non-adaptive, subgradient algorithms." + }, + { + "title": "Rectified Linear Units Improve Restricted Boltzmann Machines", + "abstract": "Restricted Boltzmann machines were developed using binary stochastic hidden units. These can be generalized by replacing each binary unit by an infinite number of copies that all have the same weights but have progressively more negative biases. The learning and inference rules for these \"Stepped Sigmoid Units\" are unchanged. They can be approximated efficiently by noisy, rectified linear units. Compared with binary units, these units learn features that are better for object recognition on the NORB dataset and face verification on the Labeled Faces in the Wild dataset. Unlike binary units, rectified linear units preserve information about relative intensities as information travels through multiple layers of feature detectors." + }, + { + "title": "Understanding the difficulty of training deep feedforward neural networks", + "abstract": "Whereas before 2006 it appears that deep multilayer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence. 1 Deep Neural Networks Deep learning methods aim at learning feature hierarchies with features from higher levels of the hierarchy formed by the composition of lower level features. They include Appearing in Proceedings of the 13 International Conference on Artificial Intelligence and Statistics (AISTATS) 2010, Chia Laguna Resort, Sardinia, Italy. Volume 9 of JMLR: WC Weston et al., 2008). Much attention has recently been devoted to them (see (Bengio, 2009) for a review), because of their theoretical appeal, inspiration from biology and human cognition, and because of empirical success in vision (Ranzato et al., 2007; Larochelle et al., 2007; Vincent et al., 2008) and natural language processing (NLP) (Collobert & Weston, 2008; Mnih & Hinton, 2009). Theoretical results reviewed and discussed by Bengio (2009), suggest that in order to learn the kind of complicated functions that can represent high-level abstractions (e.g. in vision, language, and other AI-level tasks), one may need deep architectures. Most of the recent experimental results with deep architecture are obtained with models that can be turned into deep supervised neural networks, but with initialization or training schemes different from the classical feedforward neural networks (Rumelhart et al., 1986). Why are these new algorithms working so much better than the standard random initialization and gradient-based optimization of a supervised training criterion? Part of the answer may be found in recent analyses of the effect of unsupervised pretraining (Erhan et al., 2009), showing that it acts as a regularizer that initializes the parameters in a “better” basin of attraction of the optimization procedure, corresponding to an apparent local minimum associated with better generalization. But earlier work (Bengio et al., 2007) had shown that even a purely supervised but greedy layer-wise procedure would give better results. So here instead of focusing on what unsupervised pre-training or semi-supervised criteria bring to deep architectures, we focus on analyzing what may be going wrong with good old (but deep) multilayer neural networks. Our analysis is driven by investigative experiments to monitor activations (watching for saturation of hidden units) and gradients, across layers and across training iterations. We also evaluate the effects on these of choices of activation function (with the idea that it might affect saturation) and initialization procedure (since unsupervised pretraining is a particular form of initialization and it has a drastic impact)." + }, + { + "title": "Nonlinear image representation using divisive normalization", + "abstract": "In this paper, we describe a nonlinear image representation based on divisive normalization that is designed to match the statistical properties of photographic images, as well as the perceptual sensitivity of biological visual systems. We decompose an image using a multi-scale oriented representation, and use studentpsilas t as a model of the dependencies within local clusters of coefficients. We then show that normalization of each coefficient by the square root of a linear combination of the amplitudes of the coefficients in the cluster reduces statistical dependencies. We further show that the resulting divisive normalization transform is invertible and provide an efficient iterative inversion algorithm. Finally, we probe the statistical and perceptual advantages of this image representation by examining its robustness to added noise, and using it to enhance image contrast." + }, + { + "title": "Optimization", + "abstract": "Optimization is generally a process of learning and unlearning, and of trials and errors, precisely like a mindsponge process." + }, + { + "title": "Critical Points of the Singular Value Decomposition", + "abstract": "The singular value decomposition (SVD) is a factorization that is discontinuous on the subset of matrices having repeated singular values. In this paper the SVD is studied in the vicinity of this critical set. Each one-parameter $C^k$ perturbation transversal to the critical set is shown to uniquely determine an SVD at the critical point that extends to an SVD along the perturbation path that is $C^{k-1}$ in the perturbation parameter. Derivatives of the singular vectors at the critical point are found explicitly. Application is made to the effect on the singular vectors of perturbations from a matrix in the critical set and compared to the information provided by the $\\sin (\\theta)$ theorem. Estimates of the derivative of the singular vectors are applied to inequalities involving the matrix absolute value, such as the generalized Araki--Yamagami inequality." + }, + { + "title": "Improving predictive inference under covariate shift by weighting the log-likelihood function", + "abstract": null + }, + { + "title": "Independent component analysis: algorithms and applications", + "abstract": null + }, + { + "title": "Batch Normalization Presentation", + "abstract": null + }, + { + "title": "Dropout: a simple way to prevent neural networks from overfitting", + "abstract": "Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overfitting is a serious problem in such networks. Large networks are also slow to use, making it difficult to deal with overfitting by combining the predictions of many different large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different \"thinned\" networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets." + }, + { + "title": "Figure 5 documents the changes that were performed compared to the architecture with respect to the GoogleNet archictecture. For the interpretation of this table, please consult (", + "abstract": null + }, + { + "title": "Efficient BackProp", + "abstract": null + }, + { + "title": "J. Mach. Learn. Res", + "abstract": null + }, + { + "title": "A Literature Survey on Domain Adaptation of Statistical Classifiers", + "abstract": "The domain adaptation problem, especially domain adaptation in natural language processing, started gaining much attention very recently [Daumé III and Marcu, 2006, Blitzer et al., 2006, Ben-David et al., 2007, Daumé III, 2007, Satpal and Sarawagi, 2007]. However, some special kinds of domain adaptation problems have been studied before under different names such as class imbalance [Japkowicz and Stephen, 2002], covariate shift [Shimodaira, 2000], and sample selection bias [Heckman, 1979]. There are also some well-studied machine learning problems that are closely related but not equivalent to domain adaptation, including multi-task learning [Caruana, 1997] and semi-supervised learning [Chapelle et al., 2006]. In this literature survey, we review existing work in both the machine learning and the natural language processing communities related to domain adaptation. Because this relatively new topic is constantly attracting attention, our survey is necessarily incomplete. Nevertheless, we try to cover the major lines of work that we are aware of up to the date this survey is written. This survey will also be constantly updated. The goal of this literature survey is twofold. First, existing studies on domain adaptation seem very different from each other, and different terms are used to refer to the problem. There has not been any survey that connects these different studies. This survey thus tries to organize the existing work in a systematic way and draw a big picture of the domain adaptation problem with its possible solutions. Second, a systematic literature survey shows the limitations of current work and points out promising directions that should be explored." + }, + { + "title": "GradientBased Learning Applied to Document Recognition", + "abstract": "Multilayer Neural Networks trained with the backpropagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to minimize an overall performance measure. Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks. A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with global training techniques to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day." + }, + { + "title": "Gradient-based learning applied to document recognition", + "abstract": "Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day." + }, + { + "title": "Our model employed separable convolution with depth multiplier 8 on the first convolutional layer. This reduces the computational cost while increasing the memory consumption at training time", + "abstract": null + }, + { + "title": "Appendix Variant of the Inception Model Used", + "abstract": null + }, + { + "title": "комбинация свёрток 5 × 1 и 1 × 5 вычисляется почти так же обычная свёртка 3 × 3", + "abstract": null + }, + { + "title": "Особенности входных данных имеют большое значение. Для однородных данных даже очень простая сеть будет работать хорошо (например, 6-10 слоёв по 32 или даже 16 каналов)", + "abstract": null + }, + { + "title": "Instance segmentation: Самая сложная задача - объектов может быть несколь-ко, причём надо отличать друг от друга разные объекты с совпадающим классом", + "abstract": null + }, + { + "title": "Object recognition: исходная сеть получает на вход изображение с ровно одним объектом и возвращает координаты содержащего объект прямо-угольника и его класс", + "abstract": null + }, + { + "title": "Создание более коротких путей внутри сети (Fractal net [11], Dense net [12])", + "abstract": null + }, + { + "title": "Inception-v4, Inception-ResNet", + "abstract": null + }, + { + "title": "Разработанная архитектура сети позволяет распознавать элементы изоб-ражения на телефоне c ОС Android за время 200-500 мс в зависимости от модели телефона", + "abstract": null + }, + { + "title": "Если не требуется инвариантность к повороту, можно использовать свёрт-ки 1 × 7 и 7 × 1 - они позволяют за один слой захватить сразу большую область изображения (особенно в сочетании с dilations)", + "abstract": null + }, + { + "title": "По сравнению с float значениями в четыре раза снижается потребление памяти и количество читаемой-записываемой информации", + "abstract": null + }, + { + "title": "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift", + "abstract": null + } + ] + }, + "author_data": { + "266ff30b-47e7-47af-a6f6-4c85a99667cd": { + "pk": "266ff30b-47e7-47af-a6f6-4c85a99667cd", + "project_name": null, + "name": "Sergey Ioffe", + "bio": "I am a researcher with a strong focus on computer vision and image processing, particularly in enhancing user experiences through automated techniques. My work at Google on image auto-rectification has led to the development of a system that corrects involuntary camera rotations, improving the aesthetic quality of user photos. I have also pioneered methods for noise estimation in personal photo collections, leveraging facial information to enhance image quality through advanced denoising techniques.\n\nMy research extends to multilabel image annotation, where I have successfully integrated deep learning features to significantly boost performance on benchmark datasets. I introduced a hierarchical CRF model for image segmentation and labeling, which has proven effective in classifying street scenes. Additionally, I have tackled large-scale web image annotation by creating visual synsets, leading to improved performance in predicting annotations for vast image databases.\n\nI am passionate about developing efficient algorithms, such as the Consistent Weighted Sampling method for retrieval and compression, and have explored innovative approaches to red-eye detection and correction in photographs. My work on articulated object recognition, particularly in detecting people in images, showcases my commitment to advancing the field of computer vision through robust, automated solutions.\n\nOverall, my research is driven by a desire to enhance image quality and accessibility, making technology more intuitive and user-friendly. I continuously seek to push the boundaries of what is possible in computer vision, contributing to both academic knowledge and practical applications.", + "collaborators": [ + "D. Forsyth", + "J. Haddon", + "Qi-Xing Huang", + "Mei Han", + "Bo Wu", + "K. Chaudhury", + "S. DiVerdi", + "Yichang Shih", + "Vivek Kwatra", + "Troy T. Chinen", + "Hui Fang", + "Yunchao Gong", + "Yangqing Jia", + "Thomas Leung", + "Alexander Toshev", + "David Tsai", + "Yushi Jing", + "Yi Liu", + "H. Rowley", + "James M. Rehg", + "S. Baluja", + "Michele Covell", + "B. S. Manjunath", + "Yining Deng", + "C. Carson", + "Serge J. Belongie", + "Wei-Ying Ma", + "Jitendra Malik", + "D. Bertsekas" + ], + "pub_titles": [ + "Auto-rectification of user photos", + "Joint Noise Level Estimation from Personal Photo Collections", + "Deep Convolutional Ranking for Multilabel Image Annotation", + "A hierarchical conditional random field model for labeling and segmenting images of street scenes", + "Large-scale image annotation using visual synset", + "Improved Consistent Sampling, Weighted Minhash and L1 Sketching", + "Permutation grouping: intelligent Hash function design for audio & image retrieval", + "Red eye detection with machine learning", + "Mixtures of trees for object recognition", + "Human tracking with mixtures of trees", + "Bayesian structure from motion", + "Finding people by sampling", + "Learning to Find Pictures of People", + "Temporal Differences-Based Policy Iteration and Applications in Neuro-Dynamic Programming" + ], + "pub_abstracts": [ + "The image auto rectification project at Google aims to create a pleasanter version of user photos by correcting the small, involuntary camera rotations (roll / pitch/ yaw) that often occur in non-professional photographs. Our system takes the image closer to the fronto-parallel view by performing an affine rectification on the image that restores parallelism of lines that are parallel in the fronto-parallel image view. This partially corrects perspective distortions, but falls short of full metric rectification which also restores angles between lines. On the other hand the 2D homography for our rectification can be computed from only two (as opposed to three) estimated vanishing points, allowing us to fire upon many more images. A new RANSAC based approach to vanishing point estimation has been developed. The main strength of our vanishing point detector is that it is line-less, thereby avoiding the hard, binary (line/no-line) upstream decisions that cause traditional algorithm to ignore much supporting evidence and/or admit noisy evidence for vanishing points. A robust RANSAC based technique for detecting horizon lines in an image is also proposed for analyzing correctness of the estimated rectification. We post-multiply our affine rectification homography with a 2D rotation which aligns the closer vanishing point with the image Y axis.", + "Personal photo albums are heavily biased towards faces of people, but most state-of-the-art algorithms for image denoising and noise estimation do not exploit facial information. We propose a novel technique for jointly estimating noise levels of all face images in a photo collection. Photos in a personal album are likely to contain several faces of the same people. While some of these photos would be clean and high quality, others may be corrupted by noise. Our key idea is to estimate noise levels by comparing multiple images of the same content that differ predominantly in their noise content. Specifically, we compare geometrically and photo metrically aligned face images of the same person. Our estimation algorithm is based on a probabilistic formulation that seeks to maximize the joint probability of estimated noise levels across all images. We propose an approximate solution that decomposes this joint maximization into a two-stage optimization. The first stage determines the relative noise between pairs of images by pooling estimates from corresponding patch pairs in a probabilistic fashion. The second stage then jointly optimizes for all absolute noise parameters by conditioning them upon relative noise levels, which allows for a pair wise factorization of the probability distribution. We evaluate our noise estimation method using quantitative experiments to measure accuracy on synthetic data. Additionally, we employ the estimated noise levels for automatic denoising using \"BM3D\", and evaluate the quality of denoising on real-world photos through a user study.", + "Multilabel image annotation is one of the most important challenges in computer vision with many real-world applications. While existing work usually use conventional visual features for multilabel annotation, features based on Deep Neural Networks have shown potential to significantly boost performance. In this work, we propose to leverage the advantage of such features and analyze key components that lead to better performances. Specifically, we show that a significant performance gain could be obtained by combining convolutional architectures with approximate top-$k$ ranking objectives, as thye naturally fit the multilabel tagging problem. Our experiments on the NUS-WIDE dataset outperforms the conventional visual features by about 10%, obtaining the best reported performance in the literature.", + "Simultaneously segmenting and labeling images is a fundamental problem in Computer Vision. In this paper, we introduce a hierarchical CRF model to deal with the problem of labeling images of street scenes by several distinctive object classes. In addition to learning a CRF model from all the labeled images, we group images into clusters of similar images and learn a CRF model from each cluster separately. When labeling a new image, we pick the closest cluster and use the associated CRF model to label this image. Experimental results show that this hierarchical image labeling method is comparable to, and in many cases superior to, previous methods on benchmark data sets. In addition to segmentation and labeling results, we also showed how to apply the image labeling result to rerank Google similar images.", + "We address the problem of large-scale annotation of web images. Our approach is based on the concept of visual synset, which is an organization of images which are visually-similar and semantically-related. Each visual synset represents a single prototypical visual concept, and has an associated set of weighted annotations. Linear SVM's are utilized to predict the visual synset membership for unseen image examples, and a weighted voting rule is used to construct a ranked list of predicted annotations from a set of visual synsets. We demonstrate that visual synsets lead to better performance than standard methods on a new annotation database containing more than 200 million im- ages and 300 thousand annotations, which is the largest ever reported", + "We propose a new Consistent Weighted Sampling method, where the probability of drawing identical samples for a pair of inputs is equal to their Jaccard similarity. Our method takes deterministic constant time per non-zero weight, improving on the best previous approach which takes expected constant time. The samples can be used as Weighted Minhash for efficient retrieval and compression (sketching) under Jaccard or L1 metric. A method is presented for using simple data statistics to reduce the running time of hash computation by two orders of magnitude. We compare our method with the random projection method and show that it has better characteristics for retrieval under L1. We present a novel method of mapping hashes to short bit-strings, apply it to Weighted Minhash, and achieve more accurate distance estimates from sketches than existing methods, as long as the inputs are sufficiently distinct. We show how to choose the optimal number of bits per hash for sketching, and demonstrate experimental results which agree with the theoretical analysis.", + "The combination of MinHash-based signatures and locality- sensitive hashing (LSH) schemes has been effectively used for finding approximate matches in very large audio and image retrieval systems. In this study, we introduce the idea of permutation-grouping to intelligently design the hash functions that are used to index the LSH tables. This helps to overcome the inefficiencies introduced by hashing real-world data that is noisy, structured, and most importantly is not independently and identically distributed. Through extensive tests, we find that permutation-grouping dramatically increases the efficiency of the overall retrieval system by lowering the number of low-probability candidates that must be examined by 30-50%.", + "Red-eye is a problem in photography that occurs when a photograph is taken with a flash, and the bright flash light is reflected from the blood vessels in the eye, giving the eye an unnatural red hue. Most red-eye reduction systems need the user to outline the red eyes by hand, but this approach doesn't scale up. Instead, we propose an automatic red-eye detection system. The system contains a red-eye detector that finds red eye-like candidate image patches; a state of the art face detector used to eliminate most false positives (image regions that look but red eyes but are not); and a red-eye outline detector. All three detectors are automatically learned from data, using Boosting. Our system can be combined with a red-eye reduction module to yield a fully automatic red eye corrector.", + "Efficient detection of objects in images is complicated by variations of object appearance due to intra-class object differences, articulation, lighting, occlusions, and aspect variations. To reduce the search required for detection, we employ the bottom-up approach where we find candidate image features and associate some of them with parts of the object model. We represent objects as collections of local features, and would like to allow any of them to be absent, with only a small subset sufficient for detection;furthermore, our model should allow efficient correspondence search. We propose a model, Mixture of Trees, that achieves these goals. With a mixture of trees, we can model the individual appearances of the features, relationships among them, and the aspect, and handle occlusions. Independences captured in the model make efficient inference possible. In our earlier work, we have shown that mixtures of trees can be used to model objects with a natural tree structure, in the context of human tracking. Now we show that a natural tree structure is not required, and use a mixture of trees for both frontal and view-invariant face detection. We also show that by modeling faces as collections of features we can establish an intrinsic coordinate frame for a face, and estimate the out-of-plane rotation of a face.", + "Tree-structured probabilistic models admit simple, fast inference. However they are not well suited to phenonena such as occlusion, where multiple components of an object may disappear simultaneously. We address this problem with mixtures of trees, and demonstrate an efficient and compact representation of this mixture, which admits simple learning and inference algorithms. We use this method to build an automated tracker for Muybridge sequences of a variety of human activities. Tracking is difficult, because the temporal dependencies rule out simple inference methods. We show how to use our model for efficient inference, using a method that employs alternate spatial and temporal inference. The result is a cracker that (a) uses a very loose motion model, and so can track many different activities at a variable frame rate and (b) is entirely, automatic.", + "Formulates structure from motion as a Bayesian inference problem and uses a Markov-chain Monte Carlo sampler to sample the posterior on this problem. This results in a method that can identify both small and large tracker errors and yields reconstructions that are stable in the presence of these errors. Furthermore, the method gives detailed information on the range of ambiguities in structure given a particular data set and requires no special geometric formulation to cope with degenerate situations. Motion segmentation is obtained by a layer of discrete variables associating a point with an object. We demonstrate a sampler that successfully samples an approximation to the marginal on this domain, producing a relatively unambiguous segmentation.", + "We show how to use a sampling method to find sparsely clad people in static images. People are modeled as an assembly of nine cylindrical segments. Segments are found using an EM algorithm and then assembled into hypotheses incrementally, using a learned likelihood model. Each assembly step passes on a set of samples of its likelihood to the next; this yields effective pruning of the space of hypotheses. The collection of available nine-segment hypotheses is then represented by a set of equivalence classes, which yield an efficient pruning process. The posterior for the number of people is obtained from the class representatives. People are counted quite accurately in images of real scenes using a MAP estimate. We show the method allows top-down as well as bottom up reasoning. While the method can be overwhelmed by very large numbers of segments, we show that this problem can be avoided by quite simple pruning steps.", + "Finding articulated objects, like people, in pictures presents a particularly difficult object recognition problem. We show how to find people by finding putative body segments, and then constructing assemblies of those segments that are consistent with the constraints on the appearance of a person that result from kinematic properties. Since a reasonable model of a person requires at least nine segments, it is not possible to present every group to a classifier. Instead, the search can be pruned by using projected versions of a classifier that accepts groups corresponding to people. We describe an efficient projection algorithm for one popular classifier, and demonstrate that our approach can be used to determine whether images of real scenes contain people.", + "We introduce a new policy iteration method for dynamic programming problems with discounted and undiscounted cost. The method is based on the notion of temporal differences, and is primarily geared to the case of large and complex problems where the use of approximations is essential. We develop the theory of the method without approximation, we describe how to embed it within a neuro-dynamic programming/reinforcement learning context where feature-based approximation architectures are used, we relate it to TD(λ) methods, and we illustrate its use in the training of a tetris playing program. 1 Supported by the National Science Foundation under Grant DDM-8903385 and Grant CCR9103804. Thanks are due to John Tsitsiklis for several helpful discussions and to Dimitris Papaioannou, who assisted with some of the experiments. 2 Department of Electrical Engineering and Computer Science, M. I. T., Cambridge, Mass., 02139. 3 Department of Electrical Engineering and Computer Science, M. I. T., Cambridge, Mass., 02139. 1" + ], + "domain": [ + "Computer Vision", + "Image Processing", + "Machine Learning", + "Object Detection" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "3e538439-0cbd-480c-b359-2ac90d2e9491": { + "pk": "3e538439-0cbd-480c-b359-2ac90d2e9491", + "project_name": null, + "name": "Christian Szegedy", + "bio": "I am a researcher deeply engaged in the intersection of deep learning and computer vision, with a particular focus on object detection and recognition. My work has significantly advanced the state-of-the-art in these fields, particularly through the development of innovative architectures and methodologies. One of my notable contributions is the Inception architecture, also known as GoogLeNet, which optimizes resource utilization while achieving remarkable performance in the ImageNet Large-Scale Visual Recognition Challenge.\n\nI have also explored the challenges posed by noisy and incomplete labeling in supervised learning, proposing a consistency-based approach that enhances robustness across various datasets. My research into adversarial examples has shed light on the vulnerabilities of neural networks, leading to improved adversarial training techniques.\n\nIn addition to my work on object detection, I have developed methods for human pose estimation using deep neural networks, achieving high precision through a cascade of regressors. My approach to modular code generation from hierarchical block diagrams has addressed critical trade-offs in embedded systems, emphasizing the balance between modularity, reusability, and code size.\n\nOverall, my research aims to push the boundaries of what is possible with deep learning, providing practical solutions that enhance the performance and reliability of machine learning systems in real-world applications. I am passionate about continuing to explore new methodologies that bridge theoretical insights with practical implementations in computer vision and beyond.", + "collaborators": [ + "D. Erhan", + "D. Rautenbach", + "Dragomir Anguelov", + "Jürgen Werber", + "Scott E. Reed", + "Alexander Toshev", + "Andrew Rabinovich", + "I. Goodfellow", + "Honglak Lee", + "Jonathon Shlens", + "Wei Liu", + "Yangqing Jia", + "P. Sermanet", + "Vincent Vanhoucke", + "Wojciech Zaremba", + "I. Sutskever", + "Joan Bruna", + "R. Fergus", + "Roberto Lublinerman", + "S. Tripakis", + "P. Chong", + "Balázs Szegedy" + ], + "pub_titles": [ + "Scalable, High-Quality Object Detection", + "Training Deep Neural Networks on Noisy Labels with Bootstrapping", + "Explaining and Harnessing Adversarial Examples", + "Going deeper with convolutions", + "DeepPose: Human Pose Estimation via Deep Neural Networks", + "Scalable Object Detection Using Deep Neural Networks", + "Deep Neural Networks for Object Detection", + "Intriguing properties of neural networks", + "Modular code generation from synchronous block diagrams: modularity vs. code size", + "Timing optimization by restructuring long combinatorial paths", + "A morphing approach to address placement stability", + "On the Number of 3-Edge Colorings of Cubic Graphs", + "A Linear Representation of the Ear Matroid" + ], + "pub_abstracts": [ + "Current high-quality object detection approaches use the scheme of salience-based object proposal methods followed by post-classification using deep convolutional features. This spurred recent research in improving object proposal methods. However, domain agnostic proposal generation has the principal drawback that the proposals come unranked or with very weak ranking, making it hard to trade-off quality for running time. This raises the more fundamental question of whether high-quality proposal generation requires careful engineering or can be derived just from data alone. We demonstrate that learning-based proposal methods can effectively match the performance of hand-engineered methods while allowing for very efficient runtime-quality trade-offs. Using the multi-scale convolutional MultiBox (MSC-MultiBox) approach, we substantially advance the state-of-the-art on the ILSVRC 2014 detection challenge data set, with $0.5$ mAP for a single model and $0.52$ mAP for an ensemble of two models. MSC-Multibox significantly improves the proposal quality over its predecessor MultiBox~method: AP increases from $0.42$ to $0.53$ for the ILSVRC detection challenge. Finally, we demonstrate improved bounding-box recall compared to Multiscale Combinatorial Grouping with less proposals on the Microsoft-COCO data set.", + "Current state-of-the-art deep learning systems for visual object recognition and detection use purely supervised training with regularization such as dropout to avoid overfitting. The performance depends critically on the amount of labeled examples, and in current practice the labels are assumed to be unambiguous and accurate. However, this assumption often does not hold; e.g. in recognition, class labels may be missing; in detection, objects in the image may not be localized; and in general, the labeling may be subjective. In this work we propose a generic way to handle noisy and incomplete labeling by augmenting the prediction objective with a notion of consistency. We consider a prediction consistent if the same prediction is made given similar percepts, where the notion of similarity is between deep network features computed from the input data. In experiments we demonstrate that our approach yields substantial robustness to label noise on several datasets. On MNIST handwritten digits, we show that our model is robust to label corruption. On the Toronto Face Database, we show that our model handles well the case of subjective labels in emotion recognition, achieving state-of-the- art results, and can also benefit from unlabeled face images with no modification to our method. On the ILSVRC2014 detection challenge data, we show that our approach extends to very deep networks, high resolution images and structured outputs, and results in improved scalable detection.", + "Several machine learning models, including neural networks, consistently misclassify adversarial examples---inputs formed by applying small but intentionally worst-case perturbations to examples from the dataset, such that the perturbed input results in the model outputting an incorrect answer with high confidence. Early attempts at explaining this phenomenon focused on nonlinearity and overfitting. We argue instead that the primary cause of neural networks' vulnerability to adversarial perturbation is their linear nature. This explanation is supported by new quantitative results while giving the first explanation of the most intriguing fact about them: their generalization across architectures and training sets. Moreover, this view yields a simple and fast method of generating adversarial examples. Using this approach to provide examples for adversarial training, we reduce the test set error of a maxout network on the MNIST dataset.", + "We propose a deep convolutional neural network architecture codenamed Inception that achieves the new state of the art for classification and detection in the ImageNet Large-Scale Visual Recognition Challenge 2014 (ILSVRC14). The main hallmark of this architecture is the improved utilization of the computing resources inside the network. By a carefully crafted design, we increased the depth and width of the network while keeping the computational budget constant. To optimize quality, the architectural decisions were based on the Hebbian principle and the intuition of multi-scale processing. One particular incarnation used in our submission for ILSVRC14 is called GoogLeNet, a 22 layers deep network, the quality of which is assessed in the context of classification and detection.", + "We propose a method for human pose estimation based on Deep Neural Networks (DNNs). The pose estimation is formulated as a DNN-based regression problem towards body joints. We present a cascade of such DNN regres- sors which results in high precision pose estimates. The approach has the advantage of reasoning about pose in a holistic fashion and has a simple but yet powerful formula- tion which capitalizes on recent advances in Deep Learn- ing. We present a detailed empirical analysis with state-of- art or better performance on four academic benchmarks of diverse real-world images.", + "Deep convolutional neural networks have recently achieved state-of-the-art performance on a number of image recognition benchmarks, including the ImageNet Large-Scale Visual Recognition Challenge (ILSVRC-2012). The winning model on the localization sub-task was a network that predicts a single bounding box and a confidence score for each object category in the image. Such a model captures the whole-image context around the objects but cannot handle multiple instances of the same object in the image without naively replicating the number of outputs for each instance. In this work, we propose a saliency-inspired neural network model for detection, which predicts a set of class-agnostic bounding boxes along with a single score for each box, corresponding to its likelihood of containing any object of interest. The model naturally handles a variable number of instances for each class and allows for cross-class generalization at the highest levels of the network. We are able to obtain competitive recognition performance on VOC2007 and ILSVRC2012, while using only the top few predicted locations in each image and a small number of neural network evaluations.", + "Deep Neural Networks (DNNs) have recently shown outstanding performance on image classification tasks [14]. In this paper we go one step further and address the problem of object detection using DNNs, that is not only classifying but also precisely localizing objects of various classes. We present a simple and yet powerful formulation of object detection as a regression problem to object bounding box masks. We define a multi-scale inference procedure which is able to produce high-resolution object detections at a low cost by a few network applications. State-of-the-art performance of the approach is shown on Pascal VOC.", + "Deep neural networks are highly expressive models that have recently achieved state of the art performance on speech and visual recognition tasks. While their expressiveness is the reason they succeed, it also causes them to learn uninterpretable solutions that could have counter-intuitive properties. In this paper we report two such properties. First, we find that there is no distinction between individual high level units and random linear combinations of high level units, according to various methods of unit analysis. It suggests that it is the space, rather than the individual units, that contains of the semantic information in the high layers of neural networks. Second, we find that deep neural networks learn input-output mappings that are fairly discontinuous to a significant extend. We can cause the network to misclassify an image by applying a certain imperceptible perturbation, which is found by maximizing the network's prediction error. In addition, the specific nature of these perturbations is not a random artifact of learning: the same perturbation can cause a different network, that was trained on a different subset of the dataset, to misclassify the same input.", + "We study modular, automatic code generation from hierarchical block diagrams with synchronous semantics. Such diagrams are the fundamental model behind widespread tools in the embedded software domain, such as Simulink and SCADE. Code is modular in the sense that it is generated for a given composite block independently from context (i.e., without knowing in which diagrams the block is to be used) and using minimal information about the internals of the block. In previous work, we have shown how modular code can be generated by computing a set of interface functions for each block and a set of dependencies between these functions that is exported along with the interface. We have also introduced a quantified notion of modularity in terms of the number of interface functions generated per block, and showed how to minimize this number, which is essential for scalability. Finally, we have exposed the fundamental trade-off between modularity and reusability (set of diagrams the block can be used in). In this paper we explore another trade-off: modularity vs. code size. We show that our previous technique, although it achieves maximal reusability and is optimal in terms of modularity, may result in code replication and therefore large code sizes, something often unacceptable in an embedded system context. We propose to remedy this by generating code with no replication, and show that this generally results in some loss of modularity. We show that optimizing modularity while maintaining maximal reusability and zero replication is an intractable problem (NP-complete). We also show that this problem can be solved using a simple iterative procedure that checks satisfiability of a sequence of propositional formulas. We report on a new prototype implementation and experimental results. The latter demonstrate the practical interest in our methods.", + "We present an implementation of an algorithm for constructing provably fast circuits for a class of Boolean functions with input signals that have individual starting times. We show how to adapt this algorithm to logic optimization for timing correction at late stages of VLSI physical design and report experimental results on recent industrial chips. By restructuring long critical paths, our code achieves worst-slack improvements of up to several hundred picoseconds on top of traditional timing optimization techniques.", + "Traditionally, research in global placement has focused on relatively few simple metrics, such as pure wirelength or routability estimates. However, in the real world today, designs are driven by not-so-simple issues such as timing and crosstalk. The future holds even more difficulties as physical models for devices and interconnects become increasingly complex and unpredictable. Adoption of an iterative methodology, where one incrementally fixes design errors, is a basic approach to tackling these problems. However, developers of placement algorithms have long neglected the need for an tool which can be easily adopted into an incremental design flow. We propose a novel placement approach called grid morphing, which is specifically tailored for an incremental approach to placement. In particular, our technique focuses on the stability of the placement, which is critical for minimization of perturbation of the final placement under changes applied to the input netlist. We comparethe stability of our approach to existing placement tools, and show through experiments that our approach still delivers good results under traditional placement metrics.", + "In this paper we present a short algebraic proof for a generalization of a formula of R. Penrose, Some applications of negative dimensional tensors, in: Combinatorial Mathematics and its Applications Welsh (ed.), Academic Press, 1971, pp. 221?244 on the number of 3-edge colorings of a plane cubic graph. We also show that the number of 3-edge colorings of cubic graphs can be computed (up to a factor of 2| E |/3?1) by evaluating the Penrose polynomial of their cycle space at 4.", + "The family of bases of the ear matroid of a 2-edge-connected graph G = (V; E) consists of the minimal edge sets S E such that G=S is factor-critical. We give a linear representation of this matroid and new algebraic characterizations of factor-criticality in graphs." + ], + "domain": [ + "Computer Vision", + "Deep Learning", + "Object Detection", + "Adversarial Learning" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we effectively reduce internal covariate shift in deep neural networks to accelerate training and improve performance?\n\n**[Question 2] - Why is it interesting and important?** \nSolving the problem of internal covariate shift is crucial for the research community as it can lead to significant advancements in the training efficiency of deep learning models. By addressing this issue, we can enable the use of higher learning rates, reduce the need for careful hyperparameter tuning, and improve convergence rates. This could lead to faster training times and better performance across various applications, ultimately advancing the state of the art in machine learning and enabling more complex models to be trained effectively.\n\n**[Question 3] - Why is it hard?** \nThe challenge in reducing internal covariate shift lies in the complex interactions between the parameters of deep networks and the distributions of inputs to each layer. Naive approaches may fail because they do not account for the dynamic nature of these distributions during training, which can lead to saturation and vanishing gradients. Additionally, the need for careful initialization and tuning of learning rates complicates the training process. Overcoming these technical obstacles requires a robust mechanism that can stabilize the input distributions across layers throughout the training process.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has largely focused on external covariate shift and has not adequately addressed the internal dynamics of deep networks. Existing solutions often overlook the importance of maintaining stable input distributions for sub-networks or layers. Barriers such as the lack of effective normalization techniques and the complexity of deep network architectures have prevented this problem from being solved. Our approach, Batch Normalization, differs by introducing a normalization step that directly addresses internal covariate shift, thereby improving training efficiency and model performance.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves implementing Batch Normalization, which normalizes the inputs to each layer by fixing their means and variances. We will evaluate this approach using standard deep learning datasets, measuring performance through metrics such as training time, convergence rates, and model accuracy. The expected outcomes include accelerated training processes, improved gradient flow, and enhanced model performance, demonstrating the effectiveness of Batch Normalization in reducing internal covariate shift." + }, + "1707.07998": { + "paper_data": { + "title": "Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering", + "url": "http://arxiv.org/abs/1707.07998v3", + "arxiv_id": "1707.07998", + "authors": [ + "Peter Anderson", + "Xiaodong He", + "Chris Buehler", + "Damien Teney", + "Mark Johnson", + "Stephen Gould", + "Lei Zhang" + ], + "abstract": "Top-down visual attention mechanisms have been used extensively in image captioning and visual question answering (VQA) to enable deeper image understanding through fine-grained analysis and even multiple steps of reasoning. In this work, we propose a combined bottom-up and top-down attention mechanism that enables attention to be calculated at the level of objects and other salient image regions. This is the natural basis for attention to be considered. Within our approach, the bottom-up mechanism (based on Faster R-CNN) proposes image regions, each with an associated feature vector, while the top-down mechanism determines feature weightings. Applying this approach to image captioning, our results on the MSCOCO test server establish a new state-of-the-art for the task, achieving CIDEr / SPICE / BLEU-4 scores of 117.9, 21.5 and 36.9, respectively. Demonstrating the broad applicability of the method, applying the same approach to VQA we obtain first place in the 2017 VQA Challenge.", + "introduction": " Introduction Problems combining image and language understand- ing such as image captioning [4] and visual question an- swering (VQA) [12] continue to inspire considerable re- search at the boundary of computer vision and natural lan- guage processing. In both these tasks it is often necessary to perform some fine-grained visual processing, or even multiple steps of reasoning to generate high quality out- puts. As a result, visual attention mechanisms have been widely adopted in both image captioning [34, 27, 48, 46] and VQA [11, 28, 45, 47, 51]. These mechanisms improve performance by learning to focus on the regions of the im- age that are salient and are currently based on deep neural network architectures. \u0003Work performed while interning at Microsoft. Figure 1. Typically, attention models operate on CNN features cor- responding to a uniform grid of equally-sized image regions (left). Our approach enables attention to be calculated at the level of ob- jects and other salient image regions (right). In the human visual system, attention can be focused volitionally by top-down signals determined by the cur- rent task (e.g., looking for something), and automatically by bottom-up signals associated with unexpected, novel or salient stimuli [3, 6]. In this paper we adopt similar termi- nology and refer to attention mechanisms driven by non- visual or task-specific context as ‘top-down’, and purely vi- sual feed-forward attention mechanisms as ‘bottom-up’. Most conventional visual attention mechanisms used in image captioning and VQA are of the top-down variety. Taking as context a representation of a partially-completed caption output, or a question relating to the image, these mechanisms are typically trained to selectively attend to the output of one or more layers of a convolutional neural net (CNN). However, this approach gives little consideration to how the image regions that are subject to attention are deter- mined. As illustrated conceptually in Figure 1, the resulting 1arXiv:1707.07998v3 [cs.CV] 14 Mar 2018input regions correspond to a uniform grid of equally sized and shaped neural receptive fields – irrespective of the con- tent of the image. To generate more human-like captions and question answers, objects and other salient image re- gions are a much more natural basis for attention [10, 36]. In this paper we propose a combined bottom-up and top- down visual attention mechanism. The bottom-up mech- anism proposes a set of salient image regions, with each region represented by a pooled convolutional feature vec- tor. Practically, we implement bottom-up attention using Faster R-CNN [33], which represents a natural expression of a bottom-up attention mechanism. The top-down mecha- nism uses task-specific context to predict an attention distri- bution over the image regions. The attended feature vector is then computed as a weighted average of image features over all regions. We evaluate the impact of combining bottom-up and top- down attention on two tasks. We first present an image cap- tioning model that takes multiple glimpses of salient im- age regions during caption generation. Empirically, we find that the inclusion of bottom-up attention has a significant positive benefit for image captioning. Our Related Work A large number of attention-based deep neural networks have been proposed for image captioning and VQA. Typ- ically, these models can be characterized as top-down ap- proaches, with context provided by a representation of a partially-completed caption in the case of image caption- ing [34, 27, 48, 46], or a representation of the question in the case of VQA [11, 28, 45, 47, 51]. In each case", + "references": [ + { + "title": "Tips and Tricks for Visual Question Answering: Learnings from the 2017 Challenge", + "abstract": "Deep Learning has had a transformative impact on Computer Vision, but for all of the success there is also a significant cost. This is that the models and procedures used are so complex and intertwined that it is often impossible to distinguish the impact of the individual design and engineering choices each model embodies. This ambiguity diverts progress in the field, and leads to a situation where developing a state-of-the-art model is as much an art as a science. As a step towards addressing this problem we present a massive exploration of the effects of the myriad architectural and hyperparameter choices that must be made in generating a state-of-the-art model. The model is of particular interest because it won the 2017 Visual Question Answering Challenge. We provide a detailed analysis of the impact of each choice on model performance, in the hope that it will inform others in developing models, but also that it might set a precedent that will accelerate scientific progress in the field." + }, + { + "title": "Show, Ask, Attend, and Answer: A Strong Baseline For Visual Question Answering", + "abstract": "This paper presents a new baseline for visual question answering task. Given an image and a question in natural language, our model produces accurate answers according to the content of the image. Our model, while being architecturally simple and relatively small in terms of trainable parameters, sets a new state of the art on both unbalanced and balanced VQA benchmark. On VQA 1.0 open ended challenge, our model achieves 64.6% accuracy on the test-standard set without using additional data, an improvement of 0.4% over state of the art, and on newly released VQA 2.0, our model scores 59.7% on validation set outperforming best previously reported results by 0.5%. The results presented in this paper are especially interesting because very similar models have been tried before but significantly lower performance were reported. In light of the new results we hope to see more meaningful research on visual question answering in the future." + }, + { + "title": "Language Modeling with Gated Convolutional Networks", + "abstract": "The pre-dominant approach to language modeling to date is based on recurrent neural networks. Their success on this task is often linked to their ability to capture unbounded context. In this paper we develop a finite context approach through stacked convolutions, which can be more efficient since they allow parallelization over sequential tokens. We propose a novel simplified gating mechanism that outperforms Oord et al (2016) and investigate the impact of key architectural decisions. The proposed approach achieves state-of-the-art on the WikiText-103 benchmark, even though it features long-term dependencies, as well as competitive results on the Google Billion Words benchmark. Our model reduces the latency to score a sentence by an order of magnitude compared to a recurrent baseline. To our knowledge, this is the first time a non-recurrent approach is competitive with strong recurrent models on these large scale language tasks." + }, + { + "title": "Knowing When to Look: Adaptive Attention via a Visual Sentinel for Image Captioning", + "abstract": "Attention-based neural encoder-decoder frameworks have been widely adopted for image captioning. Most methods force visual attention to be active for every generated word. However, the decoder likely requires little to no visual information from the image to predict non-visual words such as the and of. Other words that may seem visual can often be predicted reliably just from the language model e.g., sign after behind a red stop or phone following talking on a cell. In this paper, we propose a novel adaptive attention model with a visual sentinel. At each time step, our model decides whether to attend to the image (and if so, to which regions) or to the visual sentinel. The model decides whether to attend to the image and where, in order to extract meaningful information for sequential word generation. We test our method on the COCO image captioning 2015 challenge dataset and Flickr30K. Our approach sets the new state-of-the-art by a significant margin." + }, + { + "title": "Areas of Attention for Image Captioning", + "abstract": "We propose “Areas of Attention”, a novel attentionbased model for automatic image captioning. Our approach models the dependencies between image regions, caption words, and the state of an RNN language model, using three pairwise interactions. In contrast to previous attentionbased approaches that associate image regions only to the RNN state, our method allows a direct association between caption words and image regions. During training these associations are inferred from image-level captions, akin to weakly-supervised object detector training. These associations help to improve captioning by localizing the corresponding regions during testing. We also propose and compare different ways of generating attention areas: CNN activation grids, object proposals, and spatial transformers nets applied in a convolutional fashion. Spatial transformers give the best results. They allow for image specific attention areas, and can be trained jointly with the rest of the network. Our attention mechanism and spatial transformer attention areas together yield state-of-the-art results on the MSCOCO dataset." + }, + { + "title": "Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering", + "abstract": null + }, + { + "title": "Self-Critical Sequence Training for Image Captioning", + "abstract": "Recently it has been shown that policy-gradient methods for reinforcement learning can be utilized to train deep end-to-end systems directly on non-differentiable metrics for the task at hand. In this paper we consider the problem of optimizing image captioning systems using reinforcement learning, and show that by carefully optimizing our systems using the test metrics of the MSCOCO task, significant gains in performance can be realized. Our systems are built using a new optimization approach that we call self-critical sequence training (SCST). SCST is a form of the popular REINFORCE algorithm that, rather than estimating a baseline to normalize the rewards and reduce variance, utilizes the output of its own test-time inference algorithm to normalize the rewards it experiences. Using this approach, estimating the reward signal (as actor-critic methods must do) and estimating normalization (as REINFORCE algorithms typically do) is avoided, while at the same time harmonizing the model with respect to its test-time inference procedure. Empirically we find that directly optimizing the CIDEr metric with SCST and greedy decoding at test-time is highly effective. Our results on the MSCOCO evaluation sever establish a new state-of-the-art on the task, improving the best result in terms of CIDEr from 104.9 to 114.7." + }, + { + "title": "Improved Image Captioning via Policy Gradient optimization of SPIDEr", + "abstract": "Current image captioning methods are usually trained via maximum likelihood estimation. However, the log-likelihood score of a caption does not correlate well with human assessments of quality. Standard syntactic evaluation metrics, such as BLEU, METEOR and ROUGE, are also not well correlated. The newer SPICE and CIDEr metrics are better correlated, but have traditionally been hard to optimize for. In this paper, we show how to use a policy gradient (PG) method to directly optimize a linear combination of SPICE and CIDEr (a combination we call SPIDEr): the SPICE score ensures our captions are semantically faithful to the image, while CIDEr score ensures our captions are syntactically fluent. The PG method we propose improves on the prior MIXER approach, by using Monte Carlo rollouts instead of mixing MLE training with PG. We show empirically that our algorithm leads to easier optimization and improved results compared to MIXER. Finally, we show that using our PG method we can optimize any of the metrics, including the proposed SPIDEr metric which results in image captions that are strongly preferred by human raters compared to captions generated by the same model but trained to optimize MLE or the COCO metrics." + }, + { + "title": "Zero-Shot Visual Question Answering", + "abstract": "Part of the appeal of Visual Question Answering (VQA) is its promise to answer new questions about previously unseen images. Most current methods demand training questions that illustrate every possible concept, and will therefore never achieve this capability, since the volume of required training data would be prohibitive. Answering general questions about images requires methods capable of Zero-Shot VQA, that is, methods able to answer questions beyond the scope of the training questions. We propose a new evaluation protocol for VQA methods which measures their ability to perform Zero-Shot VQA, and in doing so highlights significant practical deficiencies of current approaches, some of which are masked by the biases in current datasets. We propose and evaluate several strategies for achieving Zero-Shot VQA, including methods based on pretrained word embeddings, object classifiers with semantic embeddings, and test-time retrieval of example images. Our extensive experiments are intended to serve as baselines for Zero-Shot VQA, and they also achieve state-of-the-art performance in the standard VQA evaluation setting." + }, + { + "title": "Boosting Image Captioning with Attributes", + "abstract": "Automatically describing an image with a natural language has been an emerging challenge in both fields of computer vision and natural language processing. In this paper, we present Long Short-Term Memory with Attributes (LSTM-A) - a novel architecture that integrates attributes into the successful Convolutional Neural Networks (CNNs) plus Recurrent Neural Networks (RNNs) image captioning framework, by training them in an end-to-end manner. Particularly, the learning of attributes is strengthened by integrating inter-attribute correlations into Multiple Instance Learning (MIL). To incorporate attributes into captioning, we construct variants of architectures by feeding image representations and attributes into RNNs in different ways to explore the mutual but also fuzzy relationship between them. Extensive experiments are conducted on COCO image captioning dataset and our framework shows clear improvements when compared to state-of-the-art deep models. More remarkably, we obtain METEOR/CIDEr-D of 25.5%/100.2% on testing data of widely used and publicly available splits in [10] when extracting image representations by GoogleNet and achieve superior performance on COCO captioning Leaderboard." + }, + { + "title": "End-to-End Concept Word Detection for Video Captioning, Retrieval, and Question Answering", + "abstract": "We propose a high-level concept word detector that can be integrated with any video-to-language models. It takes a video as input and generates a list of concept words as useful semantic priors for language generation models. The proposed word detector has two important properties. First, it does not require any external knowledge sources for training. Second, the proposed word detector is trainable in an end-to-end manner jointly with any video-to-language models. To effectively exploit the detected words, we also develop a semantic attention mechanism that selectively focuses on the detected concept words and fuse them with the word encoding and decoding in the language model. In order to demonstrate that the proposed approach indeed improves the performance of multiple video-to-language tasks, we participate in all the four tasks of LSMDC 2016 [18]. Our approach has won three of them, including fill-in-the-blank, multiple-choice test, and movie retrieval." + }, + { + "title": "SPICE: Semantic Propositional Image Caption Evaluation", + "abstract": null + }, + { + "title": "Revisiting Visual Question Answering Baselines", + "abstract": null + }, + { + "title": "Multimodal Compact Bilinear Pooling for Visual Question Answering and Visual Grounding", + "abstract": "Modeling textual or visual information with vector representations trained from large language or visual datasets has been successfully explored in recent years. However, tasks such as visual question answering require combining these vector representations with each other. Approaches to multimodal pooling include element-wise product or sum, as well as concatenation of the visual and textual representations. We hypothesize that these methods are not as expressive as an outer product of the visual and textual vectors. As the outer product is typically infeasible due to its high dimensionality, we instead propose utilizing Multimodal Compact Bilinear pooling (MCB) to efficiently and expressively combine multimodal features. We extensively evaluate MCB on the visual question answering and grounding tasks. We consistently show the benefit of MCB over ablations without MCB. For visual question answering, we present an architecture which uses MCB twice, once for predicting attention over spatial features and again to combine the attended representation with the question representation. This model outperforms the state-of-the-art on the Visual7W dataset and the VQA challenge." + }, + { + "title": "Hierarchical Question-Image Co-Attention for Visual Question Answering", + "abstract": "A number of recent works have proposed attention models for Visual Question Answering (VQA) that generate spatial maps highlighting image regions relevant to answering the question. In this paper, we argue that in addition to modeling \"where to look\" or visual attention, it is equally important to model \"what words to listen to\" or question attention. We present a novel co-attention model for VQA that jointly reasons about image and question attention. In addition, our model reasons about the question (and consequently the image via the co-attention mechanism) in a hierarchical fashion via a novel 1-dimensional convolution neural networks (CNN). Our model improves the state-of-the-art on the VQA dataset from 60.3% to 60.5%, and from 61.6% to 63.3% on the COCO-QA dataset. By using ResNet, the performance is further improved to 62.1% for VQA and 65.4% for COCO-QA." + }, + { + "title": "Review Networks for Caption Generation", + "abstract": "We propose a novel extension of the encoder-decoder framework, called a review network. The review network is generic and can enhance any existing encoder- decoder model: in this paper, we consider RNN decoders with both CNN and RNN encoders. The review network performs a number of review steps with attention mechanism on the encoder hidden states, and outputs a thought vector after each review step; the thought vectors are used as the input of the attention mechanism in the decoder. We show that conventional encoder-decoders are a special case of our framework. Empirically, we show that our framework improves over state-of- the-art encoder-decoder systems on the tasks of image captioning and source code captioning." + }, + { + "title": "Identity Mappings in Deep Residual Networks", + "abstract": null + }, + { + "title": "Image Captioning with Semantic Attention", + "abstract": "Automatically generating a natural language description of an image has attracted interests recently both because of its importance in practical applications and because it connects two major artificial intelligence fields: computer vision and natural language processing. Existing approaches are either top-down, which start from a gist of an image and convert it into words, or bottom-up, which come up with words describing various aspects of an image and then combine them. In this paper, we propose a new algorithm that combines both approaches through a model of semantic attention. Our algorithm learns to selectively attend to semantic concept proposals and fuse them into hidden states and outputs of recurrent neural networks. The selection and fusion form a feedback connecting the top-down and bottom-up computation. We evaluate our algorithm on two public benchmarks: Microsoft COCO and Flickr30K. Experimental results show that our algorithm significantly outperforms the state-of-the-art approaches consistently across different evaluation metrics." + }, + { + "title": "Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations", + "abstract": null + }, + { + "title": "Deep Residual Learning for Image Recognition", + "abstract": "Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation." + }, + { + "title": "SSD: Single Shot MultiBox Detector", + "abstract": null + }, + { + "title": "DenseCap: Fully Convolutional Localization Networks for Dense Captioning", + "abstract": "We introduce the dense captioning task, which requires a computer vision system to both localize and describe salient regions in images in natural language. The dense captioning task generalizes object detection when the descriptions consist of a single word, and Image Captioning when one predicted region covers the full image. To address the localization and description task jointly we propose a Fully Convolutional Localization Network (FCLN) architecture that processes an image with a single, efficient forward pass, requires no external regions proposals, and can be trained end-to-end with a single round of optimization. The architecture is composed of a Convolutional Network, a novel dense localization layer, and Recurrent Neural Network language model that generates the label sequences. We evaluate our network on the Visual Genome dataset, which comprises 94,000 images and 4,100,000 region-grounded captions. We observe both speed and accuracy improvements over baselines based on current state of the art approaches in both generation and retrieval settings." + }, + { + "title": "Order-Embeddings of Images and Language", + "abstract": "Hypernymy, textual entailment, and image captioning can be seen as special cases of a single visual-semantic hierarchy over words, sentences, and images. In this paper we advocate for explicitly modeling the partial order structure of this hierarchy. Towards this goal, we introduce a general method for learning ordered representations, and show how it can be applied to a variety of tasks involving images and language. We show that the resulting representations improve performance over current approaches for hypernym prediction and image-caption retrieval." + }, + { + "title": "Ask, Attend and Answer: Exploring Question-Guided Spatial Attention for Visual Question Answering", + "abstract": null + }, + { + "title": "Visual7W: Grounded Question Answering in Images", + "abstract": "We have seen great progress in basic perceptual tasks such as object recognition and detection. However, AI models still fail to match humans in high-level vision tasks due to the lack of capacities for deeper reasoning. Recently the new task of visual question answering (QA) has been proposed to evaluate a model's capacity for deep image understanding. Previous works have established a loose, global association between QA sentences and images. However, many questions and answers, in practice, relate to local regions in the images. We establish a semantic link between textual descriptions and image regions by object-level grounding. It enables a new type of QA with visual answers, in addition to textual answers used in previous work. We study the visual QA tasks in a grounded setting with a large collection of 7W multiple-choice QA pairs. Furthermore, we evaluate human performance and several baseline models on the QA tasks. Finally, we propose a novel LSTM model with spatial attention to tackle the 7W QA tasks." + }, + { + "title": "Stacked Attention Networks for Image Question Answering", + "abstract": "This paper presents stacked attention networks (SANs) that learn to answer natural language questions from images. SANs use semantic representation of a question as query to search for the regions in an image that are related to the answer. We argue that image question answering (QA) often requires multiple steps of reasoning. Thus, we develop a multiple-layer SAN in which we query an image multiple times to infer the answer progressively. Experiments conducted on four image QA data sets demonstrate that the proposed SANs significantly outperform previous state-of-the-art approaches. The visualization of the attention layers illustrates the progress that the SAN locates the relevant visual clues that lead to the answer of the question layer-by-layer." + }, + { + "title": "Aligning where to see and what to tell: image caption with region-based attention and scene factorization", + "abstract": "Recent progress on automatic generation of image captions has shown that it is possible to describe the most salient information conveyed by images with accurate and meaningful sentences. In this paper, we propose an image caption system that exploits the parallel structures between images and sentences. In our model, the process of generating the next word, given the previously generated ones, is aligned with the visual perception experience where the attention shifting among the visual regions imposes a thread of visual ordering. This alignment characterizes the flow of \"abstract meaning\", encoding what is semantically shared by both the visual scene and the text description. Our system also makes another novel modeling contribution by introducing scene-specific contexts that capture higher-level semantic information encoded in an image. The contexts adapt language models for word generation to specific scene types. We benchmark our system and contrast to published results on several popular datasets. We show that using either region-based attention or scene-specific contexts improves systems without those components. Furthermore, combining these two modeling ingredients attains the state-of-the-art performance." + }, + { + "title": "You Only Look Once: Unified, Real-Time Object Detection", + "abstract": "We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance. Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background. Finally, YOLO learns very general representations of objects. It outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork." + }, + { + "title": "Spatial Transformer Networks", + "abstract": "Convolutional Neural Networks define an exceptionally powerful class of models, but are still limited by the lack of ability to be spatially invariant to the input data in a computationally and parameter efficient manner. In this work we introduce a new learnable module, the Spatial Transformer, which explicitly allows the spatial manipulation of data within the network. This differentiable module can be inserted into existing convolutional architectures, giving neural networks the ability to actively spatially transform feature maps, conditional on the feature map itself, without any extra training supervision or modification to the optimisation process. We show that the use of spatial transformers results in models which learn invariance to translation, scale, rotation and more generic warping, resulting in state-of-the-art performance on several benchmarks, and for a number of classes of transformations." + }, + { + "title": "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks", + "abstract": "State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available" + }, + { + "title": "What Value Do Explicit High Level Concepts Have in Vision to Language Problems?", + "abstract": "Much recent progress in Vision-to-Language (V2L) problems has been achieved through a combination of Convolutional Neural Networks (CNNs) and Recurrent Neural Networks (RNNs). This approach does not explicitly represent high-level semantic concepts, but rather seeks to progress directly from image features to text. In this paper we investigate whether this direct approach succeeds due to, or despite, the fact that it avoids the explicit representation of high-level information. We propose a method of incorporating high-level concepts into the successful CNN-RNN approach, and show that it achieves a significant improvement on the state-of-the-art in both image captioning and visual question answering. We also show that the same mechanism can be used to introduce external semantic information and that doing so further improves performance. We achieve the best reported results on both image captioning and VQA on several benchmark datasets, and provide an analysis of the value of explicit high-level concepts in V2L problems." + }, + { + "title": "VQA: Visual Question Answering", + "abstract": null + }, + { + "title": "Highway Networks", + "abstract": "There is plenty of theoretical and empirical evidence that depth of neural networks is a crucial ingredient for their success. However, network training becomes more difficult with increasing depth and training of very deep networks remains an open problem. In this extended abstract, we introduce a new architecture designed to ease gradient-based training of very deep networks. We refer to networks with this architecture as highway networks, since they allow unimpeded information flow across several layers on\"information highways\". The architecture is characterized by the use of gating units which learn to regulate the flow of information through a network. Highway networks with hundreds of layers can be trained directly using stochastic gradient descent and with a variety of activation functions, opening up the possibility of studying extremely deep and efficient architectures." + }, + { + "title": "Microsoft COCO Captions: Data Collection and Evaluation Server", + "abstract": "In this paper we describe the Microsoft COCO Caption dataset and evaluation server. When completed, the dataset will contain over one and a half million captions describing over 330,000 images. For the training and validation images, five independent human generated captions will be provided. To ensure consistency in evaluation of automatic caption generation algorithms, an evaluation server is used. The evaluation server receives candidate captions and scores them using several popular metrics, including BLEU, METEOR, ROUGE and CIDEr. Instructions for using the evaluation server are provided." + }, + { + "title": "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention", + "abstract": "Inspired by recent work in machine translation and object detection, we introduce an attention based model that automatically learns to describe the content of images. We describe how we can train this model in a deterministic manner using standard backpropagation techniques and stochastically by maximizing a variational lower bound. We also show through visualization how the model is able to automatically learn to fix its gaze on salient objects while generating the corresponding words in the output sequence. We validate the use of attention with state-of-the-art performance on three benchmark datasets: Flickr9k, Flickr30k and MS COCO." + }, + { + "title": "Deep Captioning with Multimodal Recurrent Neural Networks (m-RNN)", + "abstract": "In this paper, we present a multimodal Recurrent Neural Network (m-RNN) model for generating novel image captions. It directly models the probability distribution of generating a word given previous words and an image. Image captions are generated by sampling from this distribution. The model consists of two sub-networks: a deep recurrent neural network for sentences and a deep convolutional network for images. These two sub-networks interact with each other in a multimodal layer to form the whole m-RNN model. The effectiveness of our model is validated on four benchmark datasets (IAPR TC-12, Flickr 8K, Flickr 30K and MS COCO). Our model outperforms the state-of-the-art methods. In addition, we apply the m-RNN model to retrieval tasks for retrieving images or sentences, and achieves significant performance improvement over the state-of-the-art methods which directly optimize the ranking objective function for retrieval. The project page of this work is: www.stat.ucla.edu/~junhua.mao/m-RNN.html ." + }, + { + "title": "Deep visual-semantic alignments for generating image descriptions", + "abstract": "We present a model that generates natural language descriptions of images and their regions. Our approach leverages datasets of images and their sentence descriptions to learn about the inter-modal correspondences between language and visual data. Our alignment model is based on a novel combination of Convolutional Neural Networks over image regions, bidirectional Recurrent Neural Networks over sentences, and a structured objective that aligns the two modalities through a multimodal embedding. We then describe a Multimodal Recurrent Neural Network architecture that uses the inferred alignments to learn to generate novel descriptions of image regions. We demonstrate that our alignment model produces state of the art results in retrieval experiments on Flickr8K, Flickr30K and MSCOCO datasets. We then show that the generated descriptions significantly outperform retrieval baselines on both full images and on a new dataset of region-level annotations." + }, + { + "title": "CIDEr: Consensus-based image description evaluation", + "abstract": "Automatically describing an image with a sentence is a long-standing challenge in computer vision and natural language processing. Due to recent progress in object detection, attribute classification, action recognition, etc., there is renewed interest in this area. However, evaluating the quality of descriptions has proven to be challenging. We propose a novel paradigm for evaluating image descriptions that uses human consensus. This paradigm consists of three main parts: a new triplet-based method of collecting human annotations to measure consensus, a new automated metric that captures consensus, and two new datasets: PASCAL-50S and ABSTRACT-50S that contain 50 sentences describing each image. Our simple metric captures human judgment of consensus better than existing metrics across sentences generated by various sources. We also evaluate five state-of-the-art image description approaches using this new protocol and provide a benchmark for future comparisons. A version of CIDEr named CIDEr-D is available as a part of MS COCO evaluation server to enable systematic evaluation and benchmarking." + }, + { + "title": "From captions to visual concepts and back", + "abstract": "This paper presents a novel approach for automatically generating image descriptions: visual detectors, language models, and multimodal similarity models learnt directly from a dataset of image captions. We use multiple instance learning to train visual detectors for words that commonly occur in captions, including many different parts of speech such as nouns, verbs, and adjectives. The word detector outputs serve as conditional inputs to a maximum-entropy language model. The language model learns from a set of over 400,000 image descriptions to capture the statistics of word usage. We capture global semantics by re-ranking caption candidates using sentence-level features and a deep multimodal similarity model. Our system is state-of-the-art on the official Microsoft COCO benchmark, producing a BLEU-4 score of 29.1%. When human judges compare the system captions to ones written by other people on our held-out test set, the system captions have equal or better quality 34% of the time." + }, + { + "title": "Show and tell: A neural image caption generator", + "abstract": "Automatically describing the content of an image is a fundamental problem in artificial intelligence that connects computer vision and natural language processing. In this paper, we present a generative model based on a deep recurrent architecture that combines recent advances in computer vision and machine translation and that can be used to generate natural sentences describing an image. The model is trained to maximize the likelihood of the target description sentence given the training image. Experiments on several datasets show the accuracy of the model and the fluency of the language it learns solely from image descriptions. Our model is often quite accurate, which we verify both qualitatively and quantitatively. For instance, while the current state-of-the-art BLEU-1 score (the higher the better) on the Pascal dataset is 25, our approach yields 59, to be compared to human performance around 69. We also show BLEU-1 score improvements on Flickr30k, from 56 to 66, and on SBU, from 19 to 28. Lastly, on the newly released COCO dataset, we achieve a BLEU-4 of 27.7, which is the current state-of-the-art." + }, + { + "title": "Long-term recurrent convolutional networks for visual recognition and description", + "abstract": "Models based on deep convolutional networks have dominated recent image interpretation tasks; we investigate whether models which are also recurrent, or “temporally deep”, are effective for tasks involving sequences, visual and otherwise. We develop a novel recurrent convolutional architecture suitable for large-scale visual learning which is end-to-end trainable, and demonstrate the value of these models on benchmark video recognition tasks, image description and retrieval problems, and video narration challenges. In contrast to current models which assume a fixed spatio-temporal receptive field or simple temporal averaging for sequential processing, recurrent convolutional models are “doubly deep” in that they can be compositional in spatial and temporal “layers”. Such models may have advantages when target concepts are complex and/or training data are limited. Learning long-term dependencies is possible when nonlinearities are incorporated into the network state updates. Long-term RNN models are appealing in that they directly can map variable-length inputs (e.g., video frames) to variable length outputs (e.g., natural language text) and can model complex temporal dynamics; yet they can be optimized with backpropagation. Our recurrent long-term models are directly connected to modern visual convnet models and can be jointly trained to simultaneously learn temporal dynamics and convolutional perceptual representations. Our results show such models have distinct advantages over state-of-the-art models for recognition or generation which are separately defined and/or optimized." + }, + { + "title": "GloVe: Global Vectors for Word Representation", + "abstract": "Recent methods for learning vector space representations of words have succeeded in capturing fine-grained semantic and syntactic regularities using vector arithmetic, but the origin of these regularities has remained opaque. We analyze and make explicit the model properties needed for such regularities to emerge in word vectors. The result is a new global logbilinear regression model that combines the advantages of the two major model families in the literature: global matrix factorization and local context window methods. Our model efficiently leverages statistical information by training only on the nonzero elements in a word-word cooccurrence matrix, rather than on the entire sparse matrix or on individual context windows in a large corpus. The model produces a vector space with meaningful substructure, as evidenced by its performance of 75% on a recent word analogy task. It also outperforms related models on similarity tasks and named entity recognition." + }, + { + "title": "Edge Boxes: Locating Object Proposals from Edges", + "abstract": null + }, + { + "title": "ImageNet Large Scale Visual Recognition Challenge", + "abstract": null + }, + { + "title": "Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation", + "abstract": "In this paper, we propose a novel neural network model called RNN Encoder‐ Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixedlength vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder‐Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases." + }, + { + "title": "Meteor Universal: Language Specific Translation Evaluation for Any Target Language", + "abstract": "This paper describes Meteor Universal, released for the 2014 ACL Workshop on Statistical Machine Translation. Meteor Universal brings language specific evaluation to previously unsupported target languages by (1) automatically extracting linguistic resources (paraphrase tables and function word lists) from the bitext used to train MT systems and (2) using a universal parameter set learned from pooling human judgments of translation quality from several language directions. Meteor Universal is shown to significantly outperform baseline BLEU on two new languages, Russian (WMT13) and Hindi (WMT14)." + }, + { + "title": "Microsoft COCO: Common Objects in Context", + "abstract": null + }, + { + "title": "Selective Search for Object Recognition", + "abstract": null + }, + { + "title": "ADADELTA: An Adaptive Learning Rate Method", + "abstract": "We present a novel per-dimension learning rate method for gradient descent called ADADELTA. The method dynamically adapts over time using only first order information and has minimal computational overhead beyond vanilla stochastic gradient descent. The method requires no manual tuning of a learning rate and appears robust to noisy gradient information, different model architecture choices, various data modalities and selection of hyperparameters. We show promising results compared to other methods on the MNIST digit classification task using a single machine and on a large scale voice dataset in a distributed cluster environment." + }, + { + "title": "Maximum Expected BLEU Training of Phrase and Lexicon Translation Models", + "abstract": "This paper proposes a new discriminative training method in constructing phrase and lexicon translation models. In order to reliably learn a myriad of parameters in these models, we propose an expected BLEU score-based utility function with KL regularization as the objective, and train the models on a large parallel dataset. For training, we derive growth transformations for phrase and lexicon translation probabilities to iteratively improve the objective. The proposed method, evaluated on the Europarl German-to-English dataset, leads to a 1.1 BLEU point improvement over a state-of-the-art baseline translation system. In IWSLT 2011 Benchmark, our system using the proposed method achieves the best Chinese-to-English translation result on the task of translating TED talks." + }, + { + "title": "Top-Down Versus Bottom-Up Control of Attention in the Prefrontal and Posterior Parietal Cortices", + "abstract": "Attention can be focused volitionally by “top-down” signals derived from task demands and automatically by “bottom-up” signals from salient stimuli. The frontal and parietal cortices are involved, but their neural activity has not been directly compared. Therefore, we recorded from them simultaneously in monkeys. Prefrontal neurons reflected the target location first during top-down attention, whereas parietal neurons signaled it earlier during bottom-up attention. Synchrony between frontal and parietal areas was stronger in lower frequencies during top-down attention and in higher frequencies during bottom-up attention. This result indicates that top-down and bottom-up signals arise from the frontal and sensory cortex, respectively, and different modes of attention may emphasize synchrony at different frequencies." + }, + { + "title": "ROUGE: A Package for Automatic Evaluation of Summaries", + "abstract": "ROUGE stands for Recall-Oriented Understudy for Gisting Evaluation. It includes measures to automatically determine the quality of a summary by comparing it to other (ideal) summaries created by humans. The measures count the number of overlapping units such as n-gram, word sequences, and word pairs between the computer-generated summary to be evaluated and the ideal summaries created by humans. This paper introduces four different ROUGE measures: ROUGE-N, ROUGE-L, ROUGE-W, and ROUGE-S included in the ROUGE summarization evaluation package and their evaluations. Three of them have been used in the Document Understanding Conference (DUC) 2004, a large-scale summarization evaluation sponsored by NIST." + }, + { + "title": "Bleu: a Method for Automatic Evaluation of Machine Translation", + "abstract": "Human evaluations of machine translation are extensive but expensive. Human evaluations can take months to finish and involve human labor that can not be reused. We propose a method of automatic machine translation evaluation that is quick, inexpensive, and language-independent, that correlates highly with human evaluation, and that has little marginal cost per run. We present this method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations." + }, + { + "title": "Control of goal-directed and stimulus-driven attention in the brain", + "abstract": null + }, + { + "title": "Objects and attention: the state of the art", + "abstract": null + }, + { + "title": "Long Short-Term Memory", + "abstract": "Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms." + }, + { + "title": "Shifting visual attention between objects and locations: evidence from normal and parietal lesion subjects.", + "abstract": "Space- and object-based attention components were examined in neurologically normal and parietal-lesion subjects, who detected a luminance change at 1 of 4 ends of 2 outline rectangles. One rectangle end was precued (75% valid); on invalid-cue trials, the target appeared at the other end of the cued rectangle or at 1 end of the uncued rectangle. For normals, the cost for invalid cues was greater for targets in the uncued rectangle, indicating an object-based component. Both right- and left-hemisphere patients showed costs that were greater for contralesional targets. For right-hemisphere patients, the object cost was equivalent for contralesional and ipsilesional targets, indicating a spatial deficit, whereas the object cost for left-hemisphere patients was larger for contralesional targets, indicating an object deficit." + }, + { + "title": "Perceptual grouping and attention in visual search for features and for objects.", + "abstract": "This article explores the effects of perceptual grouping on search for targets defined by separate features or by conjunction of features. Treisman and Gelade proposed a feature-integration theory of attention, which claims that in the absence of prior knowledge, the separable features of objects are correctly combined only when focused attention is directed to each item in turn. If items are preattentively grouped, however, attention may be directed to groups rather than to single items whenever no recombination of features within a group could generate an illusory target. This prediction is confirmed: In search for conjunctions, subjects appear to scan serially between groups rather than items. The scanning rate shows little effect of the spatial density of distractors, suggesting that it reflects serial fixations of attention rather than eye movements. Search for features, on the other hand, appears to independent of perceptual grouping, suggesting that features are detected preattentively. A conjunction target can be camouflaged at the preattentive level by placing it at the boundary between two adjacent groups, each of which shares one of its features. This suggests that preattentive grouping creates separate feature maps within each separable dimension rather than one global configuration." + }, + { + "title": "A feature-integration theory of attention", + "abstract": null + }, + { + "title": "Optimization of image description metrics using policy gradient methods", + "abstract": null + }, + { + "title": "Deeper lstm and normalized cnn visual question answering model", + "abstract": null + }, + { + "title": "Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning", + "abstract": null + }, + { + "title": "• (c) A shiny metal pot filled with some diced veggies. • (d) The pan on the stove has chopped vegetables in it", + "abstract": null + }, + { + "title": "• (a) A young girl standing on top of a tennis court", + "abstract": null + }, + { + "title": "• (b) A giraffe standing on top of a green field. High n-gram similarity", + "abstract": null + }, + { + "title": "Examples of visual question answering failure cases involving reading and counting", + "abstract": null + }, + { + "title": "and as an in", + "abstract": null + } + ] + }, + "author_data": { + "5e934cc8-35dd-4aa9-a472-72d257fa964e": { + "pk": "5e934cc8-35dd-4aa9-a472-72d257fa964e", + "project_name": null, + "name": "Peter Anderson", + "bio": "I am a researcher with a strong focus on advancing the fields of computer vision and robotics, particularly in the context of real-world applications. My work has primarily revolved around enhancing image captioning models to improve their generalization to out-of-domain images, utilizing innovative techniques like constrained beam search and pretrained word embeddings. This approach has allowed me to achieve state-of-the-art results in image captioning tasks, demonstrating the potential of leveraging external image taggers without the need for retraining.\n\nIn addition to image captioning, I have explored hierarchical rank pooling for video sequence encoding, which has significantly improved activity recognition performance across multiple benchmarks. My research also delves into bi-level optimization problems, where I have contributed insights into differentiating argmin and argmax optimization techniques, which are crucial for end-to-end learning approaches.\n\nMy experience in robotics is highlighted by my work on the rUNSWift multirobotic system, where I developed various perception techniques for robot localization and navigation. This includes a unified field-feature inverse sensor model and a natural landmark localization system, both of which were successfully demonstrated in competitive environments. My research emphasizes the importance of integrating multiple sensory modalities and real-time processing capabilities, which are essential for the development of complex AI systems.\n\nOverall, my goal is to bridge the gap between theoretical advancements and practical applications, driving innovation in both computer vision and robotics through collaborative and iterative research methodologies.", + "collaborators": [ + "Basura Fernando", + "Stephen Gould", + "B. Hengst", + "Mark Johnson", + "Youssef Hunter", + "A. Cherian", + "Rodrigo Santa Cruz", + "Edison Guo", + "Marcus Hutter", + "Sean Harris", + "Belinda Teh", + "Roger Liu", + "Ritwik Roy", + "Sam Li", + "Carl Chateld", + "Yongki Yusmanthia", + "A. Sowmya" + ], + "pub_titles": [ + "Guided Open Vocabulary Image Captioning with Constrained Beam Search", + "On Differentiating Parameterized Argmin and Argmax Problems with Application to Bi-level Optimization", + "Discriminative Hierarchical Rank Pooling for Activity Recognition", + "An ICP inspired inverse sensor model with unknown data association", + "New Methods for Improving Perception in RoboCup SPL", + "Robocup Standard Platform League - rUNSWift 2012 Innovations" + ], + "pub_abstracts": [ + "Existing image captioning models do not generalize well to out-of-domain images containing novel scenes or objects. This limitation severely hinders the use of these models in real world applications dealing with images in the wild. We address this problem using a flexible approach that enables existing deep captioning architectures to take advantage of image taggers at test time, without re-training. Our method uses constrained beam search to force the inclusion of selected tag words in the output, and fixed, pretrained word embeddings to facilitate vocabulary expansion to previously unseen tag words. Using this approach we achieve state of the art results for out-of-domain captioning on MSCOCO (and improved results for in-domain captioning). Perhaps surprisingly, our results significantly outperform approaches that incorporate the same tag predictions into the learning algorithm. We also show that we can significantly improve the quality of generated ImageNet captions by leveraging ground-truth labels.", + "Some recent works in machine learning and computer vision involve the solution of a bi-level optimization problem. Here the solution of a parameterized lower-level problem binds variables that appear in the objective of an upper-level problem. The lower-level problem typically appears as an argmin or argmax optimization problem. Many techniques have been proposed to solve bi-level optimization problems, including gradient descent, which is popular with current end-to-end learning approaches. In this technical report we collect some results on differentiating argmin and argmax optimization problems with and without constraints and provide some insightful motivating examples.", + "We present hierarchical rank pooling, a video sequence encoding method for activity recognition. It consists of a network of rank pooling functions which captures the dynamics of rich convolutional neural network features within a video sequence. By stacking non-linear feature functions and rank pooling over one another, we obtain a high capacity dynamic encoding mechanism, which is used for action recognition. We present a method for jointly learning the video representation and activity classifier parameters. Our method obtains state-of-the art results on three important activity recognition benchmarks: 76.7% on Hollywood2, 66.9% on HMDB51 and, 91.4% on UCF101.", + "This paper introduces an Iterative Closest Point (ICP) inspired inverse sensor model for robot localisation given multiple simultaneous observations of aliased landmarks. Combined with a Kalman filter, the sensor model offers a robust alternative to maximum likelihood data association, or a computationally inexpensive alternative to a particle filter. The technique can also be used as a means for re-localising a kidnapped robot, or a sensor resetting method for a particle filter. In the RoboCup Standard Platform League, this sensor model is able to localise the robot from a single observation in 42% of field positions where multiple landmarks are visible.", + "This thesis introduces a number of new techniques motivated by the desire to improve robots’ perception of their environment in RoboCup SPL. These methods include a unified field-feature inverse sensor model, a natural landmark localisation system, a visual odometry module, and a robot detection system that combines vision with sonar sensors. All of these techniques were demonstrated in the 2012 RoboCup competition. Using a variation of ICP, the field-feature sensor model combines multiple simultaneous observations of aliased field-features into a single robot pose observation by: (1) matching observed features to the field map in a hierarchical fashion, and (2) minimising the squared positioning error simultaneously over all observed features. Results indicate that this sensor model is able to localise the robot from a single observation in 42% of field positions where multiple fieldfeatures are visible. However, it is not possible to distinguish between one end of the field and the other using field-features; this capability is provided by a natural landmark localisation system. Using a ‘bag of words’ image representation, the natural landmark localisation system stores up to 40 images of each goal area, and then matches camera frames to these stored images in real time on the Nao to resolve the field-end ambiguity. In a static environment this system is shown to perform flawlessly in repeated kidnap tests. To further improve robot localisation, a fast and unique method for calculating visual heading odometry is also presented. Experiments indicate that this system can reduce the odometric uncertainty of an uncalibrated Nao robot by 73%. The visual odometry module is also able to detect collisions with unseen objects, while remaining robust to the presence of moving objects in the environment. Both the visual odometry module and the natural landmark localisation system are based on modified 1D SURF image features extracted from pixels on the robot’s horizon. Consistent with the original SURF algorithm, the extracted features are robust to lighting changes, scale changes, and small changes in viewing angle or to the scene itself, while achieving a speed up of several orders of magnitude over SURF. This makes 1D SURF features suitable for visual navigation of resource constrained mobile robots. Finally, a combined vision and sonar robot detection system is presented that uses a novel sonar hardware control scheme to introduce a third sonar detection sector in front of the robot. Evidence suggests that during a penalty shoot-out, this system enables the striker to localise a stationary goalie to within 28 mm to 60 mm before shooting. This capability also enabled rUNSWift to develop coordinated role-switching behaviours that remain operational during total wireless failures.", + "Robotic competitions encourage a developmental style of research and development where large scale robotic systems are incrementally constructed as a whole. This diers from the typical research approach of solving a specic problem in isolation, but is a crucial part of reaching the long-term goals of complex AI systems. This paper outlines the innovation and development of the autonomous UNSW multirobotic system (rUNSWift) that was entered in the Standard Platform Soccer League at the International RoboCup competition in 2012. The challenge is to deliver real-time functionality within the limited resources of an on-board processor. Novel developments in 2012 include: SLAM using one-dimensional SURF features with visual-odometry as a by-product; extending foveated imaging to eld-line detection; a unied eld-feature sensor model; a dual-mode Kalman lter to help disambiguate the symmetric eld; robot-detection data-fusing visual and sonar observations; multi-robot tracking of the ball; and omni-directional kicking. The rUNSWift system was ranked in the top three world-wide." + ], + "domain": [ + "Computer Vision", + "Robotics", + "Machine Learning", + "Video Analysis" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "5963c180-2c7a-4b40-a400-cc4ab67f9a3f": { + "pk": "5963c180-2c7a-4b40-a400-cc4ab67f9a3f", + "project_name": null, + "name": "Chris Buehler", + "bio": "I am a researcher specializing in computer graphics and image-based rendering, with a focus on efficient algorithms for visual hull computation and real-time rendering systems. My work has led to the development of innovative techniques that allow for the precise sampling of visual hulls, significantly reducing the number of samples needed while maintaining high fidelity. I have explored the intersection of generalized cones to compute visual hulls efficiently, and I have designed a real-time, distributed light field camera system that enables dynamic navigation through captured light fields.\n\nMy thesis introduced the unstructured lumigraph rendering (ULR) algorithm, which is capable of handling irregularly arranged image collections and adapts to varying geometric information. This work laid the foundation for a flexible image-based rendering approach that generalizes existing algorithms, allowing for lumigraph-style rendering from arbitrary camera configurations.\n\nI have also tackled challenges in video stabilization through image-based rendering techniques, focusing on non-metric reconstructions that simplify the process without requiring extensive camera calibration. My recent algorithms for creating and rendering image-based visual hulls have demonstrated real-time capabilities, making them suitable for dynamic scenes. Overall, my research aims to push the boundaries of photorealistic graphics and real-time rendering, contributing to advancements in how we visualize and interact with complex visual data.", + "collaborators": [ + "L. McMillan", + "W. Matusik", + "S. Gortler", + "Michael F. Cohen", + "M. Bosse", + "Jason C. Yang", + "M. Everett", + "R. Raskar" + ], + "pub_titles": [ + "Efficient View-Dependent Sampling of Visual Hulls", + "An Efficient Visual Hull Computation Algorithm", + "A Real-Time Distributed Light Field Camera", + "Rendering from unstructured collections of images", + "Unstructured lumigraph rendering", + "Polyhedral Visual Hulls for Real-Time Rendering", + "Non-metric image-based rendering for video stabilization", + "Image-based visual hulls", + "Creating and Rendering Image-Based Visual Hulls" + ], + "pub_abstracts": [ + "In this paper we present an efficient algorithm for sampling visual hulls. Our algorithm computes exact points and normals on the surface of visual hull instead of a more traditional volumetric representation. The main feature that distinguishes our algorithm from previous ones is that it allows for sampling along arbitrary viewing rays with no loss of efficiency. Using this property, we adaptively sample visual hulls to minimize the number of samples needed to attain a given fidelity. In our experiments, the number of samples can typically be reduced by an order of magnitude, resulting in a corresponding performance increase over previous algorithms.", + "In this paper we describe an efficient algorithm for computing the visual hull of an object. This problem is equivalent to computing the intersection of generalized cones. The naive visual hull computation algorithm requires intersecting 3D polyhedra. We exploit the special structure of generalized cone polyhedra and show how to reduce this computation to a set of intersections in 2D. Moreover, we describe how the 2D intersections can be carried out efficiently.", + "We present the design and implementation of a real-time, distributed light field camera. Our system allows multiple viewers to navigate virtual cameras in a dynamically changing light field that is captured in real-time. Our light field camera consists of 64 commodity video cameras that are connected to off-the-shelf computers. We employ a distributed rendering algorithm that allows us to overcome the data bandwidth problems inherent in dynamic light fields. Our algorithm works by selectively transmitting only those portions of the video streams that contribute to the desired virtual views. This technique not only reduces the total bandwidth, but it also allows us to scale the number of cameras in our system without increasing network bandwidth. We demonstrate our system with a number of examples.", + "Computer graphics researchers recently have turned to image-based rendering to achieve the goal of photorealistic graphics. Instead of constructing a scene with millions of polygons, the scene is represented by a collection of photographs along with a greatly simplified geometric model. This simple representation allows traditional light transport simulations to be replaced with basic image-processing routines that combine multiple images together to produce never-before-seen images from new vantage points. This thesis presents a new image-based rendering algorithm called unstructured lumigraph rendering (ULR). ULR is an image-based rendering algorithm that is specifically designed to work with unstructured (i.e., irregularly arranged) collections of images. The algorithm is unique in that it is capable of using any amount of geometric or image information that is available about a scene. Specifically, the research in this thesis makes the following contributions: An enumeration of image-based rendering properties that an ideal algorithm should attempt to satisfy. An algorithm that satisfies these properties should work as well as possible with any configuration of input images or geometric knowledge. An optimal formulation of the basic image-based rendering problem, the solution to which is designed to satisfy the aforementioned properties. The unstructured lumigraph rendering algorithm, which is an efficient approximation to the optimal image-based rendering solution. A non-metric ULR algorithm, which generalizes the basic ULR algorithm to work with uncalibrated images. A time-dependent ULR algorithm, which generalizes the basic ULR algorithm to work with time-dependent data. Thesis Supervisor: Leonard McMillan Title: Associate Professor", + "We describe an image based rendering approach that generalizes many current image based rendering algorithms, including light field rendering and view-dependent texture mapping. In particular, it allows for lumigraph-style rendering from a set of input cameras in arbitrary configurations (i.e., not restricted to a plane or to any specific manifold). In the case of regular and planar input camera positions, our algorithm reduces to a typical lumigraph approach. When presented with fewer cameras and good approximate geometry, our algorithm behaves like view-dependent texture mapping. The algorithm achieves this flexibility because it is designed to meet a set of specific goals that we describe. We demonstrate this flexibility with a variety of examples.", + "We present new algorithms for creating and rendering visual hulls in real-time. Unlike voxel or sampled approaches, we compute an exact polyhedral representation for the visual hull directly from the silhouettes. This representation has a number of advantages: 1) it is a view-independent representation, 2) it is well-suited to rendering with graphics hardware, and 3) it can be computed very quickly. We render these visual hulls with a view-dependent texturing strategy, which takes into account visibility information that is computed during the creation of the visual hull. We demonstrate these algorithms in a system that asynchronously renders dynamically created visual hulls in real-time. Our system outperforms similar systems of comparable computational power.", + "We consider the problem of video stabilization: removing unwanted image perturbations due to unstable camera motions. We approach this problem from an image-based rendering (IBR) standpoint. Given an unstabilized video sequence, the task is to synthesize a new sequence as seen from a stabilized camera trajectory. This task is relatively straightforward if one has a Euclidean reconstruction of the unstabilized camera trajectory and a suitable IBR algorithm. However, it is often not feasible to obtain a Euclidean reconstruction from an arbitrary video sequence. In light of this problem, we describe IBR techniques for non-metric reconstructions, which are often much easier to obtain since they do not require camera calibration. These rendering techniques are well suited to the video stabilization problem. The key idea behind our techniques is that all measurements are specified in the image space, rather than in the non-metric space.", + "In this paper, we describe an efficient image-based approach to computing and shading visual hulls from silhouette image data. Our algorithm takes advantage of epipolar geometry and incremental computation to achieve a constant rendering cost per rendered pixel. It does not suffer from the computation complexity, limited resolution, or quantization artifacts of previous volumetric approaches. We demonstrate the use of this algorithm in a real-time virtualized reality application running off a small number of video streams.", + "In this paper, we present efficient algorithms for creating and rendering image-based visual hulls. These algorithms are motivated by our desire to render real-time views of dynamic, real-world scenes. We first describe the visual hull, an abstract geometric entity useful for describing the volumes of objects as determined by their silhouettes. We then introduce the image-based visual hull, an efficient representation of an object''s visual hull. We demonstrate two desirable properties of the image-based visual hull. First, it can be computed efficiently (i.e., in real-time) from multiple silhouette images. Second, it can be quickly rendered from novel viewpoints. These two properties motivate our use of the image-based visual hull in a real-time rendering system that we are currently developing." + ], + "domain": [ + "Computer Graphics", + "Image-Based Rendering", + "Visual Hull", + "Real-Time Rendering" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "def9809f-bb88-49a9-a12e-00887b8d7276": { + "pk": "def9809f-bb88-49a9-a12e-00887b8d7276", + "project_name": null, + "name": "Damien Teney", + "bio": "I am a researcher with a strong focus on visual understanding and machine learning, particularly in the realms of Visual Question Answering (VQA), video segmentation, and object recognition. My work aims to bridge the gap between visual perception and intelligent reasoning, enabling systems to interpret and interact with their environments more effectively.\n\nIn my recent publications, I have tackled the challenge of Zero-Shot VQA, proposing new evaluation protocols and strategies that allow models to answer questions about unseen images without exhaustive training data. I have also developed structured representations for VQA that enhance joint reasoning over visual and textual domains, significantly improving accuracy on benchmark datasets.\n\nMy research extends to video segmentation, where I have introduced novel motion features and a graph-based approach to segment dynamic textures and complex scenes. This work has yielded state-of-the-art results and demonstrates the potential of integrating motion and appearance cues for robust segmentation.\n\nAdditionally, I have explored fine-grained recognition of human hand grasps and pose estimation of objects, employing probabilistic models to enhance accuracy and robustness in challenging conditions. My contributions to robotics include methods for markerless recognition of robotic arms, facilitating better interaction with the environment.\n\nOverall, my research is driven by a desire to create intelligent systems that can understand and navigate the complexities of visual information, ultimately enhancing their ability to perform tasks in real-world scenarios.", + "collaborators": [ + "J. Piater", + "A. Hengel", + "Matthew A. Brown", + "M. Hebert", + "Qi Wu", + "Peng Wang", + "Chunhua Shen", + "A. Dick", + "Lingqiao Liu", + "Dimitry Kit", + "P. Hall", + "Akanksha Saran", + "Kris M. Kitani", + "Iman Abbasnejad", + "Kerem Can", + "A. Denis", + "J. Pisane", + "J. Verly", + "J. Duff", + "A. Hor", + "A. R. Melnyk" + ], + "pub_titles": [ + "Zero-Shot Visual Question Answering", + "Graph-Structured Representations for Visual Question Answering", + "Learning similarity metrics for dynamic scene segmentation", + "Learning Filter-Based Motion Features for Dynamic Scene Analysis", + "Hand parsing for fine-grained recognition of human grasps in monocular images", + "A hierarchical Bayesian network for face recognition using 2D and 3D facial data", + "Segmentation of Dynamic Scenes with Distributions of Spatiotemporally Oriented Energies", + "Segmentation of Low-Level Motion Features to Infer Occlusion Boundaries and Local Depth Ordering", + "Markerless Self-Recognition and Segmentation of Robotic Manipulator in Still Images ICRA 2013 Mobile Manipulation Workshop on Interactive Perception", + "Continuous Pose Estimation in 2D Images at Instance and Category Levels", + "Probabilistic Models of Visual Appearance For Object Identity, Class, and Pose Inference", + "Generalized Exemplar-Based Full Pose Estimation from 2D Images without Correspondences", + "Sampling-Based Multiview Reconstruction without Correspondences for 3D Edges", + "Design, construction, and test of a reliable, redundant on-board computer (OBC) for the OUFTI-1 CubeSat of the University of Li`ege", + "Spectral response and xerographic electrical characteristics of some perylene bisimide pigments" + ], + "pub_abstracts": [ + "Part of the appeal of Visual Question Answering (VQA) is its promise to answer new questions about previously unseen images. Most current methods demand training questions that illustrate every possible concept, and will therefore never achieve this capability, since the volume of required training data would be prohibitive. Answering general questions about images requires methods capable of Zero-Shot VQA, that is, methods able to answer questions beyond the scope of the training questions. We propose a new evaluation protocol for VQA methods which measures their ability to perform Zero-Shot VQA, and in doing so highlights significant practical deficiencies of current approaches, some of which are masked by the biases in current datasets. We propose and evaluate several strategies for achieving Zero-Shot VQA, including methods based on pretrained word embeddings, object classifiers with semantic embeddings, and test-time retrieval of example images. Our extensive experiments are intended to serve as baselines for Zero-Shot VQA, and they also achieve state-of-the-art performance in the standard VQA evaluation setting.", + "This paper proposes to improve visual question answering (VQA) with structured representations of both scene contents and questions. A key challenge in VQA is to require joint reasoning over the visual and text domains. The predominant CNN/LSTM-based approach to VQA is limited by monolithic vector representations that largely ignore structure in the scene and in the question. CNN feature vectors cannot effectively capture situations as simple as multiple object instances, and LSTMs process questions as series of words, which do not reflect the true complexity of language structure. We instead propose to build graphs over the scene objects and over the question words, and we describe a deep neural network that exploits the structure in these representations. We show that this approach achieves significant improvements over the state-of-the-art, increasing accuracy from 71.2% to 74.4% in accuracy on the abstract scenes multiple-choice benchmark, and from 34.7% to 39.1% in accuracy over pairs of balanced scenes, i.e. images with fine-grained differences and opposite yes/no answers to a same question.", + "This paper addresses the segmentation of videos with arbitrary motion, including dynamic textures, using novel motion features and a supervised learning approach. Dynamic textures are commonplace in natural scenes, and exhibit complex patterns of appearance and motion (e.g. water, smoke, swaying foliage). These are difficult for existing segmentation algorithms, often violate the brightness constancy assumption needed for optical flow, and have complex segment characteristics beyond uniform appearance or motion. Our solution uses custom spatiotemporal filters that capture texture and motion cues, along with a novel metric-learning framework that optimizes this representation for specific objects and scenes. This is used within a hierarchical, graph-based segmentation setting, yielding state-of-the-art results for dynamic texture segmentation. We also demonstrate the applicability of our approach to general object and motion segmentation, showing significant improvements over unsupervised segmentation and results comparable to the best task specific approaches.", + "[1] K. G. Derpanis and R. P. Wildes. Spacetime texture representation and recognition based on a spatiotemporal orientation analysis. PAMI, 2012. [2] D. J. Heeger. Model for the extraction of image flow. J. Opt. Soc. Am. A, 1987. [3] K. Simonyan and A. Zisserman. Two-stream convolutional networks for action recognition in videos. NIPS Spotlight, 2014. [4] F. Solari, M. Chessa, and P. Medathati, N. Kornprobst. What can we expect from a V1-MT feedforward architecture for optical flow estimation ?, Signal Processing: Image Communication, 2015. [5] D. Teney and M. Brown. Segmentation of dynamic scenes with distributions of spatiotemporally oriented energies. In BMVC, 2014. Time Vertical pattern moving at 0.5 px/fr.", + "We propose a novel method for performing fine-grained recognition of human hand grasp types using a single monocular image to allow computational systems to better understand human hand use. In particular, we focus on recognizing challenging grasp categories which differ only by subtle variations in finger configurations. While much of the prior work on understanding human hand grasps has been based on manual detection of grasps in video, this is the first work to automate the analysis process for fine-grained grasp classification. Instead of attempting to utilize a parametric model of the hand, we propose a hand parsing framework which leverages a data-driven learning to generate a pixel-wise segmentation of a hand into finger and palm regions. The proposed approach makes use of appearance-based cues such as finger texture and hand shape to accurately determine hand parts. We then build on the hand parsing result to compute high-level grasp features to learn a supervised fine-grained grasp classifier. To validate our approach, we introduce a grasp dataset recorded with a wearable camera, where the hand and its parts have been manually segmented with pixel-wise accuracy. Our results show that our proposed automatic hand parsing technique can improve grasp classification accuracy by over 30 percentage points over a state-of-the-art grasp recognition technique.", + "In this paper, we tackle the problem of face classification and verification. We present a novel face representation method based on a Bayesian network. The model captures dependencies between 2D salient facial regions and the full 3D geometrical model of the face, which makes it robust to pose variations, and useable in unconstrained environments. We present experiments on the challenging databases FERET and LFW, which show a significant advantage over state-of-the-art methods.", + "In video segmentation, disambiguating appearance cues by grouping similar motions or dynamics is potentially powerful, though non-trivial. Dynamic changes of appearance can occur from rigid or non-rigid motion, as well as complex dynamic textures. While the former are easily captured by optical flow, phenomena such as a dissipating cloud of smoke, or flickering reflections on water, do not satisfy the assumption of brightness constancy, or cannot be modelled with rigid displacements in the image. To tackle this problem, we propose a robust representation of image dynamics as histograms of motion energy (HoME) obtained from convolutions of the video with spatiotemporal filters. They capture a wide range of dynamics and handle problems previously studied separately (motion and dynamic texture segmentation). They thus offer a potential solution for a new class of problems that contain these effects in the same scene. Our representation of image dynamics is integrated in a graph-based segmentation framework and combined with colour histograms to represent the appearance of regions. In the case of translating and occluding segments, the proposed features additionally serve to characterize the motion of the boundary between pairs of segments, to identify the occluder and inferring a local depth ordering. The resulting segmentation method is completely modelfree and unsupervised, and achieves state-of-the-art results on the SynthDB dataset for dynamic texture segmentation, on the MIT dataset for motion segmentation, and reasonable performance on the CMU dataset for occlusion boundaries.", + "Motion in videos is a powerful cue to aid in scene understanding, by identifying the boundaries and the depth ordering of occluding objects. It can help to separate objects using their intrinsic motion, or parallax-induced motion at different depths. Most existing work rely on the computation of the optical flow, grouped into similar regions according to a parametric (e.g. affine) motion model. Two limitations ensue from this approach. First, the computation of the optical flow, despite recent advances, remains computationally expensive and relies on assumptions (e.g. brightness constancy or rigidly moving objects) that may not hold true. Secondly, parametric motions may similarly be limited to simple scenes with translating objects. More complex cases include deformable objects, repetitive motions, etc. In this work, we consider the use of motion energies, directly obtained from convolutions of the video with spatiotemporal filters, as an alternative image feature to optical flow for motion segmentation. We represent the motion of a region of the video with distributions of such motion energies, thereby alleviating the limitations of parametric motion models. The combination of motion and appearance cues to improve boundary detection has mostly been addressed through supervised training [4]. This has some disadvantages related to the availability of suitable training data and to possible annotation bias in what actually constitutes relevant boundaries. Instead, we are interested in establishing a learning-free baseline, and we rather hypothesize that a segmentation framework is a suitable paradigm for grouping motions, similarly as it is for grouping static color and textures cues. Most work on video segmentation extends image segmentation techniques, and the joint grouping of appearance and motion features in complex scenes is still an open problem. [3], for example, briefly mentions the use of histograms of optical flow, though the improvement in performance was not rigorously evaluated. Our contributions consist in (i) the integration of lowlevel, filter-based motion features into an existing seg-", + "Vision is a crucial capability for enabling robots to perceive and interact with their environment, e.g. manipulating or grasping objects. A current trend is bringing closer the aspects of interaction and perception, on the one hand by integrating visual information directly in the control process, and on the other hand, using interaction itself to help perception, allowing robots to explore their environment. In the context of manipulation, physical parts of the robot are then likely to appear in the observations, and an important capability emerges as the recognition of those parts, in order to separate the observations of the scene from those of the robot itself. Identifying the robot’s own body parts in input images has been used before in different ways, helping obstacle avoidance or control directly (through visual servoing [1]). However, this is usually performed via indirect methods, tracking fiducial markers purposely attached to the robot [2], which imposes undesirable (e.g. visibility) constraints. Some recent work adresses the pose estimation of a robot manipulator directly [3], [4], but these methods focus on tracking the manipulator between consecutive frames, whereas the initial recognition is considered as the harder part. We propose a method for markerless, monocular recognition and pose estimation of an articulated robot arm, dealing with single images without initialization, allowing its use with unknown hand-eye calibration, imprecise kinematics or missing position feedback.", + "We present a general method for tackling the related problems of pose estimation of known object instances and object categories. By representing the training images as a probability distribution over the joint appearance/pose space, the method is naturally suitable for modeling the appearance of a single instance of an object, or of diverse instances of the same category. The training data is weighted and forms a generative model, the weights being based on the informative power of each image feature for specific poses. Pose inference is performed through probabilistic voting in pose space, which is intrinsically robust to clutter and occlusions, and which we render tractable by treating separately the least interdependent dimensions. The scalability of category-level models is ensured during training by clustering the available image features in the joint appearance/pose space. Finally, we show how to first efficiently use a category-model, then possibly recognize a particular trained instance to refine the pose estimate using the corresponding instance-specific model. Our implementation uses edge points as image features, and was tested on several existing datasets. We obtain results on par with or superior to state-of-the-art methods, on both instance- and category-level problems, including for generalization to unseen instances.", + "This thesis addresses the topic of object recognition in images within the context of robotics, where it should allow the robot to understand its environment, then plan actions such as the manipulation of objects. We consider the recognition of specific, known objects, as well as general categories of objects, e.g. coffee mugs of various shapes, colors, and appearance. Our central contribution is the modeling of the appearance of objects as probabilistic distributions of visual characteristics, learned from training examples. We present techniques to use such models to identify and localize objects in images of cluttered scenes, and determine their actual position and orientation in space.", + "This paper addresses the problem of full pose estimation of objects in 2D images, using registered 2D examples as training data. We present a general formulation of the problem, which departs from traditional approaches by not focusing on one specific type of image features. The proposed algorithm avoids relying on specific model-to-scene correspondences, allowing using similar-looking and generally unmatchable features. We effectively demonstrate this capability by applying the method to edge segments. Our algorithm uses successive histogram-based and probabilistic evaluations, which ultimately recover a complete description of the probability distribution of the pose of the object, in the 6 degree-of-freedom 3D pose space, thereby accounting for the inherent ambiguities in the 2D input data. Furthermore, we propose, in a rigorous framework, an efficient procedure for fusing multiple sources of evidence, such as multiple registered 2D views of the same scene. The proposed method is evaluated qualitatively and quantitatively on synthetic and real test images. It shows promising results under challenging conditions, including occlusions and heavy clutter, while being capable of handling objects with little texture and detail.", + "This paper introduces a novel method for feature-based 3D reconstruction using multiple calibrated 2D views. We use a probabilistic formulation of the problem in the 3D, reconstructed space that allows using features that cannot be matched one-to-one, or which cannot be precisely located, such as points along edges. The reconstructed scene, modelled as a probability distribution in the 3D space, is defined as the intersection of all reconstructions compatible with each available view. We introduce a method based on importance sampling to retrieve individual samples from that distribution, as well as an iterative method to identify contiguous regions of high density. This allows the reconstruction of continuous 3D curves compatible with all the given input views, without establishing specific correspondences and without relying on connectivity in the input images, while accounting for uncertainty in the input observations, due e.g.\\ to noisy images and poorly calibrated cameras. The technical formulation is attractive in its flexibility and genericity. The implemented system, evaluated on several very different publicly-available datasets, shows results competitive with existing methods, effectively dealing with arbitrary numbers of views, wide baselines and imprecise camera calibrations.", + "We describe the architecture of the on-board computer (OBC) of the educational nanosatellite OUFTI-1 of the University of Liege. The OBC is the brain of the satellite. For redundancy, we quickly converged towards the use of a pair of computer boards. The issues are: What should these computers be? How should they be connected and communicate? Which one should start and continue operation for as long as possible (the “default” OBC)? How should failures be detected? How should control be passed to the “backup” OBC? Should control ever go back to the “default” OBC?", + "The series of N,N'-disubstituted diimides of perylene-3,4,9,10-tetracarboxylic acid, represented by the general structure shown below, is of interest for two reasons: a) the color of the solid pigment is markedly dependant on the nature of the substituent, R, and can vary from red to brown or black and, b) they are generally well known as effective organic photoconductors. We have synthesized and characterized a series of compounds with a selection of alkyl groups having different degrees of chain branching (R=C5H11, 6 isomers) and chain length. Thin layers of pure material were vacuum deposited onto transparent conductive su bstrates a nd spectroscopically cha racterized. Xerographic photo receptors were prepa red by overcoating these with a charge transport layer and the spectral photosensitivity between 400 and 800 nm was measured. The effect of chain length and chain branching upon solid state absorption spectrum and xerographic spectral photosensitivity will be presented and compared with the properties of previously described perylene bisimide photocond uctors." + ], + "domain": [ + "Computer Vision", + "Visual Question Answering", + "Motion Segmentation", + "Object Recognition" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "32bc43fb-2cc7-417a-bcf0-92674dfe7598": { + "pk": "32bc43fb-2cc7-417a-bcf0-92674dfe7598", + "project_name": null, + "name": "Lei Zhang", + "bio": "I am a researcher with a strong focus on image recognition, retrieval, and cognitive health. My work spans a variety of innovative systems and methodologies aimed at enhancing how we interact with visual data. Recently, I developed an advanced image captioning system that excels in generating high-quality captions for images in real-world scenarios, significantly outperforming previous models. This system integrates deep vision models, entity recognition, and confidence scoring to ensure accuracy and relevance.\n\nIn addition to image captioning, I have contributed to large-scale image retrieval challenges, such as the MSR Image Recognition Challenge, where I facilitated the development of efficient evaluation platforms for image recognition tasks. My research also delves into sketch-based image retrieval, where I introduced shape words descriptors to improve accuracy while minimizing memory usage.\n\nI have explored the complexities of near-duplicate image discovery and visual instance mining, proposing scalable methods that leverage graph structures to enhance performance on large datasets. My work in cognitive health includes investigating the prevalence of mild cognitive impairment (MCI) among older adults, revealing significant gender disparities in risk factors.\n\nOverall, my research aims to bridge the gap between advanced computational techniques and practical applications, whether in enhancing image understanding or addressing critical health issues. I am passionate about making my findings accessible and impactful, contributing to both academic and real-world advancements.", + "collaborators": [ + "Changhu Wang", + "Xiaodong He", + "Yuxiao Hu", + "Qiu-An Huang", + "Yandong Guo", + "Jianfeng Gao", + "Jiujun Zhang", + "Changcheng Xiao", + "Liqing Zhang", + "Xin-Jing Wang", + "Y. Rui", + "Jianshe Wang", + "Kenneth Tran", + "Jin Li", + "S. Mehrotra", + "Yanshi Hu", + "Zhenhua Pan", + "Ying Hu", + "Ju Wang", + "Yao Lu", + "M. Xiang", + "Q. Shang", + "Xiwu Gao", + "Yue Shen", + "Yunhui Huang", + "Saehoon Kim", + "Seungjin Choi", + "J. Ge", + "Yi Sun", + "Song Zhou", + "Y. Zhang", + "Qiang Zhang", + "A. Kannan", + "Simon Baker", + "Krishnan Ramnath", + "Juliet Fiss", + "Dahua Lin", + "Lucy Vanderwende", + "Rizwan Ansary", + "Ashish Kapoor", + "Qifa Ke", + "M. Uyttendaele", + "Deyu Meng", + "Biao Zhang", + "Zongben Xu", + "Chenqiang Gao", + "G. Xiong", + "Xiao-peng Chen", + "Xuesong Li", + "D. Fang", + "Luxia Zhang", + "Li Yang", + "L. Yao", + "Zhi-song He", + "Li-qun Zhou", + "Xiangni Su", + "L. Shang", + "Qiao-ling Xu", + "Nannan Li", + "Jian-hua Chen", + "Liping Zhang", + "Qian-zhen Hua", + "Jungen Kang", + "Ying-ying Guo", + "Yujuan Chen", + "Hailong Li", + "Haixia Liu", + "Zhengjun Zha", + "M. Mühlhäuser", + "A. Smeaton", + "Ranran Shi", + "N. Cheng", + "X. Sun", + "Liucheng Wang" + ], + "pub_titles": [ + "Rich Image Captioning in the Wild", + "ICME 2016 Image Recognition Grand Challenge", + "Sketch-based Image Retrieval via Shape Words", + "Near Duplicate Image Discovery on One Billion Images", + "IdeaPanel: A Large Scale Interactive Sketch-based Image Search System", + "A hybrid frequency–time domain method for predicting multiaxial fatigue life of 7075-T6 aluminium alloy under random loading", + "Scalable Visual Instance Mining with Instance Graph", + "Partial-Duplicate Clustering and Visual Pattern Discovery on Web Scale Image Database", + "Mining text snippets for images on the web", + "AB91. Risk factors and treatment outcomes of new contralateral upper urinary urothelial carcinoma after nephroureterectomy: the experiences of a large Chinese center", + "Prevalence and Predictors of Mild Cognitive Impairment in Xi’an: A Community-Based Study among the Elders" + ], + "pub_abstracts": [ + "We present an image caption system that addresses new challenges of automatically describing images in the wild. The challenges include generating high quality caption with respect to human judgments, out-of-domain data handling, and low latency required in many applications. Built on top of a state-of-the-art framework, we developed a deep vision model that detects a broad range of visual concepts, an entity recognition model that identifies celebrities and landmarks, and a confidence model for the caption output. Experimental results show that our caption engine outperforms previous state-of-the-art systems significantly on both in-domain dataset (i.e. MS COCO) and out-of-domain datasets. We also make the system publicly accessible as a part of the Microsoft Cognitive Services.", + "This paper summarizes the MSR Image Recognition Challenge (IRC) running with ICME 2016 Grand Challenges. Since 2013, Microsoft Research has hosted a series of IRCs to motivate the academic and industrial community to solve real-world large-scale image retrieval and recognition problems. This IRC in ICME 2016 continually leveraged the Clickture dataset [1], a large-scale real-world image click data consisting of 40M web images, and a derived subset of 95K dog-related images for the challenge of dog breed recognition. To conduct fair and efficient evaluation, and make the recognition result more reproducible and accessible, the contest runs on an open platform, Prajna Hub, which can help convert a research algorithm into an online service with minimal effort of just a few hours. As part of the ICME 2016 Grand Challenges, more than 30 teams participated this year's MSR IRC and 10 teams successfully finished the task. More details of data, system, metrics, process, and result are described in this paper.", + "The explosive growth of touch screens has provided a good platform for sketch-based image retrieval. However, most previous works focused on low level descriptors of shapes and sketches. In this paper, we try to step forward and propose to leverage shape words descriptor for sketch-based image retrieval. First, the shape words are defined and an efficient algorithm is designed for shape words extraction. Then we generalize the classic Chamfer Matching algorithm to address the shape words matching problem. Finally, a novel inverted index structure is proposed to make shape words representation scalable to large scale image databases. Experimental results show that our method achieves competitive accuracy but requires much less memory, e.g., less than 3% of memory storage of MindFinder. Due to its competitive accuracy and low memory cost, our method can scale up to much larger database.", + "Near-duplicate image discovery is the task of detecting all clusters of images which duplicate at a significant region. Previous work generally take divide and conquer approaches composed of two steps: generating cluster seeds using min-hashing, and growing the seeds by searching the entire image space with the seeds as queries. Since the computational complexity of the seed growing step is generally O (NL) where N and L are the number of images and seeds respectively, existing work can hardly be scaled up to a billion-scale dataset because L is typically millions. In this paper, we study a feasible solution of near-duplicate image discovery on one billion images, which is easily implemented on MapReduce framework. The major contribution of this work is to introduce the seed growing step designed to efficiently reduce the number of false positives among cluster seeds with O (cNL) time complexity, where c is small enough for a billion-scale dataset. The basis component of the seed growing step is a bottom-k min-hash, which generates different signatures in a sketch to remove all candidate images that share only one common visual word with a cluster seed. Our evaluations suggest that the proposed method can discover near-duplicate clusters with high precision and recall, and represent some interesting properties of our 1 billion dataset.", + "In this work, we introduce the IdeaPanel system, an interactive sketch-based image search engine with millions of images. IdeaPanel enables users to sketch the target image in their minds and also supports tagging to describe their intentions. After a search is triggered, similar images will be returned in real time, based on which users can interactively refine their query sketches until ideal images are returned. Different from existing work, most of which requires a huge amount of memory for indexing and matching, IdeaPanel can achieve very competitive performance but requires much less memory storage. IdeaPanel needs only about 240MB memory to index 1.3M images (less than 3% of previous MindFinder system). Due to its high accuracy and low memory cost, IdealPanel can scale up to much larger database and thus has larger potential to return the most desired images for users.", + "A hybrid frequency–time domain method for predicting multiaxial fatigue life under random loading is developed on the basis of combination of the frequency domain and time domain analysis. The critical damage point of the structure is determined by the frequency domain equivalent stress method. Then, the fatigue life prediction is made in time domain by generating random load-time histories from the power spectral density of the critical point. The method is validated with the random vibration fatigue test of 7075-T6 aluminium alloy. It has been shown that the results of fatigue life calculated by hybrid method are well correlated with the experiment.", + "In this paper we address the problem of visual instance mining, which is to automatically discover frequently appearing visual instances from a large collection of images. We propose a scalable mining method by leveraging the graph structure with images as vertices. Different from most existing work that focused on either instance-level similarities or image-level context properties, our graph captures both information. The instance-level information is integrated during the construction of a weighted and undirected instance graph based on the similarity between augmented local features, while the image-level context is explored with a greedy breadth-first search algorithm to discover clusters of visual instances from the graph. This method is capable of mining challenging small visual instances with diverse variations. We evaluated our method on two fully annotated datasets and outperformed the state of the arts on both datasets with higher recalls. We also applied our method on a one-million Flickr dataset and proved its scalability.", + "In this paper, we study the problem of discovering visual patterns and partial-duplicate images, which is fundamental to visual concept representation and image parsing, but very challenging when the database is extremely large, such as billions of images indexed by a commercial search engine. Although extensive research with sophisticated algorithms has been conducted for either partial-duplicate clustering or visual pattern discovery, most of them can not be easily extended to this scale, since both are clustering problems in nature and require pairwise comparisons. To tackle this computational challenge, we introduce a novel and highly parallelizable framework to discover partial-duplicate images and visual patterns in a unified way in distributed computing systems. We emphasize the nested property of local features, and propose the generalized nested feature (GNF) as a mid-level representation for regions and local patterns. Initial coarse clusters are then discovered by GNFs, upon which n-gram GNF is defined to represent co-occurrent visual patterns. After that, efficient merging and refining algorithms are used to get the partial-duplicate clusters, and logical combinations of probabilistic GNF models are leveraged to represent the visual patterns of partially duplicate images. Extensive experiments show the parallelizable property and effectiveness of the algorithms on both partial-duplicate clustering and visual pattern discovery. With 2000 machines, it costs about eight and 400 minutes to process one million and 40 million images respectively, which is quite efficient compared to previous methods.", + "Images are often used to convey many different concepts or illustrate many different stories. We propose an algorithm to mine multiple diverse, relevant, and interesting text snippets for images on the web. Our algorithm scales to all images on the web. For each image, all webpages that contain it are considered. The top-K text snippet selection problem is posed as combinatorial subset selection with the goal of choosing an optimal set of snippets that maximizes a combination of relevancy, interestingness, and diversity. The relevancy and interestingness are scored by machine learned models. Our algorithm is run at scale on the entire image index of a major search engine resulting in the construction of a database of images with their corresponding text snippets. We validate the quality of the database through a large-scale comparative study. We showcase the utility of the database through two web-scale applications: (a) augmentation of images on the web as webpages are browsed and (b)~an image browsing experience (similar in spirit to web browsing) that is enabled by interconnecting semantically related images (which may not be visually related) through shared concepts in their corresponding text snippets.", + "Objective To explore the risk factors and treatment outcomes of contralateral new upper tract urothelial carcinoma (UTUC) after nephroureterectomy in a large single-center cohort of UTUC patients. Methods A retrospective analysis of the clinicopathological data of 509 consecutive patients treated by nephroureterectomy from 2000 to 2010 at a high-volume center in China was conducted. Results Thirty-five patients (6.9%) were found to develop contralateral UTUC. Gross hematuria was a common symptom, and aristolochic acid (AA) containing Chinese herbs were identified as an underlying cause. In multivariate analysis, renal transplant recipients (HR =16.507) and preoperative renal insufficiency (HR =2.523) were independent risk factors. No correlation was found in the clinical and pathological characteristics of primary and subsequent contralateral UTUC. A second round of nephroureterectomy should be performed on renal transplant patients, whereas patients who are older, exhibit relatively better renal function, and have tumors that are single, located in the ureter and small size could be treated with nephron-sparing surgery. No survival difference was detected between the two treatment groups, and developing contralateral UTUC was not associated with worse survival. Conclusions Contralateral UTUC is comparatively rare, and gross hematuria is a common symptom. Renal transplant history and renal insufficiency are independent risk factors, and prophylactic contralateral nephroureterectomy is considered for uremic UTUC patients with proper renal replacement treatment or renal transplant recipients. These features are likely related to the mechanisms of multifocality of UTUC, and the potential correlation with AA remains to be elucidated. Tumor characteristics and renal function are informative for the decisions concerning surgical options for contralateral UTUC.", + "Mild cognitive impairment (MCI) is an intermediate stage between normal cognitive function and dementia among aging individuals. This study was designed to estimate the prevalence of MCI and explore the possible risk factors including gender disparities among community-dwelling older individuals. The study was conducted in Xi’an, China. This is a cross-sectional study. A total of 815 individuals, 60 years and older were selected by stratified random cluster sampling. Cognitive function was measured using the mini-mental status examination (MMSE), the Chinese version of the Dementia Rating Scales (CDRS) was used to apply the diagnostic of non-dementia, and activities of daily living (ADL) and instrumental activities of daily living (IADL) systems were used to functional status. The association between sociodemographic characteristics, lifestyle, history of chronic diseases and MCI were evaluated separately for men and women using the Pearson χ2- test and binary logistic regression. Of the 815 community-dwelling individuals, 145 were found to have MCI. Overall, the prevalence of MCI was 18.5%, with a prevalence of 19.6% in women (105/535), and 15.3% (40/261) in men. The results of the binary logistical regression analysis indicated that age and history of stroke were associated with MCI in men. For women, the risk factors were lower level of educational and lack of religious attendance. Results suggested that the factors capable of influencing MCI differed profoundly between older men and older women. For this reason, different preventative measures should be adopted to delay or reverse cognitive impairment among community-dwelling older men and women." + ], + "domain": [ + "Image Processing", + "Machine Learning", + "Cognitive Science", + "Medical Research" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we effectively combine bottom-up and top-down visual attention mechanisms to improve the performance of image captioning and visual question answering tasks?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is crucial for advancing the fields of computer vision and natural language processing, as it can lead to more human-like understanding and generation of image-related content. Improved performance in image captioning and VQA can enhance applications in accessibility, content creation, and human-computer interaction. This research could pave the way for more sophisticated AI systems that better understand visual contexts and generate relevant textual descriptions, ultimately influencing future research directions in multimodal learning and AI interpretability.\n\n**[Question 3] - Why is it hard?** \nThe challenge lies in the complexity of integrating two distinct attention mechanisms—bottom-up and top-down—while ensuring that the model can effectively learn to focus on salient image regions relevant to the task at hand. Naive approaches may fail because they often rely on uniform grids of image regions, which do not account for the varying importance of different objects or features in an image. Additionally, the technical obstacles include the need for robust feature extraction from images and the effective modeling of task-specific context, which requires sophisticated neural network architectures and training strategies.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has primarily focused on top-down attention mechanisms, which limit the model's ability to dynamically identify and prioritize salient image regions. Existing solutions often overlook the importance of bottom-up signals that can provide critical context for attention. Barriers such as the lack of effective methods for integrating these two types of attention and the computational complexity involved have hindered progress. Our approach differs by explicitly combining bottom-up attention, which identifies salient regions using Faster R-CNN, with top-down attention driven by task-specific context, thereby addressing these limitations.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves implementing a combined bottom-up and top-down visual attention mechanism. We will utilize the Faster R-CNN model to extract salient image regions, represented by pooled convolutional feature vectors. The top-down mechanism will leverage task-specific context to predict an attention distribution over these regions. We will evaluate our approach on image captioning and visual question answering tasks, using metrics such as BLEU scores for captioning and accuracy for VQA. We expect that our method will significantly enhance the quality of generated" + }, + "1311.2901": { + "paper_data": { + "title": "Visualizing and Understanding Convolutional Networks", + "url": "http://arxiv.org/abs/1311.2901v3", + "arxiv_id": "1311.2901", + "authors": [ + "Matthew D Zeiler", + "Rob Fergus" + ], + "abstract": "Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we address both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. We also perform an ablation study to discover the performance contribution from different model layers. This enables us to find model architectures that outperform Krizhevsky \\etal on the ImageNet classification benchmark. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets.", + "introduction": " Introduction Since their introduction by (LeCun et al., 1989) in the early 1990's, Convolutional Networks (convnets) have demonstrated excellent performance at tasks such as hand-written digit classi\fcation and face detec- tion. In the last year, several papers have shown that they can also deliver outstanding performance on more challenging visual classi\fcation tasks. (Ciresan et al., 2012) demonstrate state-of-the-art performance on NORB and CIFAR-10 datasets. Most notably, (Krizhevsky et al., 2012) show record beating perfor- mance on the ImageNet 2012 classi\fcation benchmark, with their convnet model achieving an error rate of 16.4%, compared to the 2nd place result of 26.1%. Several factors are responsible for this renewed inter-est in convnet models: (i) the availability of much larger training sets, with millions of labeled exam- ples; (ii) powerful GPU implementations, making the training of very large models practical and (iii) bet- ter model regularization strategies, such as Dropout (Hinton et al., 2012). Despite this encouraging progress, there is still lit- tle insight into the internal operation and behavior of these complex models, or how they achieve such good performance. From a scienti\fc standpoint, this is deeply unsatisfactory. Without clear understanding of how and why they work, the development of better models is reduced to trial-and-error. In this paper we introduce a visualization technique that reveals the in- put stimuli that excite individual feature maps at any layer in the model. It also allows us to observe the evolution of features during training and to diagnose potential problems with the model. The visualization technique we propose uses a multi-layered Deconvo- lutional Network (deconvnet), as proposed by (Zeiler et al., 2011), to project the feature activations back to the input pixel space. We also perform a sensitivity analysis of the classi\fer output by occluding portions of the input image, revealing which parts of the scene are important for classi\fcation. Using these tools, we start with the architecture of (Krizhevsky et al., 2012) and explore di\u000berent archi- tectures, discovering ones that outperform their methods ([A]= (Sande et al., 2012) and [B] = (Yan et al., 2012)). 5.3. Feature Analysis We explore how discriminative the features in each layer of our Imagenet-pretrained model are. We do this by varying the number of layers retained from the Ima- geNet model and place either a linear SVM or softmax classi\fer on top. Table 7 shows Related Work Visualizing features to gain intuition about the net- work is common practice, but mostly limited to the 1st layer where projections to pixel space are possible. In higher layers this is not the case, and there are limited experiments on a pedestrian dataset (Dalal & Triggs, 2005). We also try a second strategy of training a model from scratch, i.e. resetting layers 1-7 to random values and train them, as well as the softmax, on the training images of the dataset. One complication is that some of the Caltech datasets have some images that are also in the ImageNet train- ing data. Using normalized correlation, we identi\fed these few \\overlap\" images2and removed them from our Imagenet training set and then retrained our Ima- genet models, so avoiding the possibility of train/test contamination. Caltech-101: We follow the procedure of (Fei-fei et al., 2006) and randomly select 15 or 30 images per class for training and test on up to 50 images per class reporting the average of the per-class accuracies in Ta- 2For Caltech-101, we found 44 images in common (out of 9,144 total images), with a maximum overlap of 10 for any given", + "references": [ + { + "title": "Learning and Transferring Mid-level Image Representations Using Convolutional Neural Networks", + "abstract": "Convolutional neural networks (CNN) have recently shown outstanding image classification performance in the large- scale visual recognition challenge (ILSVRC2012). The success of CNNs is attributed to their ability to learn rich mid-level image representations as opposed to hand-designed low-level features used in other image classification methods. Learning CNNs, however, amounts to estimating millions of parameters and requires a very large number of annotated image samples. This property currently prevents application of CNNs to problems with limited training data. In this work we show how image representations learned with CNNs on large-scale annotated datasets can be efficiently transferred to other visual recognition tasks with limited amount of training data. We design a method to reuse layers trained on the ImageNet dataset to compute mid-level image representation for images in the PASCAL VOC dataset. We show that despite differences in image statistics and tasks in the two datasets, the transferred representation leads to significantly improved results for object and action classification, outperforming the current state of the art on Pascal VOC 2007 and 2012 datasets. We also show promising results for object and action localization." + }, + { + "title": "Deep Inside Convolutional Networks: Visualising Image Classification Models and Saliency Maps", + "abstract": "This paper addresses the visualisation of image classification models, learnt using deep Convolutional Networks (ConvNets). We consider two visualisation techniques, based on computing the gradient of the class score with respect to the input image. The first one generates an image, which maximises the class score [Erhan et al., 2009], thus visualising the notion of the class, captured by a ConvNet. The second technique computes a class saliency map, specific to a given image and class. We show that such maps can be employed for weakly supervised object segmentation using classification ConvNets. Finally, we establish the connection between the gradient-based ConvNet visualisation methods and deconvolutional networks [Zeiler et al., 2013]." + }, + { + "title": "Some Improvements on Deep Convolutional Neural Network Based Image Classification", + "abstract": "Abstract: We investigate multiple techniques to improve upon the current state of the art deep convolutional neural network based image classification pipeline. The techiques include adding more image transformations to training data, adding more transformations to generate additional predictions at test time and using complementary models applied to higher resolution images. This paper summarizes our entry in the Imagenet Large Scale Visual Recognition Challenge 2013. Our system achieved a top 5 classification error rate of 13.55% using no external data which is over a 20% relative improvement on the previous year's winner." + }, + { + "title": "Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation", + "abstract": "Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30% relative to the previous best result on VOC 2012 -- achieving a mAP of 53.3%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also present experiments that provide insight into what the network learns, revealing a rich hierarchy of image features. Source code for the complete system is available at http://www.cs.berkeley.edu/~rbg/rcnn." + }, + { + "title": "DeCAF: A Deep Convolutional Activation Feature for Generic Visual Recognition", + "abstract": "We evaluate whether features extracted from the activation of a deep convolutional network trained in a fully supervised fashion on a large, fixed set of object recognition tasks can be repurposed to novel generic tasks. Our generic tasks may differ significantly from the originally trained tasks and there may be insufficient labeled or unlabeled data to conventionally train or adapt a deep architecture to the new tasks. We investigate and visualize the semantic clustering of deep convolutional features with respect to a variety of such tasks, including scene recognition, domain adaptation, and fine-grained recognition challenges. We compare the efficacy of relying on various network levels to define a fixed feature, and report novel results that significantly outperform the state-of-the-art on several important vision challenges. We are releasing DeCAF, an open-source implementation of these deep convolutional activation features, along with all associated network parameters to enable vision researchers to be able to conduct experimentation with deep representations across a range of visual concept learning paradigms." + }, + { + "title": "Multipath Sparse Coding Using Hierarchical Matching Pursuit", + "abstract": "Complex real-world signals, such as images, contain discriminative structures that differ in many aspects including scale, invariance, and data channel. While progress in deep learning shows the importance of learning features through multiple layers, it is equally important to learn features through multiple paths. We propose Multipath Hierarchical Matching Pursuit (M-HMP), a novel feature learning architecture that combines a collection of hierarchical sparse features for image classification to capture multiple aspects of discriminative structures. Our building blocks are MI-KSVD, a codebook learning algorithm that balances the reconstruction error and the mutual incoherence of the codebook, and batch orthogonal matching pursuit (OMP), we apply them recursively at varying layers and scales. The result is a highly discriminative image representation that leads to large improvements to the state-of-the-art on many standard benchmarks, e.g., Caltech-101, Caltech-256, MITScenes, Oxford-IIIT Pet and Caltech-UCSD Bird-200." + }, + { + "title": "ImageNet classification with deep convolutional neural networks", + "abstract": "We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called \"dropout\" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved by the second-best entry." + }, + { + "title": "Multi-column deep neural network for traffic sign classification", + "abstract": null + }, + { + "title": "Improving neural networks by preventing co-adaptation of feature detectors", + "abstract": "When a large feedforward neural network is trained on a small training set, it typically performs poorly on held-out test data. This \"overfitting\" is greatly reduced by randomly omitting half of the feature detectors on each training case. This prevents complex co-adaptations in which a feature detector is only helpful in the context of several other specific feature detectors. Instead, each neuron learns to detect a feature that is generally helpful for producing the correct answer given the combinatorially large variety of internal contexts in which it must operate. Random \"dropout\" gives big improvements on many benchmark tasks and sets new records for speech and object recognition." + }, + { + "title": "Multi-column deep neural networks for image classification", + "abstract": "Traditional methods of computer vision and machine learning cannot match human performance on tasks such as the recognition of handwritten digits or traffic signs. Our biologically plausible, wide and deep artificial neural network architectures can. Small (often minimal) receptive fields of convolutional winner-take-all neurons yield large network depth, resulting in roughly as many sparsely connected neural layers as found in mammals between retina and visual cortex. Only winner neurons are trained. Several deep neural columns become experts on inputs preprocessed in different ways; their predictions are averaged. Graphics cards allow for fast training. On the very competitive MNIST handwriting benchmark, our method is the first to achieve near-human performance. On a traffic sign recognition benchmark it outperforms humans by a factor of two. We also improve the state-of-the-art on a plethora of common image classification benchmarks." + }, + { + "title": "Adaptive deconvolutional networks for mid and high level feature learning", + "abstract": "We present a hierarchical model that learns image decompositions via alternating layers of convolutional sparse coding and max pooling. When trained on natural images, the layers of our model capture image information in a variety of forms: low-level edges, mid-level edge junctions, high-level object parts and complete objects. To build our model we rely on a novel inference scheme that ensures each layer reconstructs the input, rather than just the output of the layer directly beneath, as is common with existing hierarchical approaches. This makes it possible to learn multiple layers of representation and we show models with 4 layers, trained on images from the Caltech-101 and 256 datasets. When combined with a standard classifier, features extracted from these models outperform SIFT, as well as representations from other feature learning methods." + }, + { + "title": "Efficient learning of sparse, distributed, convolutional feature representations for object recognition", + "abstract": "Informative image representations are important in achieving state-of-the-art performance in object recognition tasks. Among feature learning algorithms that are used to develop image representations, restricted Boltzmann machines (RBMs) have good expressive power and build effective representations. However, the difficulty of training RBMs has been a barrier to their wide use. To address this difficulty, we show the connections between mixture models and RBMs and present an efficient training method for RBMs that utilize these connections. To the best of our knowledge, this is the first work showing that RBMs can be trained with almost no hyperparameter tuning to provide classification performance similar to or significantly better than mixture models (e.g., Gaussian mixture models). Along with this efficient training, we evaluate the importance of convolutional training that can capture a larger spatial context with less redundancy, as compared to non-convolutional training. Overall, our method achieves state-of-the-art performance on both Caltech 101 / 256 datasets using a single type of feature." + }, + { + "title": "Unbiased look at dataset bias", + "abstract": "Datasets are an integral part of contemporary object recognition research. They have been the chief reason for the considerable progress in the field, not just as source of large amounts of training data, but also as means of measuring and comparing performance of competing algorithms. At the same time, datasets have often been blamed for narrowing the focus of object recognition research, reducing it to a single benchmark performance number. Indeed, some datasets, that started out as data capture efforts aimed at representing the visual world, have become closed worlds unto themselves (e.g. the Corel world, the Caltech-101 world, the PASCAL VOC world). With the focus on beating the latest benchmark numbers on the latest dataset, have we perhaps lost sight of the original purpose? The goal of this paper is to take stock of the current state of recognition datasets. We present a comparison study using a set of popular datasets, evaluated based on a number of criteria including: relative data bias, cross-dataset generalization, effects of closed-world assumption, and sample value. The experimental results, some rather surprising, suggest directions that can improve dataset collection as well as algorithm evaluation protocols. But more broadly, the hope is to stimulate discussion in the community regarding this very important, but largely neglected issue." + }, + { + "title": "Tiled convolutional neural networks", + "abstract": "Convolutional neural networks (CNNs) have been successfully applied to many tasks such as digit and object recognition. Using convolutional (tied) weights significantly reduces the number of parameters that have to be learned, and also allows translational invariance to be hard-coded into the architecture. In this paper, we consider the problem of learning invariances, rather than relying on hard-coding. We propose tiled convolution neural networks (Tiled CNNs), which use a regular \"tiled\" pattern of tied weights that does not require that adjacent hidden units share identical weights, but instead requires only that hidden units k steps away from each other to have tied weights. By pooling over neighboring units, this architecture is able to learn complex invariances (such as scale and rotational invariance) beyond translational invariance. Further, it also enjoys much of CNNs' advantage of having a relatively small number of learned parameters (such as ease of learning and greater scalability). We provide an efficient learning algorithm for Tiled CNNs based on Topographic ICA, and show that learning complex invariant features allows us to achieve highly competitive results for both the NORB and CIFAR-10 datasets." + }, + { + "title": "What is the best multi-stage architecture for object recognition?", + "abstract": "In many recent object recognition systems, feature extraction stages are generally composed of a filter bank, a non-linear transformation, and some sort of feature pooling layer. Most systems use only one stage of feature extraction in which the filters are hard-wired, or two stages where the filters in one or both stages are learned in supervised or unsupervised mode. This paper addresses three questions: 1. How does the non-linearities that follow the filter banks influence the recognition accuracy? 2. does learning the filter banks in an unsupervised or supervised manner improve the performance over random filters or hardwired filters? 3. Is there any advantage to using an architecture with two stages of feature extraction, rather than one? We show that using non-linearities that include rectification and local contrast normalization is the single most important ingredient for good accuracy on object recognition benchmarks. We show that two stages of feature extraction yield better accuracy than one. Most surprisingly, we show that a two-stage system with random filters can yield almost 63% recognition rate on Caltech-101, provided that the proper non-linearities and pooling layers are used. Finally, we show that with supervised refinement, the system achieves state-of-the-art performance on NORB dataset (5.6%) and unsupervised pre-training followed by supervised refinement produces good accuracy on Caltech-101 (≫ 65%), and the lowest known error rate on the undistorted, unprocessed MNIST dataset (0.53%)." + }, + { + "title": "Linear spatial pyramid matching using sparse coding for image classification", + "abstract": "Recently SVMs using spatial pyramid matching (SPM) kernel have been highly successful in image classification. Despite its popularity, these nonlinear SVMs have a complexity O(n2 ~ n3) in training and O(n) in testing, where n is the training size, implying that it is nontrivial to scaleup the algorithms to handle more than thousands of training images. In this paper we develop an extension of the SPM method, by generalizing vector quantization to sparse coding followed by multi-scale spatial max pooling, and propose a linear SPM kernel based on SIFT sparse codes. This new approach remarkably reduces the complexity of SVMs to O(n) in training and a constant in testing. In a number of image categorization experiments, we find that, in terms of classification accuracy, the suggested linear SPM based on sparse coding of SIFT descriptors always significantly outperforms the linear SPM kernel on histograms, and is even better than the nonlinear SPM kernels, leading to state-of-the-art performance on several benchmarks by using a single type of descriptors." + }, + { + "title": "ImageNet: A large-scale hierarchical image database", + "abstract": "The explosion of image data on the Internet has the potential to foster more sophisticated and robust models and algorithms to index, retrieve, organize and interact with images and multimedia data. But exactly how such data can be harnessed and organized remains a critical problem. We introduce here a new database called “ImageNet”, a large-scale ontology of images built upon the backbone of the WordNet structure. ImageNet aims to populate the majority of the 80,000 synsets of WordNet with an average of 500-1000 clean and full resolution images. This will result in tens of millions of annotated images organized by the semantic hierarchy of WordNet. This paper offers a detailed analysis of ImageNet in its current state: 12 subtrees with 5247 synsets and 3.2 million images in total. We show that ImageNet is much larger in scale and diversity and much more accurate than the current image datasets. Constructing such a large-scale database is a challenging task. We describe the data collection scheme with Amazon Mechanical Turk. Lastly, we illustrate the usefulness of ImageNet through three simple applications in object recognition, image classification and automatic object clustering. We hope that the scale, accuracy, diversity and hierarchical structure of ImageNet can offer unparalleled opportunities to researchers in the computer vision community and beyond." + }, + { + "title": "Co-occurrence Histograms of Oriented Gradients for Pedestrian Detection", + "abstract": null + }, + { + "title": "Extracting and composing robust features with denoising autoencoders", + "abstract": "Previous work has shown that the difficulties in learning deep generative or discriminative models can be overcome by an initial unsupervised learning step that maps inputs to useful intermediate representations. We introduce and motivate a new training principle for unsupervised learning of a representation based on the idea of making the learned representations robust to partial corruption of the input pattern. This approach can be used to train autoencoders, and these denoising autoencoders can be stacked to initialize deep architectures. The algorithm can be motivated from a manifold learning and information theoretic perspective or from a generative model perspective. Comparative experiments clearly show the surprising advantage of corrupting the input of autoencoders on a pattern classification benchmark suite." + }, + { + "title": "Greedy Layer-Wise Training of Deep Networks", + "abstract": "Complexity theory of circuits strongly suggests that deep architectures can be much more efficient (sometimes exponentially) than shallow architectures, in terms of computational elements required to represent some functions. Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization appears to often get stuck in poor solutions. Hinton et al. recently introduced a greedy layer-wise unsupervised learning algorithm for Deep Belief Networks (DBN), a generative model with many layers of hidden causal variables. In the context of the above optimization problem, we study this algorithm empirically and explore variants to better understand its success and extend it to cases where the inputs are continuous or where the structure of the input distribution is not revealing enough about the variable to be predicted in a supervised task. Our experiments also confirm the hypothesis that the greedy layer-wise unsupervised training strategy mostly helps the optimization, by initializing weights in a region near a good local minimum, giving rise to internal distributed representations that are high-level abstractions of the input, bringing better generalization." + }, + { + "title": "On the Analysis and Interpretation of Inhomogeneous Quadratic Forms as Receptive Fields", + "abstract": "In this letter, we introduce some mathematical and numerical tools to analyze and interpret inhomogeneous quadratic forms. The resulting characterization is in some aspects similar to that given by experimental studies of cortical cells, making it particularly suitable for application to second-order approximations and theoretical models of physiological receptive fields. We first discuss two ways of analyzing a quadratic form by visualizing the coefficients of its quadratic and linear term directly and by considering the eigenvectors of its quadratic term. We then present an algorithm to compute the optimal excitatory and inhibitory stimulithose that maximize and minimize the considered quadratic form, respectively, given a fixed energy constraint. The analysis of the optimal stimuli is completed by considering their invariances, which are the transformations to which the quadratic form is most insensitive, and by introducing a test to determine which of these are statistically significant. Next we propose a way to measure the relative contribution of the quadratic and linear term to the total output of the quadratic form. Furthermore, we derive simpler versions of the above techniques in the special case of a quadratic form without linear term. In the final part of the letter, we show that for each quadratic form, it is possible to build an equivalent two-layer neural network, which is compatible with (but more general than) related networks used in some recent articles and with the energy model of complex cells. We show that the neural network is unique only up to an arbitrary orthogonal transformation of the excitatory and inhibitory subunits in the first layer." + }, + { + "title": "A Fast Learning Algorithm for Deep Belief Nets", + "abstract": "We show how to use complementary priors to eliminate the explaining-away effects that make inference difficult in densely connected belief nets that have many hidden layers. Using complementary priors, we derive a fast, greedy algorithm that can learn deep, directed belief networks one layer at a time, provided the top two layers form an undirected associative memory. The fast, greedy algorithm is used to initialize a slower learning procedure that fine-tunes the weights using a contrastive version of the wake-sleep algorithm. After fine-tuning, a network with three hidden layers forms a very good generative model of the joint distribution of handwritten digit images and their labels. This generative model gives better digit classification than the best discriminative learning algorithms. The low-dimensional manifolds on which the digits lie are modeled by long ravines in the free-energy landscape of the top-level associative memory, and it is easy to explore these ravines by using the directed connections to display what the associative memory has in mind." + }, + { + "title": "One-shot learning of object categories", + "abstract": "Learning visual models of object categories notoriously requires hundreds or thousands of training examples. We show that it is possible to learn much information about a category from just one, or a handful, of images. The key insight is that, rather than learning from scratch, one can take advantage of knowledge coming from previously learned categories, no matter how different these categories might be. We explore a Bayesian implementation of this idea. Object categories are represented by probabilistic models. Prior knowledge is represented as a probability density function on the parameters of these models. The posterior model for an object category is obtained by updating the prior in the light of one or more observations. We test a simple implementation of our algorithm on a database of 101 diverse object categories. We compare category models learned by an implementation of our Bayesian approach to models learned from by maximum likelihood (ML) and maximum a posteriori (MAP) methods. We find that on a database of more than 100 categories, the Bayesian approach produces informative models when the number of training examples is too small for other methods to operate successfully." + }, + { + "title": "Backpropagation Applied to Handwritten Zip Code Recognition", + "abstract": "The ability of learning networks to generalize can be greatly enhanced by providing constraints from the task domain. This paper demonstrates how such constraints can be integrated into a backpropagation network through the architecture of the network. This approach has been successfully applied to the recognition of handwritten zip code digits provided by the U.S. Postal Service. A single network learns the entire recognition operation, going from the normalized image of the character to the final classification." + }, + { + "title": "Visualizing and Understanding Convolutional Networks", + "abstract": null + }, + { + "title": "Improving Histograms of Oriented Gradients for Pedestrian Detection", + "abstract": "— In this paper, we proposed a pedestrian detection system based on Hog Transform Using Open CV for smart vehicles. Two SVM classifiers on Histogram of Oriented Gradient (HOG) features are used to precisely locate pedestrians on the ROI. Experiments report over 30 time’s higher speed than the state-of-the-art method and a comparable detection rate." + }, + { + "title": "Classifi - cation entry", + "abstract": null + }, + { + "title": "Generalized hierarchical matching for sub-category aware object classification", + "abstract": null + }, + { + "title": "Hybrid coding for selective search. In PASCAL VOC Classification Challenge", + "abstract": null + }, + { + "title": "Visualizing Higher-Layer Features of a Deep Network", + "abstract": "Deep architectures have demonstrated state-of-the-art results in a variety of settings, especially with vision datasets. Beyond the model definitions and the quantitative analyses, there is a need for qualitative comparisons of the solutions learned by various deep architectures. The goal of this paper is to find good qualitative interpretations of high level features represented by such models. To this end, we contrast and compare several techniques applied on Stacked Denoising Autoencoders and Deep Belief Networks, trained on several vision datasets. We show that, perhaps counter-intuitively, such interpretation is possible at the unit level, that it is simple to accomplish and that the results are consistent across various techniques. We hope that such techniques will allow researchers in deep architectures to understand more of how and why deep architectures work." + }, + { + "title": "The caltech 256", + "abstract": null + }, + { + "title": "Clarifai (2013), http://www.image-net.org/challenges/LSVRC/2013/ results", + "abstract": null + } + ] + }, + "author_data": { + "269d42a6-6a9c-4a39-baf1-ce969039d3b6": { + "pk": "269d42a6-6a9c-4a39-baf1-ce969039d3b6", + "project_name": null, + "name": "Matthew D Zeiler", + "bio": "I am a researcher with a strong focus on developing innovative machine learning techniques, particularly in the realm of deep learning and image representation. My work has led to the creation of ADADELTA, a per-dimension learning rate method that adapts dynamically during training, eliminating the need for manual tuning and proving robust across various architectures and data types. \n\nI have also explored novel pooling methods, introducing a differentiable Gaussian-based pooling technique that optimizes location and appearance information simultaneously, enhancing the performance of hierarchical models. My research extends to Temporal Restricted Boltzmann Machines, where I model complex sequences, such as facial expressions, demonstrating significant improvements over traditional methods.\n\nIn addition, I have developed hierarchical models that learn rich image decompositions through alternating layers of convolutional sparse coding and max pooling. This approach captures a spectrum of image features, from low-level edges to high-level object parts, and has shown to outperform conventional feature extraction methods like SIFT.\n\nMy passion lies in uncovering robust low and mid-level image representations that retain critical cues often lost in traditional methods. By leveraging unsupervised learning frameworks, I aim to build hierarchies of features that enhance both image analysis and synthesis. My interdisciplinary work even extends to modeling animal behavior, where I apply machine learning techniques to understand the dynamics of pigeon courtship through motion capture data. Overall, my research is driven by a commitment to advancing the field of machine learning and its applications in understanding complex systems.", + "collaborators": [ + "R. Fergus", + "Graham W. Taylor", + "Dilip Krishnan", + "L. Sigal", + "I. Matthews", + "N. Troje", + "Geoffrey E. Hinton" + ], + "pub_titles": [ + "ADADELTA: An Adaptive Learning Rate Method", + "Differentiable Pooling for Hierarchical Feature Learning", + "Facial Expression Transfer with Input-Output Temporal Restricted Boltzmann Machines", + "Adaptive deconvolutional networks for mid and high level feature learning", + "Deconvolutional Networks for Feature Learning", + "Deconvolutional networks", + "Learning Image Decompositions with Hierarchical Sparse Coding", + "Modeling pigeon behavior using a Conditional Restricted Boltzmann Machine" + ], + "pub_abstracts": [ + "We present a novel per-dimension learning rate method for gradient descent called ADADELTA. The method dynamically adapts over time using only first order information and has minimal computational overhead beyond vanilla stochastic gradient descent. The method requires no manual tuning of a learning rate and appears robust to noisy gradient information, different model architecture choices, various data modalities and selection of hyperparameters. We show promising results compared to other methods on the MNIST digit classification task using a single machine and on a large scale voice dataset in a distributed cluster environment.", + "We introduce a parametric form of pooling, based on a Gaussian, which can be optimized alongside the features in a single global objective function. By contrast, existing pooling schemes are based on heuristics (e.g. local maximum) and have no clear link to the cost function of the model. Furthermore, the variables of the Gaussian explicitly store location information, distinct from the appearance captured by the features, thus providing a what/where decomposition of the input signal. Although the differentiable pooling scheme can be incorporated in a wide range of hierarchical models, we demonstrate it in the context of a Deconvolutional Network model (Zeiler et al. ICCV 2011). We also explore a number of secondary issues within this model and present detailed experiments on MNIST digits.", + "We present a type of Temporal Restricted Boltzmann Machine that defines a probability distribution over an output sequence conditional on an input sequence. It shares the desirable properties of RBMs: efficient exact inference, an exponentially more expressive latent state than HMMs, and the ability to model nonlinear structure and dynamics. We apply our model to a challenging real-world graphics problem: facial expression transfer. Our results demonstrate improved performance over several baselines modeling high-dimensional 2D and 3D data.", + "We present a hierarchical model that learns image decompositions via alternating layers of convolutional sparse coding and max pooling. When trained on natural images, the layers of our model capture image information in a variety of forms: low-level edges, mid-level edge junctions, high-level object parts and complete objects. To build our model we rely on a novel inference scheme that ensures each layer reconstructs the input, rather than just the output of the layer directly beneath, as is common with existing hierarchical approaches. This makes it possible to learn multiple layers of representation and we show models with 4 layers, trained on images from the Caltech-101 and 256 datasets. When combined with a standard classifier, features extracted from these models outperform SIFT, as well as representations from other feature learning methods.", + "Introduction Building robust low-level image representations, beyond edge primitives, is a long-standing goal in vision. In its most basic form, an image is a matrix of intensities. How we should progress from this matrix to stable mid-level representations, useful for high-level vision tasks, remains unclear. Popular feature representations such as SIFT or HOG spatially pool edge information to form descriptors that are invariant to local transformations. However, in doing so important cues such as edge intersections, grouping, parallelism and symmetry are lost. (a)", + "Building robust low and mid-level image representations, beyond edge primitives, is a long-standing goal in vision. Many existing feature detectors spatially pool edge information which destroys cues such as edge intersections, parallelism and symmetry. We present a learning framework where features that capture these mid-level cues spontaneously emerge from image data. Our approach is based on the convolutional decomposition of images under a spar-sity constraint and is totally unsupervised. By building a hierarchy of such decompositions we can learn rich feature sets that are a robust image representation for both the analysis and synthesis of images.", + "We present a hierarchical model that learns image decompositions via alternating layers of convolutional sparse coding and max pooling. When trained on natural images, the layers of our model capture image information in a variety of forms: low-level edges, mid-level edge junctions, high-level object parts and complete objects. To build our model we rely on a novel inference scheme that ensures each layer reconstructs the input, rather than just the output of the layer directly beneath, as is common with existing hierarchical approaches. This scheme makes it possible to robustly learn multiple layers of representation and we show a model with 4 layers, trained on images from the Caltech101 dataset. We use our model to produce image decompositions that, when used as input to standard classification schemes, give a significant performance gain over low-level edge features and yield an overall performance competitive with leading approaches.", + "In an effort to better understand the complex courtship be- haviour of pigeons, we have built a model learned from motion capture data. We employ a Conditional Restricted Boltzmann Machine (CRBM) with binary latent features and real-valued visible units. The units are conditioned on information from previous time steps to capture dynam- ics. We validate a trained model by quantifying the characteristic \"head- bobbing\" present in pigeons. We also show how to predict missing data by marginalizing out the hidden variables and minimizing free energy." + ], + "domain": [ + "Machine Learning", + "Computer Vision", + "Deep Learning", + "Generative Models" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we effectively visualize and understand the internal operations and feature representations of Convolutional Neural Networks (convnets) to improve model interpretability and performance?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is crucial for the research community as it addresses the black-box nature of deep learning models, particularly convnets, which have achieved remarkable performance in various visual classification tasks. By gaining insights into how these models operate, researchers can develop better architectures and training strategies, leading to advancements in machine learning applications across fields such as computer vision, healthcare, and autonomous systems. Improved interpretability can also enhance trust in AI systems, making them more acceptable for real-world applications.\n\n**[Question 3] - Why is it hard?** \nThe challenges in solving this problem stem from the complexity of deep learning models, which consist of numerous layers and millions of parameters. Naive approaches to understanding these models often fail because they do not account for the hierarchical nature of feature extraction and the non-linear transformations that occur at each layer. Additionally, the lack of direct interpretability of higher-layer features complicates the analysis. Technical obstacles include the need for sophisticated visualization techniques that can accurately project feature activations back to the input space and the difficulty in isolating the contributions of individual features to the model's output.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has primarily focused on visualizing features at the first layer, where projections to pixel space are straightforward, while higher layers remain largely unexplored due to the complexity of their representations. Existing solutions have been limited by the lack of effective tools for analyzing feature maps and understanding their contributions to classification tasks. Barriers such as the absence of comprehensive sensitivity analysis methods and the challenge of training models from scratch without contamination from overlapping datasets have hindered progress. Our approach improves upon prior work by introducing a multi-layered Deconvolutional Network (deconvnet) for detailed visualization and sensitivity analysis, enabling a deeper understanding of feature evolution during training.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves using a multi-layered Deconvolutional Network (deconvnet) to visualize feature activations at various layers of a pretrained convnet model. We will analyze the Caltech-101 dataset, employing a linear SVM or softmax classifier on top of varying retained layers to assess feature discriminability. The" + }, + "1510.00726": { + "paper_data": { + "title": "A Primer on Neural Network Models for Natural Language Processing", + "url": "http://arxiv.org/abs/1510.00726v1", + "arxiv_id": "1510.00726", + "authors": [ + "Yoav Goldberg" + ], + "abstract": "Over the past few years, neural networks have re-emerged as powerful machine-learning models, yielding state-of-the-art results in fields such as image recognition and speech processing. More recently, neural network models started to be applied also to textual natural language signals, again with very promising results. This tutorial surveys neural network models from the perspective of natural language processing research, in an attempt to bring natural-language researchers up to speed with the neural techniques. The tutorial covers input encoding for natural language tasks, feed-forward networks, convolutional networks, recurrent networks and recursive networks, as well as the computation graph abstraction for automatic gradient computation.", + "introduction": " Introduction to Automatic Di\u000berentiation and MATLAB Object- Oriented Programming. SIAM Review ,52(3), 545{563. Nguyen, T. H., & Grishman, R. (2015). Event Detection and Domain Adaptation with Convolutional Neural Networks. In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Confer- ence on Natural Language Processing (Volume 2: Short Papers) , pp. 365{371, Beijing, China. Association for Computational Linguistics. Nivre, J. (2008). Algorithms for Deterministic Incremental Dependency Parsing. Compu- tational Linguistics ,34(4), 513{553. Okasaki, C. (1999). Purely Functional Data Structures . Cambridge University Press, Cam- bridge, U.K.; New York. Pascanu, R., Mikolov, T., & Bengio, Y. (2012). On the di\u000eculty of training Recurrent Neural Networks. arXiv:1211.5063 [cs] . Pei, W., Ge, T., & Chang, B. (2015). An E\u000bective Neural Network Model for Graph-based Dependency Parsing. In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers) , pp. 313{322, Beijing, China. Associa- tion for Computational Linguistics. 71Pennington, J., Socher, R., & Manning, C. (2014). Glove: Global Vectors for Word Rep- resentation. In Proceedings of the 2014 Conference on Empirical background, jargon, tools and methodology that will allow them to understand the principles behind the neural network models and apply them to their own work. This tutorial is expected to be self-contained, while presenting the di\u000berent approaches under a uni\fed notation and framework. It repeats a lot of material which is available elsewhere. It also points to external sources for more advanced topics when appropriate. This primer is not intended as a comprehensive resource for those that will go on and develop the next advances in neural-network machinery (though it may serve as a good entry point). Rather, it is aimed at those readers who are interested in taking the existing, useful technology and applying it in useful and creative ways to their favourite NLP problems. For more in-depth, general discussion on recursive neural networks and their use in natural language tasks, refer to the PhD thesis of Richard Socher (2014). 40. Before the methods. USSR Computational Mathematics and Mathematical Physics ,4(5), 1 { 17. Qian, Q., Tian, B., Huang, M., Liu, Y., Zhu, X., & Zhu, X. (2015). Learning Tag Embeddings and Tag-speci\fc Composition Functions in Recursive Neural Network. In Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers) , pp. 1365{1374, Beijing, China. Association for Computational Linguistics. Rong, X. (2014). word2vec Parameter Learning Explained. arXiv:1411.2738 [cs] . Rumelhart, D. E., Hinton, G. E., & Williams, R. J. (1986). Learning representations by back-propagating errors. Nature ,323(6088), 533{536. Santos, C. D., & Zadrozny, B. (2014). Learning Character-level Representations for Part- of-Speech Tagging.. pp. 1818{1826. Schuster, M., & Paliwal, K. K. (1997). Bidirectional recurrent neural networks. IEEE Transactions on Signal Processing ,45(11), 2673{2681. Shawe-Taylor, J., & Cristianini, N. (2004). Kernel Methods in Natural Language Processing , pp. 1366{1371, Seattle, Washington, USA. Association for Computational Linguistics. Xu, W., Auli, M., & Clark, S. (2015). CCG Supertagging with a Recurrent Neural Network. InProceedings of the 53rd Annual Meeting of the Association for Computational Lin- guistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers) , pp. 250{255, Beijing, China. Association for Computational Linguistics. 74Yin, W., & Sch utze, H. (2015). Convolutional Neural", + "references": [ + { + "title": "Deep multi-task learning with low level tasks supervised at lower layers", + "abstract": "In all previous work on deep multi-task learning we are aware of, all task supervisions are on the same (outermost) layer. We present a multi-task learning architecture with deep bi-directional RNNs, where different tasks supervision can happen at different layers. We present experiments in syntactic chunking and CCG supertagging, coupled with the additional task of POS-tagging. We show that it is consistently better to have POS supervision at the innermost rather than the outermost layer. We argue that this is because “lowlevel” tasks are better kept at the lower layers, enabling the higher-level tasks to make use of the shared representation of the lower-level tasks. Finally, we also show how this architecture can be used for domain adaptation." + }, + { + "title": "Training with Exploration Improves a Greedy Stack LSTM Parser", + "abstract": "We adapt the greedy Stack-LSTM dependency parser of Dyer et al. (2015) to support a training-with-exploration procedure using dynamic oracles(Goldberg and Nivre, 2013) instead of cross-entropy minimization. This form of training, which accounts for model predictions at training time rather than assuming an error-free action history, improves parsing accuracies for both English and Chinese, obtaining very strong results for both languages. We discuss some modifications needed in order to get training with exploration to work well for a probabilistic neural-network." + }, + { + "title": "Stack-propagation: Improved Representation Learning for Syntax", + "abstract": "Traditional syntax models typically leverage part-of-speech (POS) information by constructing features from hand-tuned templates. We demonstrate that a better approach is to utilize POS tags as a regularizer of learned representations. We propose a simple method for learning a stacked pipeline of models which we call “stack-propagation”. We apply this to dependency parsing and tagging, where we use the hidden layer of the tagger network as a representation of the input tokens for the parser. At test time, our parser does not require predicted POS tags. On 19 languages from the Universal Dependencies, our method is 1.3% (absolute) more accurate than a state-of-the-art graph-based approach and 2.7% more accurate than the most comparable greedy model." + }, + { + "title": "Benefits of Depth in Neural Networks", + "abstract": "For any positive integer $k$, there exist neural networks with $\\Theta(k^3)$ layers, $\\Theta(1)$ nodes per layer, and $\\Theta(1)$ distinct parameters which can not be approximated by networks with $\\mathcal{O}(k)$ layers unless they are exponentially large --- they must possess $\\Omega(2^k)$ nodes. This result is proved here for a class of nodes termed \"semi-algebraic gates\" which includes the common choices of ReLU, maximum, indicator, and piecewise polynomial functions, therefore establishing benefits of depth against not just standard networks with ReLU gates, but also convolutional networks with ReLU and maximization gates, sum-product networks, and boosted decision trees (in this last case with a stronger separation: $\\Omega(2^{k^3})$ total tree nodes are required)." + }, + { + "title": "Exploring the Limits of Language Modeling", + "abstract": "In this work we explore recent advances in Recurrent Neural Networks for large scale Language Modeling, a task central to language understanding. We extend current models to deal with two key challenges present in this task: corpora and vocabulary sizes, and complex, long term structure of language. We perform an exhaustive study on techniques such as character Convolutional Neural Networks or Long-Short Term Memory, on the One Billion Word Benchmark. Our best single model significantly improves state-of-the-art perplexity from 51.3 down to 30.0 (whilst reducing the number of parameters by a factor of 20), while an ensemble of models sets a new record by improving perplexity from 41.0 down to 23.7. We also release these models for the NLP and ML community to study and improve upon." + }, + { + "title": "Natural Language Understanding with Distributed Representation", + "abstract": "This is a lecture note for the course DS-GA 3001 at the Center for Data Science , New York University in Fall, 2015. As the name of the course suggests, this lecture note introduces readers to a neural network based approach to natural language understanding/processing. In order to make it as self-contained as possible, I spend much time on describing basics of machine learning and neural networks, only after which how they are used for natural languages is introduced. On the language front, I almost solely focus on language modelling and machine translation, two of which I personally find most fascinating and most fundamental to natural language understanding." + }, + { + "title": "Multi-task Sequence to Sequence Learning", + "abstract": "Sequence to sequence learning has recently emerged as a new paradigm in supervised learning. To date, most of its applications focused on only one task and not much work explored this framework for multiple tasks. This paper examines three multi-task learning (MTL) settings for sequence to sequence models: (a) the oneto-many setting - where the encoder is shared between several tasks such as machine translation and syntactic parsing, (b) the many-to-one setting - useful when only the decoder can be shared, as in the case of translation and image caption generation, and (c) the many-to-many setting - where multiple encoders and decoders are shared, which is the case with unsupervised objectives and translation. Our results show that training on a small amount of parsing and image caption data can improve the translation quality between English and German by up to 1.5 BLEU points over strong single-task baselines on the WMT benchmarks. Furthermore, we have established a new state-of-the-art result in constituent parsing with 93.0 F1. Lastly, we reveal interesting properties of the two unsupervised learning objectives, autoencoder and skip-thought, in the MTL context: autoencoder helps less in terms of perplexities but more on BLEU scores compared to skip-thought." + }, + { + "title": "Sentence Compression by Deletion with LSTMs", + "abstract": "We present an LSTM approach to deletion-based sentence compression where the task is to translate a sentence into a sequence of zeros and ones, corresponding to token deletion decisions. We demonstrate that even the most basic version of the system, which is given no syntactic information (no PoS or NE tags, or dependencies) or desired compression length, performs surprisingly well: around 30% of the compressions from a large test set could be regenerated. We compare the LSTM system with a competitive baseline which is trained on the same amount of data but is additionally provided with all kinds of linguistic features. In an experiment with human raters the LSTMbased model outperforms the baseline achieving 4.5 in readability and 3.8 in informativeness." + }, + { + "title": "The Forest Convolutional Network: Compositional Distributional Semantics with a Neural Chart and without Binarization", + "abstract": "According to the principle of compositionality, the meaning of a sentence is computed from the meaning of its parts and the way they are syntactically combined. In practice, however, the syntactic structure is computed by automatic parsers which are far-from-perfect and not tuned to the specifics of the task. Current recursive neural network (RNN) approaches for computing sentence meaning therefore run into a number of practical difficulties, including the need to carefully select a parser appropriate for the task, deciding how and to what extent syntactic context modifies the semantic composition function, as well as on how to transform parse trees to conform to the branching settings (typically, binary branching) of the RNN. This paper introduces a new model, the Forest Convolutional Network, that avoids all of these challenges, by taking a parse forest as input, rather than a single tree, and by allowing arbitrary branching factors. We report improvements over the state-of-the-art in sentiment analysis and question classification." + }, + { + "title": "Character-Aware Neural Language Models", + "abstract": "\n \n We describe a simple neural language model that relies only on character-level inputs. Predictions are still made at the word-level. Our model employs a convolutional neural network (CNN) and a highway net work over characters, whose output is given to a long short-term memory (LSTM) recurrent neural network language model (RNN-LM). On the English Penn Treebank the model is on par with the existing state-of-the-art despite having 60% fewer parameters. On languages with rich morphology (Arabic, Czech, French, German, Spanish, Russian), the model outperforms word-level/morpheme-level LSTM baselines, again with fewer parameters. The results suggest that on many languages, character inputs are sufficient for language modeling. Analysis of word representations obtained from the character composition part of the model reveals that the model is able to encode, from characters only, both semantic and orthographic information.\n \n" + }, + { + "title": "Finding Function in Form: Compositional Character Models for Open Vocabulary Word Representation", + "abstract": "We introduce a model for constructing vector representations of words by composing characters using bidirectional LSTMs. Relative to traditional word representation models that have independent vectors for each word type, our model requires only a single vector per character type and a fixed set of parameters for the compositional model. Despite the compactness of this model and, more importantly, the arbitrary nature of the form‐function relationship in language, our “composed” word representations yield state-of-the-art results in language modeling and part-of-speech tagging. Benefits over traditional baselines are particularly pronounced in morphologically rich languages (e.g., Turkish)." + }, + { + "title": "Improved Transition-based Parsing by Modeling Characters instead of Words with LSTMs", + "abstract": "We present extensions to a continuousstate dependency parsing method that makes it applicable to morphologically rich languages. Starting with a highperformance transition-based parser that uses long short-term memory (LSTM) recurrent neural networks to learn representations of the parser state, we replace lookup-based word representations with representations constructed from the orthographic representations of the words, also using LSTMs. This allows statistical sharing across word forms that are similar on the surface. Experiments for morphologically rich languages show that the parsing model benefits from incorporating the character-based encodings of words." + }, + { + "title": "A Dependency-Based Neural Network for Relation Classification", + "abstract": "Previous research on relation classification has verified the effectiveness of using dependency shortest paths or subtrees. In this paper, we further explore how to make full use of the combination of these dependency information. We first propose a new structure, termed augmented dependency path (ADP), which is composed of the shortest dependency path between two entities and the subtrees attached to the shortest path. To exploit the semantic representation behind the ADP structure, we develop dependency-based neural networks (DepNN): a recursive neural network designed to model the subtrees, and a convolutional neural network to capture the most important features on the shortest path. Experiments on the SemEval-2010 dataset show that our proposed method achieves state-of-art results." + }, + { + "title": "An Empirical Exploration of Recurrent Network Architectures", + "abstract": "The Recurrent Neural Network (RNN) is an extremely powerful sequence model that is often difficult to train. The Long Short-Term Memory (LSTM) is a specific RNN architecture whose design makes it much easier to train. While wildly successful in practice, the LSTM's architecture appears to be ad-hoc so it is not clear if it is optimal, and the significance of its individual components is unclear. \n \nIn this work, we aim to determine whether the LSTM architecture is optimal or whether much better architectures exist. We conducted a thorough architecture search where we evaluated over ten thousand different RNN architectures, and identified an architecture that outperforms both the LSTM and the recently-introduced Gated Recurrent Unit (GRU) on some but not all tasks. We found that adding a bias of 1 to the LSTM's forget gate closes the gap between the LSTM and the GRU." + }, + { + "title": "Long Short-Term Memory Over Recursive Structures", + "abstract": "The chain-structured long short-term memory (LSTM) has showed to be effective in a wide range of problems such as speech recognition and machine translation. In this paper, we propose to extend it to tree structures, in which a memory cell can reflect the history memories of multiple child cells or multiple descendant cells in a recursive process. We call the model S-LSTM, which provides a principled way of considering long-distance interaction over hierarchies, e.g., language or image parse structures. We leverage the models for semantic composition to understand the meaning of text, a fundamental problem in natural language understanding, and show that it outperforms a state-of-the-art recursive model by replacing its composition layers with the S-LSTM memory blocks. We also show that utilizing the given structures is helpful in achieving a performance better than that without considering the structures." + }, + { + "title": "Transition-based Neural Constituent Parsing", + "abstract": "Constituent parsing is typically modeled by a chart-based algorithm under probabilistic context-free grammars or by a transition-based algorithm with rich features. Previous models rely heavily on richer syntactic information through lexicalizing rules, splitting categories, or memorizing long histories. However enriched models incur numerous parameters and sparsity issues, and are insufficient for capturing various syntactic phenomena. We propose a neural network structure that explicitly models the unbounded history of actions performed on the stack and queue employed in transition-based parsing, in addition to the representations of partially parsed tree structure. Our transition-based neural constituent parsing achieves performance comparable to the state-of-the-art parsers, demonstrating F1 score of 90.68% for English and 84.33% for Chinese, without reranking, feature templates or additional data to train model parameters." + }, + { + "title": "CCG Supertagging with a Recurrent Neural Network", + "abstract": "Recent work on supertagging using a feedforward neural network achieved significant improvements for CCG supertagging and parsing (Lewis and Steedman, 2014). However, their architecture is limited to considering local contexts and does not naturally model sequences of arbitrary length. In this paper, we show how directly capturing sequence information using a recurrent neural network leads to further accuracy improvements for both supertagging (up to 1.9%) and parsing (up to 1% F1), on CCGBank, Wikipedia and biomedical text." + }, + { + "title": "Predicting Polarities of Tweets by Composing Word Embeddings with Long Short-Term Memory", + "abstract": "In this paper, we introduce Long ShortTerm Memory (LSTM) recurrent network for twitter sentiment prediction. With the help of gates and constant error carousels in the memory block structure, the model could handle interactions between words through a flexible compositional function. Experiments on a public noisy labelled data show that our model outperforms several feature-engineering approaches, with the result comparable to the current best data-driven technique. According to the evaluation on a generated negation phrase test set, the proposed architecture doubles the performance of non-neural model based on bag-of-word features. Furthermore, words with special functions (such as negation and transition) are distinguished and the dissimilarities of words with opposite sentiment are magnified. An interesting case study on negation expression processing shows a promising potential of the architecture dealing with complex sentiment phrases." + }, + { + "title": "Non-Linear Text Regression with a Deep Convolutional Neural Network", + "abstract": "Text regression has traditionally been tackled using linear models. Here we present a non-linear method based on a deep convolutional neural network. We show that despite having millions of parameters, this model can be trained on only a thousand documents, resulting in a 40% relative improvement over sparse linear models, the previous state of the art. Further, this method is flexible allowing for easy incorporation of side information such as document meta-data. Finally we present a novel technique for interpreting the effect of different text inputs on this complex non-linear model." + }, + { + "title": "Semantic Clustering and Convolutional Neural Network for Short Text Categorization", + "abstract": "Short texts usually encounter data sparsity and ambiguity problems in representations for their lack of context. In this paper, we propose a novel method to model short texts based on semantic clustering and convolutional neural network. Particularly, we first discover semantic cliques in embedding spaces by a fast clustering algorithm. Then, multi-scale semantic units are detected under the supervision of semantic cliques, which introduce useful external knowledge for short texts. These meaningful semantic units are combined and fed into convolutional layer, followed by max-pooling operation. Experimental results on two open benchmarks validate the effectiveness of the proposed method." + }, + { + "title": "A Neural Probabilistic Structured-Prediction Model for Transition-Based Dependency Parsing", + "abstract": "Neural probabilistic parsers are attractive for their capability of automatic feature combination and small data sizes. A transition-based greedy neural parser has given better accuracies over its linear counterpart. We propose a neural probabilistic structured-prediction model for transition-based dependency parsing, which integrates search and learning. Beam search is used for decoding, and contrastive learning is performed for maximizing the sentence-level log-likelihood. In standard Penn Treebank experiments, the structured neural parser achieves a 1.8% accuracy improvement upon a competitive greedy neural parser baseline, giving performance comparable to the best linear parser." + }, + { + "title": "Learning Tag Embeddings and Tag-specific Composition Functions in Recursive Neural Network", + "abstract": "Recursive neural network is one of the most successful deep learning models for natural language processing due to the compositional nature of text. The model recursively composes the vector of a parent phrase from those of child words or phrases, with a key component named composition function. Although a variety of composition functions have been proposed, the syntactic information has not been fully encoded in the composition process. We propose two models, Tag Guided RNN (TGRNN for short) which chooses a composition function according to the part-ofspeech tag of a phrase, and Tag Embedded RNN/RNTN (TE-RNN/RNTN for short) which learns tag embeddings and then combines tag and word embeddings together. In the fine-grained sentiment classification, experiment results show the proposed models obtain remarkable improvement: TG-RNN/TE-RNN obtain remarkable improvement over baselines, TE-RNTN obtains the second best result among all the top performing models, and all the proposed models have much less parameters/complexity than their counterparts." + }, + { + "title": "An Effective Neural Network Model for Graph-based Dependency Parsing", + "abstract": "Most existing graph-based parsing models rely on millions of hand-crafted features, which limits their generalization ability and slows down the parsing speed. In this paper, we propose a general and effective Neural Network model for graph-based dependency parsing. Our model can automatically learn high-order feature combinations using only atomic features by exploiting a novel activation function tanhcube. Moreover, we propose a simple yet effective way to utilize phrase-level information that is expensive to use in conventional graph-based parsers. Experiments on the English Penn Treebank show that parsers based on our model perform better than conventional graph-based parsers." + }, + { + "title": "Dependency-based Convolutional Neural Networks for Sentence Embedding", + "abstract": "In sentence modeling and classification, convolutional neural network approaches have recently achieved state-of-the-art results, but all such efforts process word vectors sequentially and neglect long-distance dependencies. To exploit both deep learning and linguistic structures, we propose a tree-based convolutional neural network model which exploit various long-distance relationships between words. Our model improves the sequential baselines on all three sentiment and question classification tasks, and achieves the highest published accuracy on TREC." + }, + { + "title": "Event Extraction via Dynamic Multi-Pooling Convolutional Neural Networks", + "abstract": "Traditional approaches to the task of ACE event extraction primarily rely on elaborately designed features and complicated natural language processing (NLP) tools. These traditional approaches lack generalization, take a large amount of human effort and are prone to error propagation and data sparsity problems. This paper proposes a novel event-extraction method, which aims to automatically extract lexical-level and sentence-level features without using complicated NLP tools. We introduce a word-representation model to capture meaningful semantic regularities for words and adopt a framework based on a convolutional neural network (CNN) to capture sentence-level clues. However, CNN can only capture the most important information in a sentence and may miss valuable facts when considering multiple-event sentences. We propose a dynamic multi-pooling convolutional neural network (DMCNN), which uses a dynamic multi-pooling layer according to event triggers and arguments, to reserve more crucial information. The experimental results show that our approach significantly outperforms other state-of-the-art methods." + }, + { + "title": "Question Answering over Freebase with Multi-Column Convolutional Neural Networks", + "abstract": "Answering natural language questions over a knowledge base is an important and challenging task. Most of existing systems typically rely on hand-crafted features and rules to conduct question understanding and/or answer ranking. In this paper, we introduce multi-column convolutional neural networks (MCCNNs) to understand questions from three different aspects (namely, answer path, answer context, and answer type) and learn their distributed representations. Meanwhile, we jointly learn low-dimensional embeddings of entities and relations in the knowledge base. Question-answer pairs are used to train the model to rank candidate answers. We also leverage question paraphrases to train the column networks in a multi-task learning manner. We use FREEBASE as the knowledge base and conduct extensive experiments on the WEBQUESTIONS dataset. Experimental results show that our method achieves better or comparable performance compared with baseline systems. In addition, we develop a method to compute the salience scores of question words in different column networks. The results help us intuitively understand what MCCNNs learn." + }, + { + "title": "Deep Unordered Composition Rivals Syntactic Methods for Text Classification", + "abstract": "Many existing deep learning models for natural language processing tasks focus on learning the compositionality of their inputs, which requires many expensive computations. We present a simple deep neural network that competes with and, in some cases, outperforms such models on sentiment analysis and factoid question answering tasks while taking only a fraction of the training time. While our model is syntactically-ignorant, we show significant improvements over previous bag-of-words models by deepening our network and applying a novel variant of dropout. Moreover, our model performs better than syntactic models on datasets with high syntactic variance. We show that our model makes similar errors to syntactically-aware models, indicating that for the tasks we consider, nonlinearly transforming the input is more important than tailoring a network to incorporate word order and syntax." + }, + { + "title": "Neural CRF Parsing", + "abstract": "This paper describes a parsing model that combines the exact dynamic programming of CRF parsing with the rich nonlinear featurization of neural net approaches. Our model is structurally a CRF that factors over anchored rule productions, but instead of linear potential functions based on sparse features, we use nonlinear potentials computed via a feedforward neural network. Because potentials are still local to anchored rules, structured inference (CKY) is unchanged from the sparse case. Computing gradients during learning involves backpropagating an error signal formed from standard CRF sufficient statistics (expected rule counts). Using only dense features, our neural CRF already exceeds a strong baseline CRF model (Hall et al., 2014). In combination with sparse features, our system achieves 91.1 F1 on section 23 of the Penn Treebank, and more generally outperforms the best prior single parser results on a range of languages." + }, + { + "title": "Multi-domain Dialog State Tracking using Recurrent Neural Networks", + "abstract": "Dialog state tracking is a key component of many modern dialog systems, most of which are designed with a single, well-defined domain in mind. This paper shows that dialog data drawn from different dialog domains can be used to train a general belief tracking model which can operate across all of these domains, exhibiting superior performance to each of the domain-specific models. We propose a training procedure which uses out-of-domain data to initialise belief tracking models for entirely new domains. This procedure leads to improvements in belief tracking performance regardless of the amount of in-domain data available for training the model." + }, + { + "title": "A Neural Network Approach to Context-Sensitive Generation of Conversational Responses", + "abstract": "We present a novel response generation system that can be trained end to end on large quantities of unstructured Twitter conversations. A neural network architecture is used to address sparsity issues that arise when integrating contextual information into classic statistical models, allowing the system to take into account previous dialog utterances. Our dynamic-context generative models show consistent gains over both context-sensitive and non-context-sensitive Machine Translation and Information Retrieval baselines." + }, + { + "title": "Structured Training for Neural Network Transition-Based Parsing", + "abstract": "We present structured perceptron training for neural network transition-based dependency parsing. We learn the neural network representation using a gold corpus augmented by a large number of automatically parsed sentences. Given this fixed network representation, we learn a final layer using the structured perceptron with beam-search decoding. On the Penn Treebank, our parser reaches 94.26% unlabeled and 92.41% labeled attachment accuracy, which to our knowledge is the best accuracy on Stanford Dependencies to date. We also provide indepth ablative analysis to determine which aspects of our model provide the largest gains in accuracy." + }, + { + "title": "Dropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning", + "abstract": "Deep learning tools have gained tremendous attention in applied machine learning. However such tools for regression and classification do not capture model uncertainty. In comparison, Bayesian models offer a mathematically grounded framework to reason about model uncertainty, but usually come with a prohibitive computational cost. In this paper we develop a new theoretical framework casting dropout training in deep neural networks (NNs) as approximate Bayesian inference in deep Gaussian processes. A direct result of this theory gives us tools to model uncertainty with dropout NNs -- extracting information from existing models that has been thrown away so far. This mitigates the problem of representing uncertainty in deep learning without sacrificing either computational complexity or test accuracy. We perform an extensive study of the properties of dropout's uncertainty. Various network architectures and non-linearities are assessed on tasks of regression and classification, using MNIST as an example. We show a considerable improvement in predictive log-likelihood and RMSE compared to existing state-of-the-art methods, and finish by using dropout's uncertainty in deep reinforcement learning." + }, + { + "title": "Visualizing and Understanding Recurrent Networks", + "abstract": "Recurrent Neural Networks (RNNs), and specifically a variant with Long Short-Term Memory (LSTM), are enjoying renewed interest as a result of successful applications in a wide range of machine learning problems that involve sequential data. However, while LSTMs provide exceptional results in practice, the source of their performance and their limitations remain rather poorly understood. Using character-level language models as an interpretable testbed, we aim to bridge this gap by providing an analysis of their representations, predictions and error types. In particular, our experiments reveal the existence of interpretable cells that keep track of long-range dependencies such as line lengths, quotes and brackets. Moreover, our comparative analysis with finite horizon n-gram models traces the source of the LSTM improvements to long-range structural dependencies. Finally, we provide analysis of the remaining errors and suggests areas for further study." + }, + { + "title": "Transition-Based Dependency Parsing with Stack Long Short-Term Memory", + "abstract": "This work was sponsored in part by the U. S. Army Research Laboratory and the U. S. Army Research Office/nunder contract/grant number W911NF-10-1-0533, and in part by NSF CAREER grant IIS-1054319./nMiguel Ballesteros is supported by the European Commission under the contract numbers FP7-ICT-610411 (project MULTISENSOR) and H2020-RIA-645012 (project KRISTINA)." + }, + { + "title": "A Re-ranking Model for Dependency Parser with Recursive Convolutional Neural Network", + "abstract": "In this work, we address the problem to model all the nodes (words or phrases) in a dependency tree with the dense representations. We propose a recursive convolutional neural network (RCNN) architecture to capture syntactic and compositional-semantic representations of phrases and words in a dependency tree. Different with the original recursive neural network, we introduce the convolution and pooling layers, which can model a variety of compositions by the feature maps and choose the most informative compositions by the pooling layers. Based on RCNN, we use a discriminative model to re-rank a $k$-best list of candidate dependency parsing trees. The experiments show that RCNN is very effective to improve the state-of-the-art dependency parsing on both English and Chinese datasets." + }, + { + "title": "Improving Distributional Similarity with Lessons Learned from Word Embeddings", + "abstract": "Recent trends suggest that neural-network-inspired word embedding models outperform traditional count-based distributional models on word similarity and analogy detection tasks. We reveal that much of the performance gains of word embeddings are due to certain system design choices and hyperparameter optimizations, rather than the embedding algorithms themselves. Furthermore, we show that these modifications can be transferred to traditional distributional models, yielding similar gains. In contrast to prior reports, we observe mostly local or insignificant performance differences between the methods, with no global advantage to any single approach over the others." + }, + { + "title": "Classifying Relations by Ranking with Convolutional Neural Networks", + "abstract": "Relation classification is an important semantic processing task for which state-ofthe-art systems still rely on costly handcrafted features. In this work we tackle the relation classification task using a convolutional neural network that performs classification by ranking (CR-CNN). We propose a new pairwise ranking loss function that makes it easy to reduce the impact of artificial classes. We perform experiments using the the SemEval-2010 Task 8 dataset, which is designed for the task of classifying the relationship between two nominals marked in a sentence. Using CRCNN, we outperform the state-of-the-art for this dataset and achieve a F1 of 84.1 without using any costly handcrafted features. Additionally, our experimental results show that: (1) our approach is more effective than CNN followed by a softmax classifier; (2) omitting the representation of the artificial class Other improves both precision and recall; and (3) using only word embeddings as input features is enough to achieve state-of-the-art results if we consider only the text between the two target nominals." + }, + { + "title": "A Simple Way to Initialize Recurrent Networks of Rectified Linear Units", + "abstract": "Learning long term dependencies in recurrent networks is difficult due to vanishing and exploding gradients. To overcome this difficulty, researchers have developed sophisticated optimization techniques and network architectures. In this paper, we propose a simpler solution that use recurrent neural networks composed of rectified linear units. Key to our solution is the use of the identity matrix or its scaled version to initialize the recurrent weight matrix. We find that our solution is comparable to LSTM on our four benchmarks: two toy problems involving long-range temporal structures, a large language modeling problem and a benchmark speech recognition problem." + }, + { + "title": "Long Short-Term Memory Over Tree Structures", + "abstract": "The chain-structured long short-term memory (LSTM) has showed to be effective in a wide range of problems such as speech recognition and machine translation. In this paper, we propose to extend it to tree structures, in which a memory cell can reflect the history memories of multiple child cells or multiple descendant cells in a recursive process. We call the model S-LSTM, which provides a principled way of considering long-distance interaction over hierarchies, e.g., language or image parse structures. We leverage the models for semantic composition to understand the meaning of text, a fundamental problem in natural language understanding, and show that it outperforms a state-of-the-art recursive model by replacing its composition layers with the S-LSTM memory blocks. We also show that utilizing the given structures is helpful in achieving a performance better than that without considering the structures." + }, + { + "title": "LSTM: A Search Space Odyssey", + "abstract": "Several variants of the long short-term memory (LSTM) architecture for recurrent neural networks have been proposed since its inception in 1995. In recent years, these networks have become the state-of-the-art models for a variety of machine learning problems. This has led to a renewed interest in understanding the role and utility of various computational components of typical LSTM variants. In this paper, we present the first large-scale analysis of eight LSTM variants on three representative tasks: speech recognition, handwriting recognition, and polyphonic music modeling. The hyperparameters of all LSTM variants for each task were optimized separately using random search, and their importance was assessed using the powerful functional ANalysis Of VAriance framework. In total, we summarize the results of 5400 experimental runs ( $\\approx 15$ years of CPU time), which makes our study the largest of its kind on LSTM networks. Our results show that none of the variants can improve upon the standard LSTM architecture significantly, and demonstrate the forget gate and the output activation function to be its most critical components. We further observe that the studied hyperparameters are virtually independent and derive guidelines for their efficient adjustment." + }, + { + "title": "Improved Semantic Representations From Tree-Structured Long Short-Term Memory Networks", + "abstract": "Because of their superior ability to preserve sequence information over time, Long Short-Term Memory (LSTM) networks, a type of recurrent neural network with a more complex computational unit, have obtained strong results on a variety of sequence modeling tasks. The only underlying LSTM structure that has been explored so far is a linear chain. However, natural language exhibits syntactic properties that would naturally combine words to phrases. We introduce the Tree-LSTM, a generalization of LSTMs to tree-structured network topologies. Tree-LSTMs outperform all existing systems and strong LSTM baselines on two tasks: predicting the semantic relatedness of two sentences (SemEval 2014, Task 1) and sentiment classification (Stanford Sentiment Treebank)." + }, + { + "title": "Automatic differentiation in machine learning: a survey", + "abstract": "Derivatives, mostly in the form of gradients and Hessians, are ubiquitous in machine learning. Automatic differentiation (AD), also called algorithmic differentiation or simply “auto-diff”, is a family of techniques similar to but more general than backpropagation for efficiently and accurately evaluating derivatives of numeric functions expressed as computer programs. AD is a small but established field with applications in areas including computational fluid dynamics, atmospheric sciences, and engineering design optimization. Until \nvery recently, the fields of machine learning and AD have largely been unaware of each other and, in some cases, have independently discovered each other’s results. Despite its \nrelevance, general-purpose AD has been missing from the machine learning toolbox, a situation slowly changing with its ongoing adoption under the names “dynamic computational \ngraphs” and “differentiable programming”. We survey the intersection of AD and machine learning, cover applications where AD has direct relevance, and address the main imple- \nmentation techniques. By precisely defining the main differentiation techniques and their interrelationships, we aim to bring clarity to the usage of the terms “autodiff”, “automatic differentiation”, and “symbolic differentiation” as these are encountered more and more in machine learning settings." + }, + { + "title": "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift", + "abstract": "Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization, and in some cases eliminates the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.82% top-5 test error, exceeding the accuracy of human raters." + }, + { + "title": "Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification", + "abstract": "Rectified activation units (rectifiers) are essential for state-of-the-art neural networks. In this work, we study rectifier neural networks for image classification from two aspects. First, we propose a Parametric Rectified Linear Unit (PReLU) that generalizes the traditional rectified unit. PReLU improves model fitting with nearly zero extra computational cost and little overfitting risk. Second, we derive a robust initialization method that particularly considers the rectifier nonlinearities. This method enables us to train extremely deep rectified models directly from scratch and to investigate deeper or wider network architectures. Based on the learnable activation and advanced initialization, we achieve 4.94% top-5 test error on the ImageNet 2012 classification dataset. This is a 26% relative improvement over the ILSVRC 2014 winner (GoogLeNet, 6.66% [33]). To our knowledge, our result is the first to surpass the reported human-level performance (5.1%, [26]) on this dataset." + }, + { + "title": "Learning Longer Memory in Recurrent Neural Networks", + "abstract": "Recurrent neural network is a powerful model that learns temporal patterns in sequential data. For a long time, it was believed that recurrent networks are difficult to train using simple optimizers, such as stochastic gradient descent, due to the so-called vanishing gradient problem. In this paper, we show that learning longer term patterns in real data, such as in natural language, is perfectly possible using gradient descent. This is achieved by using a slight structural modification of the simple recurrent neural network architecture. We encourage some of the hidden units to change their state slowly by making part of the recurrent weight matrix close to identity, thus forming kind of a longer term memory. We evaluate our model in language modeling experiments, where we obtain similar performance to the much more complex Long Short Term Memory (LSTM) networks (Hochreiter & Schmidhuber, 1997)." + }, + { + "title": "Adam: A Method for Stochastic Optimization", + "abstract": "We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm." + }, + { + "title": "Embedding Word Similarity with Neural Machine Translation", + "abstract": "Neural language models learn word representations, or embeddings, that capture rich linguistic and conceptual information. Here we investigate the embeddings learned by neural machine translation models, a recently-developed class of neural language model. We show that embeddings from translation models outperform those learned by monolingual models at tasks that require knowledge of both conceptual similarity and lexical-syntactic role. We further show that these effects hold when translating from both English to French and English to German, and argue that the desirable properties of translation embeddings should emerge largely independently of the source and target languages. Finally, we apply a new method for training neural translation models with very large vocabularies, and show that this vocabulary expansion algorithm results in minimal degradation of embedding quality. Our embedding spaces can be queried in an online demo and downloaded from our web page. Overall, our analyses indicate that translation-based embeddings should be used in applications that require concepts to be organised according to similarity and/or lexical function, while monolingual embeddings are better suited to modelling (nonspecific) inter-word relatedness." + }, + { + "title": "Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling", + "abstract": "In this paper we compare different types of recurrent units in recurrent neural networks (RNNs). Especially, we focus on more sophisticated units that implement a gating mechanism, such as a long short-term memory (LSTM) unit and a recently proposed gated recurrent unit (GRU). We evaluate these recurrent units on the tasks of polyphonic music modeling and speech signal modeling. Our experiments revealed that these advanced recurrent units are indeed better than more traditional recurrent units such as tanh units. Also, we found GRU to be comparable to LSTM." + }, + { + "title": "Neural Word Embedding as Implicit Matrix Factorization", + "abstract": "We analyze skip-gram with negative-sampling (SGNS), a word embedding method introduced by Mikolov et al., and show that it is implicitly factorizing a word-context matrix, whose cells are the pointwise mutual information (PMI) of the respective word and context pairs, shifted by a global constant. We find that another embedding method, NCE, is implicitly factorizing a similar matrix, where each cell is the (shifted) log conditional probability of a word given its context. We show that using a sparse Shifted Positive PMI word-context matrix to represent words improves results on two word similarity tasks and one of two analogy tasks. When dense low-dimensional vectors are preferred, exact factorization with SVD can achieve solutions that are at least as good as SGNS's solutions for word similarity tasks. On analogy questions SGNS remains superior to SVD. We conjecture that this stems from the weighted nature of SGNS's factorization." + }, + { + "title": "Effective Use of Word Order for Text Categorization with Convolutional Neural Networks", + "abstract": "Convolutional neural network (CNN) is a neural network that can make use of the internal structure of data such as the 2D structure of image data. This paper studies CNN on text categorization to exploit the 1D structure (namely, word order) of text data for accurate prediction. Instead of using low-dimensional word vectors as input as is often done, we directly apply CNN to high-dimensional text data, which leads to directly learning embedding of small text regions for use in classification. In addition to a straightforward adaptation of CNN from image to text, a simple but new variation which employs bag-of-word conversion in the convolution layer is proposed. An extension to combine multiple convolution layers is also explored for higher accuracy. The experiments demonstrate the effectiveness of our approach in comparison with state-of-the-art methods." + }, + { + "title": "word2vec Parameter Learning Explained", + "abstract": "The word2vec model and application by Mikolov et al. have attracted a great amount of attention in recent two years. The vector representations of words learned by word2vec models have been shown to carry semantic meanings and are useful in various NLP tasks. As an increasing number of researchers would like to experiment with word2vec or similar techniques, I notice that there lacks a material that comprehensively explains the parameter learning process of word embedding models in details, thus preventing researchers that are non-experts in neural networks from understanding the working mechanism of such models. \nThis note provides detailed derivations and explanations of the parameter update equations of the word2vec models, including the original continuous bag-of-word (CBOW) and skip-gram (SG) models, as well as advanced optimization techniques, including hierarchical softmax and negative sampling. Intuitive interpretations of the gradient equations are also provided alongside mathematical derivations. \nIn the appendix, a review on the basics of neuron networks and backpropagation is provided. I also created an interactive demo, wevi, to facilitate the intuitive understanding of the model." + }, + { + "title": "BilBOWA: Fast Bilingual Distributed Representations without Word Alignments", + "abstract": "We introduce BilBOWA (Bilingual Bag-of-Words without Alignments), a simple and computationally-efficient model for learning bilingual distributed representations of words which can scale to large monolingual datasets and does not require word-aligned parallel training data. Instead it trains directly on monolingual data and extracts a bilingual signal from a smaller set of raw-text sentence-aligned data. This is achieved using a novel sampled bag-of-words cross-lingual objective, which is used to regularize two noise-contrastive language models for efficient cross-lingual feature learning. We show that bilingual embeddings learned using the proposed model outperform state-of-the-art methods on a cross-lingual document classification task as well as a lexical translation task on WMT11 data." + }, + { + "title": "Improved CCG Parsing with Semi-supervised Supertagging", + "abstract": "Current supervised parsers are limited by the size of their labelled training data, making improving them with unlabelled data an important goal. We show how a state-of-the-art CCG parser can be enhanced, by predicting lexical categories using unsupervised vector-space embeddings of words. The use of word embeddings enables our model to better generalize from the labelled data, and allows us to accurately assign lexical categories without depending on a POS-tagger. Our approach leads to substantial improvements in dependency parsing results over the standard supervised CCG parser when evaluated on Wall Street Journal (0.8%), Wikipedia (1.8%) and biomedical (3.4%) text. We compare the performance of two recently proposed approaches for classification using a wide variety of word embeddings. We also give a detailed error analysis demonstrating where using embeddings outperforms traditional feature sets, and showing how including POS features can decrease accuracy." + }, + { + "title": "The Inside-Outside Recursive Neural Network model for Dependency Parsing", + "abstract": "We propose the first implementation of an infinite-order generative dependency model. The model is based on a new recursive neural network architecture, the Inside-Outside Recursive Neural Network. This architecture allows information to flow not only bottom-up, as in traditional recursive neural networks, but also topdown. This is achieved by computing content as well as context representations for any constituent, and letting these representations interact. Experimental results on the English section of the Universal Dependency Treebank show that the infinite-order model achieves a perplexity seven times lower than the traditional third-order model using counting, and tends to choose more accurate parses in k-best lists. In addition, reranking with this model achieves state-of-the-art unlabelled attachment scores and unlabelled exact match scores." + }, + { + "title": "GloVe: Global Vectors for Word Representation", + "abstract": "Recent methods for learning vector space representations of words have succeeded in capturing fine-grained semantic and syntactic regularities using vector arithmetic, but the origin of these regularities has remained opaque. We analyze and make explicit the model properties needed for such regularities to emerge in word vectors. The result is a new global logbilinear regression model that combines the advantages of the two major model families in the literature: global matrix factorization and local context window methods. Our model efficiently leverages statistical information by training only on the nonzero elements in a word-word cooccurrence matrix, rather than on the entire sparse matrix or on individual context windows in a large corpus. The model produces a vector space with meaningful substructure, as evidenced by its performance of 75% on a recent word analogy task. It also outperforms related models on similarity tasks and named entity recognition." + }, + { + "title": "Recursive Deep Models for Discourse Parsing", + "abstract": "Text-level discourse parsing remains a challenge: most approaches employ features that fail to capture the intentional, semantic, and syntactic aspects that govern discourse coherence. In this paper, we propose a recursive model for discourse parsing that jointly models distributed representations for clauses, sentences, and entire discourses. The learned representations can to some extent learn the semantic and intentional import of words and larger discourse units automatically,. The proposed framework obtains comparable performance regarding standard discoursing parsing evaluations when compared against current state-of-art systems." + }, + { + "title": "Opinion Mining with Deep Recurrent Neural Networks", + "abstract": "Recurrent neural networks (RNNs) are connectionist models of sequential data that are naturally applicable to the analysis of natural language. Recently, “depth in space” — as an orthogonal notion to “depth in time” — in RNNs has been investigated by stacking multiple layers of RNNs and shown empirically to bring a temporal hierarchy to the architecture. In this work we apply these deep RNNs to the task of opinion expression extraction formulated as a token-level sequence-labeling task. Experimental results show that deep, narrow RNNs outperform traditional shallow, wide RNNs with the same number of parameters. Furthermore, our approach outperforms previous CRF-based baselines, including the state-of-the-art semi-Markov CRF model, and does so without access to the powerful opinion lexicons and syntactic features relied upon by the semi-CRF, as well as without the standard layer-by-layer pre-training typically required of RNN architectures." + }, + { + "title": "A Neural Network Approach to Selectional Preference Acquisition", + "abstract": "This paper investigates the use of neural networks for the acquisition of selectional preferences. Inspired by recent advances of neural network models for nlp applications, we propose a neural network model that learns to discriminate between felicitous and infelicitous arguments for a particular predicate. The model is entirely unsupervised ‐ preferences are learned from unannotated corpus data. We propose two neural network architectures: one that handles standard two-way selectional preferences and one that is able to deal with multi-way selectional preferences. The model’s performance is evaluated on a pseudo-disambiguation task, on which it is shown to achieve state of the art performance." + }, + { + "title": "Translation Modeling with Bidirectional Recurrent Neural Networks", + "abstract": "This work presents two different translation models using recurrent neural networks. The first one is a word-based approach using word alignments. Second, we present phrase-based translation models that are more consistent with phrasebased decoding. Moreover, we introduce bidirectional recurrent neural models to the problem of machine translation, allowing us to use the full source sentence in our models, which is also of theoretical interest. We demonstrate that our translation models are capable of improving strong baselines already including recurrent neural language models on three tasks: IWSLT 2013 German!English, BOLT Arabic!English and Chinese!English. We obtain gains up to 1.6% BLEU and 1.7% TER by rescoring 1000-best lists." + }, + { + "title": "A Neural Network for Factoid Question Answering over Paragraphs", + "abstract": "Text classification methods for tasks like factoid question answering typically use manually defined string matching rules or bag of words representations. These methods are ineective when question text contains very few individual words (e.g., named entities) that are indicative of the answer. We introduce a recursive neural network (rnn) model that can reason over such input by modeling textual compositionality. We apply our model, qanta, to a dataset of questions from a trivia competition called quiz bowl. Unlike previous rnn models, qanta learns word and phrase-level representations that combine across sentences to reason about entities. The model outperforms multiple baselines and, when combined with information retrieval methods, rivals the best human players." + }, + { + "title": "Sequence to Sequence Learning with Neural Networks", + "abstract": "Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks. Although DNNs work well whenever large labeled training sets are available, they cannot be used to map sequences to sequences. In this paper, we present a general end-to-end approach to sequence learning that makes minimal assumptions on the sequence structure. Our method uses a multilayered Long Short-Term Memory (LSTM) to map the input sequence to a vector of a fixed dimensionality, and then another deep LSTM to decode the target sequence from the vector. Our main result is that on an English to French translation task from the WMT-14 dataset, the translations produced by the LSTM achieve a BLEU score of 34.8 on the entire test set, where the LSTM's BLEU score was penalized on out-of-vocabulary words. Additionally, the LSTM did not have difficulty on long sentences. For comparison, a phrase-based SMT system achieves a BLEU score of 33.3 on the same dataset. When we used the LSTM to rerank the 1000 hypotheses produced by the aforementioned SMT system, its BLEU score increases to 36.5, which is close to the previous state of the art. The LSTM also learned sensible phrase and sentence representations that are sensitive to word order and are relatively invariant to the active and the passive voice. Finally, we found that reversing the order of the words in all source sentences (but not target sentences) improved the LSTM's performance markedly, because doing so introduced many short term dependencies between the source and the target sentence which made the optimization problem easier." + }, + { + "title": "Recurrent Neural Network Regularization", + "abstract": "We present a simple regularization technique for Recurrent Neural Networks (RNNs) with Long Short-Term Memory (LSTM) units. Dropout, the most successful technique for regularizing neural networks, does not work well with RNNs and LSTMs. In this paper, we show how to correctly apply dropout to LSTMs, and show that it substantially reduces overfitting on a variety of tasks. These tasks include language modeling, speech recognition, image caption generation, and machine translation." + }, + { + "title": "On the Properties of Neural Machine Translation: Encoder–Decoder Approaches", + "abstract": "Neural machine translation is a relatively new approach to statistical machine translation based purely on neural networks. The neural machine translation models often consist of an encoder and a decoder. The encoder extracts a fixed-length representation from a variable-length input sentence, and the decoder generates a correct translation from this representation. In this paper, we focus on analyzing the properties of the neural machine translation using two models; RNN Encoder--Decoder and a newly proposed gated recursive convolutional neural network. We show that the neural machine translation performs relatively well on short sentences without unknown words, but its performance degrades rapidly as the length of the sentence and the number of unknown words increase. Furthermore, we find that the proposed gated recursive convolutional network learns a grammatical structure of a sentence automatically." + }, + { + "title": "Convolutional Neural Networks for Sentence Classification", + "abstract": "We report on a series of experiments with convolutional neural networks (CNN) trained on top of pre-trained word vectors for sentence-level classification tasks. We show that a simple CNN with little hyperparameter tuning and static vectors achieves excellent results on multiple benchmarks. Learning task-specific vectors through fine-tuning offers further gains in performance. We additionally propose a simple modification to the architecture to allow for the use of both task-specific and static vectors. The CNN models discussed herein improve upon the state of the art on 4 out of 7 tasks, which include sentiment analysis and question classification." + }, + { + "title": "Deep Convolutional Neural Networks for Sentiment Analysis of Short Texts", + "abstract": "Sentiment analysis of short texts such as single sentences and Twitter messages is challenging because of the limited contextual information that they normally contain. Effectively solving this task requires strategies that combine the small text content with prior knowledge and use more than just bag-of-words. In this work we propose a new deep convolutional neural network that exploits from characterto sentence-level information to perform sentiment analysis of short texts. We apply our approach for two corpora of two different domains: the Stanford Sentiment Treebank (SSTb), which contains sentences from movie reviews; and the Stanford Twitter Sentiment corpus (STS), which contains Twitter messages. For the SSTb corpus, our approach achieves state-of-the-art results for single sentence sentiment prediction in both binary positive/negative classification, with 85.7% accuracy, and fine-grained classification, with 48.3% accuracy. For the STS corpus, our approach achieves a sentiment prediction accuracy of 86.4%." + }, + { + "title": "Relation Classification via Convolutional Deep Neural Network", + "abstract": "The state-of-the-art methods used for relation classification are primarily based on statistical machine learning, and their performance strongly depends on the quality of the extracted features. The extracted features are often derived from the output of pre-existing natural language processing (NLP) systems, which leads to the propagation of the errors in the existing tools and hinders the performance of these systems. In this paper, we exploit a convolutional deep neural network (DNN) to extract lexical and sentence level features. Our method takes all of the word tokens as input without complicated pre-processing. First, the word tokens are transformed to vectors by looking up word embeddings 1 . Then, lexical level features are extracted according to the given nouns. Meanwhile, sentence level features are learned using a convolutional approach. These two level features are concatenated to form the final extracted feature vector. Finally, the features are fed into a softmax classifier to predict the relationship between two marked nouns. The experimental results demonstrate that our approach significantly outperforms the state-of-the-art methods." + }, + { + "title": "Learning Character-level Representations for Part-of-Speech Tagging", + "abstract": "Distributed word representations have recently been proven to be an invaluable resource for NLP. These representations are normally learned using neural networks and capture syntactic and semantic information about words. Information about word morphology and shape is normally ignored when learning word representations. However, for tasks like part-of-speech tagging, intra-word information is extremely useful, specially when dealing with morphologically rich languages. In this paper, we propose a deep neural network that learns character-level representation of words and associate them with usual word representations to perform POS tagging. Using the proposed approach, while avoiding the use of any handcrafted feature, we produce state-of-the-art POS taggers for two languages: English, with 97.32% accuracy on the Penn Treebank WSJ corpus; and Portuguese, with 97.47% accuracy on the Mac-Morpho corpus, where the latter represents an error reduction of 12.2% on the best previous known result." + }, + { + "title": "Identifying and attacking the saddle point problem in high-dimensional non-convex optimization", + "abstract": "A central challenge to many fields of science and engineering involves minimizing non-convex error functions over continuous, high dimensional spaces. Gradient descent or quasi-Newton methods are almost ubiquitously used to perform such minimizations, and it is often thought that a main source of difficulty for these local methods to find the global minimum is the proliferation of local minima with much higher error than the global minimum. Here we argue, based on results from statistical physics, random matrix theory, neural network theory, and empirical evidence, that a deeper and more profound difficulty originates from the proliferation of saddle points, not local minima, especially in high dimensional problems of practical interest. Such saddle points are surrounded by high error plateaus that can dramatically slow down learning, and give the illusory impression of the existence of a local minimum. Motivated by these arguments, we propose a new approach to second-order optimization, the saddle-free Newton method, that can rapidly escape high dimensional saddle points, unlike gradient descent and quasi-Newton methods. We apply this algorithm to deep or recurrent neural network training, and provide numerical evidence for its superior optimization performance." + }, + { + "title": "Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation", + "abstract": "In this paper, we propose a novel neural network model called RNN Encoder‐ Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixedlength vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder‐Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases." + }, + { + "title": "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", + "abstract": null + }, + { + "title": "Decoder Integration and Expected BLEU Training for Recurrent Neural Network Language Models", + "abstract": "Neural network language models are often trained by optimizing likelihood, but we would prefer to optimize for a task specific metric, such as BLEU in machine translation. We show how a recurrent neural network language model can be optimized towards an expected BLEU loss instead of the usual cross-entropy criterion. Furthermore, we tackle the issue of directly integrating a recurrent network into firstpass decoding under an efficient approximation. Our best results improve a phrasebased statistical machine translation system trained on WMT 2012 French-English data by up to 2.0 BLEU, and the expected BLEU objective improves over a crossentropy trained model by up to 0.6 BLEU in a single reference setup." + }, + { + "title": "Tailoring Continuous Word Representations for Dependency Parsing", + "abstract": "Word representations have proven useful for many NLP tasks, e.g., Brown clusters as features in dependency parsing (Koo et al., 2008). In this paper, we investigate the use of continuous word representations as features for dependency parsing. We compare several popular embeddings to Brown clusters, via multiple types of features, in both news and web domains. We find that all embeddings yield significant parsing gains, including some recent ones that can be trained in a fraction of the time of others. Explicitly tailoring the representations for the task leads to further improvements. Moreover, an ensemble of all representations achieves the best results, suggesting their complementarity." + }, + { + "title": "Dependency-Based Word Embeddings", + "abstract": "While continuous word embeddings are gaining popularity, current models are based solely on linear contexts. In this work, we generalize the skip-gram model with negative sampling introduced by Mikolov et al. to include arbitrary contexts. In particular, we perform experiments with dependency-based contexts, and show that they produce markedly different embeddings. The dependencybased embeddings are less topical and exhibit more functional similarity than the original skip-gram embeddings." + }, + { + "title": "Adaptive Recursive Neural Network for Target-dependent Twitter Sentiment Classification", + "abstract": "We propose Adaptive Recursive Neural Network (AdaRNN) for target-dependent Twitter sentiment classification. AdaRNN adaptively propagates the sentiments of words to target depending on the context and syntactic relationships between them. It consists of more than one composition functions, and we model the adaptive sentiment propagations as distributions over these composition functions. The experimental studies illustrate that AdaRNN improves the baseline methods. Furthermore, we introduce a manually annotated dataset for target-dependent Twitter sentiment analysis." + }, + { + "title": "Tagging The Web: Building A Robust Web Tagger with Neural Network", + "abstract": "In this paper, we address the problem of web-domain POS tagging using a twophase approach. The first phase learns representations that capture regularities underlying web text. The representation is integrated as features into a neural network that serves as a scorer for an easy-first POS tagger. Parameters of the neural network are trained using guided learning in the second phase. Experiment on the SANCL 2012 shared task show that our approach achieves 93.15% average tagging accuracy, which is the best accuracy reported so far on this data set, higher than those given by ensembled syntactic parsers." + }, + { + "title": "Normalizing tweets with edit scripts and recurrent neural embeddings", + "abstract": "Tweets often contain a large proportion of abbreviations, alternative spellings, novel words and other non-canonical language. These features are problematic for standard language analysis tools and it can be desirable to convert them to canonical form. We propose a novel text normalization model based on learning edit operations from labeled data while incorporating features induced from unlabeled data via character-level neural text embeddings. The text embeddings are generated using an Simple Recurrent Network. We find that enriching the feature set with text embeddings substantially lowers word error rates on an English tweet normalization dataset. Our model improves on stateof-the-art with little training data and without any lexical resources." + }, + { + "title": "Recurrent Neural Networks for Word Alignment Model", + "abstract": "This study proposes a word alignment model based on a recurrent neural network (RNN), in which an unlimited alignment history is represented by recurrently connected hidden layers. We perform unsupervised learning using noise-contrastive estimation (Gutmann and Hyvarinen, 2010; Mnih and Teh, 2012), which utilizes artificially generated negative samples. Our alignment model is directional, similar to the generative IBM models (Brown et al., 1993). To overcome this limitation, we encourage agreement between the two directional models by introducing a penalty function that ensures word embedding consistency across two directional models during training. The RNN-based model outperforms the feed-forward neural network-based model (Yang et al., 2013) as well as the IBM Model 4 under Japanese-English and French-English word alignment tasks, and achieves comparable translation performance to those baselines for Japanese-English and Chinese-English translation tasks." + }, + { + "title": "Political Ideology Detection Using Recursive Neural Networks", + "abstract": "An individual’s words often reveal their political ideology. Existing automated techniques to identify ideology from text focus on bags of words or wordlists, ignoring syntax. Taking inspiration from recent work in sentiment analysis that successfully models the compositional aspect of language, we apply a recursive neural network (RNN) framework to the task of identifying the political position evinced by a sentence. To show the importance of modeling subsentential elements, we crowdsource political annotations at a phrase and sentence level. Our model outperforms existing models on our newly annotated dataset and an existing dataset." + }, + { + "title": "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + "abstract": null + }, + { + "title": "Compositional Morphology for Word Representations and Language Modelling", + "abstract": "This paper presents a scalable method for integrating compositional morphological representations into a vector-based probabilistic language model. Our approach is evaluated in the context of log-bilinear language models, rendered suitably efficient for implementation inside a machine translation decoder by factoring the vocabulary. We perform both intrinsic and extrinsic evaluations, presenting results on a range of languages which demonstrate that our model learns morphological representations that both perform well on word similarity tasks and lead to substantial reductions in perplexity. When used for translation into morphologically rich languages with large vocabularies, our models obtain improvements of up to 1.2 BLEU points relative to a baseline system using back-off n-gram models." + }, + { + "title": "Multilingual Models for Compositional Distributed Semantics", + "abstract": "We present a novel technique for learning semantic representations, which extends the distributional hypothesis to multilingual data and joint-space embeddings. Our models leverage parallel data and learn to strongly align the embeddings of semantically equivalent sentences, while maintaining sufficient distance between those of dissimilar sentences. The models do not rely on word alignments or any syntactic information and are successfully applied to a number of diverse languages. We extend our approach to learn semantic representations at the document level, too. We evaluate these models on two cross-lingual document classification tasks, outperforming the prior state of the art. Through qualitative analysis and the study of pivoting effects we demonstrate that our representations are semantically plausible and can capture semantic relationships across languages without parallel data." + }, + { + "title": "Introductory Lectures on Convex Optimization - A Basic Course", + "abstract": null + }, + { + "title": "A Convolutional Neural Network for Modelling Sentences", + "abstract": "The ability to accurately represent sentences is central to language understanding. We describe a convolutional architecture dubbed the Dynamic Convolutional Neural Network (DCNN) that we adopt for the semantic modelling of sentences. The network uses Dynamic k-Max Pooling, a global pooling operation over linear sequences. The network handles input sentences of varying length and induces a feature graph over the sentence that is capable of explicitly capturing short and long-range relations. The network does not rely on a parse tree and is easily applicable to any language. We test the DCNN in four experiments: small scale binary and multi-class sentiment prediction, six-way question classification and Twitter sentiment prediction by distant supervision. The network achieves excellent performance in the first three tasks and a greater than 25% error reduction in the last task with respect to the strongest baseline." + }, + { + "title": "Improving Vector Space Word Representations Using Multilingual Correlation", + "abstract": "The distributional hypothesis of Harris (1954), according to which the meaning of words is evidenced by the contexts they occur in, has motivated several effective techniques for obtaining vector space semantic representations of words using unannotated text corpora. This paper argues that lexico-semantic content should additionally be invariant across languages and proposes a simple technique based on canonical correlation analysis (CCA) for incorporating multilingual evidence into vectors generated monolingually. We evaluate the resulting word representations on standard lexical semantic evaluation tasks and show that our method produces substantially better semantic representations than monolingual techniques." + }, + { + "title": "word2vec Explained: deriving Mikolov et al.'s negative-sampling word-embedding method", + "abstract": "The word2vec software of Tomas Mikolov and colleagues (this https URL ) has gained a lot of traction lately, and provides state-of-the-art word embeddings. The learning models behind the software are described in two research papers. We found the description of the models in these papers to be somewhat cryptic and hard to follow. While the motivations and presentation may be obvious to the neural-networks language-modeling crowd, we had to struggle quite a bit to figure out the rationale behind the equations. \nThis note is an attempt to explain equation (4) (negative sampling) in \"Distributed Representations of Words and Phrases and their Compositionality\" by Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado and Jeffrey Dean." + }, + { + "title": "Learning word embeddings efficiently with noise-contrastive estimation", + "abstract": "Continuous-valued word embeddings learned by neural language models have recently been shown to capture semantic and syntactic information about words very well, setting performance records on several word similarity tasks. The best results are obtained by learning high-dimensional embeddings from very large quantities of data, which makes scalability of the training method a critical factor. \n \nWe propose a simple and scalable new approach to learning word embeddings based on training log-bilinear models with noise-contrastive estimation. Our approach is simpler, faster, and produces better results than the current state-of-the-art method. We achieve results comparable to the best ones reported, which were obtained on a cluster, using four times less data and more than an order of magnitude less computing time. We also investigate several model types and find that the embeddings learned by the simpler models perform at least as well as those learned by the more complex ones." + }, + { + "title": "Training Deterministic Parsers with Non-Deterministic Oracles", + "abstract": "Greedy transition-based parsers are very fast but tend to suffer from error propagation. This problem is aggravated by the fact that they are normally trained using oracles that are deterministic and incomplete in the sense that they assume a unique canonical path through the transition system and are only valid as long as the parser does not stray from this path. In this paper, we give a general characterization of oracles that are nondeterministic and complete, present a method for deriving such oracles for transition systems that satisfy a property we call arc decomposition, and instantiate this method for three well-known transition systems from the literature. We say that these oracles are dynamic, because they allow us to dynamically explore alternative and nonoptimal paths during training — in contrast to oracles that statically assume a unique optimal path. Experimental evaluation on a wide range of data sets clearly shows that using dynamic oracles to train greedy parsers gives substantial improvements in accuracy. Moreover, this improvement comes at no cost in terms of efficiency, unlike other techniques like beam search." + }, + { + "title": "Distributed Representations of Words and Phrases and their Compositionality", + "abstract": "The recently introduced continuous Skip-gram model is an efficient method for learning high-quality distributed vector representations that capture a large number of precise syntactic and semantic word relationships. In this paper we present several extensions that improve both the quality of the vectors and the training speed. By subsampling of the frequent words we obtain significant speedup and also learn more regular word representations. We also describe a simple alternative to the hierarchical softmax called negative sampling. \n \nAn inherent limitation of word representations is their indifference to word order and their inability to represent idiomatic phrases. For example, the meanings of \"Canada\" and \"Air\" cannot be easily combined to obtain \"Air Canada\". Motivated by this example, we present a simple method for finding phrases in text, and show that learning good vector representations for millions of phrases is possible." + }, + { + "title": "Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank", + "abstract": "Semantic word spaces have been very useful but cannot express the meaning of longer phrases in a principled way. Further progress towards understanding compositionality in tasks such as sentiment detection requires richer supervised training and evaluation resources and more powerful models of composition. To remedy this, we introduce a Sentiment Treebank. It includes fine grained sentiment labels for 215,154 phrases in the parse trees of 11,855 sentences and presents new challenges for sentiment compositionality. To address them, we introduce the Recursive Neural Tensor Network. When trained on the new treebank, this model outperforms all previous methods on several metrics. It pushes the state of the art in single sentence positive/negative classification from 80% up to 85.4%. The accuracy of predicting fine-grained sentiment labels for all phrases reaches 80.7%, an improvement of 9.7% over bag of features baselines. Lastly, it is the only model that can accurately capture the effects of negation and its scope at various tree levels for both positive and negative phrases." + }, + { + "title": "Decoding with Large-Scale Neural Language Models Improves Translation", + "abstract": "We explore the application of neural language models to machine translation. We develop a new model that combines the neural probabilistic language model of Bengio et al., rectified linear units, and noise-contrastive estimation, and we incorporate it into a machine translation system both by reranking k-best lists and by direct integration into the decoder. Our large-scale, large-vocabulary experiments across four language pairs show that our neural language model improves translation quality by up to 1.1 Bleu." + }, + { + "title": "Joint Language and Translation Modeling with Recurrent Neural Networks", + "abstract": "We present a joint language and translation model based on a recurrent neural network which predicts target words based on an unbounded history of both source and target words. The weaker independence assumptions of this model result in a vastly larger search space compared to related feedforward-based language or translation models. We tackle this issue with a new lattice rescoring algorithm and demonstrate its effectiveness empirically. Our joint model builds on a well known recurrent neural network language model (Mikolov, 2012) augmented by a layer of additional inputs from the source language. We show competitive accuracy compared to the traditional channel model features. Our best results improve the output of a system trained on WMT 2012 French-English data by up to 1.5 BLEU, and by 1.1 BLEU on average across several test sets." + }, + { + "title": "Simple Customization of Recursive Neural Networks for Semantic Relation Classification", + "abstract": "In this paper, we present a recursive neural network (RNN) model that works on a syntactic tree. Our model differs from previous RNN models in that the model allows for an explicit weighting of important phrases for the target task. We also propose to average parameters in training. Our experimental results on semantic relation classification show that both phrase categories and task-specific weighting significantly improve the prediction accuracy of the model. We also show that averaging the model parameters is effective in stabilizing the learning and improves generalization capacity. The proposed model marks scores competitive with state-of-the-art RNN-based models." + }, + { + "title": "Effect of Non-linear Deep Architecture in Sequence Labeling", + "abstract": "If we compare the widely used Conditional Random Fields (CRF) with newly proposed “deep architecture” sequence models (Collobert et al., 2011), there are two things changing: from linear architecture to non-linear, and from discrete feature representation to distributional. It is unclear, however, what utility nonlinearity offers in conventional featurebased models. In this study, we show the close connection between CRF and “sequence model” neural nets, and present an empirical investigation to compare their performance on two sequence labeling tasks ‐ Named Entity Recognition and Syntactic Chunking. Our results suggest that non-linear models are highly effective in low-dimensional distributional spaces. Somewhat surprisingly, we find that a nonlinear architecture offers no benefits in a high-dimensional discrete feature space." + }, + { + "title": "Parsing with Compositional Vector Grammars", + "abstract": "Natural language parsing has typically been done with small sets of discrete categories such as NP and VP, but this representation does not capture the full syntactic nor semantic richness of linguistic phrases, and attempts to improve on this by lexicalizing phrases or splitting categories only partly address the problem at the cost of huge feature spaces and sparseness. Instead, we introduce a Compositional Vector Grammar (CVG), which combines PCFGs with a syntactically untied recursive neural network that learns syntactico-semantic, compositional vector representations. The CVG improves the PCFG of the Stanford Parser by 3.8% to obtain an F1 score of 90.4%. It is fast to train and implemented approximately as an efficient reranker it is about 20% faster than the current Stanford factored parser. The CVG learns a soft notion of head words and improves performance on the types of ambiguities that require semantic information such as PP attachments." + }, + { + "title": "Combination of Recurrent Neural Networks and Factored Language Models for Code-Switching Language Modeling", + "abstract": "In this paper, we investigate the application of recurrent neural network language models (RNNLM) and factored language models (FLM) to the task of language modeling for Code-Switching speech. We present a way to integrate partof-speech tags (POS) and language information (LID) into these models which leads to significant improvements in terms of perplexity. Furthermore, a comparison between RNNLMs and FLMs and a detailed analysis of perplexities on the different backoff levels are performed. Finally, we show that recurrent neural networks and factored language models can be combined using linear interpolation to achieve the best performance. The final combined language model provides 37.8% relative improvement in terms of perplexity on the SEAME development set and a relative improvement of 32.7% on the evaluation set compared to the traditional n-gram language model." + }, + { + "title": "Efficient Implementation of Beam-Search Incremental Parsers", + "abstract": "Beam search incremental parsers are accurate, but not as fast as they could be. We demonstrate that, contrary to popular belief, most current implementations of beam parsers in fact run in O(n 2 ), rather than linear time, because each statetransition is actually implemented as an O(n) operation. We present an improved implementation, based on Tree Structured Stack (TSS), in which a transition is performed in O(1), resulting in a real lineartime algorithm, which is verified empirically. We further improve parsing speed by sharing feature-extraction and dotproduct across beam items. Practically, our methods combined offer a speedup of 2x over strong baselines on Penn Treebank sentences, and are orders of magnitude faster on much longer sentences." + }, + { + "title": "Adaptation Data Selection using Neural Language Models: Experiments in Machine Translation", + "abstract": "Data selection is an effective approach to domain adaptation in statistical machine translation. The idea is to use language models trained on small in-domain text to select similar sentences from large general-domain corpora, which are then incorporated into the training data. Substantial gains have been demonstrated in previous works, which employ standard ngram language models. Here, we explore the use of neural language models for data selection. We hypothesize that the continuous vector representation of words in neural language models makes them more effective than n-grams for modeling unknown word contexts, which are prevalent in general-domain text. In a comprehensive evaluation of 4 language pairs (English to German, French, Russian, Spanish), we found that neural language models are indeed viable tools for data selection: while the improvements are varied (i.e. 0.1 to 1.7 gains in BLEU), they are fast to train on small in-domain data and can sometimes substantially outperform conventional n-grams." + }, + { + "title": "The Role of Syntax in Vector Space Models of Compositional Semantics", + "abstract": "Modelling the compositional process by which the meaning of an utterance arises from the meaning of its parts is a fundamental task of Natural Language Processing. In this paper we draw upon recent advances in the learning of vector space representations of sentential semantics and the transparent interface between syntax and semantics provided by Combinatory Categorial Grammar to introduce Combinatory Categorial Autoencoders. This model leverages the CCG combinatory operators to guide a non-linear transformation of meaning within a sentence. We use this model to learn high dimensional embeddings for sentences and evaluate them in a range of tasks, demonstrating that the incorporation of syntax allows a concise model to learn representations that are both effective and general." + }, + { + "title": "Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", + "abstract": null + }, + { + "title": "Connecting Language and Knowledge Bases with Embedding Models for Relation Extraction", + "abstract": "This paper proposes a novel approach for relation extraction from free text which is trained to jointly use information from the text and from existing knowledge. Our model is based on scoring functions that operate by learning low-dimensional embeddings of words, entities and relationships from a knowledge base. We empirically show on New York Times articles aligned with Freebase relations that our approach is able to efficiently use the extra information provided by a large subset of Freebase data (4M entities, 23k relationships) to improve over methods that rely on text features alone." + } + ] + }, + "author_data": { + "449daedb-95cb-4090-a1cc-09b49e39c29b": { + "pk": "449daedb-95cb-4090-a1cc-09b49e39c29b", + "project_name": null, + "name": "Yoav Goldberg", + "bio": "I am a researcher deeply engaged in the fields of natural language processing (NLP) and computational linguistics, with a particular focus on parsing, word embeddings, and machine translation. My work has evolved from enhancing traditional models, such as the skip-gram model, to developing innovative parsing techniques that improve accuracy and efficiency across various languages.\n\nIn my recent research, I have explored the intricacies of dependency parsing, introducing dynamic oracles that allow for more flexible training of parsers, which has led to significant improvements in parsing accuracy. I have also contributed to the understanding of word embeddings by analyzing their underlying structures and proposing methods to recover relational similarities more effectively.\n\nMy work on precision-biased parsing highlights my commitment to refining parsing tasks to favor precision, which is crucial in many real-world applications. Additionally, I have tackled the challenges of adapting parsers to new domains, particularly in the biomedical field, by leveraging domain-specific selectional preferences.\n\nI am passionate about creating resources that facilitate further research, such as the dataset of syntactic n-grams I developed, which opens new avenues for lexical semantics studies. Overall, my research aims to bridge theoretical advancements with practical applications, enhancing the capabilities of NLP systems in understanding and processing human language.", + "collaborators": [ + "Omer Levy", + "Joakim Nivre", + "Michael Elhadad", + "Francesco Sartorio", + "G. Satta", + "Roee Aharoni", + "Moshe Koppel", + "Ryan T. McDonald", + "Gabriel Stanovsky", + "Jessica Ficler", + "Ido Dagan", + "Matthew Honnibal", + "Mark Johnson", + "Jon Orwant", + "Djamé Seddah", + "Reut Tsarfaty", + "Sandra Kübler", + "Marie Candito", + "Jinho D. Choi", + "Richárd Farkas", + "Jennifer Foster", + "Iakes Goenaga", + "Koldo Gojenola", + "Spence Green", + "Nizar Habash", + "Marco Kuhlmann", + "Wolfgang Maier", + "A. Przepiórkowski", + "Ryan Roth", + "Wolfgang Seeker", + "Yannick Versley", + "V. Vincze", + "Marcin Woliński", + "Alina Wróblewska", + "Eric Villemonte de la Clergerie", + "Kai Zhao", + "Liang Huang", + "Raphael Cohen", + "Zhao-bin Gao", + "J. Wong", + "S. Merrick", + "M. Karim", + "M. Uemastu", + "M. Li" + ], + "pub_titles": [ + "Dependency-Based Word Embeddings", + "A Tabular Method for Dynamic Oracles in Transition-Based Parsing", + "Automatic Detection of Machine Translated Text and Translation Quality Estimation", + "word2vec Explained: deriving Mikolov et al.'s negative-sampling word-embedding method", + "Linguistic Regularities in Sparse and Explicit Word Representations", + "Squibs: Constrained Arc-Eager Dependency Parsing", + "Intermediary Semantic Representation through Proposition Structures", + "Neural Word Embedding as Implicit Matrix Factorization", + "Dynamic-oracle Transition-based Parsing with Calibrated Probabilistic Output", + "A Non-Monotonic Arc-Eager Transition System for Dependency Parsing", + "A Dataset of Syntactic-Ngrams over Time from a Very Large Corpus of English Books", + "Word Segmentation, Unknown-word Resolution, and Morphological Agreement in a Hebrew Parsing System", + "Overview of the SPMRL 2013 Shared Task: A Cross-Framework Evaluation of Parsing Morphologically Rich Languages", + "Training Deterministic Parsers with Non-Deterministic Oracles", + "Efficient Implementation of Beam-Search Incremental Parsers", + "Precision-biased Parsing and High-Quality Parse Selection", + "Domain Adaptation of a Dependency Parser with a Class-Class Selectional Preference Model", + "A Dynamic Oracle for Arc-Eager Dependency Parsing", + "Task-specific Word-Clustering for Part-of-Speech Tagging" + ], + "pub_abstracts": [ + "While continuous word embeddings are gaining popularity, current models are based solely on linear contexts. In this work, we generalize the skip-gram model with negative sampling introduced by Mikolov et al. to include arbitrary contexts. In particular, we perform experiments with dependency-based contexts, and show that they produce markedly different embeddings. The dependencybased embeddings are less topical and exhibit more functional similarity than the original skip-gram embeddings.", + "We develop parsing oracles for two transition-based dependency parsers, including the arc-standard parser, solving a problem that was left open in (Goldberg and Nivre, 2013). We experimentally show that using these oracles during training yields superior parsing accuracies on many languages.", + "We show that it is possible to automatically detect machine translated text at sentence level from monolingual corpora, using text classification methods. We show further that the accuracy with which a learned classifier can detect text as machine translated is strongly correlated with the translation quality of the machine translation system that generated it. Finally, we offer a generic machine translation quality estimation technique based on this approach, which does not require reference sentences.", + "The word2vec software of Tomas Mikolov and colleagues (this https URL ) has gained a lot of traction lately, and provides state-of-the-art word embeddings. The learning models behind the software are described in two research papers. We found the description of the models in these papers to be somewhat cryptic and hard to follow. While the motivations and presentation may be obvious to the neural-networks language-modeling crowd, we had to struggle quite a bit to figure out the rationale behind the equations. This note is an attempt to explain equation (4) (negative sampling) in \"Distributed Representations of Words and Phrases and their Compositionality\" by Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado and Jeffrey Dean.", + "Recent work has shown that neuralembedded word representations capture many relational similarities, which can be recovered by means of vector arithmetic in the embedded space. We show that Mikolov et al.’s method of first adding and subtracting word vectors, and then searching for a word similar to the result, is equivalent to searching for a word that maximizes a linear combination of three pairwise word similarities. Based on this observation, we suggest an improved method of recovering relational similarities, improving the state-of-the-art results on two recent word-analogy datasets. Moreover, we demonstrate that analogy recovery is not restricted to neural word embeddings, and that a similar amount of relational similarities can be recovered from traditional distributional word representations.", + "Arc-eager dependency parsers process sentences in a single left-to-right pass over the input and have linear time complexity with greedy decoding or beam search. We show how such parsers can be constrained to respect two different types of conditions on the output dependency graph: span constraints, which require certain spans to correspond to subtrees of the graph, and arc constraints, which require certain arcs to be present in the graph. The constraints are incorporated into the arc-eager transition system as a set of preconditions for each transition and preserve the linear time complexity of the parser.", + "We propose an intermediary-level semantic representation, providing a higher level of abstraction than syntactic parse trees, while not committing to decisions in cases such as quantification, grounding or verbspecific roles assignments. The proposal is centered around the proposition structure of the text, and includes also implicit propositions which can be inferred from the syntax but are not transparent in parse trees, such as copular relations introduced by appositive constructions. Other benefits over dependency-trees are explicit marking of logical relations between propositions, explicit marking of multiword predicate such as light-verbs, and a consistent representation for syntacticallydifferent but semantically-similar structures. The representation is meant to serve as a useful input layer for semanticoriented applications, as well as to provide a better starting point for further levels of semantic analysis such as semantic-rolelabeling and semantic-parsing.", + "We analyze skip-gram with negative-sampling (SGNS), a word embedding method introduced by Mikolov et al., and show that it is implicitly factorizing a word-context matrix, whose cells are the pointwise mutual information (PMI) of the respective word and context pairs, shifted by a global constant. We find that another embedding method, NCE, is implicitly factorizing a similar matrix, where each cell is the (shifted) log conditional probability of a word given its context. We show that using a sparse Shifted Positive PMI word-context matrix to represent words improves results on two word similarity tasks and one of two analogy tasks. When dense low-dimensional vectors are preferred, exact factorization with SVD can achieve solutions that are at least as good as SGNS's solutions for word similarity tasks. On analogy questions SGNS remains superior to SVD. We conjecture that this stems from the weighted nature of SGNS's factorization.", + "We adapt the dynamic-oracle training method of Goldberg and Nivre (2012; 2013) to train clas-sifiers that produce probabilistic output. Evaluation of an Arc-Eager parser on 6 languages shows that the AdaGrad-RDA based training procedure results in models that provide the same high level of accuracy as the averaged-perceptron trained models, while being sparser and providing well-calibrated probabilistic output.", + "Previous incremental parsers have used monotonic state transitions. However, transitions can be made to revise previous decisions quite naturally, based on further information. We show that a simple adjustment to the Arc-Eager transition system to relax its monotonicity constraints can improve accuracy, so long as the training data includes examples of mistakes for the nonmonotonic transitions to repair. We evaluate the change in the context of a stateof-the-art system, and obtain a statistically significant improvement (p < 0.001) on the English evaluation and 5/10 of the CoNLL languages.", + "We created a dataset of syntactic-ngrams (counted dependency-tree fragments) based on a corpus of 3.5 million English books. The dataset includes over 10 billion distinct items covering a wide range of syntactic configurations. It also includes temporal information, facilitating new kinds of research into lexical semantics over time. This paper describes the dataset, the syntactic representation, and the kinds of information provided.", + "We present a constituency parsing system for Modern Hebrew. The system is based on the PCFG-LA parsing method of Petrov et al. 2006, which is extended in various ways in order to accommodate the specificities of Hebrew as a morphologically rich language with a small treebank. We show that parsing performance can be enhanced by utilizing a language resource external to the treebank, specifically, a lexicon-based morphological analyzer. We present a computational model of interfacing the external lexicon and a treebank-based parser, also in the common case where the lexicon and the treebank follow different annotation schemes. We show that Hebrew word-segmentation and constituency-parsing can be performed jointly using CKY lattice parsing. Performing the tasks jointly is effective, and substantially outperforms a pipeline-based model. We suggest modeling grammatical agreement in a constituency-based parser as a filter mechanism that is orthogonal to the grammar, and present a concrete implementation of the method. Although the constituency parser does not make many agreement mistakes to begin with, the filter mechanism is effective in fixing the agreement mistakes that the parser does make.These contributions extend outside of the scope of Hebrew processing, and are of general applicability to the NLP community. Hebrew is a specific case of a morphologically rich language, and ideas presented in this work are useful also for processing other languages, including English. The lattice-based parsing methodology is useful in any case where the input is uncertain. Extending the lexical coverage of a treebank-derived parser using an external lexicon is relevant for any language with a small treebank.", + "This paper reports on the first shared task on statistical parsing of morphologically rich languages (MRLs). The task features data sets from nine languages, each available both in constituency and dependency annotation. We report on the preparation of the data sets, on the proposed parsing scenarios, and on the evaluation metrics for parsing MRLs given different representation types. We present and analyze parsing results obtained by the task participants, and then provide an analysis and comparison of the parsers across languages and frameworks, reported for gold input as well as more realistic parsing scenarios.", + "Greedy transition-based parsers are very fast but tend to suffer from error propagation. This problem is aggravated by the fact that they are normally trained using oracles that are deterministic and incomplete in the sense that they assume a unique canonical path through the transition system and are only valid as long as the parser does not stray from this path. In this paper, we give a general characterization of oracles that are nondeterministic and complete, present a method for deriving such oracles for transition systems that satisfy a property we call arc decomposition, and instantiate this method for three well-known transition systems from the literature. We say that these oracles are dynamic, because they allow us to dynamically explore alternative and nonoptimal paths during training — in contrast to oracles that statically assume a unique optimal path. Experimental evaluation on a wide range of data sets clearly shows that using dynamic oracles to train greedy parsers gives substantial improvements in accuracy. Moreover, this improvement comes at no cost in terms of efficiency, unlike other techniques like beam search.", + "Beam search incremental parsers are accurate, but not as fast as they could be. We demonstrate that, contrary to popular belief, most current implementations of beam parsers in fact run in O(n 2 ), rather than linear time, because each statetransition is actually implemented as an O(n) operation. We present an improved implementation, based on Tree Structured Stack (TSS), in which a transition is performed in O(1), resulting in a real lineartime algorithm, which is verified empirically. We further improve parsing speed by sharing feature-extraction and dotproduct across beam items. Practically, our methods combined offer a speedup of 2x over strong baselines on Penn Treebank sentences, and are orders of magnitude faster on much longer sentences.", + "We introduce precision-biased parsing: a parsing task which favors precision over recall by allowing the parser to abstain from decisions deemed uncertain. We focus on dependency-parsing and present an ensemble method which is capable of assigning parents to 84% of the text tokens while being over 96% accurate on these tokens. We use the precision-biased parsing task to solve the related high-quality parse-selection task: finding a subset of high-quality (accurate) trees in a large collection of parsed text. We present a method for choosing over a third of the input trees while keeping unlabeled dependency parsing accuracy of 97% on these trees. We also present a method which is not based on an ensemble but rather on directly predicting the risk associated with individual parser decisions. In addition to its efficiency, this method demonstrates that a parsing system can provide reasonable estimates of confidence in its predictions without relying on ensembles or aggregate corpus counts.", + "When porting parsers to a new domain, many of the errors are related to wrong attachment of out-of-vocabulary words. Since there is no available annotated data to learn the attachment preferences of the target domain words, we attack this problem using a model of selectional preferences based on domain-specific word classes. Our method uses Latent Dirichlet Allocations (LDA) to learn a domain-specific Selectional Preference model in the target domain using un-annotated data. The model provides features that model the affinities among pairs of words in the domain. To incorporate these new features in the parsing model, we adopt the co-training approach and retrain the parser with the selectional preferences features. We apply this method for adapting Easy First, a fast non-directional parser trained on WSJ, to the biomedical domain (Genia Treebank). The Selectional Preference features reduce error by 4.5% over the co-training baseline.", + "The standard training regime for transition-based dependency parsers makes use of an oracle, which predicts an optimal transition sequence for a sentence and its gold tree. We present an improved oracle for the arc-eager transition system, which provides a set of optimal transitions for every valid parser configuration, including configurations from which the gold tree is not reachable. In such cases, the oracle provides transitions that will lead to the best reachable tree from the given configuration. The oracle is efficient to implement and provably correct. We use the oracle to train a deterministic left-to-right dependency parser that is less sensitive to error propagation, using an online training procedure that also explores parser configurations resulting from non-optimal sequences of transitions. This new parser outperforms greedy parsers trained using conventional oracles on a range of data sets, with an average improvement of over 1.2 LAS points and up to almost 3 LAS points on some data sets.", + "While the use of cluster features became ubiquitous in core NLP tasks, most cluster features in NLP are based on distributional similarity. We propose a new type of clustering criteria, specific to the task of part-of-speech tagging. Instead of distributional similarity, these clusters are based on the beha vior of a baseline tagger when applied to a large corpus. These cluster features provide similar gains in accuracy to those achieved by distributional-similarity derived clusters. Using both types of cluster features together further improve tagging accuracies. We show that the method is effective for both the in-domain and out-of-domain scenarios for English, and for French, German and Italian. The effect is larger for out-of-domain text." + ], + "domain": [ + "Natural Language Processing", + "Dependency Parsing", + "Word Embeddings", + "Machine Translation" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "### [Question 1] - What is the problem?\nHow can we effectively apply neural network models to improve event detection and domain adaptation in natural language processing tasks?\n\n### [Question 2] - Why is it interesting and important?\nSolving this problem is crucial for advancing the field of natural language processing (NLP) as it can lead to more accurate and context-aware models that understand and interpret events in text. Improved event detection can enhance applications such as information retrieval, sentiment analysis, and automated content generation. By addressing this question, future research can build on more robust frameworks, leading to practical applications in various domains, including social media analysis, news aggregation, and real-time event monitoring.\n\n### [Question 3] - Why is it hard?\nThe challenges in this area stem from the complexity of natural language, which includes ambiguity, context sensitivity, and the need for models to generalize across different domains. Naive approaches may fail due to their inability to capture the nuances of language and the relationships between events. Additionally, technical obstacles such as the need for large, annotated datasets and the computational resources required for training deep learning models complicate the problem. Theoretical challenges include understanding how to effectively represent and learn from diverse event types and their interrelations.\n\n### [Question 4] - Why hasn't it been solved before?\nPrevious research has often focused on isolated aspects of event detection or domain adaptation, leading to a lack of comprehensive models that integrate both effectively. Limitations in existing solutions include insufficient datasets, inadequate model architectures, and a lack of attention to the dynamic nature of language and events. Barriers such as the complexity of designing models that can adapt to various domains and the need for extensive computational resources have hindered progress. Our approach aims to combine advanced neural network architectures with innovative training techniques to overcome these limitations and provide a more unified solution.\n\n### [Question 5] - What are the key components of my approach and results?\nOur proposed methodology involves developing a convolutional neural network (CNN) model tailored for event detection, utilizing a diverse dataset that includes annotated text from various domains. We will employ metrics such as precision, recall, and F1-score to evaluate model performance. The expected outcomes include a significant improvement in event detection accuracy and the model's ability to adapt to new domains with minimal retraining. Additionally, we aim to provide insights into the model's decision-making process, enhancing interpretability and usability in real-world applications." + }, + "1409.1259": { + "paper_data": { + "title": "On the Properties of Neural Machine Translation: Encoder-Decoder Approaches", + "url": "http://arxiv.org/abs/1409.1259v2", + "arxiv_id": "1409.1259", + "authors": [ + "Kyunghyun Cho", + "Bart van Merrienboer", + "Dzmitry Bahdanau", + "Yoshua Bengio" + ], + "abstract": "Neural machine translation is a relatively new approach to statistical machine translation based purely on neural networks. The neural machine translation models often consist of an encoder and a decoder. The encoder extracts a fixed-length representation from a variable-length input sentence, and the decoder generates a correct translation from this representation. In this paper, we focus on analyzing the properties of the neural machine translation using two models; RNN Encoder--Decoder and a newly proposed gated recursive convolutional neural network. We show that the neural machine translation performs relatively well on short sentences without unknown words, but its performance degrades rapidly as the length of the sentence and the number of unknown words increase. Furthermore, we find that the proposed gated recursive convolutional network learns a grammatical structure of a sentence automatically.", + "introduction": " Introduction A new approach for statistical machine transla- tion based purely on neural networks has recently been proposed (Kalchbrenner and Blunsom, 2013; Sutskever et al., 2014). This new approach, which we refer to as neural machine translation , is in- spired by the recent trend of deep representational learning. All the neural network models used in (Kalchbrenner and Blunsom, 2013; Sutskever et al., 2014; Cho et al., 2014) consist of an encoder and a decoder. The encoder extracts a fixed-length vector representation from a variable-length input sentence, and from this representation the decoder \u0003Research done while visiting Universit ´e de Montr ´ealgenerates a correct, variable-length target transla- tion. The emergence of the neural machine transla- tion is highly significant, both practically and the- oretically. Neural machine translation models re- quire only a fraction of the memory needed by traditional statistical machine translation (SMT) models. The models we trained for this paper require only 500MB of memory in total. This stands in stark contrast with existing SMT sys- tems, which often require tens of gigabytes of memory. This makes the neural machine trans- lation appealing in practice. Furthermore, un- like conventional translation systems, each and ev- ery component of the neural translation model is trained jointly to maximize the translation perfor- mance. As this approach is relatively new, there has not been much work on analyzing the properties and behavior of these models. For instance: What are the properties of sentences on which this ap- proach performs better? How does the choice of source/target vocabulary affect the performance? In which cases does the neural machine translation fail? It is crucial to understand the properties and be- havior of this new neural machine translation ap- proach in order to determine future research di- rections. Also, understanding the weaknesses and strengths of neural machine translation might lead to better ways of integrating SMT and neural ma- chine translation systems. In this paper, we analyze two neural machine translation models. One of them is the RNN Encoder–Decoder that was proposed recently in (Cho et al., 2014). The other model replaces the encoder in the RNN Encoder–Decoder model with a novel neural network, which we call a gated recursive convolutional neural network (grConv). We evaluate these two models on the task of trans- lation from French to English.arXiv:1409.1259v2 [cs.CL] 7 Oct 2014Our analysis shows that the performance of the neural machine translation model degrades quickly as the length of a source sentence in- creases. Furthermore, we find that the vocabulary size has a high impact on the translation perfor- mance. Nonetheless, qualitatively we find that the both models are able to generate correct transla- tions most of the time. Furthermore, the newly proposed grConv model is able to learn, without supervision, a kind of syntactic structure over the source language. 2 Neural Networks for Variable-Length Sequences In this section, we describe two types of neural networks that are able to process variable-length sequences. These are the recurrent neural net- work and the proposed gated recursive convolu- tional neural network. 2.1 Recurrent Neural Network with Gated Hidden Neurons z rh h~x (a) (b) Figure 1: The graphical illustration of (a) the re- current neural network and (b) the hidden unit that adaptively forgets and remembers. A recurrent neural network (RNN, Fig. 1 (a)) works on a variable-length sequence x= (x1;x2;\u0001\u0001\u0001;xT)by maintaining a hidden state h over time. At each timestep t, the hidden state h(t) is updated by h(t)=f\u0010 h(t\u00001);xt\u0011 ; wherefis an activation function. Often fis as simple as performing a linear transformation on the input vectors, summing them, and applying an element-wise logistic", + "references": [ + { + "title": "Sequence to Sequence Learning with Neural Networks", + "abstract": "Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks. Although DNNs work well whenever large labeled training sets are available, they cannot be used to map sequences to sequences. In this paper, we present a general end-to-end approach to sequence learning that makes minimal assumptions on the sequence structure. Our method uses a multilayered Long Short-Term Memory (LSTM) to map the input sequence to a vector of a fixed dimensionality, and then another deep LSTM to decode the target sequence from the vector. Our main result is that on an English to French translation task from the WMT-14 dataset, the translations produced by the LSTM achieve a BLEU score of 34.8 on the entire test set, where the LSTM's BLEU score was penalized on out-of-vocabulary words. Additionally, the LSTM did not have difficulty on long sentences. For comparison, a phrase-based SMT system achieves a BLEU score of 33.3 on the same dataset. When we used the LSTM to rerank the 1000 hypotheses produced by the aforementioned SMT system, its BLEU score increases to 36.5, which is close to the previous state of the art. The LSTM also learned sensible phrase and sentence representations that are sensitive to word order and are relatively invariant to the active and the passive voice. Finally, we found that reversing the order of the words in all source sentences (but not target sentences) improved the LSTM's performance markedly, because doing so introduced many short term dependencies between the source and the target sentence which made the optimization problem easier." + }, + { + "title": "Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation", + "abstract": "In this paper, we propose a novel neural network model called RNN Encoder‐ Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixedlength vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder‐Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases." + }, + { + "title": "Recurrent Continuous Translation Models", + "abstract": "We introduce a class of probabilistic continuous translation models called Recurrent Continuous Translation Models that are purely based on continuous representations for words, phrases and sentences and do not rely on alignments or phrasal translation units. The models have a generation and a conditioning aspect. The generation of the translation is modelled with a target Recurrent Language Model, whereas the conditioning on the source sentence is modelled with a Convolutional Sentence Model. Through various experiments, we show first that our models obtain a perplexity with respect to gold translations that is > 43% lower than that of stateof-the-art alignment-based translation models. Secondly, we show that they are remarkably sensitive to the word order, syntax, and meaning of the source sentence despite lacking alignments. Finally we show that they match a state-of-the-art system when rescoring n-best lists of translations." + }, + { + "title": "Generating Sequences With Recurrent Neural Networks", + "abstract": "This paper shows how Long Short-term Memory recurrent neural networks can be used to generate complex sequences with long-range structure, simply by predicting one data point at a time. The approach is demonstrated for text (where the data are discrete) and online handwriting (where the data are real-valued). It is then extended to handwriting synthesis by allowing the network to condition its predictions on a text sequence. The resulting system is able to generate highly realistic cursive handwriting in a wide variety of styles." + }, + { + "title": "ADADELTA: An Adaptive Learning Rate Method", + "abstract": "We present a novel per-dimension learning rate method for gradient descent called ADADELTA. The method dynamically adapts over time using only first order information and has minimal computational overhead beyond vanilla stochastic gradient descent. The method requires no manual tuning of a learning rate and appears robust to noisy gradient information, different model architecture choices, various data modalities and selection of hyperparameters. We show promising results compared to other methods on the MNIST digit classification task using a single machine and on a large scale voice dataset in a distributed cluster environment." + }, + { + "title": "Sequence Transduction with Recurrent Neural Networks", + "abstract": "Many machine learning tasks can be expressed as the transformation---or \\emph{transduction}---of input sequences into output sequences: speech recognition, machine translation, protein secondary structure prediction and text-to-speech to name but a few. One of the key challenges in sequence transduction is learning to represent both the input and output sequences in a way that is invariant to sequential distortions such as shrinking, stretching and translating. Recurrent neural networks (RNNs) are a powerful sequence learning architecture that has proven capable of learning such representations. However RNNs traditionally require a pre-defined alignment between the input and output sequences to perform transduction. This is a severe limitation since \\emph{finding} the alignment is the most difficult aspect of many sequence transduction problems. Indeed, even determining the length of the output sequence is often challenging. This paper introduces an end-to-end, probabilistic sequence transduction system, based entirely on RNNs, that is in principle able to transform any input sequence into any finite, discrete output sequence. Experimental results for phoneme recognition are provided on the TIMIT speech corpus." + }, + { + "title": "Domain Adaptation via Pseudo In-Domain Data Selection", + "abstract": "We explore efficient domain adaptation for the task of statistical machine translation based on extracting sentences from a large general-domain parallel corpus that are most relevant to the target domain. These sentences may be selected with simple cross-entropy based methods, of which we present three. As these sentences are not themselves identical to the in-domain data, we call them pseudo in-domain subcorpora. These subcorpora -- 1% the size of the original -- can then used to train small domain-adapted Statistical Machine Translation (SMT) systems which outperform systems trained on the entire corpus. Performance is further improved when we use these domain-adapted models in combination with a true in-domain model. The results show that more training data is not always better, and that best results are attained via proper domain-relevant data selection, as well as combining in- and general-domain systems during decoding." + }, + { + "title": "Better Evaluation Metrics Lead to Better Machine Translation", + "abstract": "Many machine translation evaluation metrics have been proposed after the seminal BLEU metric, and many among them have been found to consistently outperform BLEU, demonstrated by their better correlations with human judgment. It has long been the hope that by tuning machine translation systems against these new generation metrics, advances in automatic machine translation evaluation can lead directly to advances in automatic machine translation. However, to date there has been no unambiguous report that these new metrics can improve a state-of-the-art machine translation system over its BLEU-tuned baseline. \n \nIn this paper, we demonstrate that tuning Joshua, a hierarchical phrase-based statistical machine translation system, with the TESLA metrics results in significantly better human-judged translation quality than the BLEU-tuned baseline. TESLA-M in particular is simple and performs well in practice on large datasets. We release all our implementation under an open source license. It is our hope that this work will encourage the machine translation community to finally move away from BLEU as the unquestioned default and to consider the new generation metrics when tuning their systems." + }, + { + "title": "Statistical Phrase-Based Translation", + "abstract": "We propose a new phrase-based translation model and decoding algorithm that enables us to evaluate and compare several, previously proposed phrase-based translation models. Within our framework, we carry out a large number of experiments to understand better and explain why phrase-based models out-perform word-based models. Our empirical results, which hold for all examined language pairs, suggest that the highest levels of performance can be obtained through relatively simple means: heuristic learning of phrase translations from word-based alignments and lexical weighting of phrase translations. Surprisingly, learning phrases longer than three words and learning phrases from high-accuracy word-level alignment models does not have a strong impact on performance. Learning only syntactically motivated phrases degrades the performance of our systems." + }, + { + "title": "Long Short-Term Memory", + "abstract": "Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms." + }, + { + "title": "Anonymized", + "abstract": null + }, + { + "title": "Two recurrent continuous translation models Association for Computational Linguis- tics", + "abstract": null + }, + { + "title": "Audio Chord Recognition with Recurrent Neural Networks", + "abstract": "In this paper, we present an audio chord recognition system based on a recurrent neural network. The audio features are obtained from a deep neural network optimized with a combination of chromagram targets and chord information, and aggregated over different time scales. Contrarily to other existing approaches, our system incorporates acoustic and musicological models under a single training objective. We devise an efficient algorithm to search for the global mode of the output distribution while taking long-term dependencies into account. The resulting method is competitive with state-of-the-art approaches on the MIREX dataset in the major/minor prediction task." + }, + { + "title": "BLEU Deconstructed: Designing a Better MT Evaluation Metric", + "abstract": "BLEU is the de facto standard automatic evaluation metric in machine translation. While BLEU is undeniably useful, it has a number of limitations. Although it works well for large documents and multiple references, it is unreliable at the sentence or sub-sentence levels, and with a single reference. In this paper, we propose new variants of BLEU which address these limitations, resulting in a more flexible metric which is not only more reliable, but also allows for more accurate discriminative training. Our best metric has better correlation with human judgements than standard BLEU, despite using a simpler formulation. Moreover, these improvements carry over to a system tuned for our new metric." + }, + { + "title": "her new position of foreign affairs and security policy representative as a reply to a question: ”Who is the European Union? Which phone number should I call?”;", + "abstract": null + } + ] + }, + "author_data": {}, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we analyze the properties and performance limitations of neural machine translation models, particularly in relation to sentence length and vocabulary size?\n\n**[Question 2] - Why is it interesting and important?** \nUnderstanding the strengths and weaknesses of neural machine translation (NMT) models is crucial for advancing the field of machine translation. By identifying the conditions under which these models perform well or fail, researchers can refine existing models and develop new approaches that integrate the strengths of both neural and statistical machine translation systems. This research could lead to improved translation accuracy and efficiency, ultimately benefiting applications in global communication, information access, and cross-lingual understanding.\n\n**[Question 3] - Why is it hard?** \nAnalyzing the performance of NMT models presents several challenges. The complexity of language, including variations in sentence structure and vocabulary, makes it difficult to pinpoint specific factors that influence translation quality. Naive approaches may overlook the nuanced interactions between sentence length and vocabulary size, leading to incomplete or misleading conclusions. Additionally, the need for extensive datasets and computational resources to evaluate model performance across diverse linguistic contexts adds to the difficulty of this research.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research on NMT has primarily focused on model development and performance benchmarks, often neglecting a detailed analysis of the models' behavior under varying conditions. Limitations in computational resources and the complexity of language processing have hindered comprehensive studies. Additionally, existing work has not sufficiently explored the interplay between sentence length and vocabulary size, which are critical factors affecting translation performance. Our approach aims to fill these gaps by providing a systematic analysis of these variables in the context of NMT.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves a detailed evaluation of two neural machine translation models: the RNN Encoder-Decoder and the gated recursive convolutional neural network (grConv). We will use a dataset of French-English translation pairs and employ metrics such as BLEU score to assess translation quality. We expect to uncover insights into how sentence length and vocabulary size impact translation performance, as well as demonstrate the grConv model's ability to learn syntactic structures without supervision. The anticipated outcomes will contribute to a deeper understanding of NMT and inform future research directions." + }, + "1506.00019": { + "paper_data": { + "title": "A Critical Review of Recurrent Neural Networks for Sequence Learning", + "url": "http://arxiv.org/abs/1506.00019v4", + "arxiv_id": "1506.00019", + "authors": [ + "Zachary C. Lipton", + "John Berkowitz", + "Charles Elkan" + ], + "abstract": "Countless learning tasks require dealing with sequential data. Image captioning, speech synthesis, and music generation all require that a model produce outputs that are sequences. In other domains, such as time series prediction, video analysis, and musical information retrieval, a model must learn from inputs that are sequences. Interactive tasks, such as translating natural language, engaging in dialogue, and controlling a robot, often demand both capabilities. Recurrent neural networks (RNNs) are connectionist models that capture the dynamics of sequences via cycles in the network of nodes. Unlike standard feedforward neural networks, recurrent networks retain a state that can represent information from an arbitrarily long context window. Although recurrent neural networks have traditionally been difficult to train, and often contain millions of parameters, recent advances in network architectures, optimization techniques, and parallel computation have enabled successful large-scale learning with them. In recent years, systems based on long short-term memory (LSTM) and bidirectional (BRNN) architectures have demonstrated ground-breaking performance on tasks as varied as image captioning, language translation, and handwriting recognition. In this survey, we review and synthesize the research that over the past three decades first yielded and then made practical these powerful learning models. When appropriate, we reconcile conflicting notation and nomenclature. Our goal is to provide a self-contained explication of the state of the art together with a historical perspective and references to primary research.", + "introduction": " Introduction Neural networks are powerful learning models that achieve state-of-the-art re- sults in a wide range of supervised and unsupervised machine learning tasks. 1arXiv:1506.00019v4 [cs.LG] 17 Oct 2015They are suited especially well for machine perception tasks, where the raw un- derlying features are not individually interpretable. This success is attributed to their ability to learn hierarchical representations, unlike traditional meth- ods that rely upon hand-engineered features [Farabet et al., 2013]. Over the past several years, storage has become more a\u000bordable, datasets have grown far larger, and the \feld of parallel computing has advanced considerably. In the setting of large datasets, simple linear models tend to under-\ft, and often under-utilize computing resources. Deep learning methods for online learning and stochastic optimization. The Journal of Machine Learning Research , 12:2121{2159, 2011. Charles Elkan. Learning meanings for sentences. http://cseweb.ucsd.edu/ ~elkan/250B/learningmeaning.pdf , 2015. Accessed: 2015-05-18. Je\u000brey L. Elman. Finding structure in time. Cognitive science , 14(2):179{211, 1990. Clement Farabet, Camille Couprie, Laurent Najman, and Yann LeCun. Learn- ing hierarchical features for scene labeling. Pattern Analysis and Machine Intelligence, IEEE Transactions on , 35(8):1915{1929, 2013. Felix A. Gers. Long short-term memory in recurrent neural networks. Un- published PhD dissertation, \u0013Ecole Polytechnique F\u0013 ed\u0013 erale de Lausanne, Lau- sanne, Switzerland , 2001. Felix A. Gers and J urgen Schmidhuber. Recurrent nets that time and count. In Neural Networks, 2000. IJCNN 2000, Proceedings of the IEEE-INNS-ENNS International Joint Conference on , volume 3, pages 189{194. IEEE, 2000. Felix A. Gers, J urgen Schmidhuber, and Fred Cummins. Learning to forget: Continual prediction with LSTM. Neural computation , 12(10):2451{2471, 2000. Xavier Glorot, Antoine Bordes, and Yoshua Bengio. Deep sparse recti\fer net- works. In Proceedings of the 14th International Conference on Arti\fcial In- telligence and Statistics. JMLR W&CP Volume , volume 15, pages 315{323, 2011. Yoav Goldberg and Omer Levy. word2vec explained: deriving Mikolov et al.'s negative-sampling word-embedding method. arXiv preprint arXiv:1402.3722 , 2014. 33Alex Graves. Supervised sequence labelling with recurrent neural networks , vol- ume 385. Springer, 2012. Alex Graves and J urgen Schmidhuber. Framewise phoneme classi\fcation with bidirectional LSTM and other neural network architectures. Neural Networks , 18(5):602{610, 2005. Alex Graves, Marcus Liwicki, Santiago Fern\u0013 andez, Roman Bertolami, Horst Bunke, and J urgen Schmidhuber. A novel connectionist system for uncon- strained handwriting recognition. Pattern Analysis and Machine Intelligence, IEEE Transactions on , 31(5):855{868, 2009. Alex Graves, Greg Wayne, and Ivo Danihelka. Neural Turing machines. arXiv preprint arXiv:1410.5401 , 2014. Frdric Gruau, L'universite Claude Bernard lyon I, Of A Diplome De Doctorat, M. Jacques Demongeot, Examinators M. Michel Cosnard, M. Jacques Ma- zoyer, M. Pierre Peretto, and M. Darell Whitley. Neural network synthesis using cellular encoding and the genetic algorithm., 1994. Steven A. Harp and Tariq Samad. Optimizing neural networks with genetic algorithms. In Proceedings of the 54th American Power Conference, Chicago , volume 2, 2013. Geo\u000brey E. Hinton. Learning distributed representations of concepts, 1986. Sepp Hochreiter and Jurgen Schmidhuber. Bridging long time lags by weight guessing and \\long short-term memory\". Spatiotemporal Models in Biological and Arti\fcial Systems , 37:65{72, 1996. Sepp Hochreiter and J urgen Schmidhuber. Long short-term memory. Neural Computation , 9(8):1735{1780, 1997. Sepp Hochreiter, Yoshua Bengio, Paolo Frasconi, and J urgen Schmidhuber. Gra- dient \row in recurrent nets: the di\u000eculty of learning long-term dependencies. A \feld guide to dynamical recurrent neural networks , 2001. John J. Hop\feld. Neural networks and physical systems with emergent collective computational abilities. Proceedings of the National Academy of Sciences , 79 (8):2554{2558, 1982. Yangqing Jia, Evan", + "references": [ + { + "title": "Efficient Elastic Net Regularization for Sparse Linear Models", + "abstract": "We extend previous work on efficiently training linear model s by applying stochastic updates to non-zero features only, lazily bring ing weights current as needed. To date, only the closed form updates for the l1, l1, and the rarely used l2 norm have been described. We extend this work by showing the proper closed form updates for the popular l 2 and elastic net regularized models. We show a dynamic programming algorithm to calculate the proper elastic net update with only one constant-time subproblem computation per update. Our algorithm handles both fixed and decreasing learning rates and we derive th e result for both stochastic gradient descent (SGD) and forward backward splitting (FoBoS) . We empirically validate the algorithm, showing that on a bag-of-words dataset with 260, 941 features and 88 nonzero features on average per example, our method trains a logistic regression classifier with elastic net reg ularization 612 times faster than an otherwise identical implementation with dense updates." + }, + { + "title": "Sequence to Sequence -- Video to Text", + "abstract": "Real-world videos often have complex dynamics, methods for generating open-domain video descriptions should be sensitive to temporal structure and allow both input (sequence of frames) and output (sequence of words) of variable length. To approach this problem we propose a novel end-to-end sequence-to-sequence model to generate captions for videos. For this we exploit recurrent neural networks, specifically LSTMs, which have demonstrated state-of-the-art performance in image caption generation. Our LSTM model is trained on video-sentence pairs and learns to associate a sequence of video frames to a sequence of words in order to generate a description of the event in the video clip. Our model naturally is able to learn the temporal structure of the sequence of frames as well as the sequence model of the generated sentences, i.e. a language model. We evaluate several variants of our model that exploit different visual features on a standard set of YouTube videos and two movie description datasets (M-VAD and MPII-MD)." + }, + { + "title": "A survey on the application of recurrent neural networks to statistical language modeling", + "abstract": null + }, + { + "title": "Unsupervised Learning of Video Representations using LSTMs", + "abstract": "We use Long Short Term Memory (LSTM) networks to learn representations of video sequences. Our model uses an encoder LSTM to map an input sequence into a fixed length representation. This representation is decoded using single or multiple decoder LSTMs to perform different tasks, such as reconstructing the input sequence, or predicting the future sequence. We experiment with two kinds of input sequences - patches of image pixels and high-level representations (\"percepts\") of video frames extracted using a pretrained convolutional net. We explore different design choices such as whether the decoder LSTMs should condition on the generated output. We analyze the outputs of the model qualitatively to see how well the model can extrapolate the learned video representation into the future and into the past. We further evaluate the representations by finetuning them for a supervised learning problem - human action recognition on the UCF-101 and HMDB-51 datasets. We show that the representations help improve classification accuracy, especially when there are only few training examples. Even models pretrained on unrelated datasets (300 hours of YouTube videos) can help action recognition performance." + }, + { + "title": "Deep Captioning with Multimodal Recurrent Neural Networks (m-RNN)", + "abstract": "In this paper, we present a multimodal Recurrent Neural Network (m-RNN) model for generating novel image captions. It directly models the probability distribution of generating a word given previous words and an image. Image captions are generated by sampling from this distribution. The model consists of two sub-networks: a deep recurrent neural network for sentences and a deep convolutional network for images. These two sub-networks interact with each other in a multimodal layer to form the whole m-RNN model. The effectiveness of our model is validated on four benchmark datasets (IAPR TC-12, Flickr 8K, Flickr 30K and MS COCO). Our model outperforms the state-of-the-art methods. In addition, we apply the m-RNN model to retrieval tasks for retrieving images or sentences, and achieves significant performance improvement over the state-of-the-art methods which directly optimize the ranking objective function for retrieval. The project page of this work is: www.stat.ucla.edu/~junhua.mao/m-RNN.html ." + }, + { + "title": "Deep visual-semantic alignments for generating image descriptions", + "abstract": "We present a model that generates natural language descriptions of images and their regions. Our approach leverages datasets of images and their sentence descriptions to learn about the inter-modal correspondences between language and visual data. Our alignment model is based on a novel combination of Convolutional Neural Networks over image regions, bidirectional Recurrent Neural Networks over sentences, and a structured objective that aligns the two modalities through a multimodal embedding. We then describe a Multimodal Recurrent Neural Network architecture that uses the inferred alignments to learn to generate novel descriptions of image regions. We demonstrate that our alignment model produces state of the art results in retrieval experiments on Flickr8K, Flickr30K and MSCOCO datasets. We then show that the generated descriptions significantly outperform retrieval baselines on both full images and on a new dataset of region-level annotations." + }, + { + "title": "Show and tell: A neural image caption generator", + "abstract": "Automatically describing the content of an image is a fundamental problem in artificial intelligence that connects computer vision and natural language processing. In this paper, we present a generative model based on a deep recurrent architecture that combines recent advances in computer vision and machine translation and that can be used to generate natural sentences describing an image. The model is trained to maximize the likelihood of the target description sentence given the training image. Experiments on several datasets show the accuracy of the model and the fluency of the language it learns solely from image descriptions. Our model is often quite accurate, which we verify both qualitatively and quantitatively. For instance, while the current state-of-the-art BLEU-1 score (the higher the better) on the Pascal dataset is 25, our approach yields 59, to be compared to human performance around 69. We also show BLEU-1 score improvements on Flickr30k, from 56 to 66, and on SBU, from 19 to 28. Lastly, on the newly released COCO dataset, we achieve a BLEU-4 of 27.7, which is the current state-of-the-art." + }, + { + "title": "Neural Turing Machines", + "abstract": "We extend the capabilities of neural networks by coupling them to external memory resources, which they can interact with by attentional processes. The combined system is analogous to a Turing Machine or Von Neumann architecture but is differentiable end-toend, allowing it to be efficiently trained with gradient descent. Preliminary results demonstrate that Neural Turing Machines can infer simple algorithms such as copying, sorting, and associative recall from input and output examples." + }, + { + "title": "Learning to Execute", + "abstract": "Recurrent Neural Networks (RNNs) with Long Short-Term Memory units (LSTM) are widely used because they are expressive and are easy to train. Our interest lies in empirically evaluating the expressiveness and the learnability of LSTMs in the sequence-to-sequence regime by training them to evaluate short computer programs, a domain that has traditionally been seen as too complex for neural networks. We consider a simple class of programs that can be evaluated with a single left-to-right pass using constant memory. Our main result is that LSTMs can learn to map the character-level representations of such programs to their correct outputs. Notably, it was necessary to use curriculum learning, and while conventional curriculum learning proved ineffective, we developed a new variant of curriculum learning that improved our networks' performance in all experimental conditions. The improved curriculum had a dramatic impact on an addition problem, making it possible to train an LSTM to add two 9-digit numbers with 99% accuracy." + }, + { + "title": "GloVe: Global Vectors for Word Representation", + "abstract": "Recent methods for learning vector space representations of words have succeeded in capturing fine-grained semantic and syntactic regularities using vector arithmetic, but the origin of these regularities has remained opaque. We analyze and make explicit the model properties needed for such regularities to emerge in word vectors. The result is a new global logbilinear regression model that combines the advantages of the two major model families in the literature: global matrix factorization and local context window methods. Our model efficiently leverages statistical information by training only on the nonzero elements in a word-word cooccurrence matrix, rather than on the entire sparse matrix or on individual context windows in a large corpus. The model produces a vector space with meaningful substructure, as evidenced by its performance of 75% on a recent word analogy task. It also outperforms related models on similarity tasks and named entity recognition." + }, + { + "title": "Deeply-Supervised Nets", + "abstract": "Our proposed deeply-supervised nets (DSN) method simultaneously minimizes classification error while making the learning process of hidden layers direct and transparent. We make an attempt to boost the classification performance by studying a new formulation in deep networks. Three aspects in convolutional neural networks (CNN) style architectures are being looked at: (1) transparency of the intermediate layers to the overall classification; (2) discriminativeness and robustness of learned features, especially in the early layers; (3) effectiveness in training due to the presence of the exploding and vanishing gradients. We introduce \"companion objective\" to the individual hidden layers, in addition to the overall objective at the output layer (a different strategy to layer-wise pre-training). We extend techniques from stochastic gradient methods to analyze our algorithm. The advantage of our method is evident and our experimental result on benchmark datasets shows significant performance gain over existing methods (e.g. all state-of-the-art results on MNIST, CIFAR-10, CIFAR-100, and SVHN)." + }, + { + "title": "Optimal Thresholding of Classifiers to Maximize F1 Measure", + "abstract": null + }, + { + "title": "Sequence to Sequence Learning with Neural Networks", + "abstract": "Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks. Although DNNs work well whenever large labeled training sets are available, they cannot be used to map sequences to sequences. In this paper, we present a general end-to-end approach to sequence learning that makes minimal assumptions on the sequence structure. Our method uses a multilayered Long Short-Term Memory (LSTM) to map the input sequence to a vector of a fixed dimensionality, and then another deep LSTM to decode the target sequence from the vector. Our main result is that on an English to French translation task from the WMT-14 dataset, the translations produced by the LSTM achieve a BLEU score of 34.8 on the entire test set, where the LSTM's BLEU score was penalized on out-of-vocabulary words. Additionally, the LSTM did not have difficulty on long sentences. For comparison, a phrase-based SMT system achieves a BLEU score of 33.3 on the same dataset. When we used the LSTM to rerank the 1000 hypotheses produced by the aforementioned SMT system, its BLEU score increases to 36.5, which is close to the previous state of the art. The LSTM also learned sensible phrase and sentence representations that are sensitive to word order and are relatively invariant to the active and the passive voice. Finally, we found that reversing the order of the words in all source sentences (but not target sentences) improved the LSTM's performance markedly, because doing so introduced many short term dependencies between the source and the target sentence which made the optimization problem easier." + }, + { + "title": "Caffe: Convolutional Architecture for Fast Feature Embedding", + "abstract": "Caffe provides multimedia scientists and practitioners with a clean and modifiable framework for state-of-the-art deep learning algorithms and a collection of reference models. The framework is a BSD-licensed C++ library with Python and MATLAB bindings for training and deploying general-purpose convolutional neural networks and other deep models efficiently on commodity architectures. Caffe fits industry and internet-scale media needs by CUDA GPU computation, processing over 40 million images a day on a single K40 or Titan GPU (approx 2 ms per image). By separating model representation from actual implementation, Caffe allows experimentation and seamless switching among platforms for ease of development and deployment from prototyping machines to cloud environments. Caffe is maintained and developed by the Berkeley Vision and Learning Center (BVLC) with the help of an active community of contributors on GitHub. It powers ongoing research projects, large-scale industrial applications, and startup prototypes in vision, speech, and multimedia." + }, + { + "title": "Identifying and attacking the saddle point problem in high-dimensional non-convex optimization", + "abstract": "A central challenge to many fields of science and engineering involves minimizing non-convex error functions over continuous, high dimensional spaces. Gradient descent or quasi-Newton methods are almost ubiquitously used to perform such minimizations, and it is often thought that a main source of difficulty for these local methods to find the global minimum is the proliferation of local minima with much higher error than the global minimum. Here we argue, based on results from statistical physics, random matrix theory, neural network theory, and empirical evidence, that a deeper and more profound difficulty originates from the proliferation of saddle points, not local minima, especially in high dimensional problems of practical interest. Such saddle points are surrounded by high error plateaus that can dramatically slow down learning, and give the illusory impression of the existence of a local minimum. Motivated by these arguments, we propose a new approach to second-order optimization, the saddle-free Newton method, that can rapidly escape high dimensional saddle points, unlike gradient descent and quasi-Newton methods. We apply this algorithm to deep or recurrent neural network training, and provide numerical evidence for its superior optimization performance." + }, + { + "title": "Grounded Compositional Semantics for Finding and Describing Images with Sentences", + "abstract": "Previous work on Recursive Neural Networks (RNNs) shows that these models can produce compositional feature vectors for accurately representing and classifying sentences or images. However, the sentence vectors of previous models cannot accurately represent visually grounded meaning. We introduce the DT-RNN model which uses dependency trees to embed sentences into a vector space in order to retrieve images that are described by those sentences. Unlike previous RNN-based models which use constituency trees, DT-RNNs naturally focus on the action and agents in a sentence. They are better able to abstract from the details of word order and syntactic expression. DT-RNNs outperform other recursive and recurrent neural networks, kernelized CCA and a bag-of-words baseline on the tasks of finding an image that fits a sentence description and vice versa. They also give more similar representations to sentences that describe the same image." + }, + { + "title": "word2vec Explained: deriving Mikolov et al.'s negative-sampling word-embedding method", + "abstract": "The word2vec software of Tomas Mikolov and colleagues (this https URL ) has gained a lot of traction lately, and provides state-of-the-art word embeddings. The learning models behind the software are described in two research papers. We found the description of the models in these papers to be somewhat cryptic and hard to follow. While the motivations and presentation may be obvious to the neural-networks language-modeling crowd, we had to struggle quite a bit to figure out the rationale behind the equations. \nThis note is an attempt to explain equation (4) (negative sampling) in \"Distributed Representations of Words and Phrases and their Compositionality\" by Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado and Jeffrey Dean." + }, + { + "title": "Wikipedia", + "abstract": "The task of this article is to analyze the political economy of Wikipedia. We discuss the specifics of Wikipedia’s mode of production. The basic principles of what we call the info-communist mode of production will be presented. Our analysis is grounded in Marxist philosophy and Marxist political economy, and is connected to the current discourse about the renewal and reloading of the idea of communism that is undertaken by thinkers like Slavoj Žižek and Alain Badiou. We explore to which extent Wikipedia encompasses principles that go beyond the capitalist mode of production and represent the info-communist mode of production. We present the subjective dimension of the mode of production (cooperative labor), the objective dimension of the mode of production (common ownership of the means of production), and the subject–object dimension of the mode of production (the effects and products of the mode of production)." + }, + { + "title": "Joint Language and Translation Modeling with Recurrent Neural Networks", + "abstract": "We present a joint language and translation model based on a recurrent neural network which predicts target words based on an unbounded history of both source and target words. The weaker independence assumptions of this model result in a vastly larger search space compared to related feedforward-based language or translation models. We tackle this issue with a new lattice rescoring algorithm and demonstrate its effectiveness empirically. Our joint model builds on a well known recurrent neural network language model (Mikolov, 2012) augmented by a layer of additional inputs from the source language. We show competitive accuracy compared to the traditional channel model features. Our best results improve the output of a system trained on WMT 2012 French-English data by up to 1.5 BLEU, and by 1.1 BLEU on average across several test sets." + }, + { + "title": "Learning Hierarchical Features for Scene Labeling", + "abstract": "Scene labeling consists of labeling each pixel in an image with the category of the object it belongs to. We propose a method that uses a multiscale convolutional network trained from raw pixels to extract dense feature vectors that encode regions of multiple sizes centered on each pixel. The method alleviates the need for engineered features, and produces a powerful representation that captures texture, shape, and contextual information. We report results using multiple postprocessing methods to produce the final labeling. Among those, we propose a technique to automatically retrieve, from a pool of segmentation components, an optimal set of components that best explain the scene; these components are arbitrary, for example, they can be taken from a segmentation tree or from any family of oversegmentations. The system yields record accuracies on the SIFT Flow dataset (33 classes) and the Barcelona dataset (170 classes) and near-record accuracy on Stanford background dataset (eight classes), while being an order of magnitude faster than competing approaches, producing a 320×240 image labeling in less than a second, including feature extraction." + }, + { + "title": "On the importance of initialization and momentum in deep learning", + "abstract": "Deep and recurrent neural networks (DNNs and RNNs respectively) are powerful models that were considered to be almost impossible to train using stochastic gradient descent with momentum. In this paper, we show that when stochastic gradient descent with momentum uses a well-designed random initialization and a particular type of slowly increasing schedule for the momentum parameter, it can train both DNNs and RNNs (on datasets with long-term dependencies) to levels of performance that were previously achievable only with Hessian-Free optimization. We find that both the initialization and the momentum are crucial since poorly initialized networks cannot be trained with momentum and well-initialized networks perform markedly worse when the momentum is absent or poorly tuned. \n \nOur success training these models suggests that previous attempts to train deep and recurrent neural networks from random initializations have likely failed due to poor initialization schemes. Furthermore, carefully tuned momentum methods suffice for dealing with the curvature issues in deep and recurrent network training objectives without the need for sophisticated second-order methods." + }, + { + "title": "On rectified linear units for speech processing", + "abstract": "Deep neural networks have recently become the gold standard for acoustic modeling in speech recognition systems. The key computational unit of a deep network is a linear projection followed by a point-wise non-linearity, which is typically a logistic function. In this work, we show that we can improve generalization and make training of deep networks faster and simpler by substituting the logistic units with rectified linear units. These units are linear when their input is positive and zero otherwise. In a supervised setting, we can successfully train very deep nets from random initialization on a large vocabulary speech recognition task achieving lower word error rates than using a logistic network with the same topology. Similarly in an unsupervised setting, we show how we can learn sparse features that can be useful for discriminative tasks. All our experiments are executed in a distributed environment using several hundred machines and several hundred hours of speech data." + }, + { + "title": "Efficient Estimation of Word Representations in Vector Space", + "abstract": "We propose two novel model architectures for computing continuous vector\nrepresentations of words from very large data sets. The quality of these\nrepresentations is measured in a word similarity task, and the results are\ncompared to the previously best performing techniques based on different types\nof neural networks. We observe large improvements in accuracy at much lower\ncomputational cost, i.e. it takes less than a day to learn high quality word\nvectors from a 1.6 billion words data set. Furthermore, we show that these\nvectors provide state-of-the-art performance on our test set for measuring\nsyntactic and semantic word similarities." + }, + { + "title": "ADADELTA: An Adaptive Learning Rate Method", + "abstract": "We present a novel per-dimension learning rate method for gradient descent called ADADELTA. The method dynamically adapts over time using only first order information and has minimal computational overhead beyond vanilla stochastic gradient descent. The method requires no manual tuning of a learning rate and appears robust to noisy gradient information, different model architecture choices, various data modalities and selection of hyperparameters. We show promising results compared to other methods on the MNIST digit classification task using a single machine and on a large scale voice dataset in a distributed cluster environment." + }, + { + "title": "Advances in optimizing recurrent networks", + "abstract": "After a more than decade-long period of relatively little research activity in the area of recurrent neural networks, several new developments will be reviewed here that have allowed substantial progress both in understanding and in technical solutions towards more efficient training of recurrent networks. These advances have been motivated by and related to the optimization issues surrounding deep learning. Although recurrent networks are extremely powerful in what they can in principle represent in terms of modeling sequences, their training is plagued by two aspects of the same issue regarding the learning of long-term dependencies. Experiments reported here evaluate the use of clipping gradients, spanning longer time ranges with leaky integration, advanced momentum techniques, using more powerful output probability models, and encouraging sparser gradients to help symmetry breaking and credit assignment. The experiments are performed on text and music data and show off the combined effects of these techniques in generally improving both training and test error." + }, + { + "title": "ImageNet classification with deep convolutional neural networks", + "abstract": "We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called \"dropout\" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved by the second-best entry." + }, + { + "title": "On the difficulty of training recurrent neural networks", + "abstract": "There are two widely known issues with properly training recurrent neural networks, the vanishing and the exploding gradient problems detailed in Bengio et al. (1994). In this paper we attempt to improve the understanding of the underlying issues by exploring these problems from an analytical, a geometric and a dynamical systems perspective. Our analysis is used to justify a simple yet effective solution. We propose a gradient norm clipping strategy to deal with exploding gradients and a soft constraint for the vanishing gradients problem. We validate empirically our hypothesis and proposed solutions in the experimental section." + }, + { + "title": "Supervised Sequence Labelling with Recurrent Neural Networks", + "abstract": null + }, + { + "title": "Dynamic Pooling and Unfolding Recursive Autoencoders for Paraphrase Detection", + "abstract": "Paraphrase detection is the task of examining two sentences and determining whether they have the same meaning. In order to obtain high accuracy on this task, thorough syntactic and semantic analysis of the two statements is needed. We introduce a method for paraphrase detection based on recursive autoencoders (RAE). Our unsupervised RAEs are based on a novel unfolding objective and learn feature vectors for phrases in syntactic trees. These features are used to measure the word- and phrase-wise similarity between two sentences. Since sentences may be of arbitrary length, the resulting matrix of similarity measures is of variable size. We introduce a novel dynamic pooling layer which computes a fixed-sized representation from the variable-sized matrices. The pooled representation is then used as input to a classifier. Our method outperforms other state-of-the-art approaches on the challenging MSRP paraphrase corpus." + }, + { + "title": "Semi-Supervised Recursive Autoencoders for Predicting Sentiment Distributions", + "abstract": "We introduce a novel machine learning framework based on recursive autoencoders for sentence-level prediction of sentiment label distributions. Our method learns vector space representations for multi-word phrases. In sentiment prediction tasks these representations outperform other state-of-the-art approaches on commonly used datasets, such as movie reviews, without using any pre-defined sentiment lexica or polarity shifting rules. We also evaluate the model's ability to predict sentiment distributions on a new dataset based on confessions from the experience project. The dataset consists of personal user stories annotated with multiple labels which, when aggregated, form a multinomial distribution that captures emotional reactions. Our algorithm can more accurately predict distributions over such labels compared to several competitive baselines." + }, + { + "title": "Generating Text with Recurrent Neural Networks", + "abstract": "Recurrent Neural Networks (RNNs) are very powerful sequence models that do not enjoy widespread use because it is extremely difficult to train them properly. Fortunately, recent advances in Hessian-free optimization have been able to overcome the difficulties associated with training RNNs, making it possible to apply them successfully to challenging sequence problems. In this paper we demonstrate the power of RNNs trained with the new Hessian-Free optimizer (HF) by applying them to character-level language modeling tasks. The standard RNN architecture, while effective, is not ideally suited for such tasks, so we introduce a new RNN variant that uses multiplicative (or \"gated\") connections which allow the current input character to determine the transition matrix from one hidden state vector to the next. After training the multiplicative RNN with the HF optimizer for five days on 8 high-end Graphics Processing Units, we were able to surpass the performance of the best previous single method for character-level language modeling – a hierarchical non-parametric sequence model. To our knowledge this represents the largest recurrent neural network application to date." + }, + { + "title": "Parsing Natural Scenes and Natural Language with Recursive Neural Networks", + "abstract": "Recursive structure is commonly found in the inputs of different modalities such as natural scene images or natural language sentences. Discovering this recursive structure helps us to not only identify the units that an image or sentence contains but also how they interact to form a whole. We introduce a max-margin structure prediction architecture based on recursive neural networks that can successfully recover such structure both in complex scene images as well as sentences. The same algorithm can be used both to provide a competitive syntactic parser for natural language sentences from the Penn Treebank and to outperform alternative approaches for semantic scene segmentation, annotation and classification. For segmentation and annotation our algorithm obtains a new level of state-of-the-art performance on the Stanford background dataset (78.1%). The features from the image parse tree outperform Gist descriptors for scene classification by 4%." + }, + { + "title": "Learning Recurrent Neural Networks with Hessian-Free Optimization", + "abstract": "In this work we resolve the long-outstanding problem of how to effectively train recurrent neural networks (RNNs) on complex and difficult sequence modeling problems which may contain long-term data dependencies. Utilizing recent advances in the Hessian-free optimization approach (Martens, 2010), together with a novel damping scheme, we successfully train RNNs on two sets of challenging problems. First, a collection of pathological synthetic datasets which are known to be impossible for standard optimization approaches (due to their extremely long-term dependencies), and second, on three natural and highly complex real-world sequence datasets where we find that our method significantly outperforms the previous state-of-the-art method for training neural sequence models: the Long Short-term Memory approach of Hochreiter and Schmidhuber (1997). Additionally, we offer a new interpretation of the generalized Gauss-Newton matrix of Schraudolph (2002) which is used within the HF approach of Martens." + }, + { + "title": "Adaptive Subgradient Methods for Online Learning and Stochastic Optimization", + "abstract": "We present a new family of subgradient methods that dynamically incorporate knowledge of the geometry of the data observed in earlier iterations to perform more informative gradient-based learning. Metaphorically, the adaptation allows us to find needles in haystacks in the form of very predictive but rarely seen features. Our paradigm stems from recent advances in stochastic optimization and online learning which employ proximal functions to control the gradient steps of the algorithm. We describe and analyze an apparatus for adaptively modifying the proximal function, which significantly simplifies setting a learning rate and results in regret guarantees that are provably as good as the best proximal function that can be chosen in hindsight. We give several efficient algorithms for empirical risk minimization problems with common and important regularization functions and domain constraints. We experimentally study our theoretical analysis and show that adaptive subgradient methods outperform state-of-the-art, yet non-adaptive, subgradient algorithms." + }, + { + "title": "Rectified Linear Units Improve Restricted Boltzmann Machines", + "abstract": "Restricted Boltzmann machines were developed using binary stochastic hidden units. These can be generalized by replacing each binary unit by an infinite number of copies that all have the same weights but have progressively more negative biases. The learning and inference rules for these \"Stepped Sigmoid Units\" are unchanged. They can be approximated efficiently by noisy, rectified linear units. Compared with binary units, these units learn features that are better for object recognition on the NORB dataset and face verification on the Labeled Faces in the Wild dataset. Unlike binary units, rectified linear units preserve information about relative intensities as information travels through multiple layers of feature detectors." + }, + { + "title": "Efficient Learning using Forward-Backward Splitting", + "abstract": "We describe, analyze, and experiment with a new framework for empirical loss minimization with regularization. Our algorithmic framework alternates between two phases. On each iteration we first perform an unconstrained gradient descent step. We then cast and solve an instantaneous optimization problem that trades off minimization of a regularization term while keeping close proximity to the result of the first phase. This yields a simple yet effective algorithm for both batch penalized risk minimization and online learning. Furthermore, the two phase approach enables sparse solutions when used in conjunction with regularization functions that promote sparsity, such as l1. We derive concrete and very simple algorithms for minimization of loss functions with l1, l2, l22, and l∞ regularization. We also show how to construct efficient algorithms for mixed-norm l1/lq regularization. We further extend the algorithms and give efficient implementations for very high-dimensional data with sparsity. We demonstrate the potential of the proposed framework in experiments with synthetic and natural datasets." + }, + { + "title": "Evolving Memory Cell Structures for Sequence Learning", + "abstract": null + }, + { + "title": "A Novel Connectionist System for Unconstrained Handwriting Recognition", + "abstract": "Recognizing lines of unconstrained handwritten text is a challenging task. The difficulty of segmenting cursive or overlapping characters, combined with the need to exploit surrounding context, has led to low recognition rates for even the best current recognizers. Most recent progress in the field has been made either through improved preprocessing or through advances in language modeling. Relatively little work has been done on the basic recognition algorithms. Indeed, most systems rely on the same hidden Markov models that have been used for decades in speech and handwriting recognition, despite their well-known shortcomings. This paper proposes an alternative approach based on a novel type of recurrent neural network, specifically designed for sequence labeling tasks where the data is hard to segment and contains long-range bidirectional interdependencies. In experiments on two large unconstrained handwriting databases, our approach achieves word recognition accuracies of 79.7 percent on online data and 74.1 percent on offline data, significantly outperforming a state-of-the-art HMM-based system. In addition, we demonstrate the network's robustness to lexicon size, measure the individual influence of its hidden layers, and analyze its use of context. Last, we provide an in-depth discussion of the differences between the network and HMMs, suggesting reasons for the network's superior performance." + }, + { + "title": "Sparse Online Learning via Truncated Gradient", + "abstract": "We propose a general method called truncated gradient to induce sparsity in the weights of online-learning algorithms with convex loss. This method has several essential properties. First, the degree of sparsity is continuous—a parameter controls the rate of sparsification from no sparsification to total sparsification. Second, the approach is theoretically motivated, and an instance of it can be regarded as an online counterpart of the popular L1-regularization method in the batch setting. We prove small rates of sparsification result in only small additional regret with respect to typical online-learning guarantees. Finally, the approach works well empirically. We apply it to several datasets and find for datasets with large numbers of features, substantial sparsity is discoverable." + }, + { + "title": "METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments", + "abstract": "We describe METEOR, an automatic metric for machine translation evaluation that is based on a generalized concept of unigram matching between the machineproduced translation and human-produced reference translations. Unigrams can be matched based on their surface forms, stemmed forms, and meanings; furthermore, METEOR can be easily extended to include more advanced matching strategies. Once all generalized unigram matches between the two strings have been found, METEOR computes a score for this matching using a combination of unigram-precision, unigram-recall, and a measure of fragmentation that is designed to directly capture how well-ordered the matched words in the machine translation are in relation to the reference. We evaluate METEOR by measuring the correlation between the metric scores and human judgments of translation quality. We compute the Pearson R correlation value between its scores and human quality assessments of the LDC TIDES 2003 Arabic-to-English and Chinese-to-English datasets. We perform segment-bysegment correlation, and show that METEOR gets an R correlation value of 0.347 on the Arabic data and 0.331 on the Chinese data. This is shown to be an improvement on using simply unigramprecision, unigram-recall and their harmonic F1 combination. We also perform experiments to show the relative contributions of the various mapping modules." + }, + { + "title": "2005 Special Issue: Framewise phoneme classification with bidirectional LSTM and other neural network architectures", + "abstract": null + }, + { + "title": "A Neural Probabilistic Language Model", + "abstract": "A goal of statistical language modeling is to learn the joint probability function of sequences of words. This is intrinsically difficult because of the curse of dimensionality: we propose to fight it with its own weapons. In the proposed approach one learns simultaneously (1) a distributed representation for each word (i.e. a similarity between words) along with (2) the probability function for word sequences, expressed with these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar to words forming an already seen sentence. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach very significantly improves on a state-of-the-art trigram model." + }, + { + "title": "Bleu: a Method for Automatic Evaluation of Machine Translation", + "abstract": "Human evaluations of machine translation are extensive but expensive. Human evaluations can take months to finish and involve human labor that can not be reused. We propose a method of automatic machine translation evaluation that is quick, inexpensive, and language-independent, that correlates highly with human evaluation, and that has little marginal cost per run. We present this method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations." + }, + { + "title": "Learning to Forget: Continual Prediction with LSTM", + "abstract": "Long short-term memory (LSTM; Hochreiter & Schmidhuber, 1997) can solve numerous tasks not solvable by previous learning algorithms for recurrent neural networks (RNNs). We identify a weakness of LSTM networks processing continual input streams that are not a priori segmented into subsequences with explicitly marked ends at which the network's internal state could be reset. Without resets, the state may grow indefinitely and eventually cause the network to break down. Our remedy is a novel, adaptive forget gate that enables an LSTM cell to learn to reset itself at appropriate times, thus releasing internal resources. We review illustrative benchmark problems on which standard LSTM outperforms other RNN algorithms. All algorithms (including LSTM) fail to solve continual versions of these problems. LSTM with forget gates, however, easily solves them, and in an elegant way." + }, + { + "title": "Recurrent nets that time and count", + "abstract": "The size of the time intervals between events conveys information essential for numerous sequential tasks such as motor control and rhythm detection. While hidden Markov models tend to ignore this information, recurrent neural networks (RNN) can in principle learn to make use of it. We focus on long short-term memory (LSTM) because it usually outperforms other RNN. Surprisingly, LSTM augmented by \"peephole connections\" from its internal cells to its multiplicative gates can learn the fine distinction between sequences of spikes separated by either 50 or 49 discrete time steps, without the help of any short training exemplars. Without external resets or teacher forcing or loss of performance on tasks reported earlier, our LSTM variant also learns to generate very stable sequences of highly nonlinear, precisely timed spikes. This makes LSTM a promising approach for real-world tasks that require to time and count." + }, + { + "title": "Long Short-Term Memory", + "abstract": "Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms." + }, + { + "title": "Bidirectional recurrent neural networks", + "abstract": "In the first part of this paper, a regular recurrent neural network (RNN) is extended to a bidirectional recurrent neural network (BRNN). The BRNN can be trained without the limitation of using input information just up to a preset future frame. This is accomplished by training it simultaneously in positive and negative time direction. Structure and training procedure of the proposed network are explained. In regression and classification experiments on artificial data, the proposed structure gives better results than other approaches. For real data, classification experiments for phonemes from the TIMIT database show the same tendency. In the second part of this paper, it is shown how the proposed bidirectional structure can be easily modified to allow efficient estimation of the conditional posterior probability of complete symbol sequences without making any explicit assumption about the shape of the distribution. For this part, experiments on real data are reported." + }, + { + "title": "Working memory and executive control.", + "abstract": "A major problem in analysing the executive processes that seem to depend upon the prefrontal cortex stems from the absence of a well developed cognitive model of such processes. It is suggested that the central executive component of an earlier model of working memory might provide a suitable framework for such an analysis. The approach is illustrated using one proposed component of executive control, namely the capacity to combine two concurrent tasks. The application of the approach to patients suffering from Alzheimer's disease, and patients with acquired brain damage is discussed. Finally, a study is described in which the dual task performance of patients with known frontal lesions is shown to be associated with observed behavioural problems. The paper concludes with the discussion of the prospects for extending the approach to include a range of other executive processes, and to the way in which such an analysis may subsequently lead to a more integrated model of the central executive, and a better understanding of its relationship to the prefrontal cortex." + }, + { + "title": "Gradient calculations for dynamic recurrent neural networks: a survey", + "abstract": "Surveys learning algorithms for recurrent neural networks with hidden units and puts the various techniques into a common framework. The authors discuss fixed point learning algorithms, namely recurrent backpropagation and deterministic Boltzmann machines, and nonfixed point algorithms, namely backpropagation through time, Elman's history cutoff, and Jordan's output feedback architecture. Forward propagation, an on-line technique that uses adjoint equations, and variations thereof, are also discussed. In many cases, the unified presentation leads to generalizations of various sorts. The author discusses advantages and disadvantages of temporally continuous neural networks in contrast to clocked ones continues with some \"tricks of the trade\" for training, using, and simulating continuous time and recurrent neural networks. The author presents some simulations, and at the end, addresses issues of computational complexity and learning speed." + }, + { + "title": "Learning long-term dependencies with gradient descent is difficult", + "abstract": "Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captured increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered." + }, + { + "title": "Original Contribution: Training a 3-node neural network is NP-complete", + "abstract": null + }, + { + "title": "Backpropagation Through Time: What It Does and How to Do It", + "abstract": "Backpropagation is now the most widely used tool in the field of artificial neural networks. At the core of backpropagation is a method for calculating derivatives exactly and efficiently in any large system made up of elementary subsystems or calculations which are represented by known, differentiable functions; thus, backpropagation has many applications which do not involve neural networks as such. This paper first reviews basic backpropagation, a simple method which is now being widely used in areas like pattern recognition and fault diagnosis. Next, it presents the basic equations for backpropagation through time, and discusses applications to areas like pattern recognition involving dynamic systems, systems identification, and control. Finally, i t describes further extensions of this method, to deal with systems other than neural networks, systems involving simultaneous equations or true recurrent networks, and other practical issues which arise with this method. Pseudocode is provided to clarify the algorithms. The chain rule for ordered derivatives-the theorem which underlies backpropagation-is briefly discussed." + }, + { + "title": "Finding Structure in Time", + "abstract": "Time underlies many interesting human behaviors. Thus, the question of how to represent time in connectionist models is very important. One approach is to represent time implicitly by its effects on processing rather than explicitly (as in a spatial representation). The current report develops a proposal along these lines first described by Jordan (1986) which involves the use of recurrent links in order to provide networks with a dynamic memory. In this approach, hidden unit patterns are fed back to themselves; the internal representations which develop thus reflect task demands in the context of prior internal states. A set of simulations is reported which range from relatively simple problems (temporal version of XOR) to discovering syntactic/semantic features for words. The networks are able to learn interesting internal representations which incorporate task demands with memory demands; indeed, in this approach the notion of memory is inextricably bound up with task processing. These representations reveal a rich structure, which allows them to be highly context-dependent while also expressing generalizations across classes of items. These representations suggest a method for representing lexical categories and the type/token distinction." + }, + { + "title": "A Learning Algorithm for Continually Running Fully Recurrent Neural Networks", + "abstract": "The exact form of a gradient-following learning algorithm for completely recurrent networks running in continually sampled time is derived and used as the basis for practical algorithms for temporal supervised learning tasks. These algorithms have (1) the advantage that they do not require a precisely defined training interval, operating while the network runs; and (2) the disadvantage that they require nonlocal communication in the network being trained and are computationally expensive. These algorithms allow networks having recurrent connections to learn complex tasks that require the retention of information over time periods having either fixed or indefinite length." + }, + { + "title": "Learning internal representations by error propagation", + "abstract": null + }, + { + "title": "Neural networks and physical systems with emergent collective computational abilities.", + "abstract": "Computational properties of use of biological organisms or to the construction of computers can emerge as collective properties of systems having a large number of simple equivalent components (or neurons). The physical meaning of content-addressable memory is described by an appropriate phase space flow of the state of a system. A model of such a system is given, based on aspects of neurobiology but readily adapted to integrated circuits. The collective properties of this model produce a content-addressable memory which correctly yields an entire memory from any subpart of sufficient size. The algorithm for the time evolution of the state of the system is based on asynchronous parallel processing. Additional emergent collective properties include some capacity for generalization, familiarity recognition, categorization, error correction, and time sequence retention. The collective properties are only weakly sensitive to details of the modeling or the failure of individual devices." + }, + { + "title": "Error bounds for convolutional codes and an asymptotically optimum decoding algorithm", + "abstract": "The probability of error in decoding an optimal convolutional code transmitted over a memoryless channel is bounded from above and below as a function of the constraint length of the code. For all but pathological channels the bounds are asymptotically (exponentially) tight for rates above R_{0} , the computational cutoff rate of sequential decoding. As a function of constraint length the performance of optimal convolutional codes is shown to be superior to that of block codes of the same length, the relative improvement increasing with rate. The upper bound is obtained for a specific probabilistic nonsequential decoding algorithm which is shown to be asymptotically optimum for rates above R_{0} and whose performance bears certain similarities to that of sequential decoding algorithms." + }, + { + "title": "Computing Machinery and Intelligence", + "abstract": "I propose to consider the question, “Can machines think?”♣ This should begin with definitions of the meaning of the terms “machine” and “think”. The definitions might be framed so as to reflect so far as possible the normal use of the words, but this attitude is dangerous. If the meaning of the words “machine” and “think” are to be found by examining how they are commonly used it is difficult to escape the conclusion that the meaning and the answer to the question, “Can machines think?” is to be sought in a statistical survey such as a Gallup poll." + }, + { + "title": "The unreasonable e↵ectiveness of recurrent neural net- works. http://karpathy.github.io", + "abstract": null + }, + { + "title": "Recurrent Neural Networks for Noise Reduction in Robust ASR", + "abstract": "Recent work on deep neural networks as acoustic models for automatic speech recognition (ASR) have demonstrated substantial performance improvements. We introduce a model which uses a deep recurrent auto encoder neural network to denoise input features for robust ASR. The model is trained on stereo (noisy and clean) audio features to predict clean features given noisy input. The model makes no assumptions about how noise affects the signal, nor the existence of distinct noise environments. Instead, the model can learn to model any type of distortion or additive noise given sufficient training data. We demonstrate the model is competitive with existing feature denoising approaches on the Aurora2 task, and outperforms a tandem approach where deep networks are used to predict phoneme posteriors directly." + }, + { + "title": "Learning meanings for sentences", + "abstract": "Consider an English sentence of length n, say “Cats like to chase mice” with n = 5. Suppose that we are given a binary tree representing the syntactic structure of the sentence. Each word is a leaf node of the tree, and there are n− 1 internal nodes. Each internal node covers a phrase of two or more consecutive words. We will associate a column vector in R with each node, to represent the meaning of the corresponding phrase. A typical value for the dimensionality d is 100. The meaning of each word is initialized to be a random vector in R. This means that we create a fixed lexicon containing a random vector for each word. Each random vector is generated independently from a Gaussian of dimension d with mean zero and diagonal covariance matrix σI . Each time the same word is used in any sentence, the same vector is used as its meaning. The meaning of a phrase is a function of the meanings of its two components. This is called a compositional approach to semantics. Let the node k have children i and j, whose meanings are xi and xj . The meaning of node k is" + }, + { + "title": "Lecture 6.5- RMSprop: Divide the gradient by a running average of its recent magnitude", + "abstract": null + }, + { + "title": "Torch7: A Matlab-like Environment for Machine Learning", + "abstract": "Torch7 is a versatile numeric computing framework and machine learning library that extends Lua. Its goal is to provide a flexible environment to design and train learning machines. Flexibility is obtained via Lua, an extremely lightweight scripting language. High performance is obtained via efficient OpenMP/SSE and CUDA implementations of low-level numeric routines. Torch7 can easily be interfaced to third-party software thanks to Lua’s light interface." + }, + { + "title": "Deep sparse rectifier networks", + "abstract": null + }, + { + "title": "Learning Continuous Phrase Representations and Syntactic Parsing with Recursive Neural Networks", + "abstract": "Natural language parsing has typically been done with small sets of discrete categories such as NP and VP, but this representation does not capture the full syntactic nor semantic richness of linguistic phrases, and attempts to improve on this by lex-icalizing phrases only partly address the problem at the cost of huge feature spaces and sparseness. To address this, we introduce a recursive neural network architecture for jointly parsing natural language and learning vector space representations for variable-sized inputs. At the core of our architecture are context-aware recursive neural networks (CRNN). These networks can induce distributed feature representations for unseen phrases and provide syntactic information to accurately predict phrase structure trees. Most excitingly, the representation of each phrase also captures semantic information: For instance, the phrases “decline to comment” and “would not disclose the terms” are close by in the induced embedding space. Our current system achieves an unlabeled bracketing F-measure of 92.1% on the Wall Street Journal development dataset for sentences up to length 15." + }, + { + "title": "3 Learning distributed representations of concepts", + "abstract": "There have been many different proposals for how conceptual information may be represented in neural networks. These range from extreme localist theories in which each concept is represented by a single neural unit (Barlow 1972) to extreme distributed theories in which a concept corresponds to a pattern of activity over a large part of the cortex. These two extremes are the natural implementations of two different theories of semantics. In the structuralist approach, concepts are defined by their relationships to other concepts rather than by some internal essence. The natural expression of this approach in a neural net is to make each concept be a single unit with no internal structure and to use the connections between units to encode the relationships between concepts. In the componential approach each concept is simply a set offeatures and so a neural net can be made to implement a set of concepts by assigning a unit to each feature and setting the strengths of the connections between units so that each concept corresponds to a stable pattern of activity distributed over the whole network (Hopfield 1982; Kohonen 1977; Willshaw, Buneman, and Longuet-Higgins 1969). The network can then perform concept completion (i.e. retrieve the whole concept from a sufficient subset of its features). The problem with componential theories is that they have little to say about how concepts are used for structured reasoning. They are primarily concerned with the similarities between concepts or with pairwise associations. They provide no obvious way of representing articulated structures composed of a number of concepts playing different roles within the structure." + }, + { + "title": "Theano: a CPU and GPU math expression compiler", + "abstract": null + }, + { + "title": "Lazy Sparse Stochastic Gradient Descent for Regularized Mutlinomial Logistic Regression", + "abstract": "Stochastic gradient descent efficiently estimates maximum likelihood logistic regression coefficients from sparse input data. Regularization with respect to a prior coefficient distribution destroys the sparsity of the gradient evaluated at a single example. Sparsity is restored by lazily shrinking a coefficient along the cumulative gradient of the prior just before the coefficient is needed. 1 Multinomial Logistic Model A multinomial logistic model classifies d-dimensional real-valued input vectors x ∈ R into one of k outcomes c ∈ {0, . . . , k − 1} using k − 1 parameter vectors β0, . . . , βk−2 ∈ R: p(c | x, β) =  exp(βc · x) Zx if c < k − 1 1 Zx if c = k − 1 (1) where the linear predictor is inner product: βc · x = ∑" + }, + { + "title": "A novel approach to on-line handwriting recognition based on bidirectional long short-term memory networks", + "abstract": "In this paper we introduce a new connectionist approach to on-line handwriting recognition and address in particular the problem of recognizing handwritten whiteboard notes. The approach uses a bidirectional recurrent neural network with the long short-term memory architecture. We use a recently introduced objective function, known as Connectionist Temporal Classification (CTC), that directly trains the network to label unsegmented sequence data. Our new system achieves a word recognition rate of 74.0%, compared with 65.4% using a previously developed HMMbased recognition system." + }, + { + "title": "Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization", + "abstract": null + }, + { + "title": "The Principled Design of Large-Scale Recursive Neural Network Architectures--DAG-RNNs and the Protein Structure Prediction Problem", + "abstract": "We describe a general methodology for the design of large-scale recursive neural network architectures (DAG-RNNs) which comprises three fundamental steps: (1) representation of a given domain using suitable directed acyclic graphs (DAGs) to connect visible and hidden node variables; (2) parameterization of the relationship between each variable and its parent variables by feedforward neural networks; and (3) application of weight-sharing within appropriate subsets of DAG connections to capture stationarity and control model complexity. Here we use these principles to derive several specific classes of DAG-RNN architectures based on lattices, trees, and other structured graphs. These architectures can process a wide range of data structures with variable sizes and dimensions. While the overall resulting models remain probabilistic, the internal deterministic dynamics allows efficient propagation of information, as well as training by gradient descent, in order to tackle large-scale problems. These methods are used here to derive state-of-the-art predictors for protein structural features such as secondary structure (1D) and both fineand coarsegrained contact maps (2D). Extensions, relationships to graphical models, and implications for the design of neural architectures are briefly discussed. The protein prediction servers are available over the Web at: www.igb.uci.edu/tools.htm." + }, + { + "title": "Long short-term memory in recurrent neural networks", + "abstract": "These Ecole polytechnique federale de Lausanne EPFL, n° 2366 (2001)Faculte informatique et communicationsJury: Paolo Frasconi, Roger Hersch, Martin Rajman, Jurgen Schmidhuber Public defense: 2001-4-6 Reference doi:10.5075/epfl-thesis-2366Print copy in library catalog Record created on 2005-03-16, modified on 2016-08-08" + }, + { + "title": "Gradient Flow in Recurrent Nets: the Difficulty of Learning Long-Term Dependencies", + "abstract": "D3EGF(FIH)J KMLONPEGQSRPETN UCV.WYX(Z R.[ V R6\\M[ X N@]_^O\\`JaNcb V RcQ W d EGKeL(^(QgfhKeLOE?i)^(QSj ETNPfPQkRl[ V R)m\"[ X ^(KeLOEG^ npo qarpo m\"[ X ^(KeLOEG^tsAu EGNPb V ^ v wyx zlwO{(|(}<~OC}€(‚(xp{aƒy„.~A}†…ˆ‡_~ ‰CŠlƒ3‰#|<€Az†w#|l€6‡ ‹(| Œ JpfhL XVŽ EG^O QgJ ‘ ETFOR†’“] ^O\\”J•NPb V RcQ—– X E)ETR ˜6EGKeLOETNcKMLOEš™ Fˆ› ETN V RcQgJp^(^OE ZgZ E i ^(Qkj EGNPfhQSRO› E œOE2m1Jp^ RcNY› E V•Z sOŸž! ¡ q.n sCD X KGKa’8¢EG^ RPNhE¤£ ¥¦Q ZgZ E•s m§J•^ RPNO› E V•Z s( ̈ X › EG©#EKas# V ^ V œ V s(H a «a•¬3­ ®#|.€Y ̄y} xa°OC}l{x“‡ ‰ ƒyxl€Y~3{| „ ±2‡Pz „ ž V J Z J U N V fhKTJp^(Q ‘ ETFOR†’ J•\\ D vYf3RPEGb ́f V ^(œ§ˆJpbF X RPETN@D KTQ—EG^(KTE i ^(QSjpEGNPfhQSR4vμJ•\\ U¶Z JaNPEG^(K·E jYQ V œ(Q ̧D V ^ R V m V N3R V aOs#1 o ¡Ga r U Q—NhE^OoTE1⁄4»,] R V•Z vC1⁄2 3⁄4 „ x ± x  ‹#¿ }À‡ ‰3€t}l‚C}2‡P}<~ ¬t[ X NP•E^§D KeL(b ́Qgœ(L X ©yETN ] ‘ DY]_Á ˆJ•NPfhJàZ j EToQ V a• rpopo2Ä X  V ^(J(sCD Å)QSRPoTEGN ZgV ^(œ Æ ‰#|•{3 ̄|.€(C}.‹C¿Y}p„ ‡Pz†w" + }, + { + "title": "A novel approach for optimizing neural networks with genetic algorithms", + "abstract": "A novel approach for optimizing feed forward neural networks is proposed in this paper, the genetic algorithms is not based on the traditional criterion of minimized square error, however its fitness function is determined by the average risk. The method considered not only the errors between the network's outputs and the desired outputs, but also the risk caused by these errors, because the errors for different types of samples in training set may present different risks. The neural networks optimized by the proposed approach shows good performance on the samples both inside and outside training set." + }, + { + "title": "Serial Order: A Parallel Distributed Processing Approach", + "abstract": null + }, + { + "title": "Bridging Long Time Lags by Weight Guessing and \\Long Short Term Memory\"", + "abstract": ". Numerous recent papers (including many NIPS papers) focus on standard recurrent nets' inability to deal with long time lags between relevant input signals and teacher signals. Rather sophisticated, alternative methods were proposed. We (cid:12)rst show: problems used to promote certain algorithms in numerous previous papers can be solved more quickly by random weight guessing than by the proposed algorithms. This does not mean that guessing is a good algorithm. It just casts doubt on whether the other algorithms are, or whether the chosen problems are meaningful. We then use long short term memory (LSTM), our own recent algorithm, to solve hard problems that can neither be quickly solved by random weight guessing nor by any other recurrent net algorithm we are aware of." + }, + { + "title": "Neural network synthesis using cellular encoding and the genetic algorithm", + "abstract": null + }, + { + "title": "Turing computability with neural nets", + "abstract": null + }, + { + "title": "Evolving networks: using the genetic algorithm with connectionist learning", + "abstract": "It is appealing to consider hybrids of neural-network learning algorithms with evolutionary search procedures, simply because Nature has so successfully done so. In fact, computational models of learning and evolution ooer theoretical biology new tools for addressing questions about Nature that have dogged that eld since Darwin Belew, 1990]. The concern of this paper, however, is strictly artiicial: Can hybrids of connectionist learning algorithms and genetic algorithms produce more eecient and eeective algorithms than either technique applied in isolation? The paper begins with a survey of recent work (by us and others) that combines Holland's Genetic Algorithm (GA) with con-nectionist techniques and delineates some of the basic design problems these hybrids share. This analysis suggests the dangers of overly literal representations of the network on the genome (e.g., encoding each weight explicitly). A preliminary set of experiments that use the GA to nd unusual but successful values for BP parameters (learning rate, momentum) are also reported. The focus of the report is a series of experiments that use the GA to explore the space of initial weight values , from which two diierent gradient techniques (conjugate gradient and back propagation) are then allowed to optimize. We nd that use of the GA provides much greater conndence in the face of the stochas-tic variation that can plague gradient techniques, and can also allow training times to be reduced by as much as two orders of magnitude. Computational trade-oos between BP and the GA are considered, including discussion of a software facility that exploits the parallelism inherent in GA/BP hybrids. This evidence leads us to conclude that the GA's global sampling characteristics compliment connectionist local search techniques well, leading to eecient and reliable hybrids." + }, + { + "title": "Handwritten Digit Recognition with a Back-Propagation Network", + "abstract": "We present an application of back-propagation networks to handwritten digit recognition. Minimal preprocessing of the data was required, but architecture of the network was highly constrained and specifically designed for the task. The input of the network consists of normalized images of isolated digits. The method has 1% error rate and about a 9% reject rate on zipcode digits provided by the U.S. Postal Service." + }, + { + "title": "CONDITIONAL MARKOV PROCESSES", + "abstract": "Relationships are given between the probabilities of conditional Markov chains for neighboring tests. The conditional probabilities at the end of the observation interval (the final probabilities) are satisfied by equations of the first kind corresponding to an increase in the observation interval. The equations of the second kind for the conditional probabilities within the observation interval are written in terms of these final probabilities.The following special cases are considered. Gaussian noise with independent values which becomes a delta-correlational process when the moments of time are compacted, and a continuous Markov process.The related problem of the time reversal of ordinary (a priori) Markov processes is treated as a side issue." + }, + { + "title": "Backpropagation -Wikipedia, the free encyclopedia", + "abstract": null + } + ] + }, + "author_data": {}, + "reference_proposal": "### [Question 1] - What is the problem?\nHow can we improve the efficiency and effectiveness of deep learning models in processing large datasets while addressing the challenges of hierarchical representation learning?\n\n### [Question 2] - Why is it interesting and important?\nSolving this problem is crucial for the research community as it can lead to significant advancements in the field of machine learning, particularly in applications requiring real-time processing of large-scale data. Improved methodologies could enhance the performance of neural networks in various domains, such as natural language processing, computer vision, and speech recognition. This research could pave the way for more interpretable models, enabling practitioners to better understand and trust the decisions made by AI systems, ultimately leading to broader adoption and innovation in AI technologies.\n\n### [Question 3] - Why is it hard?\nThe challenges in addressing this problem stem from the complexity of designing neural networks that can effectively learn hierarchical representations without overfitting or underutilizing computational resources. Naive approaches may fail due to the difficulty in capturing long-term dependencies in data, which is essential for tasks involving sequential information. Additionally, the optimization of deep learning models in the context of large datasets presents technical obstacles, such as the need for efficient training algorithms and the management of computational costs, which can hinder scalability and performance.\n\n### [Question 4] - Why hasn't it been solved before?\nPrevious research has often focused on specific aspects of deep learning, such as architecture design or optimization techniques, without fully addressing the integration of these elements for large-scale data processing. Limitations in computational resources and the lack of comprehensive datasets have also posed barriers to progress. Moreover, existing solutions may not adequately consider the trade-offs between model complexity and interpretability. Our approach aims to bridge these gaps by proposing a unified framework that combines advanced optimization techniques with novel architectural designs tailored for large datasets.\n\n### [Question 5] - What are the key components of my approach and results?\nOur proposed methodology involves developing a hybrid deep learning architecture that integrates recurrent neural networks (RNNs) with attention mechanisms to enhance the learning of hierarchical representations. We will utilize a large-scale dataset from a relevant domain, such as image classification or natural language processing, and evaluate our model using metrics such as accuracy, F1 score, and computational efficiency. The expected outcomes include improved model performance on benchmark tasks, reduced training times, and enhanced interpretability of the learned representations, demonstrating the effectiveness of our approach in addressing the identified challenges." + }, + "1404.1100": { + "paper_data": { + "title": "A Tutorial on Principal Component Analysis", + "url": "http://arxiv.org/abs/1404.1100v1", + "arxiv_id": "1404.1100", + "authors": [ + "Jonathon Shlens" + ], + "abstract": "Principal component analysis (PCA) is a mainstay of modern data analysis - a black box that is widely used but (sometimes) poorly understood. The goal of this paper is to dispel the magic behind this black box. This manuscript focuses on building a solid intuition for how and why principal component analysis works. This manuscript crystallizes this knowledge by deriving from simple intuitions, the mathematics behind PCA. This tutorial does not shy away from explaining the ideas informally, nor does it shy away from the mathematics. The hope is that by addressing both aspects, readers of all levels will be able to gain a better understanding of PCA as well as the when, the how and the why of applying this technique.", + "introduction": " INTRODUCTION Principal component analysis (PCA) is a standard tool in mod- ern data analysis - in diverse fields from neuroscience to com- puter graphics - because it is a simple, non-parametric method for extracting relevant information from confusing data sets. With minimal effort PCA provides a roadmap for how to re- duce a complex data set to a lower dimension to reveal the sometimes hidden, simplified structures that often underlie it. The goal of this tutorial is to provide both an intuitive feel for PCA, and a thorough discussion by largely building on ideas from linear algebra and avoiding challenging topics in statistics and optimization theory (but see Appendix B: Code This code is written for Matlab 6.5 (Release 13) from Mathworks8. The code is not computationally effi- cient but explanatory (terse comments begin with a %). This first version follows Section 5 by examining the covariance of the data set. function [signals,PC,V] = pca1(data) % PCA1: Perform PCA using covariance. % data - MxN matrix of input data % (M dimensions, N trials) % signals - MxN matrix of projected data % PC - each column is a PC % V - Mx1 matrix of variances [M,N] = size(data); % subtract off the mean for each dimension mn = mean(data,2); data = data - repmat(mn,1,N); % calculate the covariance matrix covariance = 1 / (N-1) * data * data’; % find the eigenvectors and eigenvalues 8http://www.mathworks.com12 [PC, V] = eig(covariance); % extract diagonal of matrix as vector V = diag(V); % sort the variances in decreasing order [junk, rindices] = sort(-1*V); V = V(rindices); PC = PC(:,rindices); % project the original data set signals = PC’ * data; This second version follows section 6 computing PCA through SVD. function [signals,PC,V] = pca2(data) % PCA2: Perform PCA using SVD. % data - MxN matrix of input data % (M dimensions, N trials) % signals - MxN matrix of projected data % PC - each column is a PC% V - Mx1 matrix of variances [M,N] = size(data); % subtract off the mean for each dimension mn = mean(data,2); data = data - repmat(mn,1,N); % construct the matrix Y Y = data’ / sqrt(N-1); % SVD does it all [u,S,PC] = svd(Y); % calculate the variances S = diag(S); V = S .* S; % project the original data signals = PC’ * data; Discussion). II.Large variances have important structure. This assumption also encompasses the belief that the data has a high SNR. Hence, principal compo- nents with larger associated variances represent interesting structure, while those with lower vari- ances represent noise. Note that this is a strong, and sometimes, incorrect assumption (see Dis- cussion). III.The principal components are orthogonal. This assumption provides an intuitive simplifica- tion that makes PCA soluble with linear algebra decomposition techniques. These techniques are highlighted in the two following sections. We have discussed all aspects of deriving PCA - what remain are the linear algebra solutions. The first solution is some- what straightforward while the second solution involves un- derstanding an important algebraic decomposition. V. SOLVING PCA USING EIGENVECTOR DECOMPOSITION We derive our first algebraic solution to PCA based on an im- portant property of eigenvector decomposition. Once again, the data set is X, anm\u0002nmatrix, where mis the number of measurement types and nis the number of samples. The goal is summarized as follows. Find some orthonormal matrix PinY=PXsuch thatCY\u00111 nYYTis a diagonal matrix. The rows ofPare the principal components ofX.We begin by rewriting CYin terms of the unknown variable. CY=1 nYYT =1 n(PX)(PX)T =1 nPXXTPT =P(1 nXXT)PT CY=PCXPT Note that we have identified the covariance matrix of Xin the last line. Our plan is to recognize that", + "references": [ + { + "title": "Lithofacies Clustering Using Principal Component Analysis and Neural Network: Applications to Wireline Logs", + "abstract": null + }, + { + "title": "Principal Component Analysis", + "abstract": null + }, + { + "title": "Analysis of dynamic brain imaging data.", + "abstract": null + }, + { + "title": "The “independent components” of natural scenes are edge filters", + "abstract": null + }, + { + "title": "Statistical Factor Analysis and Related Methods: Theory and Applications", + "abstract": "Preliminaries Matrixes, Vector Spaces The Ordinary Principal Components Model Statistical Testing of the Ordinary Principal Components Model Extensions of the Ordinary Principal Components Model Factor Analysis Factor Analysis of Correlated Observations Ordinal and Nominal Random Data Other Models for Discrete Data Factor Analysis and Least Squares Regression Exercises References Index." + }, + { + "title": "LIII. On lines and planes of closest fit to systems of points in space", + "abstract": "(1901). LIII. On lines and planes of closest fit to systems of points in space. The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science: Vol. 2, No. 11, pp. 559-572." + }, + { + "title": "Integrated Reservoir Modeling of a Pinedale Tight-gas Reservoir in the Greater Green River Basin, Wyoming", + "abstract": "The Pinedale anticline is a large natural gas field in the Greater Green River Basin of Wyoming, located north of the giant Jonah field. Gas production is from overpressured fluvial channel sandstones of the Upper Cretaceous Mesaverde and Lance formations and the lower Tertiary “unnamed Tertiary” formation. To date, most studies have focused on the regional geology and potential hydrocarbon economics. This chapter discusses an integrated approach for reservoir modeling to reduce uncertainty in this tight-gas field development. In this study, fluvial facies were defined using wireline logs. Object-based modeling was used to integrate well-log facies, object dimension, channel sinuosity, and orientation in building the three-dimensional facies model. The facies model was then used to guide petrophysical property modeling. Dependencies between rock properties were modeled using a geostatistical method. The final model honors the fluvial depositional characteristics and dependencies between the rock properties and was used for better uncertainty management in reservoir simulation and performance forecasting." + }, + { + "title": "Introduction to the Singular Value Decomposition", + "abstract": null + }, + { + "title": "Applied Multivariate Data Analysis (2nd Edition)", + "abstract": null + }, + { + "title": "Linear Algebra and It’s Applications", + "abstract": null + }, + { + "title": "Neural networks for pattern recognition", + "abstract": "From the Publisher: \nThis is the first comprehensive treatment of feed-forward neural networks from the perspective of statistical pattern recognition. After introducing the basic concepts, the book examines techniques for modelling probability density functions and the properties and merits of the multi-layer perceptron and radial basis function network models. Also covered are various forms of error functions, principal algorithms for error function minimalization, learning and generalization in neural networks, and Bayesian techniques and their applications. Designed as a text, with over 100 exercises, this fully up-to-date work will benefit anyone involved in the fields of neural computation and pattern recognition." + } + ] + }, + "author_data": { + "64be5490-6ce8-4101-ac75-78f1fe11e584": { + "pk": "64be5490-6ce8-4101-ac75-78f1fe11e584", + "project_name": null, + "name": "Jonathon Shlens", + "bio": "I am a researcher deeply engaged in the intersection of computer vision and neuroscience, with a focus on understanding and improving visual recognition systems. My work addresses the challenges of scaling object recognition to large categories by integrating visual data with semantic information from unannotated text. I have developed innovative models that leverage this semantic knowledge, achieving state-of-the-art performance in zero-shot learning tasks, which allows for accurate predictions on unseen object categories.\n\nIn addition to my contributions to visual-semantic embedding, I have explored the incorporation of contextual information to enhance object recognition and localization. By utilizing co-occurrence statistics from web documents, I have demonstrated significant improvements in recognition accuracy across various datasets.\n\nMy research also extends to the neural coding of visual information, where I have investigated the efficiency of retinal ganglion cells in transmitting spatial information. I have developed models that optimize this transmission and have shown that the ganglion cell population operates with high efficiency and redundancy, aligning closely with theoretical predictions.\n\nThrough my work, I aim to bridge the gap between computational models and biological systems, providing insights that can inform both fields. I am passionate about advancing our understanding of visual processing and developing systems that can learn and adapt in complex environments.", + "collaborators": [ + "E. Chichilnisky", + "A. Litke", + "J. Gauthier", + "G. Field", + "A. Sher", + "M. Greschner", + "Timothy A. Machado", + "K. Mathieson", + "D. Gunning", + "L. Paninski", + "Eero P. Simoncelli", + "Lauren H Jepson", + "G. Corrado", + "Samy Bengio", + "J. Dean", + "Thomas L. Dean", + "Jonathan W. Pillow", + "Andrea Frome", + "Tomas Mikolov", + "Y. Singer", + "Mark A. Ruzon", + "Mark E. Segal", + "Sudheendra Vijayanarasimhan", + "J. Yagnik", + "W. Dąbrowski", + "Marc'Aurelio Ranzato", + "D. Erhan", + "Eugene Ie", + "Quoc V. Le", + "Andrew Rabinovich", + "Mohammad Norouzi", + "E. Doi", + "C. Bakolitsa", + "M. Vidne", + "Yashar Ahmadian", + "J. Kulkarni", + "C. Rangel", + "D. Marshak" + ], + "pub_titles": [ + "DeViSE: A Deep Visual-Semantic Embedding Model", + "Using Web Co-occurrence Statistics for Improving Image Categorization", + "Fast, Accurate Detection of 100,000 Object Classes on a Single Machine: Technical Supplement", + "Fast, Accurate Detection of 100,000 Object Classes on a Single Machine", + "Zero-Shot Learning by Convex Combination of Semantic Embeddings", + "A Model-Based Spike Sorting Algorithm for Removing Correlation Artifacts in Multi-Neuron Recordings", + "Efficient Coding of Spatial Information in the Primate Retina", + "Three Controversial Hypotheses Concerning Computation in the Primate Cortex", + "Correlated firing among major ganglion cell types in primate retina", + "Mapping a Neural Circuit : A Complete Input-Output Diagram in the Primate Retina", + "The Structure of Large-Scale Synchronized Firing in Primate Retina", + "Uniform Signal Redundancy of Parasol and Midget Ganglion Cells in Primate Retina", + "Receptive Fields in Primate Retina Are Coordinated to Sample Visual Space More Uniformly" + ], + "pub_abstracts": [ + "Modern visual recognition systems are often limited in their ability to scale to large numbers of object categories. This limitation is in part due to the increasing difficulty of acquiring sufficient training data in the form of labeled images as the number of object categories grows. One remedy is to leverage data from other sources - such as text data - both to train visual models and to constrain their predictions. In this paper we present a new deep visual-semantic embedding model trained to identify visual objects using both labeled image data as well as semantic information gleaned from unannotated text. We demonstrate that this model matches state-of-the-art performance on the 1000-class ImageNet object recognition challenge while making more semantically reasonable errors, and also show that the semantic information can be exploited to make predictions about tens of thousands of image labels not observed during training. Semantic knowledge improves such zero-shot predictions achieving hit rates of up to 18% across thousands of novel labels never seen by the visual model.", + "Object recognition and localization are important tasks in computer vision. The focus of this work is the incorporation of contextual information in order to improve object recognition and localization. For instance, it is natural to expect not to see an elephant to appear in the middle of an ocean. We consider a simple approach to encapsulate such common sense knowledge using co-occurrence statistics from web documents. By merely counting the number of times nouns (such as elephants, sharks, oceans, etc.) co-occur in web documents, we obtain a good estimate of expected co-occurrences in visual data. We then cast the problem of combining textual co-occurrence statistics with the predictions of image-based classifiers as an optimization problem. The resulting optimization problem serves as a surrogate for our inference procedure. Albeit the simplicity of the resulting optimization problem, it is effective in improving both recognition and localization accuracy. Concretely, we observe significant improvements in recognition and localization rates for both ImageNet Detection 2012 and Sun 2012 datasets.", + "In the paper [1] published in CVPR, we presented a method that can directly use deformable part models (DPMs) trained as in [3]. After training, HOG based part filters are hashed, and, during inference, counts of hashing collisions summed over all hash bands serve as a proxy for part-filter / sliding-window dot products, i.e., filter responses. These counts are an approximation and so we take the original HOG-based filters for the top hash counts and calculate the exact dot products for scoring. It is possible to train DPM models not on HOG data but on a hashed WTA [4] version of this data. The resulting part filters are sparse, real-valued vectors the size of WTA vectors computed from sliding windows. Given the WTA hash of a window, we exactly recover dot products of the top responses using an extension of locality-sensitive hashing. In this supplement, we sketch a method for training such WTA-based models.", + "Many object detection systems are constrained by the time required to convolve a target image with a bank of filters that code for different aspects of an object's appearance, such as the presence of component parts. We exploit locality-sensitive hashing to replace the dot-product kernel operator in the convolution with a fixed number of hash-table probes that effectively sample all of the filter responses in time independent of the size of the filter bank. To show the effectiveness of the technique, we apply it to evaluate 100,000 deformable-part models requiring over a million (part) filters on multiple scales of a target image in less than 20 seconds using a single multi-core processor with 20GB of RAM. This represents a speed-up of approximately 20,000 times - four orders of magnitude - when compared with performing the convolutions explicitly on the same hardware. While mean average precision over the full set of 100,000 object classes is around 0.16 due in large part to the challenges in gathering training data and collecting ground truth for so many classes, we achieve a mAP of at least 0.20 on a third of the classes and 0.30 or better on about 20% of the classes.", + "Abstract: Several recent publications have proposed methods for mapping images into continuous semantic embedding spaces. In some cases the embedding space is trained jointly with the image transformation. In other cases the semantic embedding space is established by an independent natural language processing task, and then the image transformation into that space is learned in a second stage. Proponents of these image embedding systems have stressed their advantages over the traditional \\nway{} classification framing of image understanding, particularly in terms of the promise for zero-shot learning -- the ability to correctly annotate images of previously unseen object categories. In this paper, we propose a simple method for constructing an image embedding system from any existing \\nway{} image classifier and a semantic word embedding model, which contains the $\\n$ class labels in its vocabulary. Our method maps images into the semantic embedding space via convex combination of the class label embedding vectors, and requires no additional training. We show that this simple and direct method confers many of the advantages associated with more complex image embedding schemes, and indeed outperforms state of the art methods on the ImageNet zero-shot learning task.", + "We examine the problem of estimating the spike trains of multiple neurons from voltage traces recorded on one or more extracellular electrodes. Traditional spike-sorting methods rely on thresholding or clustering of recorded signals to identify spikes. While these methods can detect a large fraction of the spikes from a recording, they generally fail to identify synchronous or near-synchronous spikes: cases in which multiple spikes overlap. Here we investigate the geometry of failures in traditional sorting algorithms, and document the prevalence of such errors in multi-electrode recordings from primate retina. We then develop a method for multi-neuron spike sorting using a model that explicitly accounts for the superposition of spike waveforms. We model the recorded voltage traces as a linear combination of spike waveforms plus a stochastic background component of correlated Gaussian noise. Combining this measurement model with a Bernoulli prior over binary spike trains yields a posterior distribution for spikes given the recorded data. We introduce a greedy algorithm to maximize this posterior that we call “binary pursuit”. The algorithm allows modest variability in spike waveforms and recovers spike times with higher precision than the voltage sampling rate. This method substantially corrects cross-correlation artifacts that arise with conventional methods, and substantially outperforms clustering methods on both real and simulated data. Finally, we develop diagnostic tools that can be used to assess errors in spike sorting in the absence of ground truth.", + "Sensory neurons have been hypothesized to efficiently encode signals from the natural environment subject to resource constraints. The predictions of this efficient coding hypothesis regarding the spatial filtering properties of the visual system have been found consistent with human perception, but they have not been compared directly with neural responses. Here, we analyze the information that retinal ganglion cells transmit to the brain about the spatial information in natural images subject to three resource constraints: the number of retinal ganglion cells, their total response variances, and their total synaptic strengths. We derive a model that optimizes the transmitted information and compare it directly with measurements of complete functional connectivity between cone photoreceptors and the four major types of ganglion cells in the primate retina, obtained at single-cell resolution. We find that the ganglion cell population exhibited 80% efficiency in transmitting spatial information relative to the model. Both the retina and the model exhibited high redundancy (∼30%) among ganglion cells of the same cell type. A novel and unique prediction of efficient coding, the relationships between projection patterns of individual cones to all ganglion cells, was consistent with the observed projection patterns in the retina. These results indicate a high level of efficiency with near-optimal redundancy in visual signaling by the retina.", + " We consider three hypotheses concerning the primate neocortex which have influenced computational neuroscience in recent years. Is the mind modular in terms of its being profitably described as a collection of relatively independent functional units? Does the regular structure of the cortex imply a single algorithm at work, operating on many different inputs in parallel? Can the cognitive differences between humans and our closest primate relatives be explained in terms of a scalable cortical architecture? We bring to bear diverse sources of evidence to argue that the answers to each of these questions — with some judicious qualifications — are in the affirmative. In particular, we argue that while our higher cognitive functions may interact in a complicated fashion, many of the component functions operate through well-defined interfaces and, perhaps more important, are built on a neural substrate that scales easily under the control of a modular genetic architecture. Processing in the primary sensory cortices seem amenable to similar algorithmic principles, and, even for those cases where alternative principles are at play, the regular structure of cortex allows the same or greater advantages as the architecture scales. Similar genetic machinery to that used by nature to scale body plans has apparently been applied to scale cortical computations. The resulting replicated computing units can be used to build larger working memory and support deeper recursions needed to qualitatively improve our abilities to handle language, abstraction and social interaction. ", + "This paper examines the correlated firing among multiple ganglion cell types in the retina. For many years it has been known that ganglion cells exhibit a tendency to fire simultaneously more or less frequently than would be predicted by chance. However, the particular patterns of correlated activity in the primate retina have been unclear. Here we reveal systematic, distance‐dependent correlations between different ganglion cell types. For the most part, the patterns of activity are consistent with a model in which noise in cone photoreceptors propagates through common retinal circuitry, creating correlations among ganglion cell signals.", + "1 Systems Neurobiology Laboratory, Salk Institute for Biological Studies, La Jolla, CA 2 Santa Cruz Institute for Particle Physics, University of California, Santa Cruz, CA 3 Department of Physics and Astronomy, University of Glasgow, Glasgow, UK 4 Faculty of Physics and Applied Computer Science, AGH University of Science and Technology, 23 30-059, Krakow, Poland 5 Department of Statistics and Center for Theoretical Neuroscience, Columbia University, New York, NY * These authors contributed equally.", + "Synchronized firing among neurons has been proposed to constitute an elementary aspect of the neural code in sensory and motor systems. However, it remains unclear how synchronized firing affects the large-scale patterns of activity and redundancy of visual signals in a complete population of neurons. We recorded simultaneously from hundreds of retinal ganglion cells in primate retina, and examined synchronized firing in completely sampled populations of ∼50–100 ON-parasol cells, which form a major projection to the magnocellular layers of the lateral geniculate nucleus. Synchronized firing in pairs of cells was a subset of a much larger pattern of activity that exhibited local, isotropic spatial properties. However, a simple model based solely on interactions between adjacent cells reproduced 99% of the spatial structure and scale of synchronized firing. No more than 20% of the variability in firing of an individual cell was predictable from the activity of its neighbors. These results held both for spontaneous firing and in the presence of independent visual modulation of the firing of each cell. In sum, large-scale synchronized firing in the entire population of ON-parasol cells appears to reflect simple neighbor interactions, rather than a unique visual signal or a highly redundant coding scheme.", + "The collective representation of visual space in high resolution visual pathways was explored by simultaneously measuring the receptive fields of hundreds of ON and OFF midget and parasol ganglion cells in isolated primate retina. As expected, the receptive fields of all four cell types formed regular mosaics uniformly tiling the visual scene. Surprisingly, comparison of all four mosaics revealed that the overlap of neighboring receptive fields was nearly identical, for both the excitatory center and inhibitory surround components of the receptive field. These observations contrast sharply with the large differences in the dendritic overlap between the parasol and midget cell populations, revealing a surprising lack of correspondence between the anatomical and functional architecture in the dominant circuits of the primate retina.", + "In the visual system, large ensembles of neurons collectively sample visual space with receptive fields (RFs). A puzzling problem is how neural ensembles provide a uniform, high-resolution visual representation in spite of irregularities in the RFs of individual cells. This problem was approached by simultaneously mapping the RFs of hundreds of primate retinal ganglion cells. As observed in previous studies, RFs exhibited irregular shapes that deviated from standard Gaussian models. Surprisingly, these irregularities were coordinated at a fine spatial scale: RFs interlocked with their neighbors, filling in gaps and avoiding large variations in overlap. RF shapes were coordinated with high spatial precision: the observed uniformity was degraded by angular perturbations as small as 15°, and the observed populations sampled visual space with more than 50% of the theoretical ideal uniformity. These results show that the primate retina encodes light with an exquisitely coordinated array of RF shapes, illustrating a higher degree of functional precision in the neural circuitry than previously appreciated." + ], + "domain": [ + "Computer Vision", + "Neural Encoding", + "Object Recognition", + "Deep Learning" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we improve the computational efficiency and interpretability of Principal Component Analysis (PCA) while maintaining its effectiveness in extracting relevant information from complex datasets?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is crucial for the research community as it can enhance the usability of PCA across various fields, such as neuroscience and computer graphics, where data complexity is increasing. An improved PCA method could lead to more accurate data interpretations, facilitate better decision-making, and inspire further research into dimensionality reduction techniques. Additionally, advancements in PCA could lead to practical applications in areas like image processing, genomics, and machine learning, where efficient data analysis is paramount.\n\n**[Question 3] - Why is it hard?** \nThe challenges in improving PCA stem from its reliance on assumptions such as high signal-to-noise ratio (SNR) and the orthogonality of principal components. Naive approaches may fail because they do not account for the potential misrepresentation of noise as significant structure in the data. Moreover, the computational complexity of eigenvector decomposition and singular value decomposition (SVD) can be prohibitive for large datasets, necessitating the development of more efficient algorithms that still adhere to the theoretical foundations of PCA.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research on PCA has primarily focused on its theoretical underpinnings and straightforward implementations, often overlooking the need for computational efficiency and practical applicability in real-world scenarios. Barriers such as a lack of innovative algorithmic approaches and the complexity of existing methods have hindered progress. My approach aims to bridge these gaps by introducing novel techniques that enhance both the speed and interpretability of PCA, while also addressing the limitations of traditional methods.\n\n**[Question 5] - What are the key components of my approach and results?** \nMy proposed methodology involves developing a new algorithm that optimizes PCA through advanced matrix factorization techniques, utilizing a large dataset from a relevant domain (e.g., image or genomic data). I will evaluate the performance of this algorithm using metrics such as explained variance and computational time compared to traditional PCA methods. The expected outcomes include a more efficient PCA implementation that retains high accuracy in data representation, ultimately leading to better insights and applications in various fields." + }, + "1804.03189": { + "paper_data": { + "title": "Deep Painterly Harmonization", + "url": "http://arxiv.org/abs/1804.03189v4", + "arxiv_id": "1804.03189", + "authors": [ + "Fujun Luan", + "Sylvain Paris", + "Eli Shechtman", + "Kavita Bala" + ], + "abstract": "Copying an element from a photo and pasting it into a painting is a challenging task. Applying photo compositing techniques in this context yields subpar results that look like a collage --- and existing painterly stylization algorithms, which are global, perform poorly when applied locally. We address these issues with a dedicated algorithm that carefully determines the local statistics to be transferred. We ensure both spatial and inter-scale statistical consistency and demonstrate that both aspects are key to generating quality results. To cope with the diversity of abstraction levels and types of paintings, we introduce a technique to adjust the parameters of the transfer depending on the painting. We show that our algorithm produces significantly better results than photo compositing or global stylization techniques and that it enables creative painterly edits that would be otherwise difficult to achieve.", + "introduction": " Introduction Image compositing is a key operation to create new visual con- tent. It allows artists to remix existing materials into new pieces and artists such as Man Ray and David Hockney have created mas- terpieces using this technique. Compositing can be used in differ- ent contexts. In applications like photo collage, visible seams are desirable. But in others, the objective is to make the compositing inconspicuous, for instance, to add an object into a photograph in a way that makes it look like the object was present in the original scene. Many tools have been developed for photographiccompositing, e.g., to remove boundary seams [PGB03], match the color [XADR12] or also fine texture [SJMP10]. However, there is no equivalent for paintings. If one seeks to add an object into a painting, the options are limited. One can paint the object man- ually or with a painting engine [CKIW15] but this requires time and skills that few people have. As we shall see, resorting to algo- rithms designed for photographs produces subpar Related Work Image Harmonization. The simplest way to blend images is to combine the foreground and background painting. Our methods and our Background Our work builds upon the style transfer technique introduced by Gatys et al. [GEB16] (Neural Style) and several additional recon- struction losses proposed later to improve its experiments show that our approach succeeds on a diversity of input and style images, many of which are challenging for other Discussion. Our constrained mapping was inspired by the nearest-neighbor field upsampling used in the Deep Analogywork [LYY\u000317, § 4.4] that constrains the matches at each layer to come from a local region around the location at a previous layer. When the input and style images are similar, this technique per- forms well. In our context, the intermediate image and the style image are even more similar. This encouraged us to be even stricter by forcing the matches to come from the exact same location. Be- side this similar aspect, the other algorithmic parts are different and as we shall see, our approach produces better Results of the “Comparison” user study. Our algorithm is often the most preferred among the four algorithms. Image Harmonization Comparisons. We compare our related work and through user studies. Main Conclusions We have described an algorithm to copy an object in a photograph and paste it into a painting seamlessly, i.e., the composite still looks like a genuine painting. We have introduced a two-pass algorithm that first transfers the overall style of the painting to the input and then refines the result to accurately match the painting’s color and texture. This latter pass relies on mapping neural response statis- tics that ensures consistency across the network layers and in im- age space. To cope with different painting styles, we have trained a separate network to adjust the transfer parameters as a function of the style of the References [BSFG09] B ARNES C., S HECHTMAN E., F INKELSTEIN A., G OLDMAN D. B.: Patchmatch: A randomized correspondence algorithm for struc- tural image editing. ACM Trans. Graph. 28 , 3 (2009), 24–1. 2, 6 [CHM\u000310] C HUH.-K., H SUW.-H., M ITRA N. J., C OHEN -ORD., WONG T.-T., L EET.-Y.: Camouflage images. ACM Trans. Graph. 29, 4 (2010), 51–1. 2, 9 [CKIW15] C HEN Z., K IMB., I TOD., W ANG H.: Wetbrush: Gpu-based3d painting simulation at", + "references": [ + { + "title": "Universal Style Transfer via Feature Transforms", + "abstract": "Universal style transfer aims to transfer arbitrary visual styles to content images. Existing feed-forward based methods, while enjoying the inference efficiency, are mainly limited by inability of generalizing to unseen styles or compromised visual quality. In this paper, we present a simple yet effective method that tackles these limitations without training on any pre-defined styles. The key ingredient of our method is a pair of feature transforms, whitening and coloring, that are embedded to an image reconstruction network. The whitening and coloring transforms reflect a direct matching of feature covariance of the content image to a given style image, which shares similar spirits with the optimization of Gram matrix based cost in neural style transfer. We demonstrate the effectiveness of our algorithm by generating high-quality stylized images with comparisons to a number of recent methods. We also analyze our method by visualizing the whitened features and synthesizing textures via simple feature coloring." + }, + { + "title": "Visual attribute transfer through deep image analogy", + "abstract": "We propose a new technique for visual attribute transfer across images that may have very different appearance but have perceptually similar semantic structure. By visual attribute transfer, we mean transfer of visual information (such as color, tone, texture, and style) from one image to another. For example, one image could be that of a painting or a sketch while the other is a photo of a real scene, and both depict the same type of scene. Our technique finds semantically-meaningful dense correspondences between two input images. To accomplish this, it adapts the notion of \"image analogy\" [Hertzmann et al. 2001] with features extracted from a Deep Convolutional Neutral Network for matching; we call our technique deep image analogy. A coarse-to-fine strategy is used to compute the nearest-neighbor field for generating the results. We validate the effectiveness of our proposed method in a variety of cases, including style/texture transfer, color/style swap, sketch/painting to photo, and time lapse." + }, + { + "title": "Deep Photo Style Transfer", + "abstract": "This paper introduces a deep-learning approach to photographic style transfer that handles a large variety of image content while faithfully transferring the reference style. Our approach builds upon the recent work on painterly transfer that separates style from the content of an image by considering different layers of a neural network. However, as is, this approach is not suitable for photorealistic style transfer. Even when both the input and reference images are photographs, the output still exhibits distortions reminiscent of a painting. Our contribution is to constrain the transformation from the input to the output to be locally affine in colorspace, and to express this constraint as a custom fully differentiable energy term. We show that this approach successfully suppresses distortion and yields satisfying photorealistic style transfers in a broad variety of scenarios, including transfer of the time of day, weather, season, and artistic edits." + }, + { + "title": "Arbitrary Style Transfer in Real-Time with Adaptive Instance Normalization", + "abstract": "Gatys et al. recently introduced a neural algorithm that renders a content image in the style of another image, achieving so-called style transfer. However, their framework requires a slow iterative optimization process, which limits its practical application. Fast approximations with feed-forward neural networks have been proposed to speed up neural style transfer. Unfortunately, the speed improvement comes at a cost: the network is usually tied to a fixed set of styles and cannot adapt to arbitrary new styles. In this paper, we present a simple yet effective approach that for the first time enables arbitrary style transfer in real-time. At the heart of our method is a novel adaptive instance normalization (AdaIN) layer that aligns the mean and variance of the content features with those of the style features. Our method achieves speed comparable to the fastest existing approach, without the restriction to a pre-defined set of styles. In addition, our approach allows flexible user controls such as content-style trade-off, style interpolation, color & spatial controls, all using a single feed-forward neural network." + }, + { + "title": "Deep Image Harmonization", + "abstract": "Compositing is one of the most common operations in photo editing. To generate realistic composites, the appearances of foreground and background need to be adjusted to make them compatible. Previous approaches to harmonize composites have focused on learning statistical relationships between hand-crafted appearance features of the foreground and background, which is unreliable especially when the contents in the two layers are vastly different. In this work, we propose an end-to-end deep convolutional neural network for image harmonization, which can capture both the context and semantic information of the composite images during harmonization. We also introduce an efficient way to collect large-scale and high-quality training data that can facilitate the training process. Experiments on the synthesized dataset and real composite images show that the proposed network outperforms previous state-of-the-art methods." + }, + { + "title": "Stable and Controllable Neural Texture Synthesis and Style Transfer Using Histogram Losses", + "abstract": "Recently, methods have been proposed that perform texture synthesis and style transfer by using convolutional neural networks (e.g. Gatys et al. [2015,2016]). These methods are exciting because they can in some cases create results with state-of-the-art quality. However, in this paper, we show these methods also have limitations in texture quality, stability, requisite parameter tuning, and lack of user controls. This paper presents a multiscale synthesis pipeline based on convolutional neural networks that ameliorates these issues. We first give a mathematical explanation of the source of instabilities in many previous approaches. We then improve these instabilities by using histogram losses to synthesize textures that better statistically match the exemplar. We also show how to integrate localized style losses in our multiscale framework. These losses can improve the quality of large features, improve the separation of content and style, and offer artistic controls such as paint by numbers. We demonstrate that our approach offers improved quality, convergence in fewer iterations, and more stability over the optimization." + }, + { + "title": "Controlling Perceptual Factors in Neural Style Transfer", + "abstract": "Neural Style Transfer has shown very exciting results enabling new forms of image manipulation. Here we extend the existing method to introduce control over spatial location, colour information and across spatial scale. We demonstrate how this enhances the method by allowing high-resolution controlled stylisation and helps to alleviate common failure cases such as applying ground textures to sky regions. Furthermore, by decomposing style into these perceptual factors we enable the combination of style information from multiple sources to generate new, perceptually appealing styles from existing ones. We also describe how these methods can be used to more efficiently produce large size, high-quality stylisation. Finally we show how the introduced control measures can be applied in recent methods for Fast Neural Style Transfer." + }, + { + "title": "Deconvolution and Checkerboard Artifacts", + "abstract": null + }, + { + "title": "Image Style Transfer Using Convolutional Neural Networks", + "abstract": "Rendering the semantic content of an image in different styles is a difficult image processing task. Arguably, a major limiting factor for previous approaches has been the lack of image representations that explicitly represent semantic information and, thus, allow to separate image content from style. Here we use image representations derived from Convolutional Neural Networks optimised for object recognition, which make high level image information explicit. We introduce A Neural Algorithm of Artistic Style that can separate and recombine the image content and style of natural images. The algorithm allows us to produce new images of high perceptual quality that combine the content of an arbitrary photograph with the appearance of numerous wellknown artworks. Our results provide new insights into the deep image representations learned by Convolutional Neural Networks and demonstrate their potential for high level image synthesis and manipulation." + }, + { + "title": "Precomputed Real-Time Texture Synthesis with Markovian Generative Adversarial Networks", + "abstract": null + }, + { + "title": "Perceptual Losses for Real-Time Style Transfer and Super-Resolution", + "abstract": null + }, + { + "title": "Texture Networks: Feed-forward Synthesis of Textures and Stylized Images", + "abstract": "Gatys et al. recently demonstrated that deep networks can generate beautiful textures and stylized images from a single texture example. However, their methods require a slow and memory-consuming optimization process. We propose here an alternative approach that moves the computational burden to a learning stage. Given a single example of a texture, our approach trains compact feed-forward convolutional networks to generate multiple samples of the same texture of arbitrary size and to transfer artistic style from a given image to any other image. The resulting networks are remarkably light-weight and can generate textures of quality comparable to Gatys et al., but hundreds of times faster. More generally, our approach highlights the power and flexibility of generative feed-forward models trained with complex and expressive loss functions." + }, + { + "title": "Combining Markov Random Fields and Convolutional Neural Networks for Image Synthesis", + "abstract": "This paper studies a combination of generative Markov random field (MRF) models and discriminatively trained deep convolutional neural networks (dCNNs) for synthesizing 2D images. The generative MRF acts on higher-levels of a dCNN feature pyramid, controlling the image layout at an abstract level. We apply the method to both photographic and non-photo-realistic (artwork) synthesis tasks. The MRF regularizer prevents over-excitation artifacts and reduces implausible feature mixtures common to previous dCNN inversion approaches, permitting synthesizing photographic content with increased visual plausibility. Unlike standard MRF-based texture synthesis, the combined system can both match and adapt local features with considerable variability, yielding results far out of reach of classic generative MRF methods." + }, + { + "title": "Wetbrush", + "abstract": "We present a real-time painting system that simulates the interactions among brush, paint, and canvas at the bristle level. The key challenge is how to model and simulate sub-pixel paint details, given the limited computational resource in each time step. To achieve this goal, we propose to define paint liquid in a hybrid fashion: the liquid close to the brush is modeled by particles, and the liquid away from the brush is modeled by a density field. Based on this representation, we develop a variety of techniques to ensure the performance and robustness of our simulator under large time steps, including brush and particle simulations in non-inertial frames, a fixed-point method for accelerating Jacobi iterations, and a new Eulerian-Lagrangian approach for simulating detailed liquid effects. The resulting system can realistically simulate not only the motions of brush bristles and paint liquid, but also the liquid transfer processes among different representations. We implement the whole system on GPU by CUDA. Our experiment shows that artists can use the system to draw realistic and vivid digital paintings, by applying the painting techniques that they are familiar with but not offered by many existing systems." + }, + { + "title": "Learning a Discriminative Model for the Perception of Realism in Composite Images", + "abstract": "What makes an image appear realistic? In this work, we are answering this question from a data-driven perspective by learning the perception of visual realism directly from large amounts of data. In particular, we train a Convolutional Neural Network (CNN) model that distinguishes natural photographs from automatically generated composite images. The model learns to predict visual realism of a scene in terms of color, lighting and texture compatibility, without any human annotations pertaining to it. Our model outperforms previous works that rely on hand-crafted heuristics, for the task of classifying realistic vs. unrealistic photos. Furthermore, we apply our learned model to compute optimal parameters of a compositing method, to maximize the visual realism score predicted by our CNN model. We demonstrate its advantage against existing methods via a human perception study." + }, + { + "title": "Understanding deep image representations by inverting them", + "abstract": "Image representations, from SIFT and Bag of Visual Words to Convolutional Neural Networks (CNNs), are a crucial component of almost any image understanding system. Nevertheless, our understanding of them remains limited. In this paper we conduct a direct analysis of the visual information contained in representations by asking the following question: given an encoding of an image, to which extent is it possible to reconstruct the image itself? To answer this question we contribute a general framework to invert representations. We show that this method can invert representations such as HOG more accurately than recent alternatives while being applicable to CNNs too. We then use this technique to study the inverse of recent state-of-the-art CNN image representations for the first time. Among our findings, we show that several layers in CNNs retain photographically accurate information about the image, with different degrees of geometric and photometric invariance." + }, + { + "title": "Very Deep Convolutional Networks for Large-Scale Image Recognition", + "abstract": "In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision." + }, + { + "title": "Image melding", + "abstract": "Current methods for combining two different images produce visible artifacts when the sources have very different textures and structures. We present a new method for synthesizing a transition region between two source images, such that inconsistent color, texture, and structural properties all change gradually from one source to the other. We call this process image melding. Our method builds upon a patch-based optimization foundation with three key generalizations: First, we enrich the patch search space with additional geometric and photometric transformations. Second, we integrate image gradients into the patch representation and replace the usual color averaging with a screened Poisson equation solver. And third, we propose a new energy based on mixed L2/L0 norms for colors and gradients that produces a gradual transition between sources without sacrificing texture sharpness. Together, all three generalizations enable patch-based solutions to a broad class of image melding problems involving inconsistent sources: object cloning, stitching challenging panoramas, hole filling from multiple photos, and image harmonization. In several cases, our unified method outperforms previous state-of-the-art methods specifically designed for those applications." + }, + { + "title": "Understanding and improving the realism of image composites", + "abstract": "Compositing is one of the most commonly performed operations in computer graphics. A realistic composite requires adjusting the appearance of the foreground and background so that they appear compatible; unfortunately, this task is challenging and poorly understood. We use statistical and visual perception experiments to study the realism of image composites. First, we evaluate a number of standard 2D image statistical measures, and identify those that are most significant in determining the realism of a composite. Then, we perform a human subjects experiment to determine how the changes in these key statistics influence human judgements of composite realism. Finally, we describe a data-driven algorithm that automatically adjusts these statistical measures in a foreground to make it more compatible with its background in a composite. We show a number of compositing results, and evaluate the performance of both our algorithm and previous work with a human subjects study." + }, + { + "title": "Guided Image Filtering", + "abstract": null + }, + { + "title": "Multi-scale image harmonization", + "abstract": "Traditional image compositing techniques, such as alpha matting and gradient domain compositing, are used to create composites that have plausible boundaries. But when applied to images taken from different sources or shot under different conditions, these techniques can produce unrealistic results. In this work, we present a framework that explicitly matches the visual appearance of images through a process we call image harmonization, before blending them. At the heart of this framework is a multi-scale technique that allows us to transfer the appearance of one image to another. We show that by carefully manipulating the scales of a pyramid decomposition of an image, we can match contrast, texture, noise, and blur, while avoiding image artifacts. The output composite can then be reconstructed from the modified pyramid coefficients while enforcing both alpha-based and seamless boundary constraints. We show how the proposed framework can be used to produce realistic composites with minimal user interaction in a number of different scenarios." + }, + { + "title": "Camouflage images", + "abstract": "Camouflage images contain one or more hidden figures that remain imperceptible or unnoticed for a while. In one possible explanation, the ability to delay the perception of the hidden figures is attributed to the theory that human perception works in two main phases: feature search and conjunction search. Effective camouflage images make feature based recognition difficult, and thus force the recognition process to employ conjunction search, which takes considerable effort and time. In this paper, we present a technique for creating camouflage images. To foil the feature search, we remove the original subtle texture details of the hidden figures and replace them by that of the surrounding apparent image. To leave an appropriate degree of clues for the conjunction search, we compute and assign new tones to regions in the embedded figures by performing an optimization between two conflicting terms, which we call immersion and standout, corresponding to hiding and leaving clues, respectively. We show a large number of camouflage images generated by our technique, with or without user guidance. We have tested the quality of the images in an extensive user study, showing a good control of the difficulty levels." + }, + { + "title": "PatchMatch: a randomized correspondence algorithm for structural image editing", + "abstract": "This paper presents interactive image editing tools using a new randomized algorithm for quickly finding approximate nearest-neighbor matches between image patches. Previous research in graphics and vision has leveraged such nearest-neighbor searches to provide a variety of high-level digital image editing tools. However, the cost of computing a field of such matches for an entire image has eluded previous efforts to provide interactive performance. Our algorithm offers substantial performance improvements over the previous state of the art (20-100x), enabling its use in interactive editing tools. The key insights driving the algorithm are that some good patch matches can be found via random sampling, and that natural coherence in the imagery allows us to propagate such matches quickly to surrounding areas. We offer theoretical analysis of the convergence properties of the algorithm, as well as empirical and practical evidence for its high quality and performance. This one simple algorithm forms the basis for a variety of tools -- image retargeting, completion and reshuffling -- that can be used together in the context of a high-level image editing application. Finally, we propose additional intuitive constraints on the synthesis process that offer the user a level of control unavailable in previous methods." + }, + { + "title": "Poisson image editing", + "abstract": "Using generic interpolation machinery based on solving Poisson equations, a variety of novel tools are introduced for seamless editing of image regions. The first set of tools permits the seamless importation of both opaque and transparent source image regions into a destination region. The second set is based on similar mathematical ideas and allows the user to modify the appearance of the image seamlessly, within a selected region. These changes can be arranged to affect the texture, the illumination, and the color of objects lying in the region, or to make tileable a rectangular selection." + }, + { + "title": "On the limited memory BFGS method for large scale optimization", + "abstract": null + }, + { + "title": "Compositing digital images", + "abstract": "Most computer graphics pictures have been computed all at once, so that the rendering program takes care of all computations relating to the overlap of objects. There are several applications, however, where elements must be rendered separately, relying on compositing techniques for the anti-aliased accumulation of the full image. This paper presents the case for four-channel pictures, demonstrating that a matte component can be computed similarly to the color channels. The paper discusses guidelines for the generation of elements and the arithmetic for their arbitrary compositing." + }, + { + "title": "Neural Information Processing", + "abstract": null + } + ] + }, + "author_data": { + "e0b952eb-747d-467e-b421-9dec1123deed": { + "pk": "e0b952eb-747d-467e-b421-9dec1123deed", + "project_name": null, + "name": "Fujun Luan", + "bio": "I am a researcher specializing in the intersection of deep learning and computer graphics, with a particular focus on photorealistic style transfer, procedural modeling, and advanced rendering techniques. My work on photographic style transfer has led to a novel approach that minimizes distortions while effectively transferring styles across a wide range of image content, allowing for realistic transformations that capture variations in time, weather, and artistic edits.\n\nIn addition to style transfer, I have developed innovative techniques for rendering procedural textiles, significantly reducing memory usage while maintaining high realism. My automatic fitting approach for creating yarn-based fabric models leverages CT data to achieve fiber-level detail, enhancing both the design and editing processes in fabric modeling.\n\nI am also passionate about improving Monte Carlo rendering methods, particularly through adaptive sampling strategies that utilize machine learning to enhance image quality and reduce noise. My exploration of global illumination techniques has further solidified my commitment to merging traditional rendering methods with cutting-edge machine learning approaches.\n\nOverall, my research aims to push the boundaries of what is possible in realistic rendering and modeling, making complex visual effects more accessible and efficient for various applications in design, entertainment, and beyond.", + "collaborators": [ + "Kavita Bala", + "Shuang Zhao", + "Sylvain Paris", + "Eli Shechtman", + "Li Fan Wu", + "Kun Xu" + ], + "pub_titles": [ + "Deep Photo Style Transfer", + "Supplemental Document for Deep Photo Style Transfer", + "Fiber‐Level On‐the‐Fly Procedural Textiles", + "Fitting procedural yarn models for realistic cloth rendering", + "A Learning Approach for Adaptive Sampling in Monte Carlo Rendering", + "Final Project : Real-Time Global Illumination with Radiance Regression Functions" + ], + "pub_abstracts": [ + "This paper introduces a deep-learning approach to photographic style transfer that handles a large variety of image content while faithfully transferring the reference style. Our approach builds upon the recent work on painterly transfer that separates style from the content of an image by considering different layers of a neural network. However, as is, this approach is not suitable for photorealistic style transfer. Even when both the input and reference images are photographs, the output still exhibits distortions reminiscent of a painting. Our contribution is to constrain the transformation from the input to the output to be locally affine in colorspace, and to express this constraint as a custom fully differentiable energy term. We show that this approach successfully suppresses distortion and yields satisfying photorealistic style transfers in a broad variety of scenarios, including transfer of the time of day, weather, season, and artistic edits.", + "This document contains: links to two user studies (section 1), comparison against Wu et al. [6] (section 2), results with only semantic segmentation or photorealism regularization (section 3), merging classes for DilatedNet [1] Segmentation (section 4), a solution for handling noisy (section 5) or high-resolution (section 6) input, an extension for CNNMRF in photographic transfer (section 7), and some ideas we came up with, but ultimately did not work well, before reaching the matting Laplacian solution (section 8).", + "Procedural textile models are compact, easy to edit, and can achieve state‐of‐the‐art realism with fiber‐level details. However, these complex models generally need to be fully instantiated (aka. realized) into 3D volumes or fiber meshes and stored in memory, We introduce a novel realization‐minimizing technique that enables physically based rendering of procedural textiles, without the need of full model realizations. The key ingredients of our technique are new data structures and search algorithms that look up regular and flyaway fibers on the fly, efficiently and consistently. Our technique works with compact fiber‐level procedural yarn models in their exact form with no approximation imposed. In practice, our method can render very large models that are practically unrenderable using existing methods, while using considerably less memory (60–200× less) and achieving good performance.", + "Fabrics play a significant role in many applications in design, prototyping, and entertainment. Recent fiber-based models capture the rich visual appearance of fabrics, but are too onerous to design and edit. Yarn-based procedural models are powerful and convenient, but too regular and not realistic enough in appearance. In this paper, we introduce an automatic fitting approach to create high-quality procedural yarn models of fabrics with fiber-level details. We fit CT data to procedural models to automatically recover a full range of parameters, and augment the models with a measurement-based model of flyaway fibers. We validate our fabric models against CT measurements and photographs, and demonstrate the utility of this approach for fabric modeling and editing.", + "Monte Carlo rendering is widely used in photorealistic images generation, which can simulate a series of realworld visual effects such as soft shadow, glossy reflection, caustics, color bleeding, motion blur, depth of field, and so on. However, because the integration domain contains difficult paths with multiple scattering or interreflections, the final image often contains high variance, which can be seen as, the noise. One direction to solve this problem is to do adaptive sampling and reconstruction, which generates final images by adaptive sampling and filtering with single or multiple iterations. In this paper, combined with machine learning, we focus on the adaptive sampling phase which tries to adaptively throw more samples to high error region. As a result, our method leads to better adaptive sampling strategy than state-of-the-arts, and better final images in both visual qualities and MSEs.", + "This is a report for machine learning final project, which combines realistic rendering and machine learning. As is known, global illumination is very challenging for its expensive computing requirement. In general, Monte Carlo ray tracing or Photon Mapping techniques are used in rendering global illumination, which requires several hours or even days to generate a smooth and physically correct image. So it seems impossible to render global illumination in real time. Microsoft Research Asia solved this problem with a novel method combining rendering and neural networks. In my final project, I follow the MSRA 2013 SIGGRAPH paper \"Global Illumination with Radiance Regressions\" and make some extra extensions. The result turns out to be pretty good." + ], + "domain": [ + "Computer Graphics", + "Deep Learning", + "Photorealistic Rendering", + "Style Transfer" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "65df31ff-f82a-4223-8aae-61257e8d0673": { + "pk": "65df31ff-f82a-4223-8aae-61257e8d0673", + "project_name": null, + "name": "Eli Shechtman", + "bio": "I am a researcher specializing in computer vision and image processing, with a focus on enhancing image editing techniques through innovative algorithms and deep learning methods. My recent work includes developing Nautilus, a method for automatically identifying and manipulating symmetric regions in images, which has applications in inpainting and beautification. I have also extended patch-based synthesis to plenoptic images, enabling interactive light field editing that handles complex object boundaries effectively.\n\nMy research delves into the realm of generative modeling, where I explore the distribution of possible outputs in image-to-image translation tasks. By employing conditional generative adversarial networks (GANs), I aim to mitigate issues like mode collapse, ensuring diverse and realistic outputs. Additionally, I have tackled challenges in product image generation by creating a method that transfers user edits across variations in geometry and lighting, enhancing the iterative refinement process.\n\nI am particularly passionate about portrait photography, having developed techniques for relighting and stylization that respect the subject's identity while achieving artistic effects. My work on eye editing and pose modification demonstrates my commitment to creating user-friendly, automated solutions for common image editing challenges. Through my research, I strive to push the boundaries of what is possible in image manipulation, making advanced techniques accessible to both professionals and novice users alike.", + "collaborators": [ + "Kalyan Sunkavalli", + "Trevor Darrell", + "Oliver Wang", + "Zhixin Shu", + "Sunil Hadap", + "Sylvain Paris", + "D. Samaras", + "M. Lukác", + "D. Sýkora", + "Ondrej Jamriska", + "Jue Wang", + "Shimin Hu", + "Jun-Yan Zhu", + "Richard Zhang", + "Deepak Pathak", + "Alexei A. Efros", + "Bryan C. Russell", + "N. Carr", + "T. Pajdla", + "Fang-Lue Zhang", + "Zi-Ye Zhou", + "Jiaxin Shi", + "James W. Hennessey", + "Wilmot Li", + "N. Mitra", + "J. Fiser", + "David Simons", + "Jingwan Lu", + "P. Asente", + "Fujun Luan", + "Kavita Bala", + "S. Azadi", + "Matthew Fisher", + "Vladimir G. Kim", + "Zhaowen Wang", + "Ersin Yumer", + "Nicholas I. Kolkin", + "Gregory Shakhnarovich", + "Roey Mechrez", + "Lihi Zelnik-Manor", + "Lisa Anne Hendricks", + "Josef Sivic", + "Leon A. Gatys", + "M. Bethge", + "Aaron Hertzmann", + "Oren Boiman", + "T. Thonat", + "G. Drettakis", + "Li-Qian Ma", + "Chao Yang", + "Xin Lu", + "Zhe L. Lin", + "Hao Li", + "Ohad Fried", + "Dan B. Goldman", + "Adam Finkelstein" + ], + "pub_titles": [ + "Nautilus", + "PlenoPatch: Patch-Based Plenoptic Image Manipulation", + "Toward Multimodal Image-to-Image Translation", + "Transferring image-based edits for multi-channel compositing", + "Portrait lighting transfer using a mass transport approach", + "Example-based synthesis of stylized facial animations", + "Deep Photo Style Transfer", + "Multi-content GAN for Few-Shot Font Style Transfer", + "Neural Face Editing with Intrinsic Image Disentangling", + "Training Deep Networks to be Spatially Sensitive", + "Multimodal Image-to-Image Translation by Enforcing Bi-Cycle Consistency", + "Photorealistic Style Transfer with Screened Poisson Equation", + "Localizing Moments in Video with Natural Language", + "Preserving Color in Neural Artistic Style Transfer", + "Yelp Restaurant Photo Classification", + "Multi-View Inpainting for Image-Based Scene Editing and Rendering", + "Appearance Harmonization for Single Image Shadow Removal", + "High-Resolution Image Inpainting Using Multi-scale Neural Patch Synthesis", + "EyeOpener", + "Perspective-aware manipulation of portrait photos" + ], + "pub_abstracts": [ + "Natural images often exhibit symmetries that should be taken into account when editing them. In this paper we present Nautilus --- a method for automatically identifying symmetric regions in an image along with their corresponding symmetry transformations. We compute dense local similarity symmetry transformations using a novel variant of the Generalised PatchMatch algorithm that uses Metropolis-Hastings sampling. We combine and refine these local symmetries using an extended Lucas-Kanade algorithm to compute regional transformations and their spatial extents. Our approach produces dense estimates of complex symmetries that are combinations of translation, rotation, scale, and reflection under perspective distortion. This enables a number of automatic symmetry-aware image editing applications including inpainting, rectification, beautification, and segmentation, and we demonstrate state-of-the-art applications for each of them.", + "Patch-based image synthesis methods have been successfully applied for various editing tasks on still images, videos and stereo pairs. In this work we extend patch-based synthesis to plenoptic images captured by consumer-level lenselet-based devices for interactive, efficient light field editing. In our method the light field is represented as a set of images captured from different viewpoints. We decompose the central view into different depth layers, and present it to the user for specifying the editing goals. Given an editing task, our method performs patch-based image synthesis on all affected layers of the central view, and then propagates the edits to all other views. Interaction is done through a conventional 2D image editing user interface that is familiar to novice users. Our method correctly handles object boundary occlusion with semi-transparency, thus can generate more realistic results than previous methods. We demonstrate compelling results on a wide range of applications such as hole-filling, object reshuffling and resizing, changing object depth, light field upscaling and parallax magnification.", + "Many image-to-image translation problems are ambiguous, as a single input image may correspond to multiple possible outputs. In this work, we aim to model a \\emph{distribution} of possible outputs in a conditional generative modeling setting. The ambiguity of the mapping is distilled in a low-dimensional latent vector, which can be randomly sampled at test time. A generator learns to map the given input, combined with this latent code, to the output. We explicitly encourage the connection between output and the latent code to be invertible. This helps prevent a many-to-one mapping from the latent code to the output during training, also known as the problem of mode collapse, and produces more diverse results. We explore several variants of this approach by employing different training objectives, network architectures, and methods of injecting the latent code. Our proposed method encourages bijective consistency between the latent encoding and output modes. We present a systematic comparison of our method and other variants on both perceptual realism and diversity.", + "A common way to generate high-quality product images is to start with a physically-based render of a 3D scene, apply image-based edits on individual render channels, and then composite the edited channels together (in some cases, on top of a background photograph). This workflow requires users to manually select the right render channels, prescribe channel-specific masks, and set appropriate edit parameters. Unfortunately, such edits cannot be easily reused for global variations of the original scene, such as a rigid-body transformation of the 3D objects or a modified viewpoint, which discourages iterative refinement of both global scene changes and image-based edits. We propose a method to automatically transfer such user edits across variations of object geometry, illumination, and viewpoint. This transfer problem is challenging since many edits may be visually plausible but non-physical, with a successful transfer dependent on an unknown set of scene attributes that may include both photometric and non-photometric features. To address this challenge, we present a transfer algorithm that extends the image analogies formulation to include an augmented set of photometric and non-photometric guidance channels and, more importantly, adaptively estimate weights for the various candidate channels in a way that matches the characteristics of each individual edit. We demonstrate our algorithm on a variety of complex edit-transfer scenarios for creating high-quality product images.", + "Lighting is a critical element of portrait photography. However, good lighting design typically requires complex equipment and significant time and expertise. Our work simplifies this task using a relighting technique that transfers the desired illumination of one portrait onto another. The novelty in our approach to this challenging problem is our formulation of relighting as a mass transport problem. We start from standard color histogram matching that only captures the overall tone of the illumination, and we show how to use the mass-transport formulation to make it dependent on facial geometry. We fit a three-dimensional (3D) morphable face model to the portrait, and for each pixel, we combine the color value with the corresponding 3D position and normal. We then solve a mass-transport problem in this augmented space to generate a color remapping that achieves localized, geometry-aware relighting. Our technique is robust to variations in facial appearance and small errors in face reconstruction. As we demonstrate, this allows our technique to handle a variety of portraits and illumination conditions, including scenarios that are challenging for previous methods.", + "We introduce a novel approach to example-based stylization of portrait videos that preserves both the subject's identity and the visual richness of the input style exemplar. Unlike the current state-of-the-art based on neural style transfer [Selim et al. 2016], our method performs non-parametric texture synthesis that retains more of the local textural details of the artistic exemplar and does not suffer from image warping artifacts caused by aligning the style exemplar with the target face. Our method allows the creation of videos with less than full temporal coherence [Ruder et al. 2016]. By introducing a controllable amount of temporal dynamics, it more closely approximates the appearance of real hand-painted animation in which every frame was created independently. We demonstrate the practical utility of the proposed solution on a variety of style exemplars and target videos.", + "This paper introduces a deep-learning approach to photographic style transfer that handles a large variety of image content while faithfully transferring the reference style. Our approach builds upon the recent work on painterly transfer that separates style from the content of an image by considering different layers of a neural network. However, as is, this approach is not suitable for photorealistic style transfer. Even when both the input and reference images are photographs, the output still exhibits distortions reminiscent of a painting. Our contribution is to constrain the transformation from the input to the output to be locally affine in colorspace, and to express this constraint as a custom fully differentiable energy term. We show that this approach successfully suppresses distortion and yields satisfying photorealistic style transfers in a broad variety of scenarios, including transfer of the time of day, weather, season, and artistic edits.", + "In this work, we focus on the challenge of taking partial observations of highly-stylized text and generalizing the observations to generate unobserved glyphs in the ornamented typeface. To generate a set of multi-content images following a consistent style from very few examples, we propose an end-to-end stacked conditional GAN model considering content along channels and style along network layers. Our proposed network transfers the style of given glyphs to the contents of unseen ones, capturing highly stylized fonts found in the real-world such as those on movie posters or infographics. We seek to transfer both the typographic stylization (ex. serifs and ears) as well as the textual stylization (ex. color gradients and effects.) We base our experiments on our collected data set including 10,000 fonts with different styles and demonstrate effective generalization from a very small number of observed glyphs.", + "Traditional face editing methods often require a number of sophisticated and task specific algorithms to be applied one after the other — a process that is tedious, fragile, and computationally intensive. In this paper, we propose an end-to-end generative adversarial network that infers a face-specific disentangled representation of intrinsic face properties, including shape (i.e. normals), albedo, and lighting, and an alpha matte. We show that this network can be trained on in-the-wild images by incorporating an in-network physically-based image formation module and appropriate loss functions. Our disentangling latent representation allows for semantically relevant edits, where one aspect of facial appearance can be manipulated while keeping orthogonal properties fixed, and we demonstrate its use for a number of facial editing applications.", + "In many computer vision tasks, for example saliency prediction or semantic segmentation, the desired output is a foreground map that predicts pixels where some criteria is satisfied. Despite the inherently spatial nature of this task commonly used learning objectives do not incorporate the spatial relationships between misclassified pixels and the underlying ground truth. The Weighted F-measure, a recently proposed evaluation metric, does reweight errors spatially, and has been shown to closely correlate with human evaluation of quality, and stably rank predictions with respect to noisy ground truths (such as a sloppy human annotator might generate). However it suffers from computational complexity which makes it intractable as an optimization objective for gradient descent, which must be evaluated thousands or millions of times while learning a model's parameters. We propose a differentiable and efficient approximation of this metric. By incorporating spatial information into the objective we can use a simpler model than competing methods without sacrificing accuracy, resulting in faster inference speeds and alleviating the need for pre/post-processing. We match (or improve) performance on several tasks compared to prior state of the art by traditional metrics, and in many cases significantly improve performance by the weighted F-measure.", + "Many image-to-image translation problems are ambiguous, with a single input image corresponding to multiple possible outputs. In this work, we aim to model a distribution of possible outputs in a conditional generative modeling setting. The ambiguity of the mapping is encoded in a low-dimensional latent vector, which can be randomly sampled at test time. A generator learns to map the input, along with the latent code, to an output. We explicitly enforce cycle consistency between the latent code and the output. Encouraging invertibility helps prevent a many-to-one mapping from the latent code to the output during training, also known as the problem of mode collapse, and helps produce more diverse results. We evaluate the relationship between perceptual realism and diversity of images generated by our method, and test on a variety of domains.", + "Recent work has shown impressive success in transferring painterly style to images. These approaches, however, fall short of photorealistic style transfer. Even when both the input and reference images are photographs, the output still exhibits distortions reminiscent of a painting. In this paper we propose an approach that takes as input a stylized image and makes it more photorealistic. It relies on the Screened Poisson Equation, maintaining the fidelity of the stylized image while constraining the gradients to those of the original input image. Our method is fast, simple, fully automatic and shows positive progress in making a stylized image photorealistic. Our results exhibit finer details and are less prone to artifacts than the state-of-the-art.", + "We consider retrieving a specific temporal segment, or moment, from a video given a natural language text description. Methods designed to retrieve whole video clips with natural language determine what occurs in a video but not when. To address this issue, we propose the Moment Context Network (MCN) which effectively localizes natural language queries in videos by integrating local and global video features over time. A key obstacle to training our MCN model is that current video datasets do not include pairs of localized video segments and referring expressions, or text descriptions which uniquely identify a corresponding moment. Therefore, we collect the Distinct Describable Moments (DiDeMo) dataset which consists of over 10,000 unedited, personal videos in diverse visual settings with pairs of localized video segments and referring expressions. We demonstrate that MCN outperforms several baseline methods and believe that our initial results together with the release of DiDeMo will inspire further research on localizing video moments with natural language.", + "This note presents an extension to the neural artistic style transfer algorithm (Gatys et al.). The original algorithm transforms an image to have the style of another given image. For example, a photograph can be transformed to have the style of a famous painting. Here we address a potential shortcoming of the original method: the algorithm transfers the colors of the original painting, which can alter the appearance of the scene in undesirable ways. We describe simple linear methods for transferring style while preserving colors.", + "This project is about attaching labels to restaurants based on the photos uploaded by users. In this paper we have explored different methods to extract image features, extract restaurant features and classify restaurants. For the training set, the F1 score considering all labels is 0.7306. The mean F1 score is 0.69719. The mean F1 score on hidden test set is 0.69059.", + "We propose a method to remove objects such as people and cars from multi-view urban image datasets, enabling free-viewpoint IBR in the edited scenes. Our method combines information from multi-view 3D reconstruction with image inpainting techniques, by formulating the problem as an optimization of a global patch-based objective function. We use Image-Based Rendering (IBR) techniques to reproject information from neighboring views, and 3D multi-view stereo reconstruction to perform multiview coherent initialization for inpainting of pixels not filled by reprojection. Our algorithm performs multi-view consistent inpainting for color and 3D by blending reprojections with patch-based image inpainting. We run our algorithm on casually captured datasets, and Google StreetViewdata, removing objects cars, people and pillars, showing that our approach produces results of sufficient quality for free-viewpoint IBR on \"cleaned up\" scenes, as well as IBR scene editing, such as limited motion of real objects.", + "Shadow removal is a challenging problem and previous approaches often produce de‐shadowed regions that are visually inconsistent with the rest of the image. We propose an automatic shadow region harmonization approach that makes the appearance of a de‐shadowed region (produced using any previous technique) compatible with the rest of the image. We use a shadow‐guided patch‐based image synthesis approach that reconstructs the shadow region using patches sampled from non‐shadowed regions. This result is then refined based on the reconstruction confidence to handle unique textures. Qualitative comparisons over a wide range of images, and a quantitative evaluation on a benchmark dataset show that our technique significantly improves upon the state‐of‐the‐art.", + "Recent advances in deep learning have shown exciting promise in filling large holes in natural images with semantically plausible and context aware details, impacting fundamental image manipulation tasks such as object removal. While these learning-based methods are significantly more effective in capturing high-level features than prior techniques, they can only handle very low-resolution inputs due to memory limitations and difficulty in training. Even for slightly larger images, the inpainted regions would appear blurry and unpleasant boundaries become visible. We propose a multi-scale neural patch synthesis approach based on joint optimization of image content and texture constraints, which not only preserves contextual structures but also produces high-frequency details by matching and adapting patches with the most similar mid-layer feature correlations of a deep classification network. We evaluate our method on the ImageNet and Paris Streetview datasets and achieved state-of-the-art inpainting accuracy. We show our approach produces sharper and more coherent results than prior methods, especially for high-resolution images.", + "Closed eyes and look-aways can ruin precious moments captured in photographs. In this article, we present a new framework for automatically editing eyes in photographs. We leverage a user’s personal photo collection to find a “good” set of reference eyes and transfer them onto a target image. Our example-based editing approach is robust and effective for realistic image editing. A fully automatic pipeline for realistic eye editing is challenging due to the unconstrained conditions under which the face appears in a typical photo collection. We use crowd-sourced human evaluations to understand the aspects of the target-reference image pair that will produce the most realistic results. We subsequently train a model that automatically selects the top-ranked reference candidate(s) by narrowing the gap in terms of pose, local contrast, lighting conditions, and even expressions. Finally, we develop a comprehensive pipeline of three-dimensional face estimation, image warping, relighting, image harmonization, automatic segmentation, and image compositing in order to achieve highly believable results. We evaluate the performance of our method via quantitative and crowd-sourced experiments.", + "This paper introduces a method to modify the apparent relative pose and distance between camera and subject given a single portrait photo. Our approach fits a full perspective camera and a parametric 3D head model to the portrait, and then builds a 2D warp in the image plane to approximate the effect of a desired change in 3D. We show that this model is capable of correcting objectionable artifacts such as the large noses sometimes seen in \"selfies,\" or to deliberately bring a distant camera closer to the subject. This framework can also be used to re-pose the subject, as well as to create stereo pairs from an input portrait. We show convincing results on both an existing dataset as well as a new dataset we captured to validate our method." + ], + "domain": [ + "Image Processing", + "Computer Vision", + "Generative Models", + "Style Transfer" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + }, + "fb8c5fa4-09d6-4caf-9c14-ed7c71c5ffcd": { + "pk": "fb8c5fa4-09d6-4caf-9c14-ed7c71c5ffcd", + "project_name": null, + "name": "Kavita Bala", + "bio": "I am a researcher deeply engaged in the intersection of computer vision, graphics, and machine learning, with a particular focus on understanding and manipulating visual information. My work spans a variety of applications, from intrinsic image decomposition and shading estimation to fashion analysis and product videography. \n\nOne of my significant contributions is the development of the Shading Annotations in the Wild (SAW) dataset, which provides a large-scale resource for training models to predict per-pixel shading in images. This dataset has proven invaluable in enhancing intrinsic image decomposition techniques, reducing artifacts in the process. Additionally, I have explored the dynamics of fashion trends through a framework that analyzes millions of images, allowing for insights into global and local style choices.\n\nMy research also delves into advanced techniques for image transformation, such as photorealistic style transfer and single-image analogies, where I leverage deep learning architectures to achieve high-quality results. I have introduced methods for ambient occlusion computation and illumination estimation from photo collections, which are crucial for realistic scene modeling.\n\nFurthermore, I am passionate about making sophisticated visual effects accessible to novice users, as demonstrated in my work on DIY lighting design for product videography. My goal is to bridge the gap between complex visual tasks and user-friendly solutions, empowering individuals to create high-quality visual content.\n\nOverall, my research aims to enhance our understanding of visual phenomena and develop practical tools that leverage this knowledge for various applications in vision and graphics.", + "collaborators": [ + "Noah Snavely", + "Balazs Kovacs", + "Sylvain Paris", + "Sean Bell", + "Fujun Luan", + "Shuang Zhao", + "P. Upchurch", + "Ivaylo Boyadzhiev", + "Ioannis Gkioulekas", + "F. Durand", + "Todd Zickler", + "Scott Wehrwein", + "E. Adelson", + "K. Matzen", + "Eli Shechtman", + "Nicolas Bonneel", + "Pramook Khungurn", + "Rundong Wu", + "James Noeckel", + "Steve Marschner", + "Jiawen Chen", + "Anat Levin", + "Daniel Sedra", + "Andrew Mullen", + "H. Hirsh", + "Jacob V. Gardner", + "Geoff Pleiss", + "Robert Pless", + "Kilian Q. Weinberger", + "D. C. Hauagge", + "C. Loan", + "B. Walter", + "Andreas Veit", + "Julian McAuley", + "Serge J. Belongie", + "Manohar B. Srikanth" + ], + "pub_titles": [ + "Shading Annotations in the Wild", + "StreetStyle: Exploring world-wide clothing styles from millions of photos", + "Deep Photo Style Transfer", + "Intrinsic Decompositions for Image Editing", + "Shading Annotations in the Wild ( Supplementary Material )", + "Fiber‐Level On‐the‐Fly Procedural Textiles", + "Fast rendering of fabric micro-appearance models under directional and spherical gaussian lights", + "Do-it-yourself lighting design for product videography", + "From A to Z: Supervised Transfer of Style and Content Using Deep Neural Network Generators", + "Interactive Consensus Agreement Games for Labeling Images", + "Fitting procedural yarn models for realistic cloth rendering", + "Deep Feature Interpolation for Image Content Changes", + "Photometric Ambient Occlusion for Intrinsic Image Decomposition", + "Shadow Detection and Sun Direction in Photo Collections", + "Band-Sifting Decomposition for Image-Based Material Editing", + "On the appearance of translucent edges", + "Learning Visual Clothing Style with Heterogeneous Dyadic Co-Occurrences" + ], + "pub_abstracts": [ + "Understanding shading effects in images is critical for a variety of vision and graphics problems, including intrinsic image decomposition, shadow removal, image relighting, and inverse rendering. As is the case with other vision tasks, machine learning is a promising approach to understanding shading - but there is little ground truth shading data available for real-world images. We introduce Shading Annotations in the Wild (SAW), a new large-scale, public dataset of shading annotations in indoor scenes, comprised of multiple forms of shading judgments obtained via crowdsourcing, along with shading annotations automatically generated from RGB-D imagery. We use this data to train a convolutional neural network to predict per-pixel shading information in an image. We demonstrate the value of our data and network in an application to intrinsic images, where we can reduce decomposition artifacts produced by existing algorithms. Our database is available at http://opensurfaces.cs.cornell.edu/saw.", + "Each day billions of photographs are uploaded to photo-sharing services and social media platforms. These images are packed with information about how people live around the world. In this paper we exploit this rich trove of data to understand fashion and style trends worldwide. We present a framework for visual discovery at scale, analyzing clothing and fashion across millions of images of people around the world and spanning several years. We introduce a large-scale dataset of photos of people annotated with clothing attributes, and use this dataset to train attribute classifiers via deep learning. We also present a method for discovering visually consistent style clusters that capture useful visual correlations in this massive dataset. Using these tools, we analyze millions of photos to derive visual insight, producing a first-of-its-kind analysis of global and per-city fashion choices and spatio-temporal trends.", + "This paper introduces a deep-learning approach to photographic style transfer that handles a large variety of image content while faithfully transferring the reference style. Our approach builds upon the recent work on painterly transfer that separates style from the content of an image by considering different layers of a neural network. However, as is, this approach is not suitable for photorealistic style transfer. Even when both the input and reference images are photographs, the output still exhibits distortions reminiscent of a painting. Our contribution is to constrain the transformation from the input to the output to be locally affine in colorspace, and to express this constraint as a custom fully differentiable energy term. We show that this approach successfully suppresses distortion and yields satisfying photorealistic style transfers in a broad variety of scenarios, including transfer of the time of day, weather, season, and artistic edits.", + "Intrinsic images are a mid‐level representation of an image that decompose the image into reflectance and illumination layers. The reflectance layer captures the color/texture of surfaces in the scene, while the illumination layer captures shading effects caused by interactions between scene illumination and surface geometry. Intrinsic images have a long history in computer vision and recently in computer graphics, and have been shown to be a useful representation for tasks ranging from scene understanding and reconstruction to image editing. In this report, we review and evaluate past work on this problem. Specifically, we discuss each work in terms of the priors they impose on the intrinsic image problem. We introduce a new synthetic ground‐truth dataset that we use to evaluate the validity of these priors and the performance of the methods. Finally, we evaluate the performance of the different methods in the context of image‐editing applications.", + "Our goal is to curate a dataset that is generally useful to tasks that use shading estimation. One specifically important task is intrinsic image decomposition. Since the majority of intrinsic image decomposition algorithms make a Lambertian assumption, and disregard specular surfaces, we want to eliminate points that have a substantial glossy component. Unfortunately, the IIW [2] dataset did not take this into account. We address this limitation of IIW by annotating all old and new points in the IIW dataset with a “glossy” label.", + "Procedural textile models are compact, easy to edit, and can achieve state‐of‐the‐art realism with fiber‐level details. However, these complex models generally need to be fully instantiated (aka. realized) into 3D volumes or fiber meshes and stored in memory, We introduce a novel realization‐minimizing technique that enables physically based rendering of procedural textiles, without the need of full model realizations. The key ingredients of our technique are new data structures and search algorithms that look up regular and flyaway fibers on the fly, efficiently and consistently. Our technique works with compact fiber‐level procedural yarn models in their exact form with no approximation imposed. In practice, our method can render very large models that are practically unrenderable using existing methods, while using considerably less memory (60–200× less) and achieving good performance.", + "Rendering fabrics using micro-appearance models---fiber-level microgeometry coupled with a fiber scattering model---can take hours per frame. We present a fast, precomputation-based algorithm for rendering both single and multiple scattering in fabrics with repeating structure illuminated by directional and spherical Gaussian lights. Precomputed light transport (PRT) is well established but challenging to apply directly to cloth. This paper shows how to decompose the problem and pick the right approximations to achieve very high accuracy, with significant performance gains over path tracing. We treat single and multiple scattering separately and approximate local multiple scattering using precomputed transfer functions represented in spherical harmonics. We handle shadowing between fibers with precomputed per-fiber-segment visibility functions, using two different representations to separately deal with low and high frequency spherical Gaussian lights. Our algorithm is designed for GPU performance and high visual quality. Compared to existing PRT methods, it is more accurate. In tens of seconds on a commodity GPU, it renders high-quality supersampled images that take path tracing tens of minutes on a compute cluster.", + "The growth of online marketplaces for selling goods has increased the need for product photography by novice users and consumers. Additionally, the increased use of online media and large-screen billboards promotes the adoption of videos for advertising, going beyond just using still imagery. Lighting is a key distinction between professional and casual product videography. Professionals use specialized hardware setups, and bring expert skills to create good lighting that shows off the product's shape and material, while also producing aesthetically pleasing results. In this paper, we introduce a new do-it-yourself (DIY) approach to lighting design that lets novice users create studio quality product videography. We identify design principles to light products through emphasizing highlights, rim lighting, and contours. We devise a set of computational metrics to achieve these design goals. Our workflow is: the user acquires a video of the product by mounting a video camera on a tripod and using a tablet to light objects by waving the tablet around the object. We automatically analyze and split this acquired video into snippets that match our design principles. Finally, we present an interface that lets users easily select snippets with specific characteristics and then assembles them to produce a final pleasing video of the product. Alternatively, they can rely on our template mechanism to automatically assemble a video.", + "We propose a new neural network architecture for solving single-image analogies - the generation of an entire set of stylistically similar images from just a single input image. Solving this problem requires separating image style from content. Our network is a modified variational autoencoder (VAE) that supports supervised training of single-image analogies and in-network evaluation of outputs with a structured similarity objective that captures pixel covariances. On the challenging task of generating a 62-letter font from a single example letter we produce images with 22.4% lower dissimilarity to the ground truth than state-of-the-art.", + " Scene understanding algorithms in computer vision are improving dramatically by training deep convolutional neural networks on millions of accurately annotated images. Collecting large-scale datasets for this kind of training is challenging, and the learning algorithms are only as good as the data they train on. Training annotations are often obtained by taking the majority label from independent crowdsourced workers using platforms such as Amazon Mechanical Turk. However, the accuracy of the resulting annotations can vary, with the hardest-to-annotate samples having prohibitively low accuracy. Our insight is that in cases where independent worker annotations are poor more accurate results can be obtained by having workers collaborate. This paper introduces consensus agreement games, a novel method for assigning annotations to images by the agreement of multiple consensuses of small cliques of workers. We demonstrate that this approach reduces error by 37.8% on two different datasets at a cost of $0.10 or $0.17 per annotation. The higher cost is justified because our method does not need to be run on the entire dataset. Ultimately, our method enables us to more accurately annotate images and build more challenging training datasets for learning algorithms. ", + "Fabrics play a significant role in many applications in design, prototyping, and entertainment. Recent fiber-based models capture the rich visual appearance of fabrics, but are too onerous to design and edit. Yarn-based procedural models are powerful and convenient, but too regular and not realistic enough in appearance. In this paper, we introduce an automatic fitting approach to create high-quality procedural yarn models of fabrics with fiber-level details. We fit CT data to procedural models to automatically recover a full range of parameters, and augment the models with a measurement-based model of flyaway fibers. We validate our fabric models against CT measurements and photographs, and demonstrate the utility of this approach for fabric modeling and editing.", + "We propose Deep Feature Interpolation (DFI), a new data-driven baseline for automatic high-resolution image transformation. As the name suggests, DFI relies only on simple linear interpolation of deep convolutional features from pre-trained convnets. We show that despite its simplicity, DFI can perform high-level semantic transformations like make older/younger, make bespectacled, add smile, among others, surprisingly well–sometimes even matching or outperforming the state-of-the-art. This is particularly unexpected as DFI requires no specialized network architecture or even any deep network to be trained for these tasks. DFI therefore can be used as a new baseline to evaluate more complex algorithms and provides a practical answer to the question of which image transformation tasks are still challenging after the advent of deep learning.", + "We present a method for computing ambient occlusion (AO) for a stack of images of a Lambertian scene from a fixed viewpoint. Ambient occlusion, a concept common in computer graphics, characterizes the local visibility at a point: it approximates how much light can reach that point from different directions without getting blocked by other geometry. While AO has received surprisingly little attention in vision, we show that it can be approximated using simple, per-pixel statistics over image stacks, based on a simplified image formation model. We use our derived AO measure to compute reflectance and illumination for objects without relying on additional smoothness priors, and demonstrate state-of-the art performance on the MIT Intrinsic Images benchmark. We also demonstrate our method on several synthetic and real scenes, including 3D printed objects with known ground truth geometry.", + "Modeling the appearance of outdoor scenes from photo collections is challenging because of appearance variation, especially due to illumination. In this paper we present a simple and robust algorithm for estimating illumination properties-shadows and sun direction-from photo collections. These properties are key to a variety of scene modeling applications, including outdoor intrinsic images, realistic 3D scene rendering, and temporally varying (4D) reconstruction. Our shadow detection method uses illumination ratios to analyze lighting independent of camera effects, and determines shadow labels for each 3D point in a reconstruction. These shadow labels can then be used to detect shadow boundaries and estimate sun direction, as well as to compute dense shadow labels in pixel space. We demonstrate our method on large Internet photo collections of scenes, and show that it outperforms prior multi-image shadow detection and sun direction estimation methods.", + "Photographers often “prep” their subjects to achieve various effects; for example, toning down overly shiny skin, covering blotches, etc. Making such adjustments digitally after a shoot is possible, but difficult without good tools and good skills. Making such adjustments to video footage is harder still. We describe and study a set of 2D image operations, based on multiscale image analysis, that are easy and straightforward and that can consistently modify perceived material properties. These operators first build a subband decomposition of the image and then selectively modify the coefficients within the subbands. We call this selection process band sifting. We show that different siftings of the coefficients can be used to modify the appearance of properties such as gloss, smoothness, pigmentation, or weathering. The band-sifting operators have particularly striking effects when applied to faces; they can provide “knobs” to make a face look wetter or drier, younger or older, and with heavy or light variation in pigmentation. Through user studies, we identify a set of operators that yield consistent subjective effects for a variety of materials and scenes. We demonstrate that these operators are also useful for processing video sequences.", + "Edges in images of translucent objects are very different from edges in images of opaque objects. The physical causes for these differences are hard to characterize analytically and are not well understood. This paper considers one class of translucency edges-those caused by a discontinuity in surface orientation-and describes the physical causes of their appearance. We simulate thousands of translucency edge profiles using many different scattering material parameters, and we explain the resulting variety of edge patterns by qualitatively analyzing light transport. We also discuss the existence of shape and material metamers, or combinations of distinct shape or material parameters that generate the same edge profile. This knowledge is relevant to visual inference tasks that involve translucent objects, such as shape or material estimation.", + "With the rapid proliferation of smart mobile devices, users now take millions of photos every day. These include large numbers of clothing and accessory images. We would like to answer questions like 'What outfit goes well with this pair of shoes?' To answer these types of questions, one has to go beyond learning visual similarity and learn a visual notion of compatibility across categories. In this paper, we propose a novel learning framework to help answer these types of questions. The main idea of this framework is to learn a feature transformation from images of items into a latent space that expresses compatibility. For the feature transformation, we use a Siamese Convolutional Neural Network (CNN) architecture, where training examples are pairs of items that are either compatible or incompatible. We model compatibility based on co-occurrence in large-scale user behavior data, in particular co-purchase data from Amazon.com. To learn cross-category fit, we introduce a strategic method to sample training data, where pairs of items are heterogeneous dyads, i.e., the two elements of a pair belong to different high-level categories. While this approach is applicable to a wide variety of settings, we focus on the representative problem of learning compatible clothing style. Our results indicate that the proposed framework is capable of learning semantic information about visual style and is able to generate outfits of clothes, with items from different categories, that go well together." + ], + "domain": [ + "Computer Vision", + "Machine Learning", + "Image Processing", + "Style Transfer" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nHow can we seamlessly integrate an object from a photograph into a painting while maintaining the painting's original style and appearance?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is significant for both the artistic community and the field of machine learning. It allows for the creation of new visual content that respects the integrity of traditional art forms, enabling artists to remix and innovate without requiring advanced skills. This research could lead to advancements in image processing techniques, enhancing tools for digital artists and potentially influencing future research in style transfer and image harmonization. By addressing this question, we can expand the capabilities of automated systems in creative fields, leading to practical applications in art restoration, digital content creation, and augmented reality.\n\n**[Question 3] - Why is it hard?** \nThe challenge lies in the complexity of accurately replicating the unique characteristics of various painting styles while ensuring that the integrated object appears natural within the artwork. Naive approaches may fail because they do not account for the intricate textures, colors, and brushwork that define a painting's style. Technical obstacles include the need for sophisticated algorithms that can analyze and adapt to different artistic styles, as well as the requirement for high-quality training data that captures the nuances of both photographs and paintings. Theoretical challenges also arise in understanding how to effectively transfer style without losing the integrity of the original image.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has primarily focused on photographic compositing, leaving a gap in techniques specifically designed for paintings. Existing solutions often rely on algorithms that do not translate well to the unique characteristics of painted images, leading to subpar results. Barriers include a lack of dedicated datasets for training models on painting styles and insufficient methodologies that can adapt to the diverse range of artistic expressions. Our approach differs by employing a two-pass algorithm that specifically targets the nuances of painting styles, utilizing neural response statistics to ensure consistency and fidelity in the final composite.\n\n**[Question 5] - What are the key components of my approach and results?** \nOur proposed methodology involves a two-pass algorithm: the first pass transfers the overall style of the painting to the input object, while the second pass refines the result to match the painting's color and texture accurately. We will use a dataset comprising various paintings and photographs to train our model, focusing on metrics such as visual coherence and user preference in composite images. The expected outcome is a seamless integration of photographic" + }, + "1908.00709": { + "paper_data": { + "title": "AutoML: A Survey of the State-of-the-Art", + "url": "http://arxiv.org/abs/1908.00709v6", + "arxiv_id": "1908.00709", + "authors": [ + "Xin He", + "Kaiyong Zhao", + "Xiaowen Chu" + ], + "abstract": "Deep learning (DL) techniques have penetrated all aspects of our lives and brought us great convenience. However, building a high-quality DL system for a specific task highly relies on human expertise, hindering the applications of DL to more areas. Automated machine learning (AutoML) becomes a promising solution to build a DL system without human assistance, and a growing number of researchers focus on AutoML. In this paper, we provide a comprehensive and up-to-date review of the state-of-the-art (SOTA) in AutoML. First, we introduce AutoML methods according to the pipeline, covering data preparation, feature engineering, hyperparameter optimization, and neural architecture search (NAS). We focus more on NAS, as it is currently very hot sub-topic of AutoML. We summarize the performance of the representative NAS algorithms on the CIFAR-10 and ImageNet datasets and further discuss several worthy studying directions of NAS methods: one/two-stage NAS, one-shot NAS, and joint hyperparameter and architecture optimization. Finally, we discuss some open problems of the existing AutoML methods for future research.", + "introduction": " Introduction In recent years, deep learning has been applied in vari- ous \felds and used to solve many challenging AI tasks, in areas such as image classi\fcation [ 1,2], object detection [ 3], and language modeling [ 4,5]. Speci\fcally, since AlexNet [ 1] outperformed all other traditional manual Methods in Natural Language Processing and the 9th International Joint Conference on Natural Lan- guage Processing (EMNLP-IJCNLP), Association for Compu- tational Linguistics, Hong Kong, China, 2019, pp. 3585{3590. doi:10.18653/v1/D19-1367 . URL https://www.aclweb.org/anthology/D19-1367 [268] J. Chen, K. Chen, X. Chen, X. Qiu, X. Huang, Exploring shared structures and hierarchies for multiple nlp tasks, arXiv preprint arXiv:1808.07658. [269] H. Mazzawi, X. Gonzalvo, A. Kracun, P. Sridhar, N. Subrah- manya, I. Lopez-Moreno, H.-J. Park, P. Violette, Improving keyword spotting and language identi\fcation via neural ar- chitecture search at scale., in: INTERSPEECH, 2019, pp. 1278{1282. [270] Y. He, J. Lin, Z. Liu, H. Wang, L.-J. Li, S. Han, Amc: Automl for model compression and acceleration on mobile devices, in: Proceedings of the European Conference on Computer Vision (ECCV), 2018, pp. 784{800. [271] X. Xiao, Z. Wang, S. Rajasekaran, Autoprune: Automatic network pruning by regularizing auxiliary parameters, in: H. M. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u0013 e-Buc, E. B. Fox, R. Garnett (Eds.), Advances in Neural Information Processing Systems 32: Annual Conference on Neural Informa- tion Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada, 2019, pp. 13681{13691. 35URL https://proceedings.neurips.cc/paper/2019/hash/ 4efc9e02abdab6b6166251918570a307- results to those of the models designed by experts on NLP tasks. Besides the CV and NLP tasks, Table 5 also shows that AutoML technique has been applied to other tasks, such as network compression, federate learning, image caption, 23Category Application introduction to kernel and nearest-neighbor nonparametric regression, The American Statistician 46 (3) (1992) 175{185. [123] A. Yang, P. M. Esperan\u0018 ca, F. M. Carlucci, NAS evaluation is frustratingly hard, in: 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26- 30, 2020, OpenReview.net, 2020. URL https://openreview.net/forum?id=HygrdpVKvr [124] F. Chollet, Xception: Deep learning with depthwise separable convolutions, in: 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017, IEEE Computer Society, 2017, pp. 1800{1807. doi:10.1109/CVPR.2017.195 . URL https://doi.org/10.1109/CVPR.2017.195 [125] F. Yu, V. Koltun, Multi-scale context aggregation by dilated convolutions, in: Y. Bengio, Y. LeCun (Eds.), 4th International Conference on Learning Representations, ICLR 2016, San Juan, Puerto Rico, May 2-4, 2016, Conference Track Proceedings, 2016. URL http://arxiv.org/abs/1511.07122 [126] J. Hu, L. Shen, G. Sun, Squeeze-and-excitation networks, in: 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018, Salt Lake City, UT, USA, June 18-22, 2018, IEEE Computer Society, 2018, pp. 7132{7141. doi:10.1109/CVPR.2018.00745 . URL http://openaccess.thecvf.com/content_cvpr_2018/ html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_ paper.html [127] G. Huang, Z. Liu, L. van der Maaten, K. Q. Weinberger, Densely connected convolutional networks, in: 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017, IEEE Computer Society, 2017, pp. 2261{2269. doi:10.1109/CVPR.2017.243 . URL https://doi.org/10.1109/CVPR.2017.243[128] X. Chen, L. Xie, J. Wu, Q. Tian, Progressive di\u000berentiable architecture search: Bridging the depth gap between search and evaluation, in: 2019 IEEE/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), October 27 - November 2, 2019, IEEE, 2019, pp. 1294{1303. doi: 10.1109/ICCV.2019.00138 . URL https://doi.org/10.1109/ICCV.2019.00138 [129] C. Liu, L. Chen, F. Schro\u000b, H. Adam, W. Hua, A. L. Yuille, F. Li, Auto-deeplab: Hierarchical neural architecture search for semantic image segmentation, in: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019, Long Beach, CA, USA, June 16-20, 2019, Computer Vision Founda- tion / IEEE, 2019, pp. 82{92. doi:10.1109/CVPR.2019.00017 . URL http://openaccess.thecvf.com/content_CVPR_ 2019/html/Liu_Auto-DeepLab_Hierarchical_Neural_ Architecture_Search_for_Semantic_Image_Segmentation_ CVPR_2019_paper.html [130] M.", + "references": [ + { + "title": "The Bottleneck", + "abstract": null + }, + { + "title": "VEGA: Towards an End-to-End Configurable AutoML Pipeline", + "abstract": "Automated Machine Learning (AutoML) is an important industrial solution for automatic discovery and deployment of the machine learning models. However, designing an integrated AutoML system faces four great challenges of configurability, scalability, integrability, and platform diversity. In this work, we present VEGA, an efficient and comprehensive AutoML framework that is compatible and optimized for multiple hardware platforms. a) The VEGA pipeline integrates various modules of AutoML, including Neural Architecture Search (NAS), Hyperparameter Optimization (HPO), Auto Data Augmentation, Model Compression, and Fully Train. b) To support a variety of search algorithms and tasks, we design a novel fine-grained search space and its description language to enable easy adaptation to different search algorithms and tasks. c) We abstract the common components of deep learning frameworks into a unified interface. VEGA can be executed with multiple back-ends and hardwares. Extensive benchmark experiments on multiple tasks demonstrate that VEGA can improve the existing AutoML algorithms and discover new high-performance models against SOTA methods, e.g. the searched DNet model zoo for Ascend 10x faster than EfficientNet-B5 and 9.2x faster than RegNetX-32GF on ImageNet. VEGA is open-sourced at this https URL." + }, + { + "title": "Evolutionary recurrent neural network for image captioning", + "abstract": null + }, + { + "title": "AutoGAN-Distiller: Searching to Compress Generative Adversarial Networks", + "abstract": "The compression of Generative Adversarial Networks (GANs) has lately drawn attention, due to the increasing demand for deploying GANs into mobile devices for numerous applications such as image translation, enhancement and editing. However, compared to the substantial efforts to compressing other deep models, the research on compressing GANs (usually the generators) remains at its infancy stage. Existing GAN compression algorithms are limited to handling specific GAN architectures and losses. Inspired by the recent success of AutoML in deep compression, we introduce AutoML to GAN compression and develop an AutoGAN-Distiller (AGD) framework. Starting with a specifically designed efficient search space, AGD performs an end-to-end discovery for new efficient generators, given the target computational resource constraints. The search is guided by the original GAN model via knowledge distillation, therefore fulfilling the compression. AGD is fully automatic, standalone (i.e., needing no trained discriminators), and generically applicable to various GAN models. We evaluate AGD in two representative GAN tasks: image translation and super resolution. Without bells and whistles, AGD yields remarkably lightweight yet more competitive compressed models, that largely outperform existing alternatives. Our codes and pretrained models are available at this https URL." + }, + { + "title": "NAS-Bench-NLP: Neural Architecture Search Benchmark for Natural Language Processing", + "abstract": "Neural Architecture Search (NAS) is a promising and rapidly evolving research area. Training a large number of neural networks requires an exceptional amount of computational power, which makes NAS unreachable for those researchers who have limited or no access to high-performance clusters and supercomputers. A few benchmarks with precomputed neural architectures performances have been recently introduced to overcome this problem and ensure reproducible experiments. However, these benchmarks are only for the computer vision domain and, thus, are built from the image datasets and convolution-derived architectures. In this work, we step outside the computer vision domain by leveraging the language modeling task, which is the core of natural language processing (NLP). Our main contribution is as follows: we have provided search space of recurrent neural networks on the text datasets and trained 14k architectures within it; we have conducted both intrinsic and extrinsic evaluation of the trained models using datasets for semantic relatedness and language understanding evaluation; finally, we have tested several NAS algorithms to demonstrate how the precomputed results can be utilized. We consider that the benchmark will provide more reliable empirical findings in the community and stimulate progress in developing new NAS methods well suited for recurrent architectures." + }, + { + "title": "AMER: Automatic Behavior Modeling and Interaction Exploration in Recommender System", + "abstract": "User behavior and feature interactions are crucial in deep learning-based recommender systems. There has been a diverse set of behavior modeling and interaction exploration methods in the literature. Nevertheless, the design of task-aware recommender systems still requires feature engineering and architecture engineering from domain experts. In this work, we introduce AMER, namely Automatic behavior Modeling and interaction Exploration in Recommender systems with Neural Architecture Search (NAS). The core contributions of AMER include the three-stage search space and the tailored three-step searching pipeline. In the first step, AMER searches for residual blocks that incorporate commonly used operations in the block-wise search space of stage 1 to model sequential patterns in user behavior. In the second step, it progressively investigates useful low-order and high-order feature interactions in the non-sequential interaction space of stage 2. Finally, an aggregation multi-layer perceptron (MLP) with shortcut connection is selected from flexible dimension settings of stage~3 to combine features extracted from the previous steps. For efficient and effective NAS, AMER employs the one-shot random search in all three steps. Further analysis reveals that AMER's search space could cover most of the representative behavior extraction and interaction investigation methods, which demonstrates the universality of our design. The extensive experimental results over various scenarios reveal that AMER could outperform competitive baselines with elaborate feature engineering and architecture engineering, indicating both effectiveness and robustness of the proposed method." + }, + { + "title": "Benchmarking Deep Learning Models and Automated Model Design for COVID-19 Detection with Chest CT Scans", + "abstract": "COVID-19 pandemic has spread all over the world for months. As its transmissibility and high pathogenicity seriously threaten people's lives, the accurate and fast detection of the COVID-19 infection is crucial. Although many recent studies have shown that deep learning based solutions can help detect COVID-19 based on chest CT scans, there lacks a consistent and systematic comparison and evaluation on these techniques. In this paper, we first build a clean and segmented CT dataset called Clean-CC-CCII by fixing the errors and removing some noises in a large CT scan dataset CC-CCII with three classes: novel coronavirus pneumonia (NCP), common pneumonia (CP), and normal controls (Normal). After cleaning, our dataset consists of a total of 340,190 slices of 3,993 scans from 2,698 patients. Then we benchmark and compare the performance of a series of state-of-the-art (SOTA) 3D and 2D convolutional neural networks (CNNs). The results show that 3D CNNs outperform 2D CNNs in general. With extensive effort of hyperparameter tuning, we find that the 3D CNN model DenseNet3D121 achieves the highest accuracy of 88.63% (F1-score is 88.14% and AUC is 0.940), and another 3D CNN model ResNet3D34 achieves the best AUC of 0.959 (accuracy is 87.83% and F1-score is 86.04%). We further demonstrate that the mixup data augmentation technique can largely improve the model performance. At last, we design an automated deep learning methodology to generate a lightweight deep learning model MNas3DNet41 that achieves an accuracy of 87.14%, F1-score of 87.25%, and AUC of 0.957, which are on par with the best models made by AI experts. The automated deep learning design is a promising methodology that can help health-care professionals develop effective deep learning models using their private data sets. Our Clean-CC-CCII dataset and source code are available at: https://github.com/arthursdays/HKBU_HPML_COVID-19." + }, + { + "title": "Differentiable Neural Input Search for Recommender Systems", + "abstract": "Latent factor models are the driving forces of the state-of-the-art recommender systems, with an important insight of vectorizing raw input features into dense embeddings. The dimensions of different feature embeddings are often set to a same value empirically, which limits the predictive performance of latent factor models. Existing works have proposed heuristic or reinforcement learning-based methods to search for mixed feature embedding dimensions. For efficiency concern, these methods typically choose embedding dimensions from a restricted set of candidate dimensions. However, this restriction will hurt the flexibility of dimension selection, leading to suboptimal performance of search results. In this paper, we propose Differentiable Neural Input Search (DNIS), a method that searches for mixed feature embedding dimensions in a more flexible space through continuous relaxation and differentiable optimization. The key idea is to introduce a soft selection layer that controls the significance of each embedding dimension, and optimize this layer according to model's validation performance. DNIS is model-agnostic and thus can be seamlessly incorporated with existing latent factor models for recommendation. We conduct experiments with various architectures of latent factor models on three public real-world datasets for rating prediction, Click-Through-Rate (CTR) prediction, and top-k item recommendation. The results demonstrate that our method achieves the best predictive performance compared with existing neural input search approaches with fewer embedding parameters and less time cost." + }, + { + "title": "Revisiting the Train Loss: an Efficient Performance Estimator for Neural Architecture Search", + "abstract": "Reliable yet efficient evaluation of generalisation performance of a proposed architecture is crucial to the success of neural architecture search (NAS). Traditional approaches face a variety of limitations: training each architecture to completion is prohibitively expensive, early stopping estimates may correlate poorly with fully trained performance, and model-based estimators require large training sets. Instead, motivated by recent results linking training speed and generalisation with stochastic gradient descent, we propose to estimate the final test performance based on the sum of training losses. Our estimator is inspired by the marginal likelihood, which is used for Bayesian model selection. Our model-free estimator is simple, efficient, and cheap to implement, and does not require hyperparameter-tuning or surrogate training before deployment. We demonstrate empirically that our estimator consistently outperforms other baselines and can achieve a rank correlation of 0.95 with final test accuracy on the NAS-Bench201 dataset within 50 epochs." + }, + { + "title": "AutoHAS: Efficient Hyperparameter and Architecture Search.", + "abstract": "Deep learning models often require extensive efforts in optimizing hyperparameters and architectures. Standard hyperparameter optimization methods are expensive because of their multi-trial nature: different configurations are tried separately to find the best. In this paper, we propose AutoHAS, an efficient framework for both hyperparameter and architecture search. AutoHAS generalizes the concept of efficient architecture search, ENAS and DARTS, to hyperparameter search and hence can jointly optimize both in a single training. A key challenge in such generalization is that ENAS and DARTS are designed to optimize discrete architecture choices, whereas hyperparameter choices are often continuous. To tackle this challenge, we discretize the continuous space into a linear combination of multiple categorical basis. Furthermore, we extend the idea of weight sharing and augment it with REINFORCE to reduce its memory cost. In order to decouple the shared network weights and controller optimization, we also propose to create temporary weights for evaluating the sampled hyperparameters and updating the controller. Experimental results show AutoHAS can improve the ImageNet accuracy by up to 0.8% for highly-optimized state-of-the-art ResNet/EfficientNet models, and up to 11% for less-optimized models. Compared to random search and Bayesian search, AutoHAS consistently achieves better accuracy with 10x less computation cost." + }, + { + "title": "AutoHAS: Differentiable Hyper-parameter and Architecture Search", + "abstract": "Neural Architecture Search (NAS) has achieved significant progress in pushing state-of-the-art performance. While previous NAS methods search for different network architectures with the same hyper-parameters, we argue that such search would lead to sub-optimal results. We empirically observe that different architectures tend to favor their own hyper-parameters. In this work, we extend NAS to a broader and more practical space by combining hyper-parameter and architecture search. As architecture choices are often categorical whereas hyper-parameter choices are often continuous, a critical challenge here is how to handle these two types of values in a joint search space. To tackle this challenge, we propose AutoHAS, a differentiable hyper-parameter and architecture search approach, with the idea of discretizing the continuous space into a linear combination of multiple categorical basis. A key element of AutoHAS is the use of weight sharing across all architectures and hyper-parameters which enables efficient search over the large joint search space. Experimental results on MobileNet/ResNet/EfficientNet/BERT show that AutoHAS significantly improves accuracy up to 2% on ImageNet and F1 score up to 0.4 on SQuAD 1.1, with search cost comparable to training a single model. Compared to other AutoML methods, such as random search or Bayesian methods, AutoHAS can achieve better accuracy with 10x less compute cost." + }, + { + "title": "FBNetV3: Joint Architecture-Recipe Search using Neural Acquisition Function", + "abstract": "Neural Architecture Search (NAS) yields state-of-the-art neural networks that outperform their best manually-designed counterparts. However, previous NAS methods search for architectures under one training recipe (i.e., training hyperparameters), ignoring the significance of training recipes and overlooking superior architectures under other training recipes. Thus, they fail to find higher-accuracy architecture-recipe combinations. To address this oversight, we present JointNAS to search both (a) architectures and (b) their corresponding training recipes. To accomplish this, we introduce a neural acquisition function that scores architectures and training recipes jointly. Following pre-training on a proxy dataset, this acquisition function guides both coarse-grained and fine-grained searches to produce FBNetV3. FBNetV3 is a family of state-of-the-art compact ImageNet models, outperforming both automatically and manually-designed architectures. For example, FBNetV3 matches both EfficientNet and ResNeSt accuracy with 1.4x and 5.0x fewer FLOPs, respectively. Furthermore, the JointNAS-searched training recipe yields significant performance gains across different networks and tasks." + }, + { + "title": "A Comprehensive Survey of Neural Architecture Search", + "abstract": "Deep learning has made substantial breakthroughs in many fields due to its powerful automatic representation capabilities. It has been proven that neural architecture design is crucial to the feature representation of data and the final performance. However, the design of the neural architecture heavily relies on the researchers’ prior knowledge and experience. And due to the limitations of humans’ inherent knowledge, it is difficult for people to jump out of their original thinking paradigm and design an optimal model. Therefore, an intuitive idea would be to reduce human intervention as much as possible and let the algorithm automatically design the neural architecture. Neural Architecture Search (NAS) is just such a revolutionary algorithm, and the related research work is complicated and rich. Therefore, a comprehensive and systematic survey on the NAS is essential. Previously related surveys have begun to classify existing work mainly based on the key components of NAS: search space, search strategy, and evaluation strategy. While this classification method is more intuitive, it is difficult for readers to grasp the challenges and the landmark work involved. Therefore, in this survey, we provide a new perspective: beginning with an overview of the characteristics of the earliest NAS algorithms, summarizing the problems in these early NAS algorithms, and then providing solutions for subsequent related research work. In addition, we conduct a detailed and comprehensive analysis, comparison, and summary of these works. Finally, we provide some possible future research directions." + }, + { + "title": "Searching Better Architectures for Neural Machine Translation", + "abstract": "Neural architecture search (NAS) has played important roles in the evolution of neural architectures. However, no much attention has been paid to improve neural machine translation (NMT) through NAS approaches. In this work, we propose a gradient-based NAS algorithm for NMT, which automatically discovers architectures with better performances. Compared with previous NAS work, we jointly search the network operations (e.g., LSTM, CNN, self-attention etc) as well as dropout rates to ensure better results. We show that with reasonable resources it is possible to discover novel neural network architectures for NMT, which achieve consistently better performances than Transformer [1], the state-of-the-art NMT model, across different tasks. On WMT’14 English-to-German translation, IWSLT’14 German-to-English translation and WMT’18 Finnish-to-English translation tasks, our discovered architectures could obtain 30.1, 36.1 and 26.4 BLEU scores, which are great improvement over Transformer baselines. We also empirically verify that the discovered model on one task can be transferred to other tasks." + }, + { + "title": "FedNAS: Federated Deep Learning via Neural Architecture Search", + "abstract": "Federated Learning (FL) has been proved to be an effective learning framework when data cannot be centralized due to privacy, communication costs, and regulatory restrictions. When training deep learning models under an FL setting, people employ the predefined model architecture discovered in the centralized environment. However, this predefined architecture may not be the optimal choice because it may not fit data with non-identical and independent distribution (non-IID). Thus, we advocate automating federated learning (AutoFL) to improve model accuracy and reduce the manual design effort. We specifically study AutoFL via Neural Architecture Search (NAS), which can automate the design process. We propose a Federated NAS (FedNAS) algorithm to help scattered workers collaboratively searching for a better architecture with higher accuracy. We also build a system based on FedNAS. Our experiments on non-IID dataset show that the architecture searched by FedNAS can outperform the manually predefined architecture." + }, + { + "title": "Surrogate-Assisted Evolutionary Deep Learning Using an End-to-End Random Forest-Based Performance Predictor", + "abstract": "Convolutional neural networks (CNNs) have shown remarkable performance in various real-world applications. Unfortunately, the promising performance of CNNs can be achieved only when their architectures are optimally constructed. The architectures of state-of-the-art CNNs are typically handcrafted with extensive expertise in both CNNs and the investigated data, which consequently hampers the widespread adoption of CNNs for less experienced users. Evolutionary deep learning (EDL) is able to automatically design the best CNN architectures without much expertise. However, the existing EDL algorithms generally evaluate the fitness of a new architecture by training from scratch, resulting in the prohibitive computational cost even operated on high-performance computers. In this paper, an end-to-end offline performance predictor based on the random forest is proposed to accelerate the fitness evaluation in EDL. The proposed performance predictor shows the promising performance in term of the classification accuracy and the consumed computational resources when compared with 18 state-of-the-art peer competitors by integrating into an existing EDL algorithm as a case study. The proposed performance predictor is also compared with the other two representatives of existing performance predictors. The experimental results show the proposed performance predictor not only significantly speeds up the fitness evaluations but also achieves the best prediction among the peer performance predictors." + }, + { + "title": "UniformAugment: A Search-free Probabilistic Data Augmentation Approach", + "abstract": "Augmenting training datasets has been shown to improve the learning effectiveness for several computer vision tasks. A good augmentation produces an augmented dataset that adds variability while retaining the statistical properties of the original dataset. Some techniques, such as AutoAugment and Fast AutoAugment, have introduced a search phase to find a set of suitable augmentation policies for a given model and dataset. This comes at the cost of great computational overhead, adding up to several thousand GPU hours. More recently RandAugment was proposed to substantially speedup the search phase by approximating the search space by a couple of hyperparameters, but still incurring non-negligible cost for tuning those. In this paper we show that, under the assumption that the augmentation space is approximately distribution invariant, a uniform sampling over the continuous space of augmentation transformations is sufficient to train highly effective models. Based on that result we propose UniformAugment, an automated data augmentation approach that completely avoids a search phase. In addition to discussing the theoretical underpinning supporting our approach, we also use the standard datasets, as well as established models for image classification, to show that UniformAugment's effectiveness is comparable to the aforementioned methods, while still being highly efficient by virtue of not requiring any search." + }, + { + "title": "MiLeNAS: Efficient Neural Architecture Search via Mixed-Level Reformulation", + "abstract": "Many recently proposed methods for Neural Architecture Search (NAS) can be formulated as bilevel optimization. For efficient implementation, its solution requires approximations of second-order methods. In this paper, we demonstrate that gradient errors caused by such approximations lead to suboptimality, in the sense that the optimization procedure fails to converge to a (locally) optimal solution. To remedy this, this paper proposes MiLeNAS, a mixed-level reformulation for NAS that can be optimized efficiently and reliably. It is shown that even when using a simple first-order method on the mixed-level formulation, MiLeNAS can achieve a lower validation error for NAS problems. Consequently, architectures obtained by our method achieve consistently higher accuracies than those obtained from bilevel optimization. Moreover, MiLeNAS proposes a framework beyond DARTS. It is upgraded via model size-based search and early stopping strategies to complete the search process in around 5 hours. Extensive experiments within the convolutional architecture search space validate the effectiveness of our approach." + }, + { + "title": "Are Labels Necessary for Neural Architecture Search?", + "abstract": null + }, + { + "title": "GreedyNAS: Towards Fast One-Shot NAS With Greedy Supernet", + "abstract": "Training a supernet matters for one-shot neural architecture search (NAS) methods since it serves as a basic performance estimator for different architectures (paths). Current methods mainly hold the assumption that a supernet should give a reasonable ranking over all paths. They thus treat all paths equally, and spare much effort to train paths. However, it is harsh for a single supernet to evaluate accurately on such a huge-scale search space (e.g., 7^21). In this paper, instead of covering all paths, we ease the burden of supernet by encouraging it to focus more on evaluation of those potentially-good ones, which are identified using a surrogate portion of validation data. Concretely, during training, we propose a multi-path sampling strategy with rejection, and greedily filter the weak paths. The training efficiency is thus boosted since the training space has been greedily shrunk from all paths to those potentially-good ones. Moreover, we further adopt an exploration and exploitation policy by introducing an empirical candidate path pool. Our proposed method GreedyNAS is easy-to-follow, and experimental results on ImageNet dataset indicate that it can achieve better Top-1 accuracy under same search space and FLOPs or latency level, but with only ~60% of supernet training cost. By searching on a larger space, our GreedyNAS can also obtain new state-of-the-art architectures." + }, + { + "title": "BigNAS: Scaling Up Neural Architecture Search with Big Single-Stage Models", + "abstract": null + }, + { + "title": "Hyper-Parameter Optimization: A Review of Algorithms and Applications", + "abstract": "Since deep neural networks were developed, they have made huge contributions to everyday lives. Machine learning provides more rational advice than humans are capable of in almost every aspect of daily life. However, despite this achievement, the design and training of neural networks are still challenging and unpredictable procedures. To lower the technical thresholds for common users, automated hyper-parameter optimization (HPO) has become a popular topic in both academic and industrial areas. This paper provides a review of the most essential topics on HPO. The first section introduces the key hyper-parameters related to model training and structure, and discusses their importance and methods to define the value range. Then, the research focuses on major optimization algorithms and their applicability, covering their efficiency and accuracy especially for deep learning networks. This study next reviews major services and toolkits for HPO, comparing their support for state-of-the-art searching algorithms, feasibility with major deep learning frameworks, and extensibility for new modules designed by users. The paper concludes with problems that exist when HPO is applied to deep learning, a comparison between optimization algorithms, and prominent approaches for model evaluation with limited computational resources." + }, + { + "title": "Hierarchical Neural Architecture Search for Single Image Super-Resolution", + "abstract": "Deep neural networks have exhibited promising performance in image super-resolution (SR). Most SR models follow a hierarchical architecture that contains both the cell-level design of computational blocks and the network-level design of the positions of upsampling blocks. However, designing SR models heavily relies on human expertise and is very labor-intensive. More critically, these SR models often contain a huge number of parameters and may not meet the requirements of computation resources in real-world applications. To address the above issues, we propose a Hierarchical Neural Architecture Search (HNAS) method to automatically design promising architectures with different requirements of computation cost. To this end, we design a hierarchical SR search space and propose a hierarchical controller for architecture search. Such a hierarchical controller is able to simultaneously find promising cell-level blocks and network-level positions of upsampling layers. Moreover, to design compact architectures with promising performance, we build a joint reward by considering both the performance and computation cost to guide the search process. Extensive experiments on five benchmark datasets demonstrate the superiority of our method over existing methods." + }, + { + "title": "DADA: Differentiable Automatic Data Augmentation", + "abstract": null + }, + { + "title": "AutoML-Zero: Evolving Machine Learning Algorithms From Scratch", + "abstract": "Machine learning research has advanced in multiple aspects, including model structures and learning methods. The effort to automate such research, known as AutoML, has also made significant progress. However, this progress has largely focused on the architecture of neural networks, where it has relied on sophisticated expert-designed layers as building blocks---or similarly restrictive search spaces. Our goal is to show that AutoML can go further: it is possible today to automatically discover complete machine learning algorithms just using basic mathematical operations as building blocks. We demonstrate this by introducing a novel framework that significantly reduces human bias through a generic search space. Despite the vastness of this space, evolutionary search can still discover two-layer neural networks trained by backpropagation. These simple neural networks can then be surpassed by evolving directly on tasks of interest, e.g. CIFAR-10 variants, where modern techniques emerge in the top algorithms, such as bilinear interactions, normalized gradients, and weight averaging. Moreover, evolution adapts algorithms to different task types: e.g., dropout-like techniques appear when little data is available. We believe these preliminary successes in discovering machine learning algorithms from scratch indicate a promising new direction for the field." + }, + { + "title": "Real-Time Federated Evolutionary Neural Architecture Search", + "abstract": "Federated learning is a distributed machine learning approach to privacy preservation and two major technical challenges prevent a wider application of federated learning. One is that federated learning raises high demands on communication resources, since a large number of model parameters must be transmitted between the server and clients. The other challenge is that training large machine learning models such as deep neural networks in federated learning requires a large amount of computational resources, which may be unrealistic for edge devices such as mobile phones. The problem becomes worse when deep neural architecture search (NAS) is to be carried out in federated learning. To address the above challenges, we propose an evolutionary approach to real-time federated NAS that not only optimizes the model performance but also reduces the local payload. During the search, a double-sampling technique is introduced, in which for each individual, only a randomly sampled submodel is transmitted to a number of randomly sampled clients for training. This way, we effectively reduce computational and communication costs required for evolutionary optimization, making the proposed framework well suitable for real-time federated NAS." + }, + { + "title": "AutoEmb: Automated Embedding Dimensionality Search in Streaming Recommendations", + "abstract": "Deep learning based recommender systems (DLRSs) often have embedding layers, which are utilized to lessen the dimensionality of categorical variables (e.g. user/item identifiers) and meaningfully transform them in the low-dimensional space. The majority of existing DLRSs empirically pre-define a fixed and unified dimension for all user/item embeddings. It is evident from recent researches that different embedding sizes are highly desired for different users/items according to their popularity. However, manually selecting embedding sizes in recommender systems can be very challenging due to the large number of users/items and the dynamic nature of their popularity. Thus, in this paper, we propose an AutoML based end-to-end framework (AutoEmb), which can enable various embedding dimensions according to the popularity in an automated and dynamic manner. To be specific, we first enhance a typical DLRS to allow various embedding dimensions; then we propose an end-to-end differentiable framework that can automatically select different embedding dimensions according to user/item popularity; finally we propose an AutoML based optimization algorithm in a streaming recommendation setting. The experimental results based on widely used benchmark datasets demonstrate the effectiveness of the AutoEmb framework." + }, + { + "title": "Semi-Supervised Neural Architecture Search", + "abstract": "Neural architecture search (NAS) relies on a good controller to generate better architectures or predict the accuracy of given architectures. However, training the controller requires both abundant and high-quality pairs of architectures and their accuracy, while it is costly to evaluate an architecture and obtain its accuracy. In this paper, we propose SemiNAS, a semi-supervised NAS approach that leverages numerous unlabeled architectures (without evaluation and thus nearly no cost). Specifically, SemiNAS 1) trains an initial accuracy predictor with a small set of architecture-accuracy data pairs; 2) uses the trained accuracy predictor to predict the accuracy of large amount of architectures (without evaluation); and 3) adds the generated data pairs to the original data to further improve the predictor. The trained accuracy predictor can be applied to various NAS algorithms by predicting the accuracy of candidate architectures for them. SemiNAS has two advantages: 1) It reduces the computational cost under the same accuracy guarantee. On NASBench-101 benchmark dataset, it achieves comparable accuracy with gradient-based method while using only 1/7 architecture-accuracy pairs. 2) It achieves higher accuracy under the same computational cost. It achieves 94.02% test accuracy on NASBench-101, outperforming all the baselines when using the same number of architectures. On ImageNet, it achieves 23.5% top-1 error rate (under 600M FLOPS constraint) using 4 GPU-days for search. We further apply it to LJSpeech text to speech task and it achieves 97% intelligibility rate in the low-resource setting and 15% test error rate in the robustness setting, with 9%, 7% improvements over the baseline respectively." + }, + { + "title": "DSNAS: Direct Neural Architecture Search Without Parameter Retraining", + "abstract": "If NAS methods are solutions, what is the problem? Most existing NAS methods require two-stage parameter optimization. However, performance of the same architecture in the two stages correlates poorly. In this work, we propose a new problem definition for NAS, task-specific end-to-end, based on this observation. We argue that given a computer vision task for which a NAS method is expected, this definition can reduce the vaguely-defined NAS evaluation to i) accuracy of this task and ii) the total computation consumed to finally obtain a model with satisfying accuracy. Seeing that most existing methods do not solve this problem directly, we propose DSNAS, an efficient differentiable NAS framework that simultaneously optimizes architecture and parameters with a low-biased Monte Carlo estimate. Child networks derived from DSNAS can be deployed directly without parameter retraining. Comparing with two-stage methods, DSNAS successfully discovers networks with comparable accuracy (74.4\\%) on ImageNet in 420 GPU hours, reducing the total time by more than 34\\%." + }, + { + "title": "On Robustness of Neural Architecture Search Under Label Noise", + "abstract": "Neural architecture search (NAS), which aims at automatically seeking proper neural architectures given a specific task, has attracted extensive attention recently in supervised learning applications. In most real-world situations, the class labels provided in the training data would be noisy due to many reasons, such as subjective judgments, inadequate information, and random human errors. Existing work has demonstrated the adverse effects of label noise on the learning of weights of neural networks. These effects could become more critical in NAS since the architectures are not only trained with noisy labels but are also compared based on their performances on noisy validation sets. In this paper, we systematically explore the robustness of NAS under label noise. We show that label noise in the training and/or validation data can lead to various degrees of performance variations. Through empirical experiments, using robust loss functions can mitigate the performance degradation under symmetric label noise as well as under a simple model of class conditional label noise. We also provide a theoretical justification for this. Both empirical and theoretical results provide a strong argument in favor of employing the robust loss function in NAS under high-level noise." + }, + { + "title": "A Style-Based Generator Architecture for Generative Adversarial Networks", + "abstract": "We propose an alternative generator architecture for generative adversarial networks, borrowing from style transfer literature. The new architecture leads to an automatically learned, unsupervised separation of high-level attributes (e.g., pose and identity when trained on human faces) and stochastic variation in the generated images (e.g., freckles, hair), and it enables intuitive, scale-specific control of the synthesis. The new generator improves the state-of-the-art in terms of traditional distribution quality metrics, leads to demonstrably better interpolation properties, and also better disentangles the latent factors of variation. To quantify interpolation quality and disentanglement, we propose two new, automated methods that are applicable to any generator architecture. Finally, we introduce a new, highly varied and high-quality dataset of human faces." + }, + { + "title": "Bayesian Neural Architecture Search using A Training-Free Performance Metric", + "abstract": null + }, + { + "title": "Fast Neural Network Adaptation via Parameter Remapping and Architecture Search", + "abstract": "Deep neural networks achieve remarkable performance in many computer vision tasks. Most state-of-the-art~(SOTA) semantic segmentation and object detection approaches reuse neural network architectures designed for image classification as the backbone, commonly pre-trained on ImageNet. However, performance gains can be achieved by designing network architectures specifically for detection and segmentation, as shown by recent neural architecture search (NAS) research for detection and segmentation. One major challenge though, is that ImageNet pre-training of the search space representation (a.k.a. super network) or the searched networks incurs huge computational cost. In this paper, we propose a Fast Neural Network Adaptation (FNA) method, which can adapt both the architecture and parameters of a seed network (e.g. a high performing manually designed backbone) to become a network with different depth, width, or kernels via a Parameter Remapping technique, making it possible to utilize NAS for detection/segmentation tasks a lot more efficiently. In our experiments, we conduct FNA on MobileNetV2 to obtain new networks for both segmentation and detection that clearly out-perform existing networks designed both manually and by NAS. The total computation cost of FNA is significantly less than SOTA segmentation/detection NAS approaches: 1737$\\times$ less than DPC, 6.8$\\times$ less than Auto-DeepLab and 7.4$\\times$ less than DetNAS. The code is available at https://github.com/JaminFong/FNA ." + }, + { + "title": "NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search", + "abstract": "Neural architecture search (NAS) has achieved breakthrough success in a great number of applications in the past few years.\nIt could be time to take a step back and analyze the good and bad aspects in the field of NAS. A variety of algorithms search architectures under different search space. These searched architectures are trained using different setups, e.g., hyper-parameters, data augmentation, regularization. This raises a comparability problem when comparing the performance of various NAS algorithms. NAS-Bench-101 has shown success to alleviate this problem. In this work, we propose an extension to NAS-Bench-101: NAS-Bench-201 with a different search space, results on multiple datasets, and more diagnostic information. NAS-Bench-201 has a fixed search space and provides a unified benchmark for almost any up-to-date NAS algorithms. The design of our search space is inspired by the one used in the most popular cell-based searching algorithms, where a cell is represented as a directed acyclic graph. Each edge here is associated with an operation selected from a predefined operation set. For it to be applicable for all NAS algorithms, the search space defined in NAS-Bench-201 includes all possible architectures generated by 4 nodes and 5 associated operation options, which results in 15,625 neural cell candidates in total. The training log using the same setup and the performance for each architecture candidate are provided for three datasets. This allows researchers to avoid unnecessary repetitive training for selected architecture and focus solely on the search algorithm itself. The training time saved for every architecture also largely improves the efficiency of most NAS algorithms and presents a more computational cost friendly NAS community for a broader range of researchers. We provide additional diagnostic information such as fine-grained loss and accuracy, which can give inspirations to new designs of NAS algorithms. In further support of the proposed NAS-Bench-102, we have analyzed it from many aspects and benchmarked 10 recent NAS algorithms, which verify its applicability." + }, + { + "title": "NAS evaluation is frustratingly hard", + "abstract": "Neural Architecture Search (NAS) is an exciting new field which promises to be as much as a game-changer as Convolutional Neural Networks were in 2012. Despite many great works leading to substantial improvements on a variety of tasks, comparison between different methods is still very much an open issue. While most algorithms are tested on the same datasets, there is no shared experimental protocol followed by all. As such, and due to the under-use of ablation studies, there is a lack of clarity regarding why certain methods are more effective than others. Our first contribution is a benchmark of 8 NAS methods on 5 datasets. To overcome the hurdle of comparing methods with different search spaces, we propose using a method’s relative improvement over the randomly sampled average architecture, which effectively removes advantages arising from expertly engineered search spaces or training protocols. Surprisingly, we find that many NAS techniques struggle to significantly beat the average architecture baseline. We perform further experiments with the commonly used DARTS search space in order to understand the contribution of each component in the NAS pipeline. These experiments highlight that: (i) the use of tricks in the evaluation protocol has a predominant impact on the reported performance of architectures; (ii) the cell-based search space has a very narrow accuracy range, such that the seed has a considerable impact on architecture rankings; (iii) the hand-designed macrostructure (cells) is more important than the searched micro-structure (operations); and (iv) the depth-gap is a real phenomenon, evidenced by the change in rankings between 8 and 20 cell architectures. To conclude, we suggest best practices, that we hope will prove useful for the community and help mitigate current NAS pitfalls, e.g. difficulties in reproducibility and comparison of search methods. We provide the code used for our experiments at link-to-come." + }, + { + "title": "Big Transfer (BiT): General Visual Representation Learning", + "abstract": null + }, + { + "title": "Adversarial AutoAugment", + "abstract": "Data augmentation (DA) has been widely utilized to improve generalization in training deep neural networks. Recently, human-designed data augmentation has been gradually replaced by automatically learned augmentation policy. Through finding the best policy in well-designed search space of data augmentation, AutoAugment can significantly improve validation accuracy on image classification tasks. However, this approach is not computationally practical for large-scale problems. In this paper, we develop an adversarial method to arrive at a computationally-affordable solution called Adversarial AutoAugment, which can simultaneously optimize target related object and augmentation policy search loss. The augmentation policy network attempts to increase the training loss of a target network through generating adversarial augmentation policies, while the target network can learn more robust features from harder examples to improve the generalization. In contrast to prior work, we reuse the computation in target network training for policy evaluation, and dispense with the retraining of the target network. Compared to AutoAugment, this leads to about 12x reduction in computing cost and 11x shortening in time overhead on ImageNet. We show experimental results of our approach on CIFAR-10/CIFAR-100, ImageNet, and demonstrate significant performance improvements over state-of-the-art. On CIFAR-10, we achieve a top-1 test error of 1.36%, which is the currently best performing single model. On ImageNet, we achieve a leading performance of top-1 accuracy 79.40% on ResNet-50 and 80.00% on ResNet-50-D without extra data." + }, + { + "title": "AtomNAS: Fine-Grained End-to-End Neural Architecture Search", + "abstract": "Designing of search space is a critical problem for neural architecture search (NAS) algorithms. We propose a fine-grained search space comprised of atomic blocks, a minimal search unit much smaller than the ones used in recent NAS algorithms. This search space facilitates direct selection of channel numbers and kernel sizes in convolutions. In addition, we propose a resource-aware architecture search algorithm which dynamically selects atomic blocks during training. The algorithm is further accelerated by a dynamic network shrinkage technique. Instead of a search-and-retrain two-stage paradigm, our method can simultaneously search and train the target architecture in an end-to-end manner. Our method achieves state-of-the-art performance under several FLOPS configurations on ImageNet with a negligible searching cost. We open our entire codebase at: https://github.com/meijieru/AtomNAS." + }, + { + "title": "IRS: A Large Synthetic Indoor Robotics Stereo Dataset for Disparity and Surface Normal Estimation", + "abstract": "Indoor robotics localization, navigation and interaction heavily rely on scene understanding and reconstruction. Compared to monocular vision which usually does not explicitly introduce any geometrical constraint, stereo vision based schemes are more promising and robust to produce accurate geometrical information, such as surface normal and depth/disparity. Besides, deep learning models trained with large-scale datasets have shown their superior performance in many stereo vision tasks. However, existing stereo datasets rarely contain the high-quality surface normal and disparity ground truth, which hardly satisfy the demand of training a prospective deep model for indoor scenes. To this end, we introduce a large-scale synthetic indoor robotics stereo (IRS) dataset with over 100K stereo RGB images and high-quality surface normal and disparity maps. Leveraging the advanced rendering techniques of our customized rendering engine, the dataset is considerably close to the real-world captured images and covers several visual effects, such as brightness changes, light reflection/transmission, lens flare, vivid shadow, etc. We compare the data distribution of IRS with existing stereo datasets to illustrate the typical visual attributes of indoor scenes. In addition, we present a new deep model DispNormNet to simultaneously infer surface normal and disparity from stereo images. Compared to existing models trained on other datasets, DispNormNet trained with IRS produces much better estimation of surface normal and disparity for indoor scenes." + }, + { + "title": "PyTorch: An Imperative Style, High-Performance Deep Learning Library", + "abstract": "Deep learning frameworks have often focused on either usability or speed, but not both. PyTorch is a machine learning library that shows that these two goals are in fact compatible: it was designed from first principles to support an imperative and Pythonic programming style that supports code as a model, makes debugging easy and is consistent with other popular scientific computing libraries, while remaining efficient and supporting hardware accelerators such as GPUs. In this paper, we detail the principles that drove the implementation of PyTorch and how they are reflected in its architecture. We emphasize that every aspect of PyTorch is a regular Python program under the full control of its user. We also explain how the careful and pragmatic implementation of the key components of its runtime enables them to work together to achieve compelling performance. We demonstrate the efficiency of individual subsystems, as well as the overall speed of PyTorch on several commonly used benchmarks." + }, + { + "title": "EfficientDet: Scalable and Efficient Object Detection", + "abstract": "Model efficiency has become increasingly important in computer vision. In this paper, we systematically study neural network architecture design choices for object detection and propose several key optimizations to improve efficiency. First, we propose a weighted bi-directional feature pyramid network (BiFPN), which allows easy and fast multi-scale feature fusion; Second, we propose a compound scaling method that uniformly scales the resolution, depth, and width for all backbone, feature network, and box/class prediction networks at the same time. Based on these optimizations and EfficientNet backbones, we have developed a new family of object detectors, called EfficientDet, which consistently achieve much better efficiency than prior art across a wide spectrum of resource constraints. In particular, with single-model and single-scale, our EfficientDetD7 achieves state-of-the-art 52.2 AP on COCO test-dev with 52M parameters and 325B FLOPs, being 4x – 9x smaller and using 13x – 42x fewer FLOPs than previous detector." + }, + { + "title": "Faster AutoAugment: Learning Augmentation Strategies using Backpropagation", + "abstract": null + }, + { + "title": "Searching Learning Strategy with Reinforcement Learning for 3D Medical Image Segmentation", + "abstract": null + }, + { + "title": "Neural Architecture Search for Adversarial Medical Image Segmentation", + "abstract": null + }, + { + "title": "BANANAS: Bayesian Optimization with Neural Architectures for Neural Architecture Search", + "abstract": "Over the past half-decade, many methods have been considered for neural architecture search (NAS). Bayesian optimization (BO), which has long had success in hyperparameter optimization, has recently emerged as a very promising strategy for NAS when it is coupled with a neural predictor. Recent work has proposed different instantiations of this framework, for example, using Bayesian neural networks or graph convolutional networks as the predictive model within BO. However, the analyses in these papers often focus on the full-fledged NAS algorithm, so it is difficult to tell which individual components of the framework lead to the best performance.\n\nIn this work, we give a thorough analysis of the \"BO + neural predictor framework\" by identifying five main components: the architecture encoding, neural predictor, uncertainty calibration method, acquisition function, and acquisition function optimization. We test several different methods for each component and also develop a novel path-based encoding scheme for neural architectures, which we show theoretically and empirically scales better than other encodings. Using all of our analyses, we develop a final algorithm called BANANAS, which achieves state-of-the-art performance on NAS search spaces. We adhere to the NAS research checklist (Lindauer and Hutter 2019) to facilitate best practices, and our code is available at https://github.com/naszilla/naszilla." + }, + { + "title": "Auto-FPN: Automatic Network Architecture Adaptation for Object Detection Beyond Classification", + "abstract": "Abstract Neural architecture search (NAS) has shown great potential in automating the manual process of designing a good CNN architecture for image classification. In this paper, we study NAS for object detection, a core computer vision task that classifies and localizes object instances in an image. Existing works focus on transferring the searched architecture from classification task (ImageNet) to the detector backbone, while the rest of the architecture of the detector remains unchanged. However, this pipeline is not task-specific or data-oriented network search which cannot guarantee optimal adaptation to any dataset. Therefore, we propose an architecture search framework named Auto-FPN specifically designed for detection beyond simply searching a classification backbone. Specifically, we propose two auto search modules for detection: Auto-fusion to search a better fusion of the multi-level features; Auto-head to search a better structure for classification and bounding-box(bbox) regression. Instead of searching for one repeatable cell structure, we relax the constraint and allow different cells. The search space of both modules covers many popular designs of detectors and allows efficient gradient-based architecture search with resource constraint (2 days for COCO on 8 GPU cards). Extensive experiments on Pascal VOC, COCO, BDD, VisualGenome and ADE demonstrate the effectiveness of the proposed method, e.g. achieving around 5% improvement than FPN in terms of mAP while requiring around 50% fewer parameters on the searched modules." + }, + { + "title": "Towards modular and programmable architecture search", + "abstract": "Neural architecture search methods are able to find high performance deep learning architectures with minimal effort from an expert. However, current systems focus on specific use-cases (e.g. convolutional image classifiers and recurrent language models), making them unsuitable for general use-cases that an expert might wish to write. Hyperparameter optimization systems are general-purpose but lack the constructs needed for easy application to architecture search. In this work, we propose a formal language for encoding search spaces over general computational graphs. The language constructs allow us to write modular, composable, and reusable search space encodings and to reason about search space design. We use our language to encode search spaces from the architecture search literature. The language allows us to decouple the implementations of the search space and the search algorithm, allowing us to expose search spaces to search algorithms through a consistent interface. Our experiments show the ease with which we can experiment with different combinations of search spaces and search algorithms without having to implement each combination from scratch. We release an implementation of our language with this paper." + }, + { + "title": "Gradient Descent: The Ultimate Optimizer", + "abstract": "Working with any gradient-based machine learning algorithm involves the tedious task of tuning the optimizer's hyperparameters, such as the learning rate. There exist many techniques for automated hyperparameter optimization, but they typically introduce even more hyperparameters to control the hyperparameter optimization process. We propose to instead learn the hyperparameters themselves by gradient descent, and furthermore to learn the hyper-hyperparameters by gradient descent as well, and so on ad infinitum. As these towers of gradient-based optimizers grow, they become significantly less sensitive to the choice of top-level hyperparameters, hence decreasing the burden on the user to search for optimal values." + }, + { + "title": "Automatically Learning Data Augmentation Policies for Dialogue Tasks", + "abstract": "Automatic data augmentation (AutoAugment) (Cubuk et al., 2019) searches for optimal perturbation policies via a controller trained using performance rewards of a sampled policy on the target task, hence reducing data-level model bias. While being a powerful algorithm, their work has focused on computer vision tasks, where it is comparatively easy to apply imperceptible perturbations without changing an image’s semantic meaning. In our work, we adapt AutoAugment to automatically discover effective perturbation policies for natural language processing (NLP) tasks such as dialogue generation. We start with a pool of atomic operations that apply subtle semantic-preserving perturbations to the source inputs of a dialogue task (e.g., different POS-tag types of stopword dropout, grammatical errors, and paraphrasing). Next, we allow the controller to learn more complex augmentation policies by searching over the space of the various combinations of these atomic operations. Moreover, we also explore conditioning the controller on the source inputs of the target task, since certain strategies may not apply to inputs that do not contain that strategy’s required linguistic features. Empirically, we demonstrate that both our input-agnostic and input-aware controllers discover useful data augmentation policies, and achieve significant improvements over the previous state-of-the-art, including trained on manually-designed policies." + }, + { + "title": "Understanding and Robustifying Differentiable Architecture Search", + "abstract": "Differentiable Architecture Search (DARTS) has attracted a lot of attention due to its simplicity and small search costs achieved by a continuous relaxation and an approximation of the resulting bi-level optimization problem. However, DARTS does not work robustly for new problems: we identify a wide range of search spaces for which DARTS yields degenerate architectures with very poor test performance. We study this failure mode and show that, while DARTS successfully minimizes validation loss, the found solutions generalize poorly when they coincide with high validation loss curvature in the architecture space. We show that by adding one of various types of regularization we can robustify DARTS to find solutions with less curvature and better generalization properties. Based on these observations, we propose several simple variations of DARTS that perform substantially more robustly in practice. Our observations are robust across five search spaces on three image classification tasks and also hold for the very different domains of disparity estimation (a dense regression task) and language modelling." + }, + { + "title": "Memory-Efficient Hierarchical Neural Architecture Search for Image Denoising", + "abstract": "Recently, neural architecture search (NAS) methods have attracted much attention and outperformed manually designed architectures on a few high-level vision tasks. In this paper, we propose HiNAS (Hierarchical NAS), an effort towards employing NAS to automatically design effective neural network architectures for image denoising. HiNAS adopts gradient based search strategies and employs operations with adaptive receptive field to build an flexible hierarchical search space. During the search stage, HiNAS shares cells across different feature levels to save memory and employ an early stopping strategy to avoid the collapse issue in NAS, and considerably accelerate the search speed. The proposed HiNAS is both memory and computation efficient, which takes only about 4.5 hours for searching using a single GPU. We evaluate the effectiveness of our proposed HiNAS on two different datasets, namely an additive white Gaussian noise dataset BSD500, and a realistic noise dataset SIM1800. Experimental results show that the architecture found by HiNAS has fewer parameters and enjoys a faster inference speed, while achieving highly competitive performance compared with state-of-the-art methods. We also present analysis on the architectures found by NAS. HiNAS also shows good performance on experiments for image de-raining." + }, + { + "title": "IR-NAS: Neural Architecture Search for Image Restoration", + "abstract": "Recently, neural architecture search (NAS) methods have attracted much attention and outperformed manually designed architectures on a few high-level vision tasks. In this paper, we propose IR-NAS, an effort towards employing NAS to automatically design effective neural network architectures for low-level image restoration tasks, and apply to two such tasks: image denoising and image de-raining. IR-NAS adopts an flexible hierarchical search space, including inner cell structures and outer layer widths. The proposed IR-NAS is both memory and computationally efficient, which takes only 6 hours for searching using a single GPU and saves memory by sharing cell weights across different feature levels. We evaluate the effectiveness of our proposed IR-NAS on three different datasets, including an additive white Gaussian noise dataset BSD500, a realistic noise dataset SIM1800 and a challenging de-raining dataset Rain800. Results show that the architectures found by IR-NAS have fewer parameters and enjoy a faster inference speed, while achieving highly competitive performance compared with state-of-the-art methods. We also present analysis on the architectures found by NAS." + }, + { + "title": "DARTS+: Improved Differentiable Architecture Search with Early Stopping", + "abstract": "Recently, there has been a growing interest in automating the process of neural architecture design, and the Differentiable Architecture Search (DARTS) method makes the process available within a few GPU days. However, the performance of DARTS is often observed to collapse when the number of search epochs becomes large. Meanwhile, lots of \"{\\em skip-connect}s\" are found in the selected architectures. In this paper, we claim that the cause of the collapse is that there exists overfitting in the optimization of DARTS. Therefore, we propose a simple and effective algorithm, named \"DARTS+\", to avoid the collapse and improve the original DARTS, by \"early stopping\" the search procedure when meeting a certain criterion. We also conduct comprehensive experiments on benchmark datasets and different search spaces and show the effectiveness of our DARTS+ algorithm, and DARTS+ achieves $2.32\\%$ test error on CIFAR10, $14.87\\%$ on CIFAR100, and $23.7\\%$ on ImageNet. We further remark that the idea of \"early stopping\" is implicitly included in some existing DARTS variants by manually setting a small number of search epochs, while we give an {\\em explicit} criterion for \"early stopping\"." + }, + { + "title": "Auto-GNN: Neural architecture search of graph neural networks", + "abstract": "Graph neural networks (GNNs) have been widely used in various graph analysis tasks. As the graph characteristics vary significantly in real-world systems, given a specific scenario, the architecture parameters need to be tuned carefully to identify a suitable GNN. Neural architecture search (NAS) has shown its potential in discovering the effective architectures for the learning tasks in image and language modeling. However, the existing NAS algorithms cannot be applied efficiently to GNN search problem because of two facts. First, the large-step exploration in the traditional controller fails to learn the sensitive performance variations with slight architecture modifications in GNNs. Second, the search space is composed of heterogeneous GNNs, which prevents the direct adoption of parameter sharing among them to accelerate the search progress. To tackle the challenges, we propose an automated graph neural networks (AGNN) framework, which aims to find the optimal GNN architecture efficiently. Specifically, a reinforced conservative controller is designed to explore the architecture space with small steps. To accelerate the validation, a novel constrained parameter sharing strategy is presented to regularize the weight transferring among GNNs. It avoids training from scratch and saves the computation time. Experimental results on the benchmark datasets demonstrate that the architecture identified by AGNN achieves the best performance and search efficiency, comparing with existing human-invented models and the traditional search methods." + }, + { + "title": "Resource Optimized Neural Architecture Search for 3D Medical Image Segmentation", + "abstract": null + }, + { + "title": "Automated deep learning design for medical image classification by health-care professionals with no coding experience: a feasibility study.", + "abstract": null + }, + { + "title": "Once for All: Train One Network and Specialize it for Efficient Deployment", + "abstract": "We address the challenging problem of efficient inference across many devices and resource constraints, especially on edge devices. Conventional approaches either manually design or use neural architecture search (NAS) to find a specialized neural network and train it from scratch for each case, which is computationally prohibitive (causing $CO_2$ emission as much as 5 cars' lifetime) thus unscalable. In this work, we propose to train a once-for-all (OFA) network that supports diverse architectural settings by decoupling training and search, to reduce the cost. We can quickly get a specialized sub-network by selecting from the OFA network without additional training. To efficiently train OFA networks, we also propose a novel progressive shrinking algorithm, a generalized pruning method that reduces the model size across many more dimensions than pruning (depth, width, kernel size, and resolution). It can obtain a surprisingly large number of sub-networks ($> 10^{19}$) that can fit different hardware platforms and latency constraints while maintaining the same level of accuracy as training independently. On diverse edge devices, OFA consistently outperforms state-of-the-art (SOTA) NAS methods (up to 4.0% ImageNet top1 accuracy improvement over MobileNetV3, or same accuracy but 1.5x faster than MobileNetV3, 2.6x faster than EfficientNet w.r.t measured latency) while reducing many orders of magnitude GPU hours and $CO_2$ emission. In particular, OFA achieves a new SOTA 80.0% ImageNet top-1 accuracy under the mobile setting ($<$600M MACs). OFA is the winning solution for the 3rd Low Power Computer Vision Challenge (LPCVC), DSP classification track and the 4th LPCVC, both classification track and detection track. Code and 50 pre-trained models (for many devices & many latency constraints) are released at this https URL." + }, + { + "title": "AutoGAN: Neural Architecture Search for Generative Adversarial Networks", + "abstract": "Neural architecture search (NAS) has witnessed prevailing success in image classification and (very recently) segmentation tasks. In this paper, we present the first preliminary study on introducing the NAS algorithm to generative adversarial networks (GANs), dubbed AutoGAN. The marriage of NAS and GANs faces its unique challenges. We define the search space for the generator architectural variations and use an RNN controller to guide the search, with parameter sharing and dynamic-resetting to accelerate the process. Inception score is adopted as the reward, and a multi-level search strategy is introduced to perform NAS in a progressive way. Experiments validate the effectiveness of AutoGAN on the task of unconditional image generation. Specifically, our discovered architectures achieve highly competitive performance compared to current state-of-the-art hand-crafted GANs, e.g., setting new state-of-the-art FID scores of 12.42 on CIFAR-10, and 31.01 on STL-10, respectively. We also conclude with a discussion of the current limitations and future potential of AutoGAN. The code is available at https://github.com/TAMU-VITA/AutoGAN" + }, + { + "title": "Greedy AutoAugment", + "abstract": null + }, + { + "title": "Deep neural network architecture search using network morphism", + "abstract": "The paper presents the results of the research on neural architecture search (NAS) algorithm. We utilized the hill climbing algorithm to search for well-performing structures of deep convolutional neural network. Moreover, we used the function preserving transformations which enabled the effective operation of the algorithm in a short period of time. The network obtained with the advantage of NAS was validated on skin lesion classification problem. We compared the parameters and performance of the automatically generated neural structure with the architectures selected manually, reported by the authors in previous papers. The obtained structure achieved comparable results to hand-designed networks, but with much fewer parameters then manually crafted architectures." + }, + { + "title": "Multi-Fidelity Automatic Hyper-Parameter Tuning via Transfer Series Expansion", + "abstract": "Automatic machine learning (AutoML) aims at automatically choosing the best configuration for machine learning tasks. However, a configuration evaluation can be very time consuming particularly on learning tasks with large datasets. This limitation usually restrains derivative-free optimization from releasing its full power for a fine configuration search using many evaluations. To alleviate this limitation, in this paper, we propose a derivative-free optimization framework for AutoML using multi-fidelity evaluations. It uses many lowfidelity evaluations on small data subsets and very few highfidelity evaluations on the full dataset. However, the lowfidelity evaluations can be badly biased, and need to be corrected with only a very low cost. We thus propose the Transfer Series Expansion (TSE) that learns the low-fidelity correction predictor efficiently by linearly combining a set of base predictors. The base predictors can be obtained cheaply from down-scaled and experienced tasks. Experimental results on real-world AutoML problems verify that the proposed framework can accelerate derivative-free configuration search significantly by making use of the multi-fidelity evaluations." + }, + { + "title": "PC-DARTS: Partial Channel Connections for Memory-Efficient Architecture Search", + "abstract": "Differentiable architecture search (DARTS) provided a fast solution in finding effective network architectures, but suffered from large memory and computing overheads in jointly training a super-net and searching for an optimal architecture. In this paper, we present a novel approach, namely Partially-Connected DARTS, by sampling a small part of super-net to reduce the redundancy in exploring the network space, thereby performing a more efficient search without comprising the performance. In particular, we perform operation search in a subset of channels while bypassing the held out part in a shortcut. This strategy may suffer from an undesired inconsistency on selecting the edges of super-net caused by sampling different channels. We solve it by introducing edge normalization, which adds a new set of edge-level hyper-parameters to reduce uncertainty in search. Thanks to the reduced memory cost, PC-DARTS can be trained with a larger batch size and, consequently, enjoy both faster speed and higher training stability. Experiment results demonstrate the effectiveness of the proposed method. Specifically, we achieve an error rate of 2.57% on CIFAR10 within merely 0.1 GPU-days for architecture search, and a state-of-the-art top-1 error rate of 24.2% on ImageNet (under the mobile setting) within 3.8 GPU-days for search. Our code has been made available at https://www.dropbox.com/sh/on9lg3rpx1r6dkf/AABG5mt0sMHjnEJyoRnLEYW4a?dl=0." + }, + { + "title": "Video Action Recognition Via Neural Architecture Searching", + "abstract": "Deep neural networks have achieved great success for video analysis and understanding. However, designing a high-performance neural architecture requires substantial efforts and expertise. In this paper, we make the first attempt to let algorithm automatically design neural networks for video action recognition tasks. Specifically, a spatio-temporal network is developed in a differentiable space modeled by a directed acyclic graph, thus a gradient-based strategy can be performed to search an optimal architecture. Nonetheless, it is computationally expensive, since the computational burden to evaluate each architecture candidate is still heavy. To alleviate this issue, we, for the video input, introduce a temporal segment approach to reduce the computational cost without losing global video information. For the architecture, we explore in an efficient search space by introducing pseudo 3D operators. Experiments show that, our architecture outperforms popular neural architectures, under the training from scratch protocol, on the challenging UCF101 dataset, surprisingly, with only around one percentage of parameters of its manual-design counterparts." + }, + { + "title": "FairNAS: Rethinking Evaluation Fairness of Weight Sharing Neural Architecture Search", + "abstract": "One of the most critical problems in weight-sharing neural architecture search is the evaluation of candidate models within a predefined search space. In practice, a one-shot supernet is trained to serve as an evaluator. A faithful ranking certainly leads to more accurate searching results. However, current methods are prone to making misjudgments. In this paper, we prove that their biased evaluation is due to inherent unfairness in the supernet training. In view of this, we propose two levels of constraints: expectation fairness and strict fairness. Particularly, strict fairness ensures equal optimization opportunities for all choice blocks throughout the training, which neither overestimates nor underestimates their capacity. We demonstrate that this is crucial for improving the confidence of models’ ranking. Incorporating the one-shot supernet trained under the proposed fairness constraints with a multi-objective evolutionary search algorithm, we obtain various state-of-the-art models, e.g., FairNAS-A attains 77.5% top-1 validation accuracy on ImageNet." + }, + { + "title": "Google Dataset Search", + "abstract": null + }, + { + "title": "Evolving Robust Neural Architectures to Defend from Adversarial Attacks", + "abstract": "Neural networks are prone to misclassify slightly modified input images. Recently, many defences have been proposed, but none have improved the robustness of neural networks consistently. Here, we propose to use adversarial attacks as a function evaluation to search for neural architectures that can resist such attacks automatically. Experiments on neural architecture search algorithms from the literature show that although accurate, they are not able to find robust architectures. A significant reason for this lies in their limited search space. By creating a novel neural architecture search with options for dense layers to connect with convolution layers and vice-versa as well as the addition of concatenation layers in the search, we were able to evolve an architecture that is inherently accurate on adversarial samples. Interestingly, this inherent robustness of the evolved architecture rivals state-of-the-art defences such as adversarial training while being trained only on the non-adversarial samples. Moreover, the evolved architecture makes use of some peculiar traits which might be useful for developing even more robust ones. Thus, the results here confirm that more robust architectures exist as well as opens up a new realm of feasibilities for the development and exploration of neural networks. \nCode available at this http URL." + }, + { + "title": "Scalable Neural Architecture Search for 3D Medical Image Segmentation", + "abstract": null + }, + { + "title": "Continual and Multi-Task Architecture Search", + "abstract": "Architecture search is the process of automatically learning the neural model or cell structure that best suits the given task. Recently, this approach has shown promising performance improvements (on language modeling and image classification) with reasonable training speed, using a weight sharing strategy called Efficient Neural Architecture Search (ENAS). In our work, we first introduce a novel continual architecture search (CAS) approach, so as to continually evolve the model parameters during the sequential training of several tasks, without losing performance on previously learned tasks (via block-sparsity and orthogonality constraints), thus enabling life-long learning. Next, we explore a multi-task architecture search (MAS) approach over ENAS for finding a unified, single cell structure that performs well across multiple tasks (via joint controller rewards), and hence allows more generalizable transfer of the cell structure knowledge to an unseen new task. We empirically show the effectiveness of our sequential continual learning and parallel multi-task learning based architecture search approaches on diverse sentence-pair classification tasks (GLUE) and multimodal-generation based video captioning tasks. Further, we present several ablations and analyses on the learned cell structures." + }, + { + "title": "Automated Machine Learning: State-of-The-Art and Open Challenges", + "abstract": "With the continuous and vast increase in the amount of data in our digital world, it has been acknowledged that the number of knowledgeable data scientists can not scale to address these challenges. Thus, there was a crucial need for automating the process of building good machine learning models. In the last few years, several techniques and frameworks have been introduced to tackle the challenge of automating the process of Combined Algorithm Selection and Hyper-parameter tuning (CASH) in the machine learning domain. The main aim of these techniques is to reduce the role of the human in the loop and fill the gap for non-expert machine learning users by playing the role of the domain expert. \nIn this paper, we present a comprehensive survey for the state-of-the-art efforts in tackling the CASH problem. In addition, we highlight the research work of automating the other steps of the full complex machine learning pipeline (AutoML) from data understanding till model deployment. Furthermore, we provide comprehensive coverage for the various tools and frameworks that have been introduced in this domain. Finally, we discuss some of the research directions and open challenges that need to be addressed in order to achieve the vision and goals of the AutoML process." + }, + { + "title": "AutoAugment: Learning Augmentation Strategies From Data", + "abstract": "Data augmentation is an effective technique for improving the accuracy of modern image classifiers. However, current data augmentation implementations are manually designed. In this paper, we describe a simple procedure called AutoAugment to automatically search for improved data augmentation policies. In our implementation, we have designed a search space where a policy consists of many sub-policies, one of which is randomly chosen for each image in each mini-batch. A sub-policy consists of two operations, each operation being an image processing function such as translation, rotation, or shearing, and the probabilities and magnitudes with which the functions are applied. We use a search algorithm to find the best policy such that the neural network yields the highest validation accuracy on a target dataset. Our method achieves state-of-the-art accuracy on CIFAR-10, CIFAR-100, SVHN, and ImageNet (without additional data). On ImageNet, we attain a Top-1 accuracy of 83.5% which is 0.4% better than the previous record of 83.1%. On CIFAR-10, we achieve an error rate of 1.5%, which is 0.6% better than the previous state-of-the-art. Augmentation policies we find are transferable between datasets. The policy learned on ImageNet transfers well to achieve significant improvements on other datasets, such as Oxford Flowers, Caltech-101, Oxford-IIT Pets, FGVC Aircraft, and Stanford Cars." + }, + { + "title": "Searching for a Robust Neural Architecture in Four GPU Hours", + "abstract": "Conventional neural architecture search (NAS) approaches are usually based on reinforcement learning or evolutionary strategy, which take more than 1000 GPU hours to find a good model on CIFAR-10. We propose an efficient NAS approach, which learns the searching approach by gradient descent. Our approach represents the search space as a directed acyclic graph (DAG). This DAG contains thousands of sub-graphs, each of which indicates a kind of neural architecture. To avoid traversing all the possibilities of the sub-graphs, we develop a differentiable sampler over the DAG. This sampler is learnable and optimized by the validation loss after training the sampled architecture. In this way, our approach can be trained in an end-to-end fashion by gradient descent, named Gradient-based search using Differentiable Architecture Sampler (GDAS). In experiments, we can finish one searching procedure in four GPU hours on CIFAR-10, and the discovered model obtains a test error of 2.82% with only 2.5M parameters, which is on par with the state-of-the-art." + }, + { + "title": "AssembleNet: Searching for Multi-Stream Neural Connectivity in Video Architectures", + "abstract": "Learning to represent videos is a very challenging task both algorithmically and computationally. Standard video CNN architectures have been designed by directly extending architectures devised for image understanding to include the time dimension, using modules such as 3D convolutions, or by using two-stream design to capture both appearance and motion in videos. We interpret a video CNN as a collection of multi-stream convolutional blocks connected to each other, and propose the approach of automatically finding neural architectures with better connectivity and spatio-temporal interactions for video understanding. This is done by evolving a population of overly-connected architectures guided by connection weight learning. Architectures combining representations that abstract different input types (i.e., RGB and optical flow) at multiple temporal resolutions are searched for, allowing different types or sources of information to interact with each other. Our method, referred to as AssembleNet, outperforms prior approaches on public video datasets, in some cases by a great margin. We obtain 58.6% mAP on Charades and 34.27% accuracy on Moments-in-Time." + }, + { + "title": "Style transfer-based image synthesis as an efficient regularization technique in deep learning", + "abstract": "These days deep learning is the fastest-growing area in the field of Machine Learning. Convolutional Neural Networks are currently the main tool used for the image analysis and classification purposes. Although great achievements and perspectives, deep neural networks and accompanying learning algorithms have some relevant challenges to tackle. In this paper, we have focused on the most frequently mentioned problem in the field of machine learning, that is relatively poor generalization abilities. Partial remedies for this are regularization techniques e.g. dropout, batch normalization, weight decay, transfer learning, early stopping and data augmentation. In this paper we have focused on data augmentation. We propose to use a method based on a neural style transfer, which allows to generate new unlabeled images of high perceptual quality that combine the content of a base image with the appearance of another one. In a proposed approach, the newly created images are described with pseudo-labels, and then used as a training dataset. Real, labeled images are divided into the validation and test set. We validated proposed method on a challenging skin lesion classification case study. Four representative neural architectures are examined. Obtained results show the strong potential of the proposed approach." + }, + { + "title": "Improving Neural Language Modeling via Adversarial Training", + "abstract": "Recently, substantial progress has been made in language modeling by using deep neural networks. However, in practice, large scale neural language models have been shown to be prone to overfitting. In this paper, we present a simple yet highly effective adversarial training mechanism for regularizing neural language models. The idea is to introduce adversarial noise to the output embedding layer while training the models. We show that the optimal adversarial noise yields a simple closed-form solution, thus allowing us to develop a simple and time efficient algorithm. Theoretically, we show that our adversarial mechanism effectively encourages the diversity of the embedding vectors, helping to increase the robustness of models. Empirically, we show that our method improves on the single model state-of-the-art results for language modeling on Penn Treebank (PTB) and Wikitext-2, achieving test perplexity scores of 46.01 and 38.07, respectively. When applied to machine translation, our method improves over various transformer-based translation baselines in BLEU scores on the WMT14 English-German and IWSLT14 German-English tasks." + }, + { + "title": "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks", + "abstract": "Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are available. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on scaling up MobileNets and ResNet. \nTo go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves state-of-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet. Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters. Source code is at this https URL." + }, + { + "title": "Network Pruning via Transformable Architecture Search", + "abstract": "Network pruning reduces the computation costs of an over-parameterized network without performance damage. Prevailing pruning algorithms pre-define the width and depth of the pruned networks, and then transfer parameters from the unpruned network to pruned networks. To break the structure limitation of the pruned networks, we propose to apply neural architecture search to search directly for a network with flexible channel and layer sizes. The number of the channels/layers is learned by minimizing the loss of the pruned networks. The feature map of the pruned network is an aggregation of K feature map fragments (generated by K networks of different sizes), which are sampled based on the probability distribution.The loss can be back-propagated not only to the network weights, but also to the parameterized distribution to explicitly tune the size of the channels/layers. Specifically, we apply channel-wise interpolation to keep the feature map with different channel sizes aligned in the aggregation procedure. The maximum probability for the size in each distribution serves as the width and depth of the pruned network, whose parameters are learned by knowledge transfer, e.g., knowledge distillation, from the original networks. Experiments on CIFAR-10, CIFAR-100 and ImageNet demonstrate the effectiveness of our new perspective of network pruning compared to traditional network pruning algorithms. Various searching and knowledge transfer approaches are conducted to show the effectiveness of the two components. Code is at: this https URL." + }, + { + "title": "Population Based Augmentation: Efficient Learning of Augmentation Policy Schedules", + "abstract": "A key challenge in leveraging data augmentation for neural network training is choosing an effective augmentation policy from a large search space of candidate operations. Properly chosen augmentation policies can lead to significant generalization improvements; however, state-of-the-art approaches such as AutoAugment are computationally infeasible to run for the ordinary user. In this paper, we introduce a new data augmentation algorithm, Population Based Augmentation (PBA), which generates nonstationary augmentation policy schedules instead of a fixed augmentation policy. We show that PBA can match the performance of AutoAugment on CIFAR-10, CIFAR-100, and SVHN, with three orders of magnitude less overall compute. On CIFAR-10 we achieve a mean test error of 1.46%, which is a slight improvement upon the current state-of-the-art. The code for PBA is open source and is available at this https URL." + }, + { + "title": "Tabular Benchmarks for Joint Architecture and Hyperparameter Optimization", + "abstract": "Due to the high computational demands executing a rigorous comparison between hyperparameter optimization (HPO) methods is often cumbersome. The goal of this paper is to facilitate a better empirical evaluation of HPO methods by providing benchmarks that are cheap to evaluate, but still represent realistic use cases. We believe these benchmarks provide an easy and efficient way to conduct reproducible experiments for neural hyperparameter search. Our benchmarks consist of a large grid of configurations of a feed forward neural network on four different regression datasets including architectural hyperparameters and hyperparameters concerning the training pipeline. Based on this data, we performed an in-depth analysis to gain a better understanding of the properties of the optimization problem, as well as of the importance of different types of hyperparameters. Second, we exhaustively compared various different state-of-the-art methods from the hyperparameter optimization literature on these benchmarks in terms of performance and robustness." + }, + { + "title": "BayesNAS: A Bayesian Approach for Neural Architecture Search", + "abstract": "One-Shot Neural Architecture Search (NAS) is a promising method to significantly reduce search time without any separate training. It can be treated as a Network Compression problem on the architecture parameters from an over-parameterized network. However, there are two issues associated with most one-shot NAS methods. First, dependencies between a node and its predecessors and successors are often disregarded which result in improper treatment over zero operations. Second, architecture parameters pruning based on their magnitude is questionable. In this paper, we employ the classic Bayesian learning approach to alleviate these two issues by modeling architecture parameters using hierarchical automatic relevance determination (HARD) priors. Unlike other NAS methods, we train the over-parameterized network for only one epoch then update the architecture. Impressively, this enabled us to find the architecture on CIFAR-10 within only 0.2 GPU days using a single GPU. Competitive performance can be also achieved by transferring to ImageNet. As a byproduct, our approach can be applied directly to compress convolutional neural networks by enforcing structural sparsity which achieves extremely sparse networks without accuracy deterioration." + }, + { + "title": "EENA: Efficient Evolution of Neural Architecture", + "abstract": "Latest algorithms for automatic neural architecture search perform remarkable but are basically directionless in search space and computational expensive in the training of every intermediate architecture. In this paper, we propose a method for efficient architecture search called EENA (Efficient Evolution of Neural Architecture). Due to the elaborately designed mutation and crossover operations, the evolution process can be guided by the information have already been learned. Therefore, less computational effort will be required while the searching and training time can be reduced significantly. On CIFAR-10 classification, EENA using minimal computational resources (0.65 GPU-days) can design highly effective neural architecture which achieves 2.56% test error with 8.47M parameters. Furthermore, the best architecture discovered is also transferable for CIFAR-100." + }, + { + "title": "A Survey on Neural Architecture Search", + "abstract": "The growing interest in both the automation of machine learning and deep learning has inevitably led to the development of a wide variety of automated methods for neural architecture search. The choice of the network architecture has proven to be critical, and many advances in deep learning spring from its immediate improvements. However, deep learning techniques are computationally intensive and their application requires a high level of domain knowledge. Therefore, even partial automation of this process helps to make deep learning more accessible to both researchers and practitioners. With this survey, we provide a formalism which unifies and categorizes the landscape of existing methods along with a detailed analysis that compares and contrasts the different approaches. We achieve this via a comprehensive discussion of the commonly adopted architecture search spaces and architecture optimization algorithms based on principles of reinforcement learning and evolutionary algorithms along with approaches that incorporate surrogate and one-shot models. Additionally, we address the new research directions which include constrained and multi-objective architecture search as well as automated data augmentation, optimizer and activation function search." + }, + { + "title": "Fast AutoAugment", + "abstract": "Data augmentation is an essential technique for improving generalization ability of deep learning models. Recently, AutoAugment has been proposed as an algorithm to automatically search for augmentation policies from a dataset and has significantly enhanced performances on many image recognition tasks. However, its search method requires thousands of GPU hours even for a relatively small dataset. In this paper, we propose an algorithm called Fast AutoAugment that finds effective augmentation policies via a more efficient search strategy based on density matching. In comparison to AutoAugment, the proposed algorithm speeds up the search time by orders of magnitude while achieves comparable performances on image recognition tasks with various models and datasets including CIFAR-10, CIFAR-100, SVHN, and ImageNet." + }, + { + "title": "Progressive Differentiable Architecture Search: Bridging the Depth Gap Between Search and Evaluation", + "abstract": "Recently, differentiable search methods have made major progress in reducing the computational costs of neural architecture search. However, these approaches often report lower accuracy in evaluating the searched architecture or transferring it to another dataset. This is arguably due to the large gap between the architecture depths in search and evaluation scenarios. In this paper, we present an efficient algorithm which allows the depth of searched architectures to grow gradually during the training procedure. This brings two issues, namely, heavier computational overheads and weaker search stability, which we solve using search space approximation and regularization, respectively. With a significantly reduced search time (~7 hours on a single GPU), our approach achieves state-of-the-art performance on both the proxy dataset (CIFAR10 or CIFAR100) and the target dataset (ImageNet). Code is available at https://github.com/chenxin061/pdarts" + }, + { + "title": "Benchmark and Survey of Automated Machine Learning Frameworks", + "abstract": "Machine learning (ML) has become a vital part in many aspects of our daily life. However, building well performing machine learning applications requires highly specialized data scientists and domain experts. Automated machine learning (AutoML) aims to reduce the demand for data scientists by enabling domain experts to automatically build machine learning applications without extensive knowledge of statistics and machine learning. This paper is a combination of a survey on current AutoML methods and a benchmark of popular AutoML frameworks on real data sets. Driven by the selected frameworks for evaluation, we summarize and review important AutoML techniques and methods concerning every step in building an ML pipeline. The selected AutoML frameworks are evaluated on 137 different data sets." + }, + { + "title": "AlphaClean: Automatic Generation of Data Cleaning Pipelines", + "abstract": "The analyst effort in data cleaning is gradually shifting away from the design of hand-written scripts to building and tuning complex pipelines of automated data cleaning libraries. Hyper-parameter tuning for data cleaning is very different than hyper-parameter tuning for machine learning since the pipeline components and objective functions have structure that tuning algorithms can exploit. This paper proposes a framework, called AlphaClean, that rethinks parameter tuning for data cleaning pipelines. AlphaClean provides users with a rich library to define data quality measures with weighted sums of SQL aggregate queries. AlphaClean applies generate-then-search framework where each pipelined cleaning operator contributes candidate transformations to a shared pool. Asynchronously, in separate threads, a search algorithm sequences them into cleaning pipelines that maximize the user-defined quality measures. This architecture allows AlphaClean to apply a number of optimizations including incremental evaluation of the quality measures and learning dynamic pruning rules to reduce the search space. Our experiments on real and synthetic benchmarks suggest that AlphaClean finds solutions of up-to 9x higher quality than naively applying state-of-the-art parameter tuning methods, is significantly more robust to straggling data cleaning methods and redundancy in the data cleaning library, and can incorporate state-of-the-art cleaning systems such as HoloClean as cleaning operators." + }, + { + "title": "Survey on Automated Machine Learning", + "abstract": "Machine learning has become a vital part in many aspects of our daily life. However, building well performing machine learning applications requires highly specialized data scientists and domain experts. Automated machine learning (AutoML) aims to reduce the demand for data scientists by enabling domain experts to automatically build machine learning applications without extensive knowledge of statistics and machine learning. In this survey, we summarize the recent developments in academy and industry regarding AutoML. First, we introduce a holistic problem formulation. Next, approaches for solving various subproblems of AutoML are presented. Finally, we provide an extensive empirical evaluation of the presented approaches on synthetic and real data." + }, + { + "title": "NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection", + "abstract": "Current state-of-the-art convolutional architectures for object detection are manually designed. Here we aim to learn a better architecture of feature pyramid network for object detection. We adopt Neural Architecture Search and discover a new feature pyramid architecture in a novel scalable search space covering all cross-scale connections. The discovered architecture, named NAS-FPN, consists of a combination of top-down and bottom-up connections to fuse features across scales. NAS-FPN, combined with various backbone models in the RetinaNet framework, achieves better accuracy and latency tradeoff compared to state-of-the-art object detection models. NAS-FPN improves mobile detection accuracy by 2 AP compared to state-of-the-art SSDLite with MobileNetV2 model in [32] and achieves 48.3 AP which surpasses Mask R-CNN [10] detection accuracy with less computation time." + }, + { + "title": "Biomedical image augmentation using Augmentor", + "abstract": "MOTIVATION\nImage augmentation is a frequently used technique in computer vision and has been seeing increased interest since the popularity of deep learning. Its usefulness is becoming more and more recognised due to deep neural networks requiring larger amounts of data to train, and because in certain fields, such as biomedical imaging, large amounts of labelled data are difficult to come by or expensive to produce. In biomedical imaging, features specific to this domain need to be addressed.\n\n\nRESULTS\nHere we present the Augmentor software package for image augmentation. It provides a stochastic, pipeline-based approach to image augmentation with a number of features that are relevant to biomedical imaging, such as z-stack augmentation and randomised elastic distortions. The software has been designed to be highly extensible, meaning an operation that might be specific to a highly specialised task can easily be added to the library, even at runtime. Although it has been designed as a general software library, it has features that are particularly relevant to biomedical imaging and the techniques required for this domain.\n\n\nAVAILABILITY\nAugmentor is a Python package made available under the terms of the MIT licence. Source code can be found on GitHub under https://github.com/mdbloice/Augmentor and installation is via the pip package manager*.\n\n\nSUPPLEMENTARY INFORMATION\nThe GitHub repository contains supplementary information, code examples, and Jupyter notebooks. Extensive documentation is hosted on Read the Docs under https://augmentor.readthedocs.io. For continuous integration tests see https://travis-ci.org/mdbloice/Augmentor." + }, + { + "title": "Meta Filter Pruning to Accelerate Deep Convolutional Neural Networks", + "abstract": "Existing methods usually utilize pre-defined criterions, such as p-norm, to prune unimportant filters. There are two major limitations in these methods. First, the relations of the filters are largely ignored. The filters usually work jointly to make an accurate prediction in a collaborative way. Similar filters will have equivalent effects on the network prediction, and the redundant filters can be further pruned. Second, the pruning criterion remains unchanged during training. As the network updated at each iteration, the filter distribution also changes continuously. The pruning criterions should also be adaptively switched. In this paper, we propose Meta Filter Pruning (MFP) to solve the above problems. First, as a complement to the existing p-norm criterion, we introduce a new pruning criterion considering the filter relation via filter distance. Additionally, we build a meta pruning framework for filter pruning, so that our method could adaptively select the most appropriate pruning criterion as the filter distribution changes. Experiments validate our approach on two image classification benchmarks. Notably, on ILSVRC-2012, our MFP reduces more than 50% FLOPs on ResNet-50 with only 0.44% top-5 accuracy loss." + }, + { + "title": "Single-Path NAS: Designing Hardware-Efficient ConvNets in less than 4 Hours", + "abstract": null + }, + { + "title": "NAS-Unet: Neural Architecture Search for Medical Image Segmentation", + "abstract": "Neural architecture search (NAS) has significant progress in improving the accuracy of image classification. Recently, some works attempt to extend NAS to image segmentation which shows preliminary feasibility. However, all of them focus on searching architecture for semantic segmentation in natural scenes. In this paper, we design three types of primitive operation set on search space to automatically find two cell architecture DownSC and UpSC for semantic image segmentation especially medical image segmentation. Inspired by the U-net architecture and its variants successfully applied to various medical image segmentation, we propose NAS-Unet which is stacked by the same number of DownSC and UpSC on a U-like backbone network. The architectures of DownSC and UpSC updated simultaneously by a differential architecture strategy during the search stage. We demonstrate the good segmentation results of the proposed method on Promise12, Chaos, and ultrasound nerve datasets, which collected by magnetic resonance imaging, computed tomography, and ultrasound, respectively. Without any pretraining, our architecture searched on PASCAL VOC2012, attains better performances and much fewer parameters (about 0.8M) than U-net and one of its variants when evaluated on the above three types of medical image datasets." + }, + { + "title": "DetNAS: Backbone Search for Object Detection", + "abstract": "Object detectors are usually equipped with backbone networks designed for image classification. It might be sub-optimal because of the gap between the tasks of image classification and object detection. In this work, we present DetNAS to use Neural Architecture Search (NAS) for the design of better backbones for object detection. It is non-trivial because detection training typically needs ImageNet pre-training while NAS systems require accuracies on the target detection task as supervisory signals. Based on the technique of one-shot supernet, which contains all possible networks in the search space, we propose a framework for backbone search on object detection. We train the supernet under the typical detector training schedule: ImageNet pre-training and detection fine-tuning. Then, the architecture search is performed on the trained supernet, using the detection task as the guidance. This framework makes NAS on backbones very efficient. In experiments, we show the effectiveness of DetNAS on various detectors, for instance, one-stage RetinaNet and the two-stage FPN. We empirically find that networks searched on object detection shows consistent superiority compared to those searched on ImageNet classification. The resulting architecture achieves superior performance than hand-crafted networks on COCO with much less FLOPs complexity." + }, + { + "title": "sharpDARTS: Faster and More Accurate Differentiable Architecture Search", + "abstract": "Neural Architecture Search (NAS) has been a source of dramatic improvements in neural network design, with recent results meeting or exceeding the performance of hand-tuned architectures. However, our understanding of how to represent the search space for neural net architectures and how to search that space efficiently are both still in their infancy. \nWe have performed an in-depth analysis to identify limitations in a widely used search space and a recent architecture search method, Differentiable Architecture Search (DARTS). These findings led us to introduce novel network blocks with a more general, balanced, and consistent design; a better-optimized Cosine Power Annealing learning rate schedule; and other improvements. Our resulting sharpDARTS search is 50% faster with a 20-30% relative improvement in final model error on CIFAR-10 when compared to DARTS. Our best single model run has 1.93% (1.98+/-0.07) validation error on CIFAR-10 and 5.5% error (5.8+/-0.3) on the recently released CIFAR-10.1 test set. To our knowledge, both are state of the art for models of similar size. This model also generalizes competitively to ImageNet at 25.1% top-1 (7.8% top-5) error. \nWe found improvements for existing search spaces but does DARTS generalize to new domains? We propose Differentiable Hyperparameter Grid Search and the HyperCuboid search space, which are representations designed to leverage DARTS for more general parameter optimization. Here we find that DARTS fails to generalize when compared against a human's one shot choice of models. We look back to the DARTS and sharpDARTS search spaces to understand why, and an ablation study reveals an unusual generalization gap. We finally propose Max-W regularization to solve this problem, which proves significantly better than the handmade design. Code will be made available." + }, + { + "title": "NAS-Bench-101: Towards Reproducible Neural Architecture Search", + "abstract": "Recent advances in neural architecture search (NAS) demand tremendous computational resources, which makes it difficult to reproduce experiments and imposes a barrier-to-entry to researchers without access to large-scale computation. We aim to ameliorate these problems by introducing NAS-Bench-101, the first public architecture dataset for NAS research. To build NAS-Bench-101, we carefully constructed a compact, yet expressive, search space, exploiting graph isomorphisms to identify 423k unique convolutional architectures. We trained and evaluated all of these architectures multiple times on CIFAR-10 and compiled the results into a large dataset of over 5 million trained models. This allows researchers to evaluate the quality of a diverse range of models in milliseconds by querying the pre-computed dataset. We demonstrate its utility by analyzing the dataset as a whole and by benchmarking a range of architecture optimization algorithms." + }, + { + "title": "Evaluating the Search Phase of Neural Architecture Search", + "abstract": "Neural Architecture Search (NAS) aims to facilitate the design of deep networks for new tasks. Existing techniques rely on two stages: searching over the architecture space and validating the best architecture. NAS algorithms are currently compared solely based on their results on the downstream task. While intuitive, this fails to explicitly evaluate the effectiveness of their search strategies. In this paper, we propose to evaluate the NAS search phase. To this end, we compare the quality of the solutions obtained by NAS search policies with that of random architecture selection. We find that: (i) On average, the state-of-the-art NAS algorithms perform similarly to the random policy; (ii) the widely-used weight sharing strategy degrades the ranking of the NAS candidates to the point of not reflecting their true performance, thus reducing the effectiveness of the search process. We believe that our evaluation framework will be key to designing NAS strategies that consistently discover architectures superior to random ones." + }, + { + "title": "Overcoming Multi-Model Forgetting", + "abstract": "We identify a phenomenon, which we refer to as multi-model forgetting, that occurs when sequentially training multiple deep networks with partially-shared parameters; the performance of previously-trained models degrades as one optimizes a subsequent one, due to the overwriting of shared parameters. To overcome this, we introduce a statistically-justified weight plasticity loss that regularizes the learning of a model's shared parameters according to their importance for the previous models, and demonstrate its effectiveness when training two models sequentially and for neural architecture search. Adding weight plasticity in neural architecture search preserves the best models to the end of the search and yields improved results in both natural language processing and computer vision tasks." + }, + { + "title": "Random Search and Reproducibility for Neural Architecture Search", + "abstract": "Neural architecture search (NAS) is a promising research direction that has the potential to replace expert-designed networks with learned, task-specific architectures. In this work, in order to help ground the empirical results in this field, we propose new NAS baselines that build off the following observations: (i) NAS is a specialized hyperparameter optimization problem; and (ii) random search is a competitive baseline for hyperparameter optimization. Leveraging these observations, we evaluate both random search with early-stopping and a novel random search with weight-sharing algorithm on two standard NAS benchmarks---PTB and CIFAR-10. Our results show that random search with early-stopping is a competitive NAS baseline, e.g., it performs at least as well as ENAS, a leading NAS method, on both benchmarks. Additionally, random search with weight-sharing outperforms random search with early-stopping, achieving a state-of-the-art NAS result on PTB and a highly competitive result on CIFAR-10. Finally, we explore the existing reproducibility issues of published NAS results. We note the lack of source material needed to exactly reproduce these results, and further discuss the robustness of published results given the various sources of variability in NAS experimental setups. Relatedly, we provide all information (code, random seeds, documentation) needed to exactly reproduce our results, and report our random search with weight-sharing results for each benchmark on multiple runs." + }, + { + "title": "Improving NeuroEvolution Efficiency by Surrogate Model-based Optimization with Phenotypic Distance Kernels", + "abstract": null + }, + { + "title": "Fixup Initialization: Residual Learning Without Normalization", + "abstract": "Normalization layers are a staple in state-of-the-art deep neural network architectures. They are widely believed to stabilize training, enable higher learning rate, accelerate convergence and improve generalization, though the reason for their effectiveness is still an active research topic. In this work, we challenge the commonly-held beliefs by showing that none of the perceived benefits is unique to normalization. Specifically, we propose fixed-update initialization (Fixup), an initialization motivated by solving the exploding and vanishing gradient problem at the beginning of training via properly rescaling a standard initialization. We find training residual networks with Fixup to be as stable as training with normalization -- even for networks with 10,000 layers. Furthermore, with proper regularization, Fixup enables residual networks without normalization to achieve state-of-the-art performance in image classification and machine translation." + }, + { + "title": "Fast, Accurate and Lightweight Super-Resolution with Neural Architecture Search", + "abstract": "Deep convolutional neural networks demonstrate impressive results in the super-resolution domain. A series of studies concentrate on improving peak signal noise ratio (PSNR) by using much deeper layers, which are not friendly to constrained resources. Pursuing a trade-off between the restoration capacity and the simplicity of models is still non-trivial. Recent contributions are struggling to manually maximize this balance, while our work achieves the same goal automatically with neural architecture search. Specifically, we handle super-resolution with a multi-objective approach. We also propose an elastic search tactic at both micro and macro level, based on a hybrid controller that profits from evolutionary computation and reinforcement learning. Quantitative experiments help us to draw a conclusion that our generated models dominate most of the state-of-the-art methods with respect to the individual FLOPS." + } + ] + }, + "author_data": { + "05915a15-bc1c-44d3-9b5e-99f2677a2c62": { + "pk": "05915a15-bc1c-44d3-9b5e-99f2677a2c62", + "project_name": null, + "name": "Xiaowen Chu", + "bio": "I am a researcher with a strong focus on optimizing computational efficiency in deep learning and network systems. My work has primarily revolved around enhancing the scalability of distributed training methods, particularly through the development of the merged-gradient wait-free backpropagation (MG-WFBP) algorithm, which significantly improves communication efficiency in deep neural networks. This research was implemented in the B-Caffe platform and demonstrated superior scaling performance on GPU clusters.\n\nIn addition to deep learning, I have explored various aspects of network design, including a restorable routing algorithm for Wavelength Division Multiplexing (WDM) networks that minimizes blocking probability. My interest in cloud storage has led me to investigate its advantages over traditional systems, emphasizing high data availability and reliability.\n\nI have also contributed to the bioinformatics field by developing GPU-BLASTN, a GPU-accelerated version of the widely used BLAST software, achieving remarkable speed improvements for sequence alignment tasks. My work extends to multiple-precision integer operations, where I designed a GPU library that significantly outperforms traditional CPU implementations.\n\nFurthermore, I have researched congestion control mechanisms for self-similar traffic in networks, proposing an adaptive wavelet and probability-based scheme (AWP) that enhances the quality of service for real-time multimedia applications. My diverse research interests reflect my commitment to pushing the boundaries of computational efficiency and performance across various domains.", + "collaborators": [ + "Kaiyong Zhao", + "S. Shi", + "N. Kaur", + "Raman Kumar", + "Shree Prakash Singh", + "A. Wason", + "R. Kaler", + "H. L. Minh", + "Zabih Ghassemlooy", + "Wai Pang Ng", + "Amir Askarian", + "S. Subramaniam", + "Yanting Luo", + "Yongjun Zhang", + "W. Gu", + "Wonhyuk Lee", + "Kwangjong Cho", + "Tianming Bu", + "Xiangyang Li", + "Lei Guo", + "Jin Cao", + "Hongfang Yu", + "Le Min Li", + "Bo Li", + "Jiangchuan Liu", + "Hai Liu", + "Y. Leung", + "Zongpeng Li", + "Min Lei", + "Qinjian Li", + "Chengwen Zhong", + "Kai Li", + "Guangyong Zhang", + "Xiaowei Lu", + "Qing Zhang", + "H. Yin", + "Chuang Lin", + "G. Min", + "Zhensheng Zhang", + "B. Li", + "Ya-Qin Zhang" + ], + "pub_titles": [ + "MG-WFBP: Efficient Data Communication for Distributed Synchronous SGD Algorithms", + "Design of Restorable Routing Algorithm in Optical Networks", + "User-assisted cloud storage system: Opportunities and challenges", + "GPU-BLASTN: Accelerating Nucleotide Sequence Alignment by GPUs", + "GPUMP: A Multiple-Precision Integer Library for GPUs", + "Effective congestion control for QoS enhancement of self-similar multimedia traffic" + ], + "pub_abstracts": [ + "Distributed synchronous stochastic gradient descent has been widely used to train deep neural networks on computer clusters. With the increase of computational power, network communications have become one limiting factor on the system scalability. In this paper, we observe that many deep neural networks have a large number of layers with only a small amount of data to be communicated. Based on the fact that merging some short communication tasks into a single one may reduce the overall communication time, we formulate an optimization problem to minimize the training iteration time. We develop an optimal solution named merged-gradient wait-free backpropagation (MG-WFBP) and implement it in our open-source deep learning platform B-Caffe. Our experimental results on an 8-node GPU cluster with 10GbE interconnect and trace-based simulation results on a 64-node cluster both show that the MG-WFBP algorithm can achieve much better scaling efficiency than existing methods WFBP and SyncEASGD.", + "In current WDM networks, it is possible to support hundreds of WDM channels on a single fiber. Therefore, the cost of the transmitters and the receivers, and hence the number of light paths, is becoming the main factor in determining the cost of a WDM network. Researchers tried to keep the number of light paths required to implement a topology as low as possible. This paper presents a restorable routing algorithm that reduces blocking probability and suggested a mathematical model and compares the proposed work with the conventional algorithms such as first fit and best fit routing algorithm. In this work, we have presented a quick and efficient heuristic for restorable routing", + "1. Introduction Cloud storage has recently attracted a substantial amount of attention from both industry and academia. Notable commercial cloud storage services include Amazon S3, Google Drive, Dropbox, Microsoft Skydrive, and Apple's iCloud. Compared with traditional storage systems, cloud storage offers several desirable advantages, including high data availability, high data reliability, and dynamic storage space.", + "The BLAST software package for sequence alignment is one of the most fundamental and widely used bioinformatics tools [1] [2]. Given the large population of BLAST users, any improvement in the execution speed of BLAST will bring significant benefits to the bioinformatics community. Some research groups have used GPUs to accelerate the speed of BLAST. E.g., GPU-BLAST uses GPUs to accelerate BLASTP, and it achieves 3 to 4 times of speedup over single-thread CPU based NCBI-BLASTP [3]. GPUs have also been successfully used to accelerate other sequence alignment tools, e.g., [4]. In this poster, we show our design, implementation, optimization, and experimental results of GPU-BLASTN, a GPU-accelerated version of the widely used NCBI-BLASTN. To the best of our knowledge, this is the first work that provides a complete solution for accelerating BLASTN by GPUs. GPU-BLASTN can obtain identical results as NCBI-BLASTN, and its speed on a contemporary Nvidia GTX680 GPU card is about 10 to 20 times faster than the speed of single-thread NCBI-BLASTN running on Xeon E5620. We evaluate GPU-BLASTN by running sequence search experiments against human build 36 and mouse build 36 genome databases that have been masked with WindowMasker. We use six sets of query sequences with different lengths ranging from hundreds to hundreds of thousands. We compare the results and running time of GPU-BLASTN with those of NCBIBLASTN on both single-thread CPU and multi-thread CPU. The GPU-BLASTN will be open source and freely available to the bioinformatics community.", + "Multiple-precision integer operations are key components of many security applications; but unfortunately they are computationally expensive on contemporary CPUs. In this paper, we present our design and implementation of a multiple-precision integer library for GPUs which is implemented by CUDA. We report our experimental results which show that a significant speedup can be achieved by GPUs as compared with the GNU MP library on CPUs.", + "Owing to the existence of noticeable concentrated periods of contention and idleness, self-similar traffic can greatly increase packet delay and loss probability and thus reduce system resource utilisation. The development of efficient congestion control mechanisms plays a central role in the improvement of network quality of service (QoS), in particular for real-time multimedia applications. By exploiting the property of scale-invariant burstiness and correlation inherent in self-similar traffic, the authors propose an effective congestion control scheme, named adaptive wavelet and probability-based scheme (AWP), which concurrently operates over multiple time scales. AWP adopts the extended multifractal wavelet model (EMWM) for analysing estimated traffic volume across multiple time scales. Furthermore, a new auto-correction algorithm based on Bayes’ theory for confidence analysis is employed to examine the validity of the predicted information. The analysis results can be used to enhance the adaptability of the prediction algorithm. In particular, the AWP framework can be easily extended to more than two time scales by increasing the level of wavelet transforms, which brings AWP a natural advantage in implementation and scalability. A series of simulation experiments have demonstrated that the proposed AWP scheme is superior to TCP and TFRC as it can greatly improve the QoS of multimedia data transmission while avoiding congestion collapse on the network." + ], + "domain": [ + "Deep Learning", + "Cloud Computing", + "Bioinformatics", + "Network Optimization" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "### [Question 1] - What is the problem?\nHow can we effectively automate the design and optimization of deep learning architectures to improve performance across various AI tasks?\n\n### [Question 2] - Why is it interesting and important?\nSolving this problem has significant implications for the research community as it can lead to the development of more efficient and effective machine learning models, reducing the reliance on expert knowledge and manual tuning. This advancement could democratize access to powerful AI tools, enabling researchers and practitioners from diverse fields to leverage deep learning without extensive expertise. Furthermore, automating architecture design could accelerate the pace of innovation in AI, leading to breakthroughs in applications such as natural language processing, computer vision, and beyond.\n\n### [Question 3] - Why is it hard?\nThe challenges in automating deep learning architecture design stem from the vast search space of possible architectures, which makes it computationally expensive and time-consuming to explore. Naive approaches may fail due to overfitting, underfitting, or simply not discovering optimal configurations because they lack a systematic way to evaluate and refine architectures. Additionally, the complexity of interactions between different components of a neural network adds to the difficulty, requiring sophisticated methods to balance performance, efficiency, and generalization.\n\n### [Question 4] - Why hasn't it been solved before?\nPrevious research has often focused on specific architectures or tasks, leading to a lack of generalizable methods for architecture search. Limitations in computational resources and the absence of robust evaluation metrics have also hindered progress. Existing solutions may not adequately address the trade-offs between model complexity and performance, resulting in suboptimal designs. Our approach aims to integrate advanced techniques such as neural architecture search (NAS) and meta-learning to create a more holistic framework that overcomes these barriers and improves upon prior work.\n\n### [Question 5] - What are the key components of my approach and results?\nOur proposed methodology involves a multi-faceted approach that combines neural architecture search with reinforcement learning to optimize model design. We will utilize benchmark datasets such as CIFAR-10 and ImageNet for evaluation, focusing on metrics like accuracy, computational efficiency, and model size. The expected outcomes include the identification of novel architectures that outperform existing state-of-the-art models, as well as a framework that can be applied to various tasks beyond image classification, potentially leading to significant advancements in the field of machine learning." + }, + "1902.08160": { + "paper_data": { + "title": "Topology of Learning in Artificial Neural Networks", + "url": "http://arxiv.org/abs/1902.08160v4", + "arxiv_id": "1902.08160", + "authors": [ + "Maxime Gabella" + ], + "abstract": "Understanding how neural networks learn remains one of the central challenges in machine learning research. From random at the start of training, the weights of a neural network evolve in such a way as to be able to perform a variety of tasks, like classifying images. Here we study the emergence of structure in the weights by applying methods from topological data analysis. We train simple feedforward neural networks on the MNIST dataset and monitor the evolution of the weights. When initialized to zero, the weights follow trajectories that branch off recurrently, thus generating trees that describe the growth of the effective capacity of each layer. When initialized to tiny random values, the weights evolve smoothly along two-dimensional surfaces. We show that natural coordinates on these learning surfaces correspond to important factors of variation.", + "introduction": " Introduction Deep artificial neural networks perform spectacularly on many machine learning tasks, including image classification, speech recognition, translation, and game playing. A neural network can for example be trained to detect the presence of a dog in an image by analyzing a large number of images that have been labeled as “dog” or “no dog,” each time adjusting its parameters, in particular the weights assigned to edges connecting pairs of neurons. Such a neural network is generally organized in layers that learn increasingly abstract features, from simple combinations of input pixels all the way to full-fledged models of a dog [1]. Despite these empirical successes, neural networks are still poorly understood theoretically. One open question is why they generalize so well to unseen samples, whereas the multitude of their parameters would lead one to expect them to overfit the training set [ 2,3]. Finding the answer would make neural networks more interpretable and provide a principled approach to designing their architecture. The main intuition behind this paper is that whatever the structure that arises during training may be, it should be possible to capture it with topology, roughly defined as the mathematical study of qualitative shapes and structures. More specifically, before starting the training the weights are typically initialized randomly and are therefore structureless. However, as the training progresses the weights learn to adjust towards certain distributions of values, whose structure ultimately encodes the knowledge of the neural network about the task at hand. It is the emergence of such learning structures that we wish to exhibit in this paper. Our approach is to monitor the evolution of the weights during training. For a fixed layer of the neural network, we consider each of its neurons at each training step as a vector of incoming weights. This provides us in the end with a cloud of points in a high-dimensional vector space. We study the shape of this point cloud by applying techniques from topological data analysis [ 4]. This allows us to represent the evolution of the weights as a graph that encodes the topological structure of the point cloud. The code for this paper uses the Keras library [5] and is available as a Jupyter Notebook [6]. Preprint. Work in progress.arXiv:1902.08160v4 [cs.LG] 27 Oct 2020This work was originally motivated by the papers [ 7,8], where topological results, see the companion notebook [ 6]). For simplicity we turn off all biases and only consider weights as parameters. We take sigmoid activation functions for the internal layers, and softmax for the output layer, with the cross-entropy as the loss function. Initializing all the weights of a neural network to the same value is generally not recommended in practice [ 13]. One of the main guidelines for initialization is indeed to break the symmetry between the weights, so that the neurons can learn different functions. However, since our goal in this paper is not to reach optimal performance but rather to gain some insight into how weights learn, we start by studying the simplest case where all weights are initialized to zero. We find that the evolution of the weights during training describes a tree, with subsets of weights occasionally branching off from each other. While this can already be deduced from inspection of the", + "references": [ + { + "title": "Topological Approaches to Deep Learning", + "abstract": null + }, + { + "title": "A look at the topology of convolutional neural networks", + "abstract": "Convolutional neural networks (CNN’s) are powerful and widely used tools. However, their interpretability is far from ideal. In this paper we use topological data analysis to investigate what various CNN’s learn. We show that the weights of convolutional layers at depths from 1 through 13 learn simple global structures. We also demonstrate the change of the simple structures over the course of training. In particular, we define and analyze the spaces of spatial filters of convolutional layers and show the recurrence, among all networks, depths, and during training, of a simple circle consisting of rotating edges, as well as a less recurring unanticipated complex circle that combines lines, edges, and non-linear patterns. We train over a thousand CNN’s on MNIST and CIFAR-10, as well as use VGG-networks pretrained on ImageNet." + }, + { + "title": "Generalization in Deep Learning", + "abstract": "This paper provides non-vacuous and numerically-tight generalization guarantees for deep learning, as well as theoretical insights into why and how deep learning can generalize well, despite its large capacity, complexity, possible algorithmic instability, nonrobustness, and sharp minima, responding to an open question in the literature. We also propose new open problems and discuss the limitations of our results." + }, + { + "title": "Understanding deep learning requires rethinking generalization", + "abstract": "Despite their massive size, successful deep artificial neural networks can exhibit a remarkably small difference between training and test performance. Conventional wisdom attributes small generalization error either to properties of the model family, or to the regularization techniques used during training. \nThrough extensive systematic experiments, we show how these traditional approaches fail to explain why large neural networks generalize well in practice. Specifically, our experiments establish that state-of-the-art convolutional networks for image classification trained with stochastic gradient methods easily fit a random labeling of the training data. This phenomenon is qualitatively unaffected by explicit regularization, and occurs even if we replace the true images by completely unstructured random noise. We corroborate these experimental findings with a theoretical construction showing that simple depth two neural networks already have perfect finite sample expressivity as soon as the number of parameters exceeds the number of data points as it usually does in practice. \nWe interpret our experimental findings by comparison with traditional models." + }, + { + "title": "In Search of the Real Inductive Bias: On the Role of Implicit Regularization in Deep Learning", + "abstract": "We present experiments demonstrating that some other form of capacity control, different from network size, plays a central role in learning multilayer feed-forward networks. We argue, partially through analogy to matrix factorization, that this is an inductive bias that can help shed light on deep learning." + }, + { + "title": "Auto-Encoding Variational Bayes", + "abstract": "Abstract: How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions is two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results." + }, + { + "title": "Extracting insights from the shape of complex data using topology", + "abstract": null + }, + { + "title": "Topology and data", + "abstract": "An important feature of modern science and engineering is that data of various kinds is being produced at an unprecedented rate. This is so in part because of new experimental methods, and in part because of the increase in the availability of high powered computing technology. It is also clear that the nature of the data we are obtaining is significantly different. For example, it is now often the case that we are given data in the form of very long vectors, where all but a few of the coordinates turn out to be irrelevant to the questions of interest, and further that we don’t necessarily know which coordinates are the interesting ones. A related fact is that the data is often very high-dimensional, which severely restricts our ability to visualize it. The data obtained is also often much noisier than in the past and has more missing information (missing data). This is particularly so in the case of biological data, particularly high throughput data from microarray or other sources. Our ability to analyze this data, both in terms of quantity and the nature of the data, is clearly not keeping pace with the data being produced. In this paper, we will discuss how geometry and topology can be applied to make useful contributions to the analysis of various kinds of data. Geometry and topology are very natural tools to apply in this direction, since geometry can be regarded as the study of distance functions, and what one often works with are distance functions on large finite sets of data. The mathematical formalism which has been developed for incorporating geometric and topological techniques deals with point clouds, i.e. finite sets of points equipped with a distance function. It then adapts tools from the various branches of geometry to the study of point clouds. The point clouds are intended to be thought of as finite samples taken from a geometric object, perhaps with noise. Here are some of the key points which come up when applying these geometric methods to data analysis. • Qualitative information is needed: One important goal of data analysis is to allow the user to obtain knowledge about the data, i.e. to understand how it is organized on a large scale. For example, if we imagine that we are looking at a data set constructed somehow from diabetes patients, it would be important to develop the understanding that there are two types of the disease, namely the juvenile and adult onset forms. Once that is established, one of course wants to develop quantitative methods for distinguishing them, but the first insight about the distinct forms of the disease is key." + }, + { + "title": "Deep Learning", + "abstract": "Machine-learning technology powers many aspects of modern society: from web searches to content filtering on social networks to recommendations on e-commerce websites, and it is increasingly present in consumer products such as cameras and smartphones. Machine-learning systems are used to identify objects in images, transcribe speech into text, match news items, posts or products with users’ interests, and select relevant results of search. Increasingly, these applications make use of a class of techniques called deep learning. Conventional machine-learning techniques were limited in their ability to process natural data in their raw form. For decades, constructing a pattern-recognition or machine-learning system required careful engineering and considerable domain expertise to design a feature extractor that transformed the raw data (such as the pixel values of an image) into a suitable internal representation or feature vector from which the learning subsystem, often a classifier, could detect or classify patterns in the input. Representation learning is a set of methods that allows a machine to be fed with raw data and to automatically discover the representations needed for detection or classification. Deep-learning methods are representation-learning methods with multiple levels of representation, obtained by composing simple but non-linear modules that each transform the representation at one level (starting with the raw input) into a representation at a higher, slightly more abstract level. With the composition of enough such transformations, very complex functions can be learned. For classification tasks, higher layers of representation amplify aspects of the input that are important for discrimination and suppress irrelevant variations. An image, for example, comes in the form of an array of pixel values, and the learned features in the first layer of representation typically represent the presence or absence of edges at particular orientations and locations in the image. The second layer typically detects motifs by spotting particular arrangements of edges, regardless of small variations in the edge positions. The third layer may assemble motifs into larger combinations that correspond to parts of familiar objects, and subsequent layers would detect objects as combinations of these parts. The key aspect of deep learning is that these layers of features are not designed by human engineers: they are learned from data using a general-purpose learning procedure. Deep learning is making major advances in solving problems that have resisted the best attempts of the artificial intelligence community for many years. It has turned out to be very good at discovering intricate structures in high-dimensional data and is therefore applicable to many domains of science, business and government. In addition to beating records in image recognition and speech recognition, it has beaten other machine-learning techniques at predicting the activity of potential drug molecules, analysing particle accelerator data, reconstructing brain circuits, and predicting the effects of mutations in non-coding DNA on gene expression and disease. Perhaps more surprisingly, deep learning has produced extremely promising results for various tasks in natural language understanding, particularly topic classification, sentiment analysis, question answering and language translation. We think that deep learning will have many more successes in the near future because it requires very little engineering by hand, so it can easily take advantage of increases in the amount of available computation and data. New learning algorithms and architectures that are currently being developed for deep neural networks will only accelerate this progress." + }, + { + "title": "Gradient-based learning applied to document recognition", + "abstract": "Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day." + }, + { + "title": "Eurographics Symposium on Point-based Graphics (2007) Topological Methods for the Analysis of High Dimensional Data Sets and 3d Object Recognition", + "abstract": "We present a computational method for extracting simple descriptions of high dimensional data sets in the form of simplicial complexes. Our method, called Mapper, is based on the idea of partial clustering of the data guided by a set of functions defined on the data. The proposed method is not dependent on any particular clustering algorithm, i.e. any clustering algorithm may be used with Mapper. We implement this method and present a few sample applications in which simple descriptions of the data present important information about its structure." + } + ] + }, + "author_data": { + "14c8c2b6-eea8-4ef4-8807-2a0af5fbb76a": { + "pk": "14c8c2b6-eea8-4ef4-8807-2a0af5fbb76a", + "project_name": null, + "name": "Maxime Gabella", + "bio": "I am a researcher deeply engaged in the intersection of knowledge representation, complex systems, and theoretical physics. My work explores how knowledge can be structured and classified, particularly through the lens of large databases like Wikipedia. I have demonstrated that implicit classification hierarchies emerge organically within these networks, revealing cultural influences on knowledge organization across different languages.\n\nIn addition to my work on knowledge structures, I delve into advanced topics in string theory and supergravity. I have developed graphical methods to analyze BPS spectra in class S theories and explored the generalized geometry of AdS/CFT dualities. My research has led to significant insights into the properties of solutions in type IIB string theory, including the relationship between contact volumes and central charges in dual conformal field theories.\n\nI am particularly interested in the implications of these theoretical frameworks for understanding supersymmetric solutions and their physical interpretations. My investigations into probe M5-branes and their classifications have opened new avenues for exploring the connections between geometry and field theory. Through my work, I aim to bridge the gap between abstract theoretical concepts and their practical applications in understanding the universe's fundamental structures.", + "collaborators": [ + "I. Bah", + "Nick Halmagyi", + "A. Ashmore", + "M. Graña", + "M. Petrini", + "D. Waldram", + "Tudor Dimofte", + "A. Goncharov", + "I. Coman", + "J. Teschner", + "D. Martelli", + "A. Passias", + "J. Sparks", + "Pietro Longhi", + "Chan Y. Park", + "M. Yamazaki" + ], + "pub_titles": [ + "Cultural Structures of Knowledge from Wikipedia Networks of First Links", + "Structures of Knowledge from Wikipedia Networks", + "BPS spectra from BPS graphs", + "The AdS/CFT correspondence and generalized geometry", + "Communications in Mathematical Physics N = 2 Supersymmetric AdS 4 Solutions of M-theory", + "Punctures from probe M5-branes and N = 1 superconformal field theories" + ], + "pub_abstracts": [ + "Knowledge is useless without structure. While the classification of knowledge has been an enduring philosophical enterprise, it recently found applications in computer science, notably for artificial intelligence. The availability of large databases allowed for complex ontologies to be built automatically, for example by extracting structured content from Wikipedia. However, this approach is subject to manual categorization decisions made by online editors. Here we show that an implicit classification hierarchy emerges spontaneously on Wikipedia. We study the network of first links between articles, and find that it centers on a core cycle involving concepts of fundamental classifying importance. We argue that this structure is rooted in cultural history. For European languages, articles like Philosophy and Science are central, whereas Human and Earth dominate for East Asian languages. This reflects the differences between ancient Greek thought and Chinese tradition. Our results reveal the powerful influence of culture on the intrinsic architecture of complex data sets.", + "Knowledge is useless without structure. While the classification of knowledge has been an enduring philosophical enterprise, it recently found applications in computer science, notably for artificial intelligence. The availability of large databases allowed for complex ontologies to be built automatically, for example by extracting structured content from Wikipedia. However, this approach is subject to manual categorization decisions made by online editors. Here we show that an implicit classification system emerges spontaneously on Wikipedia. We study the network of first links between articles, and find that it centers on a core cycle involving concepts of fundamental classifying importance. We argue that this structure is rooted in cultural history. For European languages, articles like Philosophy and Science are central, whereas Human and Earth dominate for East Asian languages. This reflects the differences between ancient Greek thought and Chinese tradition. Our results reveal the powerful influence of culture on the intrinsic architecture of complex data sets.", + "I present a simple graphical method to find the BPS spectra of $A_1$ theories of class S. BPS graphs provide a bridge between spectral networks and BPS quivers, the two main frameworks for the study of BPS states. Here I show how to essentially read off from a BPS graph the quantum spectrum generator (or BPS monodromy), expressed as a product of quantum dilogarithms. Thanks to the framed wall-crossing phenomenon for line defects, the determination of the BPS spectrum reduces to the computation of quantum parallel transport across the edges of the BPS graph.", + "The most general AdS$_5 imes Y$ solutions of type IIB string theory that are AdS/CFT dual to superconformal field theories in four dimensions can be fruitfully described in the language of generalized geometry, a powerful hybrid of complex and symplectic geometry. We show that the cone over the compact five-manifold $Y$ is generalized Calabi-Yau and carries a generalized holomorphic Killing vector field $xi$, dual to the R-symmetry. Remarkably, this cone always admits a symplectic structure, which descends to a contact structure on $Y$, with $xi$ as Reeb vector field. Moreover, the contact volumes of $Y$, which can be computed by localization, encode essential properties of the dual CFT, such as the central charge and the conformal dimensions of BPS operators corresponding to wrapped D3-branes. We then define a notion of ``generalized Sasakian geometry'', which can be characterized by a simple differential system of three symplectic forms on a four-dimensional transverse space. The correct Reeb vector field for an AdS$_5$ solution within a given family of generalized Sasakian manifolds can be determined---without the need of the explicit metric---by a variational procedure. The relevant functional to minimize is the type IIB supergravity action restricted to the space of generalized Sasakian manifolds, which turns out to be just the contact volume. We conjecture that this contact volume is equal to the inverse of the trial central charge whose maximization determines the R-symmetry of the dual superconformal field theory. The power of this volume minimization is illustrated by the calculation of the contact volumes for a new infinite family of solutions, in perfect agreement with the results of $a$-maximization in the dual mass-deformed generalized conifold theories.", + "We analyse the most general N = 2 supersymmetric solutions of D = 11 supergravity consisting of a warped product of four-dimensional anti-de-Sitter space with a seven-dimensional Riemannian manifold Y7. We show that the necessary and sufficient conditions for supersymmetry can be phrased in terms of a local SU (2)-structure on Y7. Solutions with non-zero M2-brane charge also admit a canonical contact structure, in terms of which many physical quantities can be expressed, including the free energy and the scaling dimensions of operators dual to supersymmetric wrapped M5-branes. We show that a special class of solutions is singled out by imposing an additional symmetry, for which the problem reduces to solving a second order non-linear ODE. As well as recovering a known class of solutions, that includes the IR fixed point of a mass deformation of the ABJM theory, we also find new solutions which are dual to cubic deformations. In particular, we find a new supersymmetric warped AdS4 × S7 solution with non-trivial four-form flux.", + ": We study probe M5-branes in N = 1 AdS 5 solutions of M-theory that arise from M5-branes wrapped on a Riemann surface. Using the BPS condition from κ -symmetry, we classify supersymmetric probe M5-branes that extend along all of AdS 5 and intersect the Riemann surface at points. These can be viewed as punctures in the dual N = 1 superconformal field theories. We find M5-branes that correspond to the two types of simple punctures previously studied in field theory. In addition, when the central charge is rational, we find a new class of M5-branes with a moduli space that includes two internal dimensions in addition to the Riemann surface. These new M5-branes have the essential characteristic of fractional branes, in that a single one at a generic point of its moduli space becomes multiple M5-branes at special points. M-Theory" + ], + "domain": [ + "M-Theory", + "Supersymmetry", + "Geometry", + "Knowledge Representation" + ], + "institute": null, + "embed": null, + "is_leader_candidate": true, + "is_member_candidate": true, + "is_reviewer_candidate": true, + "is_chair_candidate": true + } + }, + "reference_proposal": "**[Question 1] - What is the problem?** \nWhy do neural networks generalize well to unseen samples despite having a large number of parameters that could lead to overfitting?\n\n**[Question 2] - Why is it interesting and important?** \nSolving this problem is crucial for the research community as it would enhance the interpretability of neural networks, providing insights into their architecture and functioning. Understanding the generalization capabilities of neural networks could lead to the development of more robust models and inform best practices in model design. This knowledge could also have practical applications in various fields, such as healthcare, finance, and autonomous systems, where reliable predictions are essential.\n\n**[Question 3] - Why is it hard?** \nThe challenge lies in the complex interplay between the vast number of parameters in neural networks and their training dynamics. Naive approaches may fail because they do not account for the intricate structures that emerge during training. Theoretical obstacles include a lack of understanding of how weight distributions evolve and how these distributions relate to the network's performance. Additionally, the high-dimensional nature of the weight space complicates the analysis, making it difficult to capture the qualitative shapes and structures that emerge.\n\n**[Question 4] - Why hasn't it been solved before?** \nPrevious research has often focused on empirical performance rather than the underlying mechanisms of weight evolution. There has been a lack of methodologies that effectively combine topological data analysis with neural network training dynamics. Barriers include the complexity of high-dimensional data and the absence of a framework to systematically study the topological structures of weight distributions. This paper's approach differs by explicitly monitoring weight evolution and applying topological analysis to uncover the emergent structures during training.\n\n**[Question 5] - What are the key components of my approach and results?** \nThe proposed methodology involves monitoring the evolution of weights in a neural network during training, treating each neuron's incoming weights as a vector in a high-dimensional space. The analysis will utilize topological data analysis techniques to study the shape of the resulting point cloud. The dataset will consist of standard image classification tasks, and the metric for evaluation will focus on the topological features of the weight distributions. The expected outcome is a clearer understanding of how weights learn and evolve, represented as a graph that encodes the topological structure of the point cloud, ultimately revealing insights into the generalization capabilities of neural networks." + }, + "1512.06293": { + "paper_data": { + "title": "A Mathematical Theory of Deep Convolutional Neural Networks for Feature Extraction", + "url": "http://arxiv.org/abs/1512.06293v3", + "arxiv_id": "1512.06293", + "authors": [ + "Thomas Wiatowski", + "Helmut Bölcskei" + ], + "abstract": "Deep convolutional neural networks have led to breakthrough results in numerous practical machine learning tasks such as classification of images in the ImageNet data set, control-policy-learning to play Atari games or the board game Go, and image captioning. Many of these applications first perform feature extraction and then feed the results thereof into a trainable classifier. The mathematical analysis of deep convolutional neural networks for feature extraction was initiated by Mallat, 2012. Specifically, Mallat considered so-called scattering networks based on a wavelet transform followed by the modulus non-linearity in each network layer, and proved translation invariance (asymptotically in the wavelet scale parameter) and deformation stability of the corresponding feature extractor. This paper complements Mallat's results by developing a theory that encompasses general convolutional transforms, or in more technical parlance, general semi-discrete frames (including Weyl-Heisenberg filters, curvelets, shearlets, ridgelets, wavelets, and learned filters), general Lipschitz-continuous non-linearities (e.g., rectified linear units, shifted logistic sigmoids, hyperbolic tangents, and modulus functions), and general Lipschitz-continuous pooling operators emulating, e.g., sub-sampling and averaging. In addition, all of these elements can be different in different network layers. For the resulting feature extractor we prove a translation invariance result of vertical nature in the sense of the features becoming progressively more translation-invariant with increasing network depth, and we establish deformation sensitivity bounds that apply to signal classes such as, e.g., band-limited functions, cartoon functions, and Lipschitz functions.", + "introduction": " Introduction to shearlets,” in Shearlets: Multiscale analysis for multivariate data [36], pp. 1–38. [53] I. Daubechies, Ten lectures on wavelets . Society for Industrial and Applied Mathematics, 1992. [54] D. Ellis, Z. Zeng, and J. McDermott, “Classifying soundtracks with audio texture features,” in Proc. of IEEE International Conference on Acoust., Speech, and Signal Process. (ICASSP) , pp. 5880–5883, 2011. [55] G. Tzanetakis and P. Cook, “Musical genre classification of audio signals,” IEEE Trans. Speech Audio Process. , vol. 10, no. 5, pp. 293– 302, 2002. [56] J. Lin and L. Qu, “Feature extraction based on Morlet wavelet and its application for mechanical fault diagnosis,” J. Sound Vib. , vol. 234, no. 1, pp. 135–148, 2000. [57] G. Y . Chen, T. D. Bui, and A. Kr ˙zyzak, “Rotation invariant pattern recognition using ridgelets, wavelet cycle-spinning and Fourier features,” Pattern Recognition , vol. 38, no. 12, pp. 2314–2322, 2005. [58] Y . L. Qiao, C. Y . Song, and C. H. Zhao, “M-band ridgelet transform based texture classification,” Pattern Recognition Letters , vol. 31, no. 3, pp. 244–249, 2010. [59] S. Arivazhagan, L. Ganesan, and T. S. Kumar, “Texture classification using ridgelet transform,” Pattern Recognition Letters , vol. 27, no. 16, pp. 1875–1883, 2006. [60] J. Ma and G. Plonka, “The curvelet transform,” IEEE Signal Process. Mag. , vol. 27, no. 2, pp. 118–133, 2010.22 [61] L. Dettori and L. Semler, “A comparison of wavelet, ridgelet, and curvelet-based texture classification algorithms in computed tomography,” Computers in Biology and Medicine , vol. 37, no. 4, pp. 486–498, 2007. [62] P. P. Vaidyanathan, Multirate systems and filter banks . Prentice Hall, 1993. [63] L. Grafakos, Classical Fourier analysis . Springer, 2nd ed., 2008. [64] T. Wiatowski, M. Tschannen, A. Stani ´c, P. Grohs, and H. B ¨olcskei, “Discrete deep feature extraction: A theory and new architectures,” inProc. of International Conference on Machine Learning (ICML) , pp. 2149–2158, 2016. [65] D. L. Donoho, “Sparse components of images and optimal atomic decompositions,” Constructive Approximation , vol. 17, no. 3, pp. 353– 382, 2001. [66] T. Wiatowski, P. Grohs, and H. B ¨olcskei, “Energy propagation in deep convolutional neural networks,” IEEE Transactions on Information The- ory, to appear. [67] A. J. E. M. Janssen, “The duality condition for Weyl-Heisenberg frames,” inGabor analysis: Theory and applications (H. G. Feichtinger and T. Strohmer, eds.), pp. 33–84, Birkh ¨auser, 1998. [68] A. Ron and Z. Shen, “Frames and stable bases for shift-invariant subspaces of L2(Rd),”Canad. J. Math. , vol. 47, no. 5, pp. 1051–1094, 1995. [69] M. Frazier, B. Jawerth, and G. Weiss, Littlewood-Paley theory and the study of function spaces . American Mathematical Society, 1991. [70] A. W. Naylor and G. R. Sell, Linear operator theory in engineering and science . Springer, 1982. [71] H. B ¨olcskei, F. Hlawatsch, and H. G. Feichtinger, “Frame-theoretic analysis of oversampled filter banks,” IEEE Trans. Signal Process. , vol. 46, no. 12, pp. 3256–3268, 1998. [72] A. J. E. M. Janssen, “Duality and biorthogonality for Weyl-Heisenberg frames,” J. Fourier Anal. Appl. , vol. 1, no. 4, pp. 403–436, 1995. [73] I. Daubechies, H. J. Landau, and Z. Landau, “Gabor time-frequency lattices and the Wexler-Raz identity,” J. Fourier Anal. Appl. , vol. 1, no. 4, pp. 438–478, 1995. [74] K. Gr ¨ochening, Foundations of time-frequency analysis . Birkh ¨auser, 2001. [75] I. Daubechies, A. Grossmann, and Y . Meyer, “Painless nonorthogonal expansions,” J. Math. Phys. , vol. 27, no. 5, pp. 1271–1283, 1986. [76] K. Gr", + "references": [ + { + "title": "Parseval Networks: Improving Robustness to Adversarial Examples", + "abstract": "We introduce Parseval networks, a form of deep neural networks in which the Lipschitz constant of linear, convolutional and aggregation layers is constrained to be smaller than 1. Parseval networks are empirically and theoretically motivated by an analysis of the robustness of the predictions made by deep neural networks when their input is subject to an adversarial perturbation. The most important feature of Parseval networks is to maintain weight matrices of linear and convolutional layers to be (approximately) Parseval tight frames, which are extensions of orthogonal matrices to non-square matrices. We describe how these constraints can be maintained efficiently during SGD. We show that Parseval networks match the state-of-the-art in terms of accuracy on CIFAR-10/100 and Street View House Numbers (SVHN) while being more robust than their vanilla counterpart against adversarial examples. Incidentally, Parseval networks also tend to train faster and make a better usage of the full capacity of the networks." + }, + { + "title": "Energy Propagation in Deep Convolutional Neural Networks", + "abstract": "Many practical machine learning tasks employ very deep convolutional neural networks. Such large depths pose formidable computational challenges in training and operating the network. It is therefore important to understand how fast the energy contained in the propagated signals (a.k.a. feature maps) decays across layers. In addition, it is desirable that the feature extractor generated by the network be informative in the sense of the only signal mapping to the all-zeros feature vector being the zero input signal. This “trivial null-set” property can be accomplished by asking for “energy conservation” in the sense of the energy in the feature vector being proportional to that of the corresponding input signal. This paper establishes conditions for energy conservation (and thus for a trivial null-set) for a wide class of deep convolutional neural network-based feature extractors and characterizes corresponding feature map energy decay rates. Specifically, we consider general scattering networks employing the modulus non-linearity and we find that under mild analyticity and high-pass conditions on the filters (which encompass, inter alia, various constructions of Weyl-Heisenberg filters, wavelets, ridgelets, ( $\\alpha $ )-curvelets, and shearlets) the feature map energy decays at least polynomially fast. For broad families of wavelets and Weyl-Heisenberg filters, the guaranteed decay rate is shown to be exponential. Moreover, we provide handy estimates of the number of layers needed to have at least $((1-\\varepsilon )\\cdot 100)\\%$ of the input signal energy be contained in the feature vector." + }, + { + "title": "Analysis of time-frequency scattering transforms", + "abstract": null + }, + { + "title": "Uniform Covering Frames For Scattering", + "abstract": null + }, + { + "title": "Discrete Deep Feature Extraction: A Theory and New Architectures", + "abstract": "First steps towards a mathematical theory of deep convolutional neural networks for feature extraction were made---for the continuous-time case---in Mallat, 2012, and Wiatowski and Bolcskei, 2015. This paper considers the discrete case, introduces new convolutional neural network architectures, and proposes a mathematical framework for their analysis. Specifically, we establish deformation and translation sensitivity results of local and global nature, and we investigate how certain structural properties of the input signal are reflected in the corresponding feature vectors. Our theory applies to general filters and general Lipschitz-continuous non-linearities and pooling operators. Experiments on handwritten digit classification and facial landmark detection---including feature importance evaluation---complement the theoretical findings." + }, + { + "title": "Deep convolutional neural networks on cartoon functions", + "abstract": "Wiatowski and Bölcskei, 2015, proved that deformation stability and vertical translation invariance of deep convolutional neural network-based feature extractors are guaranteed by the network structure per se rather than the specific convolution kernels and non-linearities. While the translation invariance result applies to square-integrable functions, the deformation stability bound holds for band-limited functions only. Many signals of practical relevance (such as natural images) exhibit, however, sharp and curved discontinuities and are hence not band-limited. The main contribution of this paper is a deformation stability result that takes these structural properties into account. Specifically, we establish deformation stability bounds for the class of cartoon functions introduced by Donoho, 2001." + }, + { + "title": "Deep Residual Learning for Image Recognition", + "abstract": "Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation." + }, + { + "title": "Deep convolutional neural networks based on semi-discrete frames", + "abstract": "Deep convolutional neural networks have led to breakthrough results in practical feature extraction applications. The mathematical analysis of these networks was pioneered by Mallat [1]. Specifically, Mallat considered so-called scattering networks based on identical semi-discrete wavelet frames in each network layer, and proved translation-invariance as well as deformation stability of the resulting feature extractor. The purpose of this paper is to develop Mallat's theory further by allowing for different and, most importantly, general semi-discrete frames (such as, e.g., Gabor frames, wavelets, curvelets, shearlets, ridgelets) in distinct network layers. This allows to extract wider classes of features than point singularities resolved by the wavelet transform. Our generalized feature extractor is proven to be translation-invariant, and we develop deformation stability results for a larger class of deformations than those considered by Mallat. For Mallat's wavelet-based feature extractor, we get rid of a number of technical conditions. The mathematical engine behind our results is continuous frame theory, which allows us to completely detach the invariance and deformation stability proofs from the particular algebraic structure of the underlying frames." + }, + { + "title": "Note on best possible bounds for determinants of matrices close to the identity matrix", + "abstract": null + }, + { + "title": "Deep roto-translation scattering for object classification", + "abstract": "Dictionary learning algorithms or supervised deep convolution networks have considerably improved the efficiency of predefined feature representations such as SIFT. We introduce a deep scattering convolution network, with complex wavelet filters over spatial and angular variables. This representation brings an important improvement to results previously obtained with predefined features over object image databases such as Caltech and CIFAR. The resulting accuracy is comparable to results obtained with unsupervised deep learning and dictionary based representations. This shows that refining image representations by using geometric priors is a promising direction to improve image classification and its understanding." + }, + { + "title": "Cartoon Approximation with α\\documentclass[12pt]{minimal} \\usepackage{amsmath} \\usepackage{wasysym} \\usepackage{amsfonts} \\usepackage{amssymb} \\usepackage{amsbsy} \\usepackage{mathrsfs} \\usepackage{upgreek} \\setlength{\\oddsidemargin}{-69pt} \\begin{document}$$\\alpha $$\\end{document}-Curvelets", + "abstract": null + }, + { + "title": "Rigid-Motion Scattering for Texture Classification", + "abstract": "A rigid-motion scattering computes adaptive invariants along translations and rotations, with a deep convolutional network. Convolutions are calculated on the rigid-motion group, with wavelets defined on the translation and rotation variables. It preserves joint rotation and translation information, while providing global invariants at any desired scale. Texture classification is studied, through the characterization of stationary processes from a single realization. State-of-the-art results are obtained on multiple texture data bases, with important rotation and scaling variabilities." + }, + { + "title": "Deep Scattering Spectrum", + "abstract": "A scattering transform defines a locally translation invariant representation which is stable to time-warping deformation. It extends MFCC representations by computing modulation spectrum coefficients of multiple orders, through cascades of wavelet convolutions and modulus operators. Second-order scattering coefficients characterize transient phenomena such as attacks and amplitude modulation. A frequency transposition invariant representation is obtained by applying a scattering transform along log-frequency. State-the-of-art classification results are obtained for musical genre and phone classification on GTZAN and TIMIT databases, respectively." + }, + { + "title": "Pattern Classification", + "abstract": null + }, + { + "title": "Representation Learning: A Review and New Perspectives", + "abstract": "The success of machine learning algorithms generally depends on data representation, and we hypothesize that this is because different representations can entangle and hide more or less the different explanatory factors of variation behind the data. Although specific domain knowledge can be used to help design representations, learning with generic priors can also be used, and the quest for AI is motivating the design of more powerful representation-learning algorithms implementing such priors. This paper reviews recent work in the area of unsupervised feature learning and deep learning, covering advances in probabilistic models, autoencoders, manifold learning, and deep networks. This motivates longer term unanswered questions about the appropriate objectives for learning good representations, for computing representations (i.e., inference), and the geometrical connections between representation learning, density estimation, and manifold learning." + }, + { + "title": "Shearlets: Multiscale Analysis for Multivariate Data", + "abstract": "Over the last20 years, multiscale methods and wavelets have revolutionized the field of applied mathematicsby providing an efficient means ofencoding isotropic phenomena. Directional multiscale systems, particularly shearlets,are now having the same dramatic impact on the encoding of multidimensional signals. Since its introduction about five years ago, the theory of shearlets has rapidly developed and gained wide recognitionasthe superior way of achievinga truly unified treatment in both a continuous and a digital setting. By now, it has reached maturity as a research field, with rich mathematics, efficient numerical methods, and various important applications." + }, + { + "title": "Invariant Scattering Convolution Networks", + "abstract": "A wavelet scattering network computes a translation invariant image representation, which is stable to deformations and preserves high frequency information for classification. It cascades wavelet transform convolutions with non-linear modulus and averaging operators. The first network layer outputs SIFT-type descriptors whereas the next layers provide complementary invariant information which improves classification. The mathematical analysis of wavelet scattering networks explain important properties of deep convolution networks for classification. A scattering representation of stationary processes incorporates higher order moments and can thus discriminate textures having same Fourier power spectrum. State of the art classification results are obtained for handwritten digits and texture discrimination, with a Gaussian kernel SVM and a generative PCA classifier." + }, + { + "title": "Ridgelet-type Frame Decompositions for Sobolev Spaces related to Linear Transport", + "abstract": null + }, + { + "title": "Characterization of Signals from Multiscale Edges", + "abstract": "A multiscale Canny edge detection is equivalent to finding the local maxima of a wavelet transform. The authors study the properties of multiscale edges through the wavelet theory. For pattern recognition, one often needs to discriminate different types of edges. They show that the evolution of wavelet local maxima across scales characterize the local shape of irregular structures. Numerical descriptors of edge types are derived. The completeness of a multiscale edge representation is also studied. The authors describe an algorithm that reconstructs a close approximation of 1-D and 2-D signals from their multiscale edges. For images, the reconstruction errors are below visual sensitivity. As an application, a compact image coding algorithm that selects important edges and compresses the image data by factors over 30 has been implemented. >" + }, + { + "title": "Deep Sparse Rectifier Neural Networks", + "abstract": "While logistic sigmoid neurons are more biologically plausible than hyperbolic tangent neurons, the latter work better for training multi-layer neural networks. This paper shows that rectifying neurons are an even better model of biological neurons and yield equal or better performance than hyperbolic tangent networks in spite of the hard non-linearity and non-dierentiabil ity" + }, + { + "title": "Classifying soundtracks with audio texture features", + "abstract": "Sound textures may be defined as sounds whose character depends on statistical properties as much as the specific details of each individually-perceived event. Recent work has devised a set of statistics that, when synthetically imposed, allow listeners to identify a wide range of environmental sound textures. In this work, we investigate using these statistics for automatic classification of a set of environmental sound classes defined over a set of web videos depicting “multimedia events”. We show that the texture statistics perform as well as our best conventional statistics (based on MFCC covariance). We further examine the relative contributions of the different statistics, showing the importance of modulation spectra and cross-band envelope correlations." + }, + { + "title": "Group Invariant Scattering", + "abstract": "This paper constructs translation‐invariant operators on $\\font\\open=msbm10 at 10pt\\def\\R{\\hbox{\\open R}}{\\bf L}^2({{{\\R}}}^d)$, which are Lipschitz‐continuous to the action of diffeomorphisms. A scattering propagator is a path‐ordered product of nonlinear and noncommuting operators, each of which computes the modulus of a wavelet transform. A local integration defines a windowed scattering transform, which is proved to be Lipschitz‐continuous to the action of C2 diffeomorphisms. As the window size increases, it converges to a wavelet scattering transform that is translation invariant. Scattering coefficients also provide representations of stationary processes. Expected values depend upon high‐order moments and can discriminate processes having the same power spectrum. Scattering operators are extended on L2(G), where G is a compact Lie group, and are invariant under the action of G. Combining a scattering on $\\font\\open=msbm10 at 10pt\\def\\R{\\hbox{\\open R}}{\\bf L}^2({{{\\R}}}^d)$ and on L2(SO(d)) defines a translation‐ and rotation‐invariant scattering on $\\font\\open=msbm10 at 10pt\\def\\R{\\hbox{\\open R}}{\\bf L}^2({{{\\R}}}^d)$. © 2012 Wiley Periodicals, Inc." + }, + { + "title": "Classical Fourier Analysis", + "abstract": null + }, + { + "title": "Convolutional networks and applications in vision", + "abstract": "Intelligent tasks, such as visual perception, auditory perception, and language understanding require the construction of good internal representations of the world (or \"features\")? which must be invariant to irrelevant variations of the input while, preserving relevant information. A major question for Machine Learning is how to learn such good features automatically. Convolutional Networks (ConvNets) are a biologically-inspired trainable architecture that can learn invariant features. Each stage in a ConvNets is composed of a filter bank, some nonlinearities, and feature pooling layers. With multiple stages, a ConvNet can learn multi-level hierarchies of features. While ConvNets have been successfully deployed in many commercial applications from OCR to video surveillance, they require large amounts of labeled training samples. We describe new unsupervised learning algorithms, and new non-linear stages that allow ConvNets to be trained with very few labeled samples. Applications to visual object recognition and vision navigation for off-road mobile robots are described." + }, + { + "title": "Rectified Linear Units Improve Restricted Boltzmann Machines", + "abstract": "Restricted Boltzmann machines were developed using binary stochastic hidden units. These can be generalized by replacing each binary unit by an infinite number of copies that all have the same weights but have progressively more negative biases. The learning and inference rules for these \"Stepped Sigmoid Units\" are unchanged. They can be approximated efficiently by noisy, rectified linear units. Compared with binary units, these units learn features that are better for object recognition on the NORB dataset and face verification on the Labeled Faces in the Wild dataset. Unlike binary units, rectified linear units preserve information about relative intensities as information travels through multiple layers of feature detectors." + }, + { + "title": "DAISY: An Efficient Dense Descriptor Applied to Wide-Baseline Stereo", + "abstract": "In this paper, we introduce a local image descriptor, DAISY, which is very efficient to compute densely. We also present an EM-based algorithm to compute dense depth and occlusion maps from wide-baseline image pairs using this descriptor. This yields much better results in wide-baseline situations than the pixel and correlation-based algorithms that are commonly used in narrow-baseline stereo. Also, using a descriptor makes our algorithm robust against many photometric and geometric transformations. Our descriptor is inspired from earlier ones such as SIFT and GLOH but can be computed much faster for our purposes. Unlike SURF, which can also be computed efficiently at every pixel, it does not introduce artifacts that degrade the matching performance when used densely. It is important to note that our approach is the first algorithm that attempts to estimate dense depth maps from wide-baseline image pairs, and we show that it is a good one at that with many experiments for depth estimation accuracy, occlusion detection, and comparing it against other descriptors on laser-scanned ground truth scenes. We also tested our approach on a variety of indoor and outdoor scenes with different photometric and geometric transformations and our experiments support our claim to being robust against these." + }, + { + "title": "Microlocal Analysis of the Geometric Separation Problem", + "abstract": "Image data are often composed of two or more geometrically distinct constituents; in galaxy catalogs, for instance, one sees a mixture of pointlike structures (galaxy superclusters) and curvelike structures (filaments). It would be ideal to process a single image and extract two geometrically “pure” images, each one containing features from only one of the two geometric constituents. This seems to be a seriously underdetermined problem but recent empirical work achieved highly persuasive separations." + }, + { + "title": "Understanding the difficulty of training deep feedforward neural networks", + "abstract": "Whereas before 2006 it appears that deep multilayer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence. 1 Deep Neural Networks Deep learning methods aim at learning feature hierarchies with features from higher levels of the hierarchy formed by the composition of lower level features. They include Appearing in Proceedings of the 13 International Conference on Artificial Intelligence and Statistics (AISTATS) 2010, Chia Laguna Resort, Sardinia, Italy. Volume 9 of JMLR: WC Weston et al., 2008). Much attention has recently been devoted to them (see (Bengio, 2009) for a review), because of their theoretical appeal, inspiration from biology and human cognition, and because of empirical success in vision (Ranzato et al., 2007; Larochelle et al., 2007; Vincent et al., 2008) and natural language processing (NLP) (Collobert & Weston, 2008; Mnih & Hinton, 2009). Theoretical results reviewed and discussed by Bengio (2009), suggest that in order to learn the kind of complicated functions that can represent high-level abstractions (e.g. in vision, language, and other AI-level tasks), one may need deep architectures. Most of the recent experimental results with deep architecture are obtained with models that can be turned into deep supervised neural networks, but with initialization or training schemes different from the classical feedforward neural networks (Rumelhart et al., 1986). Why are these new algorithms working so much better than the standard random initialization and gradient-based optimization of a supervised training criterion? Part of the answer may be found in recent analyses of the effect of unsupervised pretraining (Erhan et al., 2009), showing that it acts as a regularizer that initializes the parameters in a “better” basin of attraction of the optimization procedure, corresponding to an apparent local minimum associated with better generalization. But earlier work (Bengio et al., 2007) had shown that even a purely supervised but greedy layer-wise procedure would give better results. So here instead of focusing on what unsupervised pre-training or semi-supervised criteria bring to deep architectures, we focus on analyzing what may be going wrong with good old (but deep) multilayer neural networks. Our analysis is driven by investigative experiments to monitor activations (watching for saturation of hidden units) and gradients, across layers and across training iterations. We also evaluate the effects on these of choices of activation function (with the idea that it might affect saturation) and initialization procedure (since unsupervised pretraining is a particular form of initialization and it has a drastic impact)." + }, + { + "title": "The Curvelet Transform", + "abstract": "Multiresolution methods are deeply related to image processing, biological and computer vision, and scientific computing. The curvelet transform is a multiscale directional transform that allows an almost optimal nonadaptive sparse representation of objects with edges. It has generated increasing interest in the community of applied mathematics and signal processing over the years. In this article, we present a review on the curvelet transform, including its history beginning from wavelets, its logical relationship to other multiresolution multidirectional methods like contourlets and shearlets, its basic theory and discrete algorithm. Further, we consider recent applications in image/video processing, seismic exploration, fluid mechanics, simulation of partial different equations, and compressed sensing." + }, + { + "title": "M-band ridgelet transform based texture classification", + "abstract": null + }, + { + "title": "What is the best multi-stage architecture for object recognition?", + "abstract": "In many recent object recognition systems, feature extraction stages are generally composed of a filter bank, a non-linear transformation, and some sort of feature pooling layer. Most systems use only one stage of feature extraction in which the filters are hard-wired, or two stages where the filters in one or both stages are learned in supervised or unsupervised mode. This paper addresses three questions: 1. How does the non-linearities that follow the filter banks influence the recognition accuracy? 2. does learning the filter banks in an unsupervised or supervised manner improve the performance over random filters or hardwired filters? 3. Is there any advantage to using an architecture with two stages of feature extraction, rather than one? We show that using non-linearities that include rectification and local contrast normalization is the single most important ingredient for good accuracy on object recognition benchmarks. We show that two stages of feature extraction yield better accuracy than one. Most surprisingly, we show that a two-stage system with random filters can yield almost 63% recognition rate on Caltech-101, provided that the proper non-linearities and pooling layers are used. Finally, we show that with supervised refinement, the system achieves state-of-the-art performance on NORB dataset (5.6%) and unsupervised pre-training followed by supervised refinement produces good accuracy on Caltech-101 (≫ 65%), and the lowest known error rate on the undistorted, unprocessed MNIST dataset (0.53%)." + }, + { + "title": "Automatic Music Genre Classification Based on Modulation Spectral Analysis of Spectral and Cepstral Features", + "abstract": "In this paper, we will propose an automatic music genre classification approach based on long-term modulation spectral analysis of spectral (OSC and MPEG-7 NASE) as well as cepstral (MFCC) features. Modulation spectral analysis of every feature value will generate a corresponding modulation spectrum and all the modulation spectra can be collected to form a modulation spectrogram which exhibits the time-varying or rhythmic information of music signals. Each modulation spectrum is then decomposed into several logarithmically-spaced modulation subbands. The modulation spectral contrast (MSC) and modulation spectral valley (MSV) are then computed from each modulation subband. Effective and compact features are generated from statistical aggregations of the MSCs and MSVs of all modulation subbands. An information fusion approach which integrates both feature level fusion method and decision level combination method is employed to improve the classification accuracy. Experiments conducted on two different music datasets have shown that our proposed approach can achieve higher classification accuracy than other approaches with the same experimental setup." + }, + { + "title": "Why is Real-World Visual Object Recognition Hard?", + "abstract": "Progress in understanding the brain mechanisms underlying vision requires the construction of computational models that not only emulate the brain's anatomy and physiology, but ultimately match its performance on visual tasks. In recent years, “natural” images have become popular in the study of vision and have been used to show apparently impressive progress in building such models. Here, we challenge the use of uncontrolled “natural” images in guiding that progress. In particular, we show that a simple V1-like model—a neuroscientist's “null” model, which should perform poorly at real-world visual object recognition tasks—outperforms state-of-the-art object recognition systems (biologically inspired and otherwise) on a standard, ostensibly natural image recognition test. As a counterpoint, we designed a “simpler” recognition test to better span the real-world variation in object pose, position, and scale, and we show that this test correctly exposes the inadequacy of the V1-like model. Taken together, these results demonstrate that tests based on uncontrolled natural images can be seriously misleading, potentially guiding progress in the wrong direction. Instead, we reexamine what it means for images to be natural and argue for a renewed focus on the core problem of object recognition—real-world image variation." + }, + { + "title": "Unsupervised Learning of Invariant Feature Hierarchies with Applications to Object Recognition", + "abstract": "We present an unsupervised method for learning a hierarchy of sparse feature detectors that are invariant to small shifts and distortions. The resulting feature extractor consists of multiple convolution filters, followed by a feature-pooling layer that computes the max of each filter output within adjacent windows, and a point-wise sigmoid non-linearity. A second level of larger and more invariant features is obtained by training the same algorithm on patches of features from the first level. Training a supervised classifier on these features yields 0.64% error on MNIST, and 54% average recognition rate on Caltech 101 with 30 training samples per category. While the resulting architecture is similar to convolutional networks, the layer-wise unsupervised training procedure alleviates the over-parameterization problems that plague purely supervised learning procedures, and yields good performance with very few labeled training samples." + }, + { + "title": "A comparison of wavelet, ridgelet, and curvelet-based texture classification algorithms in computed tomography", + "abstract": null + }, + { + "title": "Efficient Learning of Sparse Representations with an Energy-Based Model", + "abstract": "We describe a novel unsupervised method for learning sparse, overcomplete features. The model uses a linear encoder, and a linear decoder preceded by a sparsifying non-linearity that turns a code vector into a quasi-binary sparse code vector. Given an input, the optimal code minimizes the distance between the output of the decoder and the input patch while being as similar as possible to the encoder output. Learning proceeds in a two-phase EM-like fashion: (1) compute the minimum-energy code vector, (2) adjust the parameters of the encoder and decoder so as to decrease the energy. The model produces \"stroke detectors\" when trained on handwritten numerals, and Gabor-like filters when trained on natural image patches. Inference and learning are very fast, requiring no preprocessing, and no expensive sampling. Using the proposed unsupervised method to initialize the first layer of a convolutional network, we achieved an error rate slightly lower than the best reported result on the MNIST dataset. Finally, an extension of the method is described to learn topographical filter maps." + }, + { + "title": "Pattern Recognition and Machine Learning", + "abstract": null + }, + { + "title": "Large-scale Learning with SVM and Convolutional for Generic Object Categorization", + "abstract": "The detection and recognition of generic object categories with invariance to viewpoint, illumination, and clutter requires the combination of a feature extractor and a classifier. We show that architectures such as convolutional networks are good at learning invariant features, but not always optimal for classification, while Support Vector Machines are good at producing decision surfaces from wellbehaved feature vectors, but cannot learn complicated invariances. We present a hybrid system where a convolutional network is trained to detect and recognize generic objects, and a Gaussian-kernel SVM is trained from the features learned by the convolutional network. Results are given on a large generic object recognition task with six categories (human figures, four-legged animals, airplanes, trucks, cars, and \"none of the above\"), with multiple instances of each object category under various poses, illuminations, and backgrounds. On the test set, which contains different object instances than the training set, an SVM alone yields a 43.3% error rate, a convolutional net alone yields 7.2% and an SVM on top of features produced by the convolutional net yields 5.9%." + }, + { + "title": "Multiclass Object Recognition with Sparse, Localized Features", + "abstract": "We apply a biologically inspired model of visual object recognition to the multiclass object categorization problem. Our model modifies that of Serre, Wolf, and Poggio. As in that work, we first apply Gabor filters at all positions and scales; feature complexity and position/scale invariance are then built up by alternating template matching and max pooling operations. We refine the approach in several biologically plausible ways, using simple versions of sparsification and lateral inhibition. We demonstrate the value of retaining some position and scale information above the intermediate feature level. Using feature selection we arrive at a model that performs better with fewer features. Our final model is tested on the Caltech 101 object categories and the UIUC car localization task, in both cases achieving state-of-the-art performance. The results strengthen the case for using this class of model in computer vision." + }, + { + "title": "The dual-tree complex wavelet transform", + "abstract": "The paper discusses the theory behind the dual-tree transform, shows how complex wavelets with good properties can be designed, and illustrates a range of applications in signal and image processing. The authors use the complex number symbol C in CWT to avoid confusion with the often-used acronym CWT for the (different) continuous wavelet transform. The four fundamentals, intertwined shortcomings of wavelet transform and some solutions are also discussed. Several methods for filter design are described for dual-tree CWT that demonstrates with relatively short filters, an effective invertible approximately analytic wavelet transform can indeed be implemented using the dual-tree approach." + }, + { + "title": "Rotation invariant pattern recognition using ridgelets, wavelet cycle-spinning and Fourier features", + "abstract": null + }, + { + "title": "Continuous curvelet transform: II. Discretization and frames", + "abstract": null + }, + { + "title": "Texture classification using ridgelet transform", + "abstract": null + }, + { + "title": "Object recognition with features inspired by visual cortex", + "abstract": "We introduce a novel set of features for robust object recognition. Each element of this set is a complex feature obtained by combining position- and scale-tolerant edge-detectors over neighboring positions and multiple orientations. Our system's architecture is motivated by a quantitative model of visual cortex. We show that our approach exhibits excellent recognition performance and outperforms several state-of-the-art systems on a variety of image datasets including many different object categories. We also demonstrate that our system is able to learn from very few examples. The performance of the approach constitutes a suggestive plausibility proof for a class of feedforward models of object recognition in cortex." + }, + { + "title": "Distinctive Image Features from Scale-Invariant Keypoints", + "abstract": null + }, + { + "title": "Two-Dimensional Wavelets and their Relatives: References", + "abstract": "Two-dimensional wavelets offer a number of advantages over discrete wavelet transforms when processing rapidly varying functions and signals. In particular, they offer benefits for real-time applications such as medical imaging, fluid dynamics, shape recognition, image enhancement and target tracking. This book introduces 2-D wavelets via 1-D continuous wavelet transforms. The authors then describe the underlying mathematics before progressing to more advanced topics such as matrix geometry of wavelet analysis and three-dimensional wavelets. Practical applications and illustrative examples are employed extensively throughout, ensuring the book's value to engineers, physicists and mathematicians. Two-dimensional wavelets offer a number of advantages over discrete wavelet transforms, in particular, for analysis of real-time signals in such areas as medical imaging, fluid dynamics, shape recognition, image enhancement and target tracking." + }, + { + "title": "New tight frames of curvelets and optimal representations of objects with piecewise C2 singularities", + "abstract": "This paper introduces new tight frames of curvelets to address the problem of finding optimally sparse representations of objects with discontinuities along piecewise C2 edges. Conceptually, the curvelet transform is a multiscale pyramid with many directions and positions at each length scale, and needle‐shaped elements at fine scales. These elements have many useful geometric multiscale features that set them apart from classical multiscale representations such as wavelets. For instance, curvelets obey a parabolic scaling relation which says that at scale 2−j, each element has an envelope that is aligned along a “ridge” of length 2−j/2 and width 2−j." + }, + { + "title": "Factors in automatic musical genre classification of audio signals", + "abstract": "Automatic musical genre classification is an important tool for organizing the large collections of music that are becoming available to the average user. In addition, it provides a structured way of evaluating musical content features that does not require extensive user studies. The paper provides a detailed comparative analysis of various factors affecting automatic classification performance, such as choice of features and classifiers. Using recent machine learning techniques, such as support vector machines, we improve on previously published results using identical data collections and features." + }, + { + "title": "Directional dyadic wavelet transforms: design and algorithms", + "abstract": "We propose a simple and efficient technique for designing translation invariant dyadic wavelet transforms (DWTs) in two dimensions. Our technique relies on an extension of the work of Duval-Destin et al. where dyadic decompositions are constructed starting from the continuous wavelet transform. The main advantage of this framework is that it allows for a lot of freedom in designing two-dimensional (2-D) dyadic wavelets. We use this property to construct directional wavelets, whose orientation filtering capabilities are very important in image processing. We address the efficient implementation of these decompositions by constructing approximate QMFs through an L(2) optimization. We also propose and study an efficient implementation in the Fourier domain for dealing with large filters." + }, + { + "title": "Foundations of Time-Frequency Analysis", + "abstract": null + }, + { + "title": "Nonlinear Approximation with Local Fourier Bases", + "abstract": null + }, + { + "title": "Feature Extraction Based on Morlet Wavelet and its Application for Mechanical Fault Diagnosis", + "abstract": "Abstract The vibration signals of a machine always carry the dynamic information of the machine. These signals are very useful for the feature extraction and fault diagnosis. However, in many cases, because these signals have very low signal-to-noise ratio (SNR), to extract feature components becomes difficult and the applicability of information drops down. Wavelet analysis in an effective tool for signal processing and feature extraction. In this paper, a denoising method based on wavelet analysis is applied to feature extraction for mechanical vibration signals. This method is an advanced version of the famous “soft-thresholding denoising method” proposed by Donoho and Johnstone. Based on the Morlet wavelet, the time-frequency resolution can be adapted to different signals of interest. In this paper, this denoising method is introduced in detail. The results of the application in rolling bearing diagnosis and gear-box diagnosis are satisfactory." + }, + { + "title": "Ridgelets: a key to higher-dimensional intermittency?", + "abstract": "In dimensions two and higher, wavelets can efficiently represent only a small range of the full diversity of interesting behaviour. In effect, wavelets are well adapted for point–like phenomena, whereas in dimensions greater than one, interesting phenomena can be organized along lines, hyperplanes and other non–point–like structures, for which wavelets are poorly adapted. We discuss in this paper a new subject, ridgelet analysis, which can effectively deal with line–like phenomena in dimension 2, plane–like phenomena in dimension 3 and so on. It encompasses a collection of tools which all begin from the idea of analysis by ridge functions ψ(u1x1 + ⃛ + unxn whose ridge profiles ψ are wavelets, or alternatively from performing a wavelet analysis in the Radon domain. The paper reviews recent work on the continuous ridgelet transform (CRT), ridgelet frames, ridgelet orthonormal bases, ridgelets and edges and describes a new notion of smoothness naturally attached to this new representation." + }, + { + "title": "Frame-theoretic analysis of oversampled filter banks", + "abstract": "We provide a frame-theoretic analysis of oversampled finite impulse response (FIR) and infinite impulse response (FIR) uniform filter banks (FBs). Our analysis is based on a new relationship between the FBs polyphase matrices and the frame operator corresponding to an FB. For a given oversampled analysis FB, we present a parameterization of all synthesis FBs providing perfect reconstruction. We find necessary and sufficient conditions for an oversampled FB to provide a frame expansion. A new frame-theoretic procedure for the design of paraunitary FBs from given nonparaunitary FBs is formulated. We show that the frame bounds of an FB can be obtained by an eigen-analysis of the polyphase matrices. The relevance of the frame bounds as a characterization of important numerical properties of an FB is assessed by means of a stochastic sensitivity analysis. We consider special cases in which the calculation of the frame bounds and synthesis filters is simplified. Finally, simulation results are presented." + }, + { + "title": "A Friendly Guide To Wavelets", + "abstract": null + }, + { + "title": "Image Representation Using 2D Gabor Wavelets", + "abstract": "This paper extends to two dimensions the frame criterion developed by Daubechies for one-dimensional wavelets, and it computes the frame bounds for the particular case of 2D Gabor wavelets. Completeness criteria for 2D Gabor image representations are important because of their increasing role in many computer vision applications and also in modeling biological vision, since recent neurophysiological evidence from the visual cortex of mammalian brains suggests that the filter response profiles of the main class of linearly-responding cortical neurons (called simple cells) are best modeled as a family of self-similar 2D Gabor wavelets. We therefore derive the conditions under which a set of continuous 2D Gabor wavelets will provide a complete representation of any image, and we also find self-similar wavelet parametrization which allow stable reconstruction by summation as though the wavelets formed an orthonormal basis. Approximating a \"tight frame\" generates redundancy which allows low-resolution neural responses to represent high-resolution images." + }, + { + "title": "Texture classification and segmentation using wavelet frames", + "abstract": "This paper describes a new approach to the characterization of texture properties at multiple scales using the wavelet transform. The analysis uses an overcomplete wavelet decomposition, which yields a description that is translation invariant. It is shown that this representation constitutes a tight frame of l(2) and that it has a fast iterative algorithm. A texture is characterized by a set of channel variances estimated at the output of the corresponding filter bank. Classification experiments with l(2) Brodatz textures indicate that the discrete wavelet frame (DWF) approach is superior to a standard (critically sampled) wavelet transform feature extraction. These results also suggest that this approach should perform better than most traditional single resolution techniques (co-occurrences, local linear transform, and the like). A detailed comparison of the classification performance of various orthogonal and biorthogonal wavelet transforms is also provided. Finally, the DWF feature extraction technique is incorporated into a simple multicomponent texture segmentation algorithm, and some illustrative examples are presented." + }, + { + "title": "Frames and Stable Bases for Shift-Invariant Subspaces of L2(ℝd)", + "abstract": "Abstract Let X be a countable fundamental set in a Hilbert space H, and let T be the operator Whenever T is well-defined and bounded, X is said to be a Bessel sequence. If, in addition, ran T is closed, then X is a frame. Finally, a frame whose corresponding T is injective is a stable basis (also known as a Riesz basis). This paper considers the above three properties for subspaces H of L2(ℝd), and for sets X of the form with Φ either a singleton, a finite set, or, more generally, a countable set. The analysis is performed on the Fourier domain, where the two operators TT* and T* T are decomposed into a collection of simpler \"fiber\" operators. The main theme of the entire analysis is the characterization of each of the above three properties in terms of the analogous property of these simpler operators." + }, + { + "title": "Support-Vector Networks", + "abstract": null + }, + { + "title": "Gabor Time-Frequency Lattices and the Wexler-Raz Identity", + "abstract": null + }, + { + "title": "Continuous Frames in Hilbert Space", + "abstract": "Abstract The standard theory of frames in Hilbert spaces, using discrete bases, is generalized to one where the basis vectors may be labelled using discrete, continuous, or a mixture of the two types of indices. A comprehensive analysis of such frames is presented and various notions of equivalence among frames are introduced. A consideration of the relationschip between reproducing kernel Hilbert spaces and frames leads to an exhaustive construction for all possible frames in a separable Hilbert space. Generalizations of the theory are indicated and illustrated by an example drawn from the afline group." + }, + { + "title": "Multirate Systems And Filter Banks", + "abstract": "1. Introduction 2. Review of Discrete-Time Systems 3. Review of Digital Filters 4. Fundamentals of Multirate Systems 5. Maximally Decimated Filter Banks 6. Paraunitary Perfect Reconstruction Filter Banks 7. Linear Phase Perfect Reconstruction QMF Banks 8. Cosine Modulated Filter Banks 9. Finite Word Length Effects 10. Multirate Filter Bank Theory and Related Topics 11. The Wavelet Transform and Relation to Multirate Filter Banks 12. Multidimensional Multirate Systems 13. Review of Discrete-Time Multi-Input Multi-Output LTI Systems 14. Paraunitary and Lossless Systems Appendices Bibliography Index" + }, + { + "title": "Ten Lectures on Wavelets", + "abstract": "Introduction Preliminaries and notation The what, why, and how of wavelets The continuous wavelet transform Discrete wavelet transforms: Frames Time-frequency density and orthonormal bases Orthonormal bases of wavelets and multiresolutional analysis Orthonormal bases of compactly supported wavelets More about the regularity of compactly supported wavelets Symmetry for compactly supported wavelet bases Characterization of functional spaces by means of wavelets Generalizations and tricks for orthonormal wavelet bases References Indexes." + }, + { + "title": "Littlewood-Paley Theory and the Study of Function Spaces", + "abstract": "Calderon's formula and a decomposition of $L^2(\\mathbb R^n)$ Decomposition of Lipschitz spaces Minimality of $\\dot B^0,1_1$ Littlewood-Paley theory The Besov and Triebel-Lizorkin spaces The $\\varphi$ -transform Wavelets Calderon-Zygmund operators Potential theory and a result of Muckenhoupt-Wheeden Further applications." + }, + { + "title": "Orthogonal least squares learning algorithm for radial basis function networks", + "abstract": "The radial basis function network offers a viable alternative to the two-layer neural network in many applications of signal processing. A common learning algorithm for radial basis function networks is based on first choosing randomly some data points as radial basis function centers and then using singular-value decomposition to solve for the weights of the network. Such a procedure has several drawbacks, and, in particular, an arbitrary selection of centers is clearly unsatisfactory. The authors propose an alternative learning procedure based on the orthogonal least-squares method. The procedure chooses radial basis function centers one by one in a rational way until an adequate network has been constructed. In the algorithm, each selected center maximizes the increment to the explained variance or energy of the desired output and does not suffer numerical ill-conditioning problems. The orthogonal least-squares learning strategy provides a simple and efficient means for fitting radial basis function networks. This is illustrated using examples taken from two different signal processing applications." + }, + { + "title": "A Computational Approach to Edge Detection", + "abstract": "This paper describes a computational approach to edge detection. The success of the approach depends on the definition of a comprehensive set of goals for the computation of edge points. These goals must be precise enough to delimit the desired behavior of the detector while making minimal assumptions about the form of the solution. We define detection and localization criteria for a class of edges, and present mathematical forms for these criteria as functionals on the operator impulse response. A third criterion is then added to ensure that the detector has only one response to a single edge. We use the criteria in numerical optimization to derive detectors for several common image features, including step edges. On specializing the analysis to step edges, we find that there is a natural uncertainty principle between detection and localization performance, which are the two main goals. With this principle we derive a single operator shape which is optimal at any scale. The optimal detector has a simple approximate implementation in which edges are marked at maxima in gradient magnitude of a Gaussian-smoothed image. We extend this simple detector using operators of several widths to cope with different signal-to-noise ratios in the image. We present a general method, called feature synthesis, for the fine-to-coarse integration of information from operators at different scales. Finally we show that step edge detector performance improves considerably as the operator point spread function is extended along the edge." + }, + { + "title": "PAINLESS NONORTHOGONAL EXPANSIONS", + "abstract": "In a Hilbert space H, discrete families of vectors {hj} with the property that f=∑j〈hj‖ f〉hj for every f in H are considered. This expansion formula is obviously true if the family is an orthonormal basis of H, but also can hold in situations where the hj are not mutually orthogonal and are ‘‘overcomplete.’’ The two classes of examples studied here are (i) appropriate sets of Weyl–Heisenberg coherent states, based on certain (non‐Gaussian) fiducial vectors, and (ii) analogous families of affine coherent states. It is believed, that such ‘‘quasiorthogonal expansions’’ will be a useful tool in many areas of theoretical physics and applied mathematics." + }, + { + "title": "Learning internal representations by error propagation", + "abstract": null + }, + { + "title": "Comparison of Parametric Representations for Monosyllabic Word Recognition in Continuously Spoken Se", + "abstract": "Several parametric representations of the acoustic signal were compared with regard to word recognition performance in a syllable-oriented continuous speech recognition system. The vocabulary included many phonetically similar monosyllabic words, therefore the emphasis was on the ability to retain phonetically significant acoustic information in the face of syntactic and duration variations. For each parameter set (based on a mel-frequency cepstrum, a linear frequency cepstrum, a linear prediction cepstrum, a linear prediction spectrum, or a set of reflection coefficients), word templates were generated using an efficient dynamic warping method, and test data were time registered with the templates. A set of ten mel-frequency cepstrum coefficients computed every 6.4 ms resulted in the best performance, namely 96.5 percent and 95.0 percent recognition with each of two speakers. The superior performance of the mel-frequency cepstrum coefficients may be attributed to the fact that they better represent the perceptually relevant aspects of the short-term speech spectrum." + }, + { + "title": "Real and complex analysis", + "abstract": "Preface Prologue: The Exponential Function Chapter 1: Abstract Integration Set-theoretic notations and terminology The concept of measurability Simple functions Elementary properties of measures Arithmetic in [0, ] Integration of positive functions Integration of complex functions The role played by sets of measure zero Exercises Chapter 2: Positive Borel Measures Vector spaces Topological preliminaries The Riesz representation theorem Regularity properties of Borel measures Lebesgue measure Continuity properties of measurable functions Exercises Chapter 3: Lp-Spaces Convex functions and inequalities The Lp-spaces Approximation by continuous functions Exercises Chapter 4: Elementary Hilbert Space Theory Inner products and linear functionals Orthonormal sets Trigonometric series Exercises Chapter 5: Examples of Banach Space Techniques Banach spaces Consequences of Baire's theorem Fourier series of continuous functions Fourier coefficients of L1-functions The Hahn-Banach theorem An abstract approach to the Poisson integral Exercises Chapter 6: Complex Measures Total variation Absolute continuity Consequences of the Radon-Nikodym theorem Bounded linear functionals on Lp The Riesz representation theorem Exercises Chapter 7: Differentiation Derivatives of measures The fundamental theorem of Calculus Differentiable transformations Exercises Chapter 8: Integration on Product Spaces Measurability on cartesian products Product measures The Fubini theorem Completion of product measures Convolutions Distribution functions Exercises Chapter 9: Fourier Transforms Formal properties The inversion theorem The Plancherel theorem The Banach algebra L1 Exercises Chapter 10: Elementary Properties of Holomorphic Functions Complex differentiation Integration over paths The local Cauchy theorem The power series representation The open mapping theorem The global Cauchy theorem The calculus of residues Exercises Chapter 11: Harmonic Functions The Cauchy-Riemann equations The Poisson integral The mean value property Boundary behavior of Poisson integrals Representation theorems Exercises Chapter 12: The Maximum Modulus Principle Introduction The Schwarz lemma The Phragmen-Lindelof method An interpolation theorem A converse of the maximum modulus theorem Exercises Chapter 13: Approximation by Rational Functions Preparation Runge's theorem The Mittag-Leffler theorem Simply connected regions Exercises Chapter 14: Conformal Mapping Preservation of angles Linear fractional transformations Normal families The Riemann mapping theorem The class L Continuity at the boundary Conformal mapping of an annulus Exercises Chapter 15: Zeros of Holomorphic Functions Infinite Products The Weierstrass factorization theorem An interpolation problem Jensen's formula Blaschke products The Muntz-Szas theorem Exercises Chapter 16: Analytic Continuation Regular points and singular points Continuation along curves The monodromy theorem Construction of a modular function The Picard theorem Exercises Chapter 17: Hp-Spaces Subharmonic functions The spaces Hp and N The theorem of F. and M. Riesz Factorization theorems The shift operator Conjugate functions Exercises Chapter 18: Elementary Theory of Banach Algebras Introduction The invertible elements Ideals and homomorphisms Applications Exercises Chapter 19: Holomorphic Fourier Transforms Introduction Two theorems of Paley and Wiener Quasi-analytic classes The Denjoy-Carleman theorem Exercises Chapter 20: Uniform Approximation by Polynomials Introduction Some lemmas Mergelyan's theorem Exercises Appendix: Hausdorff's Maximality Theorem Notes and Comments Bibliography List of Special Symbols Index" + }, + { + "title": "Functional Analysis", + "abstract": "In this paper we investigate the L p boundedness of the lacunary maximal function M lac H n associated to the spherical means A r f taken over Koranyi spheres on the Heisenberg group. Closely following an approach used by M. Lacey in the Euclidean case, we obtain sparse bounds for these maximal functions leading to new unweighted and weighted estimates. The key ingredients in the proof are the L p improving property of the operator A r f and a continuity property of the difference A r f − τ y A r f , where τ y f ( x ) = f ( xy − 1 ) is the right translation operator." + }, + { + "title": "Wavelet transform modulus : phase retrieval and scattering", + "abstract": "Les tâches qui consistent a comprendre automatiquement le contenu d’un signal naturel, comme une image ou un son, sont en general difficiles. En effet, dans leur representation naive, les signaux sont des objets compliques, appartenant a des espaces de grande dimension. Representes differemment, ils peuvent en revanche etre plus faciles a interpreter. Cette these s’interesse a une representation frequemment utilisee dans ce genre de situations, notamment pour analyser des signaux audio : le module de la transformee en ondelettes. Pour mieux comprendre son comportement, nous considerons, d’un point de vue theorique et algorithmique, le probleme inverse correspondant : la reconstruction d’un signal a partir du module de sa transformee en ondelettes. Ce probleme appartient a une classe plus generale de problemes inverses : les problemes de reconstruction de phase. Dans un premier chapitre, nous decrivons un nouvel algorithme, PhaseCut, qui resout numeriquement un probleme de reconstruction de phase generique. Comme l’algorithme similaire PhaseLift, PhaseCut utilise une relaxation convexe, qui se trouve en l’occurence etre de la meme forme que les relaxations du probleme abondamment etudie MaxCut. Nous comparons les performances de PhaseCut et PhaseLift, en termes de precision et de rapidite. Dans les deux chapitres suivants, nous etudions le cas particulier de la reconstruction de phase pour la transformee en ondelettes. Nous montrons que toute fonction sans frequence negative est uniquement determinee (a une phase globale pres) par le module de sa transformee en ondelettes, mais que la reconstruction a partir du module n’est pas stable au bruit, pour une definition forte de la stabilite. On demontre en revanche une propriete de stabilite locale. Nous presentons egalement un nouvel algorithme de reconstruction de phase, non-convexe, qui est specifique a la transformee en ondelettes, et etudions numeriquement ses performances. Enfin, dans les deux derniers chapitres, nous etudions une representation plus sophistiquee, construite a partir du module de transformee en ondelettes : la transformee de scattering. Notre but est de comprendre quelles proprietes d’un signal sont caracterisees par sa transformee de scattering. On commence par demontrer un theoreme majorant l’energie des coefficients de scattering d’un signal, a un ordre donne, en fonction de l’energie du signal initial, convole par un filtre passe-haut qui depend de l’ordre. On etudie ensuite une generalisation de la transformee de scattering, qui s’applique a des processus stationnaires. On montre qu’en dimension finie, cette transformee generalisee preserve la norme. En dimension un, on montre egalement que les coefficients de scattering generalises d’un processus caracterisent la queue de distribution du processus." + }, + { + "title": "Comparison of Parametric Representation for Monosyllabic Word Recognition in Continuously Spoken Sentences", + "abstract": "Haeb-Umbach et al (R. Haeb-Umbach, X. Aubert, P. Bey erlein, D. Klakow, M. Ullrich, A. Wendemuth, P. Wilcox, “Acoustic Modeling in the Philips Hub-4 Continu ous-Speech Recognition System,” DARPA Broadcast News, Transcription & Understanding Workshop, Feb. 1998.* Parrott (Parrott Systems, Inc., Internet web page “http:// www.Say-parrot.com/us/technology/algorithms/recogni tion/index.html.” Feb. 2000).* Padmanabhan et al (M. Padmanabhan, L.R. Bahl, D. Naha moo, M. Picheny, “Speaker Clustering and Transformation for Speaker Adaptation in Speech Recognition Systems', IEEE Transactions on Speech and Audio Processing, Jan. 1998).* Bahl et al., “Performance of the IBM Large Vocabulary Continuous Speech Recognition System on the ARPA Wall Street Journal Task, ICASSP-95, 1995. Davis et al., “Comparison of Parametric Representation for Monosyllabic Word Recognition in Continuously Spoken Sentences\", IEEE Trans. on ASSP, vol. 28, pp. 357-366, 1980." + }, + { + "title": "Deep Learning", + "abstract": null + }, + { + "title": "Foundations Of Time Frequency Analysis", + "abstract": null + }, + { + "title": "Cartoon Approximation with $ \\ alpha $-Curvelets", + "abstract": "It is well-known that curvelets provide optimal approximations for so-called cartoon images which are defined as piecewise C-functions, separated by a C singularity curve. In this paper, we consider the more general case of piecewise C-functions, separated by a C singularity curve for β ∈ (1, 2]. We first prove a benchmark result for the possibly achievable best N -term approximation rate for this more general signal model. Then we introduce what we call α-curvelets, which are systems that interpolate between wavelet systems on the one hand (α = 1) and curvelet systems on the other hand (α = 1 2 ). Our main result states that those frames achieve this optimal rate for α = 1 β , up to log-factors." + }, + { + "title": "Acoustic Modeling Using Deep Belief Networks", + "abstract": "Gaussian mixture models are currently the dominant technique for modeling the emission distribution of hidden Markov models for speech recognition. We show that better phone recognition on the TIMIT dataset can be achieved by replacing Gaussian mixture models by deep neural networks that contain many layers of features and a very large number of parameters. These networks are first pre-trained as a multi-layer generative model of a window of spectral feature vectors without making use of any discriminative information. Once the generative pre-training has designed the features, we perform discriminative fine-tuning using backpropagation to adjust the features slightly to make them better at predicting a probability distribution over the states of monophone hidden Markov models." + }, + { + "title": "Introduction to Shearlets", + "abstract": null + }, + { + "title": "He joined ETH Zurich in 2013, where he graduated with the Dr. sc. degree in 2017. His research interests are in deep machine learning, mathematical signal processing", + "abstract": null + }, + { + "title": "A Wavelet Tour of Signal Processing - The Sparse Way, 3rd Edition", + "abstract": null + }, + { + "title": "A Wavelet Tour of Signal Processing : The Sparse Way", + "abstract": "Mallat's book is the undisputed reference in this field - it is the only one that covers the essential material in such breadth and depth. - Laurent Demanet, Stanford University The new edition of this classic book gives all the major concepts, techniques and applications of sparse representation, reflecting the key role the subject plays in today's signal processing. The book clearly presents the standard representations with Fourier, wavelet and time-frequency transforms, and the construction of orthogonal bases with fast algorithms. The central concept of sparsity is explained and applied to signal compression, noise reduction, and inverse problems, while coverage is given to sparse representations in redundant dictionaries, super-resolution and compressive sensing applications. Features: * Balances presentation of the mathematics with applications to signal processing * Algorithms and numerical examples are implemented in WaveLab, a MATLAB toolbox * Companion website for instructors and selected solutions and code available for students New in this edition * Sparse signal representations in dictionaries * Compressive sensing, super-resolution and source separation * Geometric image processing with curvelets and bandlets * Wavelets for computer graphics with lifting on surfaces * Time-frequency audio processing and denoising * Image compression with JPEG-2000 * New and updated exercises A Wavelet Tour of Signal Processing: The Sparse Way, third edition, is an invaluable resource for researchers and R&D engineers wishing to apply the theory in fields such as image processing, video processing and compression, bio-sensing, medical imaging, machine vision and communications engineering. Stephane Mallat is Professor in Applied Mathematics at cole Polytechnique, Paris, France. From 1986 to 1996 he was a Professor at the Courant Institute of Mathematical Sciences at New York University, and between 2001 and 2007, he co-founded and became CEO of an image processing semiconductor company. Companion website: A Numerical Tour of Signal Processing * Includes all the latest developments since the book was published in 1999, including its application to JPEG 2000 and MPEG-4 * Algorithms and numerical examples are implemented in Wavelab, a MATLAB toolbox * Balances presentation of the mathematics with applications to signal processing" + }, + { + "title": "CONTINUOUS FRAMES IN HILBERT SPACES", + "abstract": "In this paper we introduce a mean of a continuous frame which is a generalization of discrete frames. Since a discrete frame is a special case of these frames, we expect that some of the results that occur in the frame theory will be generalized to these frames. For such a generalization, after giving some basic results and theorems about these frames, we discuss the following: dual to these frames, perturbation of continuous frames and robustness of these frames to an erasure of some elements." + }, + { + "title": "Sparse Multidimensional Representations using Anisotropic Dilation and Shear Operators", + "abstract": "Recent advances in applied mathematics and signal processing have shown that, in order to obtain sparse representations of multi-dimensional functions and signals, one has to use representation elements distributed not only at various scales and locations – as in classical wavelet theory – but also at various directions. In this paper, we show that we obtain a construction having exactly these properties by using the framework of affine systems. The representation elements that we obtain are generated by translations, dilations, and shear transformations of a single mother function, and are called shearlets. The shearlets provide optimally sparse representations for 2-D functions that are smooth away from discontinuities along curves. Another benefit of this approach is that, thanks to their mathematical structure, these systems provide a Multiresolution analysis similar to the one associated with classical wavelets, which is very useful for the development of fast algorithmic implementations." + }, + { + "title": "The mnist database of handwritten digits", + "abstract": "Disclosed is an improved articulated bar flail having shearing edges for efficiently shredding materials. An improved shredder cylinder is disclosed with a plurality of these flails circumferentially spaced and pivotally attached to the periphery of a rotatable shaft. Also disclosed is an improved shredder apparatus which has a pair of these shredder cylinders mounted to rotate about spaced parallel axes which cooperates with a conveyer apparatus which has a pair of inclined converging conveyer belts with one of the belts mounted to move with respect to the other belt to allow the transport of articles of various sizes therethrough." + }, + { + "title": "Multirate Systems and Filter Banks", + "abstract": "The outline of this chapter is as follows. Section 2 reviews various types of existing finite impulse response (FIR) and infinite impulse response (IIR) two-channel filter banks. The basic operations of these filter banks are considered and the requirements are stated for alias-free, perfect-reconstruction (PR), and nearly perfect-reconstruction (NPR) filter banks. Also some efficient synthesis techniques are referred to. Furthermore, examples are included to compare various two-channel filter banks with each other. Section 3 concentrates on the design of multi-channel (M-channel) uniform filter banks. The main emphasis is laid on designing these banks using tree-structured filter banks with the aid of two-channel filter banks and on generating the overall bank with the aid of a single prototype filter and a proper cosine-modulation or MDFT technique. In Section 4, it is shown how octave filter banks can be generated using a single two-channel filter bank as the basic building block. Also, the relations between the frequency-selective octave filter banks and discrete-time wavelet banks are briefly discussed. Finally, concluding remarks are given in Section 5." + }, + { + "title": "Sparse Components of Images and Optimal Atomic Decompositions", + "abstract": null + }, + { + "title": "The duality condition for Weyl-Heisenberg frames", + "abstract": null + }, + { + "title": "Gradient-based learning applied to document recognition", + "abstract": "Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day." + }, + { + "title": "Candès , Ridgelets : Theory and applications", + "abstract": null + }, + { + "title": "Duality and Biorthogonality for Weyl-Heisenberg Frames", + "abstract": null + }, + { + "title": "METRIC SPACES", + "abstract": "As calculus developed, eventually turning into analysis, concepts first explored on the real line (e.g., a limit of a sequence of real numbers) eventually extended to other spaces (e.g., a limit of a sequence of vectors or of functions), and in the early 20th century a general setting for analysis was formulated, called a metric space. It is a set on which a notion of distance between each pair of elements is defined, and in which notions from calculus in R (open and closed intervals, convergent sequences, continuous functions) can be studied. Many of the fundamental types of spaces used in analysis are metric spaces (e.g., Hilbert spaces and Banach spaces), so metric spaces are one of the first abstractions that has to be mastered in order to learn analysis." + }, + { + "title": "Handwritten Digit Recognition with a Back-Propagation Network", + "abstract": "We present an application of back-propagation networks to handwritten digit recognition. Minimal preprocessing of the data was required, but architecture of the network was highly constrained and specifically designed for the task. The input of the network consists of normalized images of isolated digits. The method has 1% error rate and about a 9% reject rate on zipcode digits provided by the U.S. Postal Service." + }, + { + "title": "Linear Operator Theory in Engineering and Science", + "abstract": null + }, + { + "title": "Ieee Transactions on Image Processing the Contourlet Transform: an Efficient Directional Multiresolution Image Representation", + "abstract": "— The limitations of commonly used separable extensions of one-dimensional transforms, such as the Fourier and wavelet transforms, in capturing the geometry of image edges are well known. In this paper, we pursue a \" true \" two-dimensional transform that can capture the intrinsic geometrical structure that is key in visual information. The main challenge in exploring geometry in images comes from the discrete nature of the data. Thus, unlike other approaches, such as curvelets, that first develop a transform in the continuous domain and then discretize for sampled data, our approach starts with a discrete-domain construction and then studies its convergence to an expansion in the continuous domain. Specifically, we construct a discrete-domain multiresolution and multidirection expansion using non-separable filter banks, in much the same way that wavelets were derived from filter banks. This construction results in a flexible multiresolution, local, and directional image expansion using contour segments, and thus it is named the contourlet transform. The discrete contourlet transform has a fast iterated filter bank algorithm that requires an order N operations for N-pixel images. Furthermore, we establish a precise link between the developed filter bank and the associated continuous-domain contourlet expansion via a directional multiresolution analysis framework. We show that with parabolic scaling and sufficient directional vanishing moments, contourlets achieve the optimal approximation rate for piecewise smooth functions with discontinuities along twice continuously differentiable curves. Finally, we show some numerical experiments demonstrating the potential of contourlets in several image processing applications." + }, + { + "title": "from Vienna University of Technology, Vienna, Austria, in 1994 and respectively", + "abstract": null + }, + { + "title": "Helmut B¨olcskei was born in M¨odling, Austria, on May 29, 1970, and received the", + "abstract": null + } + ] + }, + "author_data": {}, + "reference_proposal": "### [Question 1] - What is the problem?\nHow can advanced multiscale analysis techniques, such as shearlets, improve the classification and feature extraction of complex multivariate data in machine learning applications?\n\n### [Question 2] - Why is it interesting and important?\nSolving this problem is crucial for the research community as it can lead to significant advancements in the field of machine learning, particularly in areas requiring high-dimensional data analysis, such as image and audio processing. Improved classification methods can enhance the accuracy and efficiency of machine learning models, leading to better performance in real-world applications like medical imaging, automated surveillance, and audio recognition. This research could pave the way for new methodologies that integrate multiscale analysis with deep learning, potentially transforming how we approach data representation and feature extraction.\n\n### [Question 3] - Why is it hard?\nThe challenges in addressing this problem stem from the inherent complexity of multiscale data and the limitations of existing feature extraction techniques. Naive approaches may fail due to their inability to capture the intricate structures and patterns present in high-dimensional data. Technical obstacles include the need for robust algorithms that can efficiently handle the computational demands of shearlet transforms and the integration of these techniques with existing machine learning frameworks. Theoretical challenges also arise in ensuring that the extracted features maintain their discriminative power across various applications.\n\n### [Question 4] - Why hasn't it been solved before?\nPrevious research has often focused on traditional wavelet and Fourier transforms, which may not adequately address the unique characteristics of multivariate data. Limitations in computational resources and the lack of effective algorithms for shearlet transforms have hindered progress. Additionally, many existing solutions do not leverage the full potential of multiscale analysis in conjunction with modern machine learning techniques. My approach aims to bridge this gap by developing novel algorithms that integrate shearlet-based feature extraction with state-of-the-art machine learning models, thus enhancing classification performance.\n\n### [Question 5] - What are the key components of my approach and results?\nMy proposed methodology involves the following key components: \n1. **Method**: Implementing shearlet transforms for feature extraction from multivariate datasets, followed by the application of machine learning classifiers (e.g., support vector machines, neural networks).\n2. **Dataset**: Utilizing benchmark datasets from audio and image domains that require complex feature extraction for classification tasks.\n3. **Metric**: Evaluating performance using metrics such as accuracy, precision, recall, and F1-score to assess the effectiveness of the shearlet-based features compared to traditional methods.\n" + } +} \ No newline at end of file diff --git a/research_bench/oodbench/oodbench_paper_titles.txt b/research_bench/oodbench/oodbench_paper_titles.txt new file mode 100644 index 00000000..e9b18c9d --- /dev/null +++ b/research_bench/oodbench/oodbench_paper_titles.txt @@ -0,0 +1,200 @@ +Momentum Contrast for Unsupervised Visual Representation Learning +A Style-Based Generator Architecture for Generative Adversarial Networks +High-Resolution Image Synthesis With Latent Diffusion Models +ArcFace: Additive Angular Margin Loss for Deep Face Recognition +EfficientDet: Scalable and Efficient Object Detection +Dual Attention Network for Scene Segmentation +Analyzing and Improving the Image Quality of StyleGAN +Masked Autoencoders Are Scalable Vision Learners +YOLOv7: Trainable Bag-of-Freebies Sets New State-of-the-Art for Real-Time Object Detectors +ECA-Net: Efficient Channel Attention for Deep Convolutional Neural Networks +nuScenes: A Multimodal Dataset for Autonomous Driving +Generalized Intersection Over Union: A Metric and a Loss for Bounding Box Regression +Deep High-Resolution Representation Learning for Human Pose Estimation +A ConvNet for the 2020s +Exploring Simple Siamese Representation Learning +MnasNet: Platform-Aware Neural Architecture Search for Mobile +PointPillars: Fast Encoders for Object Detection From Point Clouds +DeepSDF: Learning Continuous Signed Distance Functions for Shape Representation +Coordinate Attention for Efficient Mobile Network Design +Semantic Image Synthesis With Spatially-Adaptive Normalization +PyTorch: an imperative style, high-performance deep learning library +Language models are few-shot learners +Denoising diffusion probabilistic models +XLNet: generalized autoregressive pretraining for language understanding +Training language models to follow instructions with human feedback +Bootstrap your own latent a new approach to self-supervised learning +Fully convolutional one-stage 3D object detection on LiDAR range images +Chain-of-thought prompting elicits reasoning in large language models +Diffusion models beat GANs on image synthesis +wav2vec 2.0: a framework for self-supervised learning of speech representations +Supervised contrastive learning +Photorealistic text-to-image diffusion models with deep language understanding +Unsupervised learning of visual features by contrasting cluster assignments +SegFormer: simple and efficient design for semantic segmentation with transformers +ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks +RandAugment: practical automated data augmentation with a reduced search space +MixMatch: a holistic approach to semi-supervised learning +FixMatch: simplifying semi-supervised learning with consistency and confidence +Generative modeling by estimating gradients of the data distribution +Retrieval-augmented generation for knowledge-intensive NLP tasks +EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks +Learning Transferable Visual Models From Natural Language Supervision +A simple framework for contrastive learning of visual representations +Training data-efficient image transformers & distillation through attention +Self-Attention Generative Adversarial Networks +Zero-Shot Text-to-Image Generation +Simplifying Graph Convolutional Networks +Parameter-Efficient Transfer Learning for NLP +Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision +Theoretically Principled Trade-off between Robustness and Accuracy +EfficientNetV2: Smaller Models and Faster Training +BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation +BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models +GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models +Improved Denoising Diffusion Probabilistic Models +SCAFFOLD: stochastic controlled averaging for federated learning +Barlow Twins: Self-Supervised Learning via Redundancy Reduction +Robust speech recognition via large-scale weak supervision +Certified Adversarial Robustness via Randomized Smoothing +PEGASUS: pre-training with extracted gap-sentences for abstractive summarization +Random Erasing Data Augmentation +Energy and Policy Considerations for Modern Deep Learning Research +Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression +Regularized Evolution for Image Classifier Architecture Search +Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting +CheXpert: A Large Chest Radiograph Dataset with Uncertainty Labels and Expert Comparison +Attention Based Spatial-Temporal Graph Convolutional Networks for Traffic Flow Forecasting +Graph Convolutional Networks for Text Classification +Weisfeiler and Leman Go Neural: Higher-Order Graph Neural Networks +Session-Based Recommendation with Graph Neural Networks +GMAN: A Graph Multi-Attention Network for Traffic Prediction +Hypergraph Neural Networks +FFA-Net: Feature Fusion Attention Network for Single Image Dehazing +TabNet: Attentive Interpretable Tabular Learning +Improved Knowledge Distillation via Teacher Assistant +Measuring and Relieving the Over-Smoothing Problem for Graph Neural Networks from the Topological View +EvolveGCN: Evolving Graph Convolutional Networks for Dynamic Graphs +Is BERT Really Robust? A Strong Baseline for Natural Language Attack on Text Classification and Entailment +Deep Interest Evolution Network for Click-Through Rate Prediction +WinoGrande: An Adversarial Winograd Schema Challenge at Scale +BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension +Unsupervised Cross-lingual Representation Learning at Scale +Transformer-XL: Attentive Language Models beyond a Fixed-Length Context +Energy and Policy Considerations for Deep Learning in NLP +Prefix-Tuning: Optimizing Continuous Prompts for Generation +Don’t Stop Pretraining: Adapt Language Models to Domains and Tasks +Stanza: A Python Natural Language Processing Toolkit for Many Human Languages +ERNIE: Enhanced Language Representation with Informative Entities +BERT Rediscovers the Classical NLP Pipeline +Making Pre-trained Language Models Better Few-shot Learners +How Multilingual is Multilingual BERT? +DIALOGPT: Large-Scale Generative Pre-training for Conversational Response Generation +What Does BERT Learn about the Structure of Language? +Multi-Task Deep Neural Networks for Natural Language Understanding +Multimodal Transformer for Unaligned Multimodal Language Sequences +Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference +BLEURT: Learning Robust Metrics for Text Generation +Self-Instruct: Aligning Language Models with Self-Generated Instructions +Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned +CamemBERT: a Tasty French Language Model +Rethinking Semantic Segmentation From a Sequence-to-Sequence Perspective With Transformers +GhostNet: More Features From Cheap Operations +Occupancy Networks: Learning 3D Reconstruction in Function Space +Self-Training With Noisy Student Improves ImageNet Classification +Scalability in Perception for Autonomous Driving: Waymo Open Dataset +Selective Kernel Networks +PointRCNN: 3D Object Proposal Generation and Detection From Point Cloud +AutoAugment: Learning Augmentation Strategies From Data +Class-Balanced Loss Based on Effective Number of Samples +SiamRPN++: Evolution of Siamese Visual Tracking With Very Deep Networks +Deformable ConvNets V2: More Deformable, Better Results +BDD100K: A Diverse Driving Dataset for Heterogeneous Multitask Learning +Taming Transformers for High-Resolution Image Synthesis +SuperGlue: Learning Feature Matching With Graph Neural Networks +PV-RCNN: Point-Voxel Feature Set Abstraction for 3D Object Detection +ResNeSt: Split-Attention Networks +PointConv: Deep Convolutional Networks on 3D Point Clouds +StarGAN v2: Diverse Image Synthesis for Multiple Domains +Bag of Tricks for Image Classification with Convolutional Neural Networks +Designing Network Design Spaces +Open graph benchmark: datasets for machine learning on graphs +MLP-mixer: an all-MLP architecture for vision +Flamingo: a visual language model for few-shot learning +Unsupervised data augmentation for consistency training +Large language models are zero-shot reasoners +Big self-supervised models are strong semi-supervised learners +Implicit neural representations with periodic activation functions +Deep leakage from gradients +When does label smoothing help? +InstructBLIP: towards general-purpose vision-language models with instruction tuning +SuperGLUE: a stickier benchmark for general-purpose language understanding systems +Big bird: transformers for longer sequences +Adversarial examples are not bugs, they are features +Fourier features let networks learn high frequency functions in low dimensional domains +LAION-5B: an open large-scale dataset for training next generation image-text models +Training generative adversarial networks with limited data +Graph contrastive learning with augmentations +Can you trust your model's uncertainty? evaluating predictive uncertainty under dataset shift +Unified language model pre-training for natural language understanding and generation +Generating diverse high-fidelity images with VQ-VAE-2 +RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space +On the Convergence of FedAvg on Non-IID Data +Deberta: decoding-Enhanced Bert with Disentangled Attention +On the Variance of the Adaptive Learning Rate and Beyond +ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware +Deep Graph Infomax +A Closer Look at Few-shot Classification +Fourier Neural Operator for Parametric Partial Differential Equations +Robustness May Be at Odds with Accuracy +VL-BERT: Pre-training of Generic Visual-Linguistic Representations +Predict then Propagate: Graph Neural Networks meet Personalized PageRank +Rethinking the Value of Network Pruning +Meta-Learning with Latent Embedding Optimization +Deep Anomaly Detection with Outlier Exposure +Distributionally Robust Neural Networks +Measuring Massive Multitask Language Understanding +Rethinking Attention with Performers +DropEdge: Towards Deep Graph Convolutional Networks on Node Classification +Efficient Lifelong Learning with A-GEM +Exploration by random network distillation +An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale +Decoupled Weight Decay Regularization +Measuring and Improving the Use of Graph Information in Graph Neural Networks +ALBERT: A Lite BERT for Self-supervised Learning of Language Representations +Large Scale GAN Training for High Fidelity Natural Image Synthesis +DARTS: Differentiable Architecture Search +LoRA: Low-Rank Adaptation of Large Language Models +Deformable DETR: Deformable Transformers for End-to-End Object Detection +BERTScore: Evaluating Text Generation with BERT +ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators +Denoising Diffusion Implicit Models +Score-Based Generative Modeling through Stochastic Differential Equations +The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks +Benchmarking Neural Network Robustness to Common Corruptions and Perturbations +Learning deep representations by mutual information estimation and maximization +ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness +The Curious Case of Neural Text Degeneration +Reformer: The Efficient Transformer +BEiT: BERT Pre-Training of Image Transformers +Finetuned Language Models are Zero-Shot Learners +Is Space-Time Attention All You Need for Video Understanding? +Reliable evaluation of adversarial robustness with an ensemble of diverse parameter-free attacks +Do ImageNet Classifiers Generalize to ImageNet? +Understanding contrastive representation learning through alignment and uniformity on the hypersphere +Generative pretraining from pixels +REALM: retrieval-augmented language model pre-training +A Convergence Theory for Deep Learning via Over-Parameterization +Data-efficient image recognition with contrastive predictive coding +Challenging Common Assumptions in the Unsupervised Learning of Disentangled Representations +Off-Policy Deep Reinforcement Learning without Exploration +Learning Latent Dynamics for Planning from Pixels +Simple and deep graph convolutional networks +ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision +Manifold Mixup: Better Representations by Interpolating Hidden States +Transformers are RNNs: fast autoregressive transformers with linear attention +Gradient Descent Finds Global Minima of Deep Neural Networks +Self-Attention Graph Pooling +WILDS: A Benchmark of in-the-Wild Distribution Shifts +Graph U-Nets +Contrastive multi-view representation learning on graphs diff --git a/research_bench/oodbench_arxiv_links.txt b/research_bench/oodbench_arxiv_links.txt new file mode 100644 index 00000000..35909614 --- /dev/null +++ b/research_bench/oodbench_arxiv_links.txt @@ -0,0 +1,200 @@ +http://arxiv.org/abs/1911.05722v3 +No exact title match found +No exact title match found +http://arxiv.org/abs/1801.07698v4 +http://arxiv.org/abs/1911.09070v7 +http://arxiv.org/abs/1809.02983v4 +http://arxiv.org/abs/1912.04958v2 +http://arxiv.org/abs/2111.06377v3 +No exact title match found +http://arxiv.org/abs/1910.03151v4 +http://arxiv.org/abs/1903.11027v5 +http://arxiv.org/abs/1902.09630v2 +No exact title match found +http://arxiv.org/abs/2201.03545v2 +http://arxiv.org/abs/2011.10566v1 +No exact title match found +http://arxiv.org/abs/1812.05784v2 +http://arxiv.org/abs/1901.05103v1 +http://arxiv.org/abs/2103.02907v1 +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/2006.11239v2 +http://arxiv.org/abs/1906.08237v2 +http://arxiv.org/abs/2203.02155v1 +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/2105.05233v4 +No exact title match found +http://arxiv.org/abs/2004.11362v5 +No exact title match found +http://arxiv.org/abs/2006.09882v5 +http://arxiv.org/abs/2105.15203v3 +No exact title match found +http://arxiv.org/abs/1909.13719v2 +No exact title match found +No exact title match found +http://arxiv.org/abs/1907.05600v3 +No exact title match found +http://arxiv.org/abs/1905.11946v5 +http://arxiv.org/abs/2103.00020v1 +http://arxiv.org/abs/2002.05709v3 +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/1902.07153v2 +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/2104.00298v3 +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/2102.09672v1 +http://arxiv.org/abs/1910.06378v4 +No exact title match found +No exact title match found +http://arxiv.org/abs/1902.02918v2 +No exact title match found +http://arxiv.org/abs/1708.04896v2 +No exact title match found +No exact title match found +http://arxiv.org/abs/1802.01548v7 +No exact title match found +http://arxiv.org/abs/1901.07031v1 +No exact title match found +http://arxiv.org/abs/1809.05679v3 +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/1809.09401v3 +http://arxiv.org/abs/1911.07559v2 +http://arxiv.org/abs/1908.07442v5 +http://arxiv.org/abs/1902.03393v2 +No exact title match found +http://arxiv.org/abs/1902.10191v3 +http://arxiv.org/abs/1907.11932v6 +No exact title match found +http://arxiv.org/abs/1907.10641v2 +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/1906.02243v1 +http://arxiv.org/abs/2101.00190v1 +No exact title match found +http://arxiv.org/abs/2003.07082v2 +http://arxiv.org/abs/1905.07129v3 +http://arxiv.org/abs/1905.05950v2 +No exact title match found +http://arxiv.org/abs/1906.01502v1 +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/1906.00295v1 +http://arxiv.org/abs/1902.01007v4 +http://arxiv.org/abs/2004.04696v5 +No exact title match found +No exact title match found +http://arxiv.org/abs/1911.03894v3 +No exact title match found +http://arxiv.org/abs/1911.11907v2 +http://arxiv.org/abs/1812.03828v2 +No exact title match found +http://arxiv.org/abs/1912.04838v7 +http://arxiv.org/abs/1903.06586v2 +http://arxiv.org/abs/1812.04244v2 +No exact title match found +No exact title match found +http://arxiv.org/abs/1812.11703v1 +http://arxiv.org/abs/1811.11168v2 +http://arxiv.org/abs/1805.04687v2 +No exact title match found +http://arxiv.org/abs/1911.11763v2 +No exact title match found +No exact title match found +http://arxiv.org/abs/1811.07246v3 +http://arxiv.org/abs/1912.01865v2 +http://arxiv.org/abs/1812.01187v2 +http://arxiv.org/abs/2003.13678v1 +http://arxiv.org/abs/2005.00687v7 +No exact title match found +No exact title match found +http://arxiv.org/abs/1904.12848v6 +No exact title match found +No exact title match found +http://arxiv.org/abs/2006.09661v1 +http://arxiv.org/abs/1906.08935v2 +http://arxiv.org/abs/1906.02629v3 +No exact title match found +No exact title match found +http://arxiv.org/abs/2007.14062v2 +http://arxiv.org/abs/1905.02175v4 +http://arxiv.org/abs/2006.10739v1 +No exact title match found +http://arxiv.org/abs/2006.06676v2 +http://arxiv.org/abs/2010.13902v3 +http://arxiv.org/abs/1906.02530v2 +No exact title match found +No exact title match found +http://arxiv.org/abs/1902.10197v1 +No exact title match found +No exact title match found +http://arxiv.org/abs/1908.03265v4 +http://arxiv.org/abs/1812.00332v2 +http://arxiv.org/abs/1809.10341v2 +No exact title match found +http://arxiv.org/abs/2010.08895v3 +http://arxiv.org/abs/1805.12152v5 +No exact title match found +http://arxiv.org/abs/1810.05997v6 +http://arxiv.org/abs/1810.05270v2 +No exact title match found +http://arxiv.org/abs/1812.04606v3 +No exact title match found +http://arxiv.org/abs/2009.03300v3 +http://arxiv.org/abs/2009.14794v4 +http://arxiv.org/abs/1907.10903v4 +http://arxiv.org/abs/1812.00420v2 +http://arxiv.org/abs/1810.12894v1 +http://arxiv.org/abs/2010.11929v2 +http://arxiv.org/abs/1711.05101v3 +http://arxiv.org/abs/2206.13170v1 +No exact title match found +http://arxiv.org/abs/1809.11096v2 +http://arxiv.org/abs/1806.09055v2 +No exact title match found +No exact title match found +http://arxiv.org/abs/1904.09675v3 +No exact title match found +http://arxiv.org/abs/2010.02502v4 +No exact title match found +http://arxiv.org/abs/1803.03635v5 +http://arxiv.org/abs/1903.12261v1 +http://arxiv.org/abs/1808.06670v5 +No exact title match found +http://arxiv.org/abs/1904.09751v2 +http://arxiv.org/abs/2001.04451v2 +No exact title match found +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/1902.10811v2 +http://arxiv.org/abs/2005.10242v10 +No exact title match found +No exact title match found +No exact title match found +No exact title match found +http://arxiv.org/abs/1811.12359v4 +No exact title match found +http://arxiv.org/abs/1811.04551v5 +http://arxiv.org/abs/2007.02133v1 +No exact title match found +http://arxiv.org/abs/1806.05236v7 +http://arxiv.org/abs/2006.16236v3 +http://arxiv.org/abs/1811.03804v4 +No exact title match found +No exact title match found +http://arxiv.org/abs/1905.05178v1 +No exact title match found diff --git a/research_bench/paper_section_stat.py b/research_bench/paper_section_stat.py new file mode 100644 index 00000000..31bad383 --- /dev/null +++ b/research_bench/paper_section_stat.py @@ -0,0 +1,174 @@ +import argparse +import json +import os +from multiprocessing import Lock, Pool +from typing import Any, Dict, List, Tuple + +from tqdm import tqdm + +from research_bench.eval import compute_proposal_metrics +from research_bench.proposal_writing import write_proposal +from research_bench.utils import load_benchmark +from research_town.configs import Config +from research_town.data import Profile +from research_town.utils.logger import logger +import random +from collections import defaultdict + + +def inference( + paper_id: str, + paper_data: Dict[str, Any], + author_data: Dict[str, Any], + ref_proposal: str, + mode: str, + config: Config, +) -> Tuple[Dict[str, str], Dict[str, float]]: + profiles = [Profile(**data) for data in author_data.values()] + ref_abstracts_full = [] + for ref in paper_data.get('references', []): + if ref['abstract'] is None: + continue + else: + ref_abstracts_full.append(ref['abstract']) + ''' + if ref['reference_section'] is None or ref['abstract'] is None: + continue + reference_sections = [section.lower() for section in ref['reference_section']] + + exclude_signal = False + for section in reference_sections: + #if 'related work' in section: + # ref_abstracts_full.append(ref['abstract']) + # break + #if 'introduction' in section: + # ref_abstracts_full.append(ref['abstract']) + # break + #if 'introduction' in section or 'related work' in section: + # ref_abstracts_full.append(ref['abstract']) + # break + + #if 'related work' in section: + # exclude_signal = True + # break + #elif 'introduction' in section: + # exclude_signal = True + # break + + #if exclude_signal is False: + # ref_abstracts_full.append(ref['abstract']) + ''' + return len(ref_abstracts_full) + +def load_papers(input_path: str, output_path: str) -> Any: + dataset = load_benchmark(input_path) + + if os.path.exists(output_path): + with open(output_path, 'r') as f: + processed_ids = {json.loads(line)['paper_id'] for line in f} + return {k: v for k, v in dataset.items() if k not in processed_ids} + + return dataset + + +def save_results( + results: Dict[str, Any], metrics: Dict[str, float], output_path: str, lock: Any +) -> None: + with lock: + with open(output_path, 'a') as f: + json.dump({**results, **metrics}, f) + f.write('\n') + + +def process_task( + task: Tuple[str, Dict[str, Any], Dict[str, Any], str, str, Config], +) -> Tuple[Dict[str, Any], Dict[str, float]]: + return inference(*task) + + +def main() -> None: + parser = argparse.ArgumentParser(description='Research Proposal Generator') + parser.add_argument( + '--input_path', type=str, required=True, help='Input JSON file path' + ) + parser.add_argument( + '--output_path', type=str, required=True, help='Output JSONL file path' + ) + parser.add_argument( + '--mode', + type=str, + required=True, + choices=[ + 'zero_shot', + 'author_only', + 'citation_only', + 'author_citation', + 'research_town', + 'sakana_ai_scientist', + 'debug', + 'fake_research_town', + 'fake_research_town_twice', + ], + help='Processing mode', + ) + parser.add_argument( + '--config_path', + type=str, + default='../configs', + help='Path to the configuration directory', + ) + parser.add_argument( + '--num_processes', + type=int, + default=os.cpu_count(), + help='Number of parallel processes to use', + ) + args = parser.parse_args() + + config = Config(args.config_path) + dataset = load_papers(args.input_path, args.output_path) + logger.info(f'Processing {len(dataset)} papers') + + metrics_summary: Dict[str, List[float]] = { + metric: [] + for metric in [ + 'bleu', + 'rouge_l', + 'gpt_metric_score', + 'bert_score', + 'embedding_similarity', + ] + } + + lock = Lock() + with Pool(processes=args.num_processes) as pool: + tasks = [ + ( + paper_id, + data['paper_data'], + data['author_data'], + data['reference_proposal'], + args.mode, + config, + ) + for paper_id, data in dataset.items() + ] + for results, metrics in tqdm( + pool.imap_unordered(process_task, tasks), + total=len(tasks), + desc='Processing papers', + ): + save_results(results, metrics, args.output_path, lock) + with lock: + for metric, scores in metrics_summary.items(): + scores.append(metrics.get(metric, 0)) + + # Report average metrics + for metric, scores in metrics_summary.items(): + if scores: + average = sum(scores) / len(scores) + logger.info(f"Average {metric.replace('_', ' ').upper()}: {average:.4f}") + + +if __name__ == '__main__': + main() diff --git a/research_bench/plot_agent_number_ablation.py b/research_bench/plot_agent_number_ablation.py new file mode 100644 index 00000000..860e66fc --- /dev/null +++ b/research_bench/plot_agent_number_ablation.py @@ -0,0 +1,80 @@ +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np + +# Example data +researcher_numbers = [1, 2, 3, 4, 5] +openai_similarity_scores = [47.59, 51.54, 52.36, 52.59, 52.75] +voyageai_similarity_scores = [53.01, 55.05, 56.36, 56.77, 57.18] + +# Set a seaborn style for consistency +sns.set_theme(style="whitegrid") + +# Create the plot +fig, ax = plt.subplots(figsize=(6, 4)) + +# Line plot for OpenAI scores +sns.lineplot( + x=researcher_numbers, + y=openai_similarity_scores, + marker='o', + color=sns.color_palette("pastel")[1], + ax=ax, + label='text-embedding-large-3', + linewidth=4, # Make the line thicker + markersize=10 # Make the dots larger +) + +# Line plot for VoyageAI scores +sns.lineplot( + x=researcher_numbers, + y=voyageai_similarity_scores, + marker='o', + color=sns.color_palette("pastel")[0], + ax=ax, + label='voyage-3', + linewidth=4, # Make the line thicker + markersize=10 # Make the dots larger +) + +# Add titles and labels +ax.set_xlabel("Researcher Number for Paper Writing", fontsize=20) +ax.set_ylabel("Similarity Score", fontsize=20) + +# Add y-axis limits for consistency with the bar plot +ax.set_ylim(47, 59) +ax.set_xlim(0.5, 5.5) + +# Tweak the x-axis to have integer ticks +ax.set_xticks(researcher_numbers) +ax.set_xticklabels(researcher_numbers, fontsize=15) + +# Add a legend +ax.legend(fontsize=12) + +# Add labels near each point for OpenAI scores +for x, y in zip(researcher_numbers, openai_similarity_scores): + ax.annotate(f'{y:.1f}', + xy=(x, y), + xytext=(0, 5), + textcoords="offset points", + ha='center', + fontsize=15) + +# Add labels near each point for VoyageAI scores +for x, y in zip(researcher_numbers, voyageai_similarity_scores): + ax.annotate(f'{y:.1f}', + xy=(x, y), + xytext=(0, 5), + textcoords="offset points", + ha='center', + fontsize=15) + +# Tweak y-axis ticks for a clean look +ax.set_yticks(np.arange(47, 59.1, 2)) +ax.grid(False) + +# Adjust layout and save the figure +plt.tight_layout() +plt.savefig("ablation_study_on_agent_number_with_seaborn_voyage.pdf") +plt.show() diff --git a/research_bench/plot_agent_number_ablation2.py b/research_bench/plot_agent_number_ablation2.py new file mode 100644 index 00000000..7b351135 --- /dev/null +++ b/research_bench/plot_agent_number_ablation2.py @@ -0,0 +1,88 @@ +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np + +# Example data +agent_numbers = [1, 3, 5] +voyage_strength = [65.14, 65.83, 66.01] +voyage_weakness = [60.95, 61.29, 61.39] + +# Set a seaborn style for consistency +sns.set_theme(style="whitegrid") + +# Create the plot +fig, ax1 = plt.subplots(figsize=(6, 4)) + +# Plot strengths +sns.lineplot( + x=agent_numbers, + y=voyage_strength, + marker='o', + color=sns.color_palette("pastel")[2], + ax=ax1, + legend=False, # Suppress automatic legend + linewidth=4, # Make the line thicker + markersize=10 # Make the dots larger +) + +# Add y-axis on the right +ax2 = ax1.twinx() + +# Plot weaknesses +sns.lineplot( + x=agent_numbers, + y=voyage_weakness, + marker='o', + color=sns.color_palette("pastel")[3], + ax=ax2, + legend=False, # Suppress automatic legend + linewidth=4, # Make the line thicker + markersize=10 # Make the dots larger +) + +# Add titles and labels +ax1.set_xlabel("Researcher Number for Review Writing", fontsize=20) +ax1.set_ylabel("Strength Scores", fontsize=20, color="black") +ax2.set_ylabel("Weakness Scores", fontsize=20, color="black") + +# Add y-axis limits +ax1.set_ylim(65, 66.5) +ax2.set_ylim(60, 61.5) +ax1.set_xlim(0.5, 5.5) +ax2.set_xlim(0.5, 5.5) +ax1.set_yticks(np.arange(65, 66.6, 0.5)) +ax2.set_yticks(np.arange(60, 61.6, 0.5)) +ax1.tick_params(axis='both', which='major', labelsize=15) +ax2.tick_params(axis='y', labelsize=15) + +# Add a single legend manually +custom_lines = [ + plt.Line2D([0], [0], color=sns.color_palette("pastel")[2], marker='o', label='voyage-3 (strength)'), + plt.Line2D([0], [0], color=sns.color_palette("pastel")[3], marker='o', label='voyage-3 (weakness)') +] +ax1.legend(handles=custom_lines, fontsize=12, loc='lower right') + +# Annotate scores + +for x, y in zip(agent_numbers, voyage_strength): + ax1.annotate(f'{y:.1f}', + xy=(x, y), + xytext=(0, -15), + textcoords="offset points", + ha='center', + fontsize=15) + +for x, y in zip(agent_numbers, voyage_weakness): + ax2.annotate(f'{y:.1f}', + xy=(x, y), + xytext=(0, -15), + textcoords="offset points", + ha='center', + fontsize=15) +ax1.grid(False) +ax2.grid(False) + +# Adjust layout and save the figure +plt.tight_layout() +plt.savefig("number_ablation_study_voyage_only.pdf") +plt.show() diff --git a/research_bench/plot_paper_number_ablation.py b/research_bench/plot_paper_number_ablation.py new file mode 100644 index 00000000..e4a1a46e --- /dev/null +++ b/research_bench/plot_paper_number_ablation.py @@ -0,0 +1,61 @@ +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns + +# Use Seaborn's style +sns.set_style("whitegrid") + +# Data +settings = ["all", "related work", "introduction", "other"] +openai_means = np.array([0.6732, 0.6749, 0.6591, 0.5889]) +voyage_means = np.array([0.6995, 0.7012, 0.6875, 0.6276]) + +# Number of bars (categories) +x = np.arange(len(settings)) # the label locations + +# Set width of each bar +width = 0.35 + +# Create the plot +fig, ax = plt.subplots(figsize=(6, 4)) + +# Add bars using Seaborn-like colors for consistency +rects1 = ax.bar(x - width/2, voyage_means * 100, width, label='voyage-3', color=sns.color_palette("pastel")[0]) +rects2 = ax.bar(x + width/2, openai_means * 100, width, label='text-embedding-large-3', color=sns.color_palette("pastel")[1]) + +# Set y-axis limits +ax.set_ylim(58, 73) + +# Add labels, title, and tick labels +ax.set_ylabel('Similarity score', fontsize=20) +ax.set_xticks(x) +ax.set_xticklabels(settings, fontsize=15) +ax.legend(fontsize=12) + +# Add a grid with Seaborn style +ax.grid(False) + +plt.tick_params(axis='both', which='major', labelsize=15) + +plt.xlabel("Cited paper for aggregation", fontsize=20) + +# Function to add labels above bars +def autolabel(rects): + for rect in rects: + height = rect.get_height() + ax.annotate(f'{height:.1f}', + xy=(rect.get_x() + rect.get_width()/2, height), + xytext=(0, 3), # Offset + textcoords="offset points", + ha='center', va='bottom', fontsize=15) + +# Add bar labels +autolabel(rects1) +autolabel(rects2) + +ax.set_yticks(np.arange(58, 73, 4)) + +# Tight layout and save the figure +plt.tight_layout() +plt.savefig("ablation_study_on_paper_number_with_seaborn.pdf") +plt.show() diff --git a/research_bench/proposal_writing.py b/research_bench/proposal_writing.py index 38ff556b..82d9edfa 100644 --- a/research_bench/proposal_writing.py +++ b/research_bench/proposal_writing.py @@ -1,5 +1,5 @@ from typing import List - +import random from research_town.agents import AgentManager from research_town.configs import Config from research_town.data import Profile @@ -28,14 +28,20 @@ def write_proposal_researchtown( agent_manager=agent_manager, ) - leader_profile = profile_db.get(name=profiles[0].name)[0] - print('leader_profile', leader_profile) + leader_profile = profiles[0] leader = agent_manager.create_agent(leader_profile, role='leader') + members = [] + for member_profile in profiles[1:]: + member = agent_manager.create_agent(member_profile, role='member') + members.append(member) if not leader_profile: raise ValueError('Failed to create leader agent') + ref_contents = [ref for ref in ref_contents if ref is not None] + assert None not in ref_contents env.on_enter( leader=leader, + members=members, contexts=ref_contents, ) @@ -48,7 +54,7 @@ def write_proposal_researchtown( # Exit the environment and retrieve the generated proposal exit_status, exit_dict = env.on_exit() - proposal = exit_dict.get('proposal') + proposal = exit_dict.get('proposals')[0] if proposal and proposal.content: return str(proposal.content) else: @@ -77,7 +83,7 @@ def write_proposal_zero_shot(config: Config) -> str: 'Discuss any barriers that have prevented this problem from being solved until now.\n' 'Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n' '[Question 5] - What are the key components of my approach and results?\n\n' - 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use.\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' 'Please provide the five core questions contents for a brand new future research that you think are the most promising one.' ), @@ -88,42 +94,63 @@ def write_proposal_zero_shot(config: Config) -> str: def write_proposal_with_only_profiles(profiles: List[Profile], config: Config) -> str: - bio_strs = '\n'.join([profile.bio for profile in profiles]) - prompt = [ - { - 'role': 'user', - 'content': ( - 'Here is a high-level summarized insight of a research field Machine Learning.\n\n' - 'Here are the five core questions:\n\n' - '[Question 1] - What is the problem?\n\n' - 'Formulate the specific research question you aim to address. Only output one question and do not include any more information.\n\n' - '[Question 2] - Why is it interesting and important?\n\n' - 'Explain the broader implications of solving this problem for the research community.\n' - 'Discuss how such paper will affect the future research.\n' - 'Discuss how addressing this question could advance knowledge or lead to practical applications.\n\n' - '[Question 3] - Why is it hard?\n\n' - 'Discuss the challenges and complexities involved in solving this problem.\n' - 'Explain why naive or straightforward approaches may fail.\n' - 'Identify any technical, theoretical, or practical obstacles that need to be overcome. MAKE IT CLEAR.\n\n' - "[Question 4] - Why hasn't it been solved before?\n\n" - 'Identify gaps or limitations in previous research or existing solutions.\n' - 'Discuss any barriers that have prevented this problem from being solved until now.\n' - 'Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n' - '[Question 5] - What are the key components of my approach and results?\n\n' - 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use.\n' - 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' - f'Author biographies and personas:\n{bio_strs}\n\n' - 'You are the profiles of this paper. Please provide the five core questions contents for a brand new future research based on the above biographies.' - ), - } - ] - response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] - return response + fiveq_candidates = [] + profiles = profiles[:2] + print(len(profiles)) + for profile in profiles: + prompt = [ + { + 'role': 'user', + 'content': 'Who are you?', + }, + { + 'role': 'assistant', + 'content': profile.bio, + }, + { + 'role': 'user', + 'content': ( + "You need to write a research proposal for a paper in the field of Machine Learning based on your previous experience.\n" + "The research proposal should be based on your previous research experience. The proposal should be practical, reliable and potentially impactful.\n" + "Please focus on the direction that you think is the most promising one.\n" + "Here is a high-level summarized insight of a research field Machine Learning.\n\n" + "Here are the five core questions:\n\n" + "[Question 1] - What is the problem?\n\n" + "Formulate the specific research question you aim to address.\n" + "Only output one question and do not include any more information.\n" + "[Question 2] - Why is it interesting and important?\n\n" + "Explain the broader implications of solving this problem for the research community.\n" + "Discuss how such paper will affect the future research.\n" + "Discuss how addressing this question could advance knowledge or lead to practical applications.\n\n" + "[Question 3] - Why is it hard?\n\n" + "Discuss the challenges and complexities involved in solving this problem.\n" + "Explain why naive or straightforward approaches may fail.\n" + "Identify any technical, theoretical, or practical obstacles that need to be overcome. MAKE IT CLEAR.\n\n" + "[Question 4] - Why hasn't it been solved before?\n\n" + "Identify gaps or limitations in previous research or existing solutions.\n" + "Discuss any barriers that have prevented this problem from being solved until now.\n" + "Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n" + '[Question 5] - What are the key components of my approach and results?\n\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' + 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' + 'Please brainstorm a following proposal with the given format.' + ), + } + ] + fiveq_response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + fiveq_candidates.append(fiveq_response) + + fused_fiveq = fuse_questions(fiveq_candidates, None, config) + return fused_fiveq def write_proposal_with_only_citations(ref_contents: List[str], config: Config) -> str: - ref_strs = '\n'.join([ref for ref in ref_contents if ref is not None]) + ref_strs = '' + for idx, ref in enumerate(ref_contents): + if ref is None: + continue + ref_strs += f'paper {idx + 1}. {ref}\n\n' prompt = [ { @@ -146,10 +173,10 @@ def write_proposal_with_only_citations(ref_contents: List[str], config: Config) 'Discuss any barriers that have prevented this problem from being solved until now.\n' 'Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n' '[Question 5] - What are the key components of my approach and results?\n\n' - 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use.\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' f'Contents collect from cited papers:\n{ref_strs}\n\n' - 'Please provide the five core questions contents based on the above cited contents.' + 'Please brainstorm a following proposal with the given format.' ), } ] @@ -315,12 +342,331 @@ def write_proposal_sakana_ai_scientist( else: return conversation[-1]['content'].split('I am done')[0] +def write_proposal_debug(profiles: List[Profile], ref_contents: List[str], config: Config) -> str: + random.seed(0) + random.shuffle(ref_contents) + ref_strs = '' + for idx, ref in enumerate(ref_contents): + if ref is None: + continue + ref_strs += f'paper {idx + 1}. {ref}\n' + profile_str = '\n'.join([profile.bio for profile in profiles]) + + prompt = [ + { + 'role': 'user', + 'content': ( + 'Here is a high-level summarized insight of a research field Machine Learning.\n\n' + 'Here are the five core questions:\n\n' + '[Question 1] - What is the problem?\n\n' + 'Formulate the specific research question you aim to address. Only output one question and do not include any more information.\n\n' + '[Question 2] - Why is it interesting and important?\n\n' + 'Explain the broader implications of solving this problem for the research community.\n' + 'Discuss how such paper will affect the future research.\n' + 'Discuss how addressing this question could advance knowledge or lead to practical applications.\n\n' + '[Question 3] - Why is it hard?\n\n' + 'Discuss the challenges and complexities involved in solving this problem.\n' + 'Explain why naive or straightforward approaches may fail.\n' + 'Identify any technical, theoretical, or practical obstacles that need to be overcome. MAKE IT CLEAR.\n\n' + "[Question 4] - Why hasn't it been solved before?\n\n" + 'Identify gaps or limitations in previous research or existing solutions.\n' + 'Discuss any barriers that have prevented this problem from being solved until now.\n' + 'Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n' + '[Question 5] - What are the key components of my approach and results?\n\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' + 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' + f'Contents collect from cited papers:\n{ref_strs}\n\n' + 'Please brainstorm a following proposal with the given format.' + ), + } + ] + generated_5q = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + # delete all things after [Question 5] + generated_4q = generated_5q.split('[Question 5]')[0] + + prompt = [ + { + 'role': 'user', + 'content': ( + 'Here is a high-level summarized insight of a research field Machine Learning.\n\n' + 'Here are the five core questions:\n\n' + '[Question 1] - What is the problem?\n\n' + 'Formulate the specific research question you aim to address. Only output one question and do not include any more information.\n\n' + '[Question 2] - Why is it interesting and important?\n\n' + 'Explain the broader implications of solving this problem for the research community.\n' + 'Discuss how such paper will affect the future research.\n' + 'Discuss how addressing this question could advance knowledge or lead to practical applications.\n\n' + '[Question 3] - Why is it hard?\n\n' + 'Discuss the challenges and complexities involved in solving this problem.\n' + 'Explain why naive or straightforward approaches may fail.\n' + 'Identify any technical, theoretical, or practical obstacles that need to be overcome. MAKE IT CLEAR.\n\n' + "[Question 4] - Why hasn't it been solved before?\n\n" + 'Identify gaps or limitations in previous research or existing solutions.\n' + 'Discuss any barriers that have prevented this problem from being solved until now.\n' + 'Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n' + '[Question 5] - What are the key components of my approach and results?\n\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' + 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' + 'This is the generated [Question 1] to [Question 4] based on the citation papers.\n' + f'{generated_4q}\n\n' + 'You have a group of researchers who the bio is as follows:\n' + f'{profile_str}\n\n' + 'When you are generating [Question 5], you can think how to do the thing based on your bio information.\n' + 'Please brainstorm a following proposal with the given format. You should still start with [Question 1] to [Question 5]. But the content from [Question 1] to [Question 4] is already given and you just copy them as part of the output.' + ), + } + ] + response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + return response + +import random +from typing import List + +def fuse_questions(fiveq_candidates: List[str], ref_strs: str, config: Config) -> str: + fiveq_candidate_strs = '\n'.join([f'Proposal {idx + 1}.\n\n {candidate}\n\n' for idx, candidate in enumerate(fiveq_candidates)]) + prompt = [ + { + 'role': 'user', + 'content': ( + f"Here is a high-level summarized insight of a research field: Machine Learning.\n\n" + f"Here are the five core questions to consider:\n\n" + f"[Question 1] - What is the problem?\n" + f"[Question 2] - Why is it interesting and important?\n" + f"[Question 3] - Why is it hard?\n" + f"[Question 4] - Why hasn't it been solved before?\n" + f"[Question 5] - What are the key components of my approach and results?\n\n" + f"Multiple proposals have been generated for the above questions:\n{fiveq_candidate_strs}\n\n" + f"Your task is to summarize and select the key insights that are suitable from these proposals.\n" + f"1. Identify shared themes and common points among the proposals.\n" + f"2. Highlight and select any valuable perspectives or contrasting elements and combine them into one proposal.\n" + f"3. Provide a concise proposal for each question based on the proposal candidates.\n\n" + f"Output the result in the provided five question format.\n\n" + f"Ensure the generated proposal is clear, concise, and avoids repeating full proposals verbatim." + ), + } + ] + summarized_response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num) + return summarized_response[0].strip() + + +def write_proposal_fake_researchtown( + profiles: List[Profile], + ref_contents: List[str], + config: Config, +) -> str: + fiveq_candidates = [] + profiles = profiles[:1] + + ref_contents = [ref for ref in ref_contents if ref is not None] + ref_strs = '\n'.join([f'paper {idx + 1}. {ref}' for idx, ref in enumerate(ref_contents) if ref]) + + + for idx, profile in enumerate(profiles): + print(f'profile {idx + 1}!') + + + prompt = [ + { + 'role': 'user', + 'content': 'Who are you?', + }, + { + 'role': 'assistant', + 'content': profile.bio, + }, + { + 'role': 'user', + 'content': ( + f'Here is the content collected from related papers:\n{ref_strs}\n\n' + "You need to write a research proposal for a paper in the field of Machine Learning based on these related papers.\n" + "The research proposal should more rely on the cited paper not based on your research experience.\n" + "Your research experience should be utilized to select the most useful and valuable papers included in the related papers for proposal writing.\n" + "Here is a high-level summarized insight of a research field Machine Learning.\n\n" + "Here are the five core questions:\n\n" + "[Question 1] - What is the problem?\n\n" + "Formulate the specific research question you aim to address.\n" + "Only output one question and do not include any more information.\n" + "[Question 2] - Why is it interesting and important?\n\n" + "Explain the broader implications of solving this problem for the research community.\n" + "Discuss how such paper will affect the future research.\n" + "Discuss how addressing this question could advance knowledge or lead to practical applications.\n\n" + "[Question 3] - Why is it hard?\n\n" + "Discuss the challenges and complexities involved in solving this problem.\n" + "Explain why naive or straightforward approaches may fail.\n" + "Identify any technical, theoretical, or practical obstacles that need to be overcome. MAKE IT CLEAR.\n\n" + "[Question 4] - Why hasn't it been solved before?\n\n" + "Identify gaps or limitations in previous research or existing solutions.\n" + "Discuss any barriers that have prevented this problem from being solved until now.\n" + "Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n" + '[Question 5] - What are the key components of my approach and results?\n\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' + 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' + 'Please brainstorm a following proposal with the given format' + ), + } + ] + fiveq_response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + fiveq_candidates.append(fiveq_response) + ''' + + ref_strs = '\n'.join([f'paper {idx + 1}. {ref}' for idx, ref in enumerate(ref_contents) if ref]) + prompt = [ + { + 'role': 'user', + 'content': ( + f'Here is the content collected from related papers:\n{ref_strs}\n\n' + "You need to write a research proposal for a paper in the field of Machine Learning based on these related papers.\n" + "The research proposal should more rely on the cited paper not based on your research experience.\n" + "Your research experience should be utilized to select the most useful and valuable papers included in the related papers for proposal writing.\n" + "Here is a high-level summarized insight of a research field Machine Learning.\n\n" + "Here are the five core questions:\n\n" + "[Question 1] - What is the problem?\n\n" + "Formulate the specific research question you aim to address.\n" + "Only output one question and do not include any more information.\n" + "[Question 2] - Why is it interesting and important?\n\n" + "Explain the broader implications of solving this problem for the research community.\n" + "Discuss how such paper will affect the future research.\n" + "Discuss how addressing this question could advance knowledge or lead to practical applications.\n\n" + "[Question 3] - Why is it hard?\n\n" + "Discuss the challenges and complexities involved in solving this problem.\n" + "Explain why naive or straightforward approaches may fail.\n" + "Identify any technical, theoretical, or practical obstacles that need to be overcome. MAKE IT CLEAR.\n\n" + "[Question 4] - Why hasn't it been solved before?\n\n" + "Identify gaps or limitations in previous research or existing solutions.\n" + "Discuss any barriers that have prevented this problem from being solved until now.\n" + "Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n" + '[Question 5] - What are the key components of my approach and results?\n\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' + 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' + 'Please brainstorm a following proposal with the given format' + ), + } + ] + fiveq_candidates = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num, return_num=len(profiles), temperature=1., top_p=0.8) + print('len(fiveq_candidates)', len(fiveq_candidates)) + ''' + + fused_fiveq = fuse_questions(fiveq_candidates, ref_strs, config) + return fused_fiveq, fiveq_candidates + + +def write_proposal_fake_researchtown_twice( + profiles: List[Profile], + ref_contents: List[str], + config: Config, +) -> str: + + fiveq_candidates = [] + for profile in profiles: + ref_contents = [ref for ref in ref_contents if ref is not None] + ref_strs = '\n'.join([f'paper {idx + 1}. {ref}' for idx, ref in enumerate(ref_contents) if ref]) + + prompt = [ + { + 'role': 'user', + 'content': 'Who are you?', + }, + { + 'role': 'assistant', + 'content': profile.bio, + }, + { + 'role': 'user', + 'content': ( + f'Here is the content collected from related papers:\n{ref_strs}\n\n' + "You need to write a research proposal for a paper in the field of Machine Learning based on these related papers.\n" + "The research proposal should more rely on the cited paper not based on your research experience.\n" + "Your research experience should be utilized to select the most useful and valuable papers included in the related papers for proposal writing.\n" + "Here is a high-level summarized insight of a research field Machine Learning.\n\n" + "Here are the five core questions:\n\n" + "[Question 1] - What is the problem?\n\n" + "Formulate the specific research question you aim to address.\n" + "Only output one question and do not include any more information.\n" + "[Question 2] - Why is it interesting and important?\n\n" + "Explain the broader implications of solving this problem for the research community.\n" + "Discuss how such paper will affect the future research.\n" + "Discuss how addressing this question could advance knowledge or lead to practical applications.\n\n" + "[Question 3] - Why is it hard?\n\n" + "Discuss the challenges and complexities involved in solving this problem.\n" + "Explain why naive or straightforward approaches may fail.\n" + "Identify any technical, theoretical, or practical obstacles that need to be overcome. MAKE IT CLEAR.\n\n" + "[Question 4] - Why hasn't it been solved before?\n\n" + "Identify gaps or limitations in previous research or existing solutions.\n" + "Discuss any barriers that have prevented this problem from being solved until now.\n" + "Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n" + '[Question 5] - What are the key components of my approach and results?\n\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' + 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' + 'Please brainstorm a following proposal with the given format.' + ), + } + ] + fiveq_response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + fiveq_candidates.append(fiveq_response) + + fused_fiveq = fuse_questions(fiveq_candidates, ref_strs, config) + + fiveq_candidates = [] + for profile in profiles: + ref_contents = [ref for ref in ref_contents if ref is not None] + ref_strs = '\n'.join([f'paper {idx + 1}. {ref}' for idx, ref in enumerate(ref_contents) if ref]) + + prompt = [ + { + 'role': 'user', + 'content': 'Who are you?', + }, + { + 'role': 'assistant', + 'content': profile.bio, + }, + { + 'role': 'user', + 'content': ( + f'Here is the content collected from related papers:\n{ref_strs}\n\n' + f'Here is the research proposal generated from the previous round:\n{fused_fiveq}\n\n' + "You need to refine a research proposal for a paper in the field of Machine Learning based on these related papers.\n" + "The research proposal should more rely on the cited paper and the previous research proposal not based on your research experience.\n" + "Your research experience should be utilized to select the most useful and valuable papers included in the related papers for proposal writing.\n" + "Here is a high-level summarized insight of a research field Machine Learning.\n\n" + "Here are the five core questions:\n\n" + "[Question 1] - What is the problem?\n\n" + "Formulate the specific research question you aim to address.\n" + "Only output one question and do not include any more information.\n" + "[Question 2] - Why is it interesting and important?\n\n" + "Explain the broader implications of solving this problem for the research community.\n" + "Discuss how such paper will affect the future research.\n" + "Discuss how addressing this question could advance knowledge or lead to practical applications.\n\n" + "[Question 3] - Why is it hard?\n\n" + "Discuss the challenges and complexities involved in solving this problem.\n" + "Explain why naive or straightforward approaches may fail.\n" + "Identify any technical, theoretical, or practical obstacles that need to be overcome. MAKE IT CLEAR.\n\n" + "[Question 4] - Why hasn't it been solved before?\n\n" + "Identify gaps or limitations in previous research or existing solutions.\n" + "Discuss any barriers that have prevented this problem from being solved until now.\n" + "Explain how your approach differs from or improves upon prior work. MAKE IT CLEAR.\n\n" + '[Question 5] - What are the key components of my approach and results?\n\n' + 'Outline your proposed methodology in detail, including the method, dataset, metric that you plan to use. But you must include these in one paragraph and not use subtitles.\n' + 'Describe the expected outcomes. MAKE IT CLEAR.\n\n' + 'Please brainstorm a following proposal with the given format.' + ), + } + ] + fiveq_response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + fiveq_candidates.append(fiveq_response) + + fused_fiveq = fuse_questions(fiveq_candidates, ref_strs, config) + + return fused_fiveq + + def write_proposal( mode: str, profiles: List[Profile], ref_contents: List[str], config: Config, + target_paper_title: str, ) -> str: if mode == 'zero_shot': return write_proposal_zero_shot(config=config) @@ -328,13 +674,13 @@ def write_proposal( return write_proposal_with_only_profiles(profiles=profiles, config=config) elif mode == 'citation_only': return write_proposal_with_only_citations( - ref_contents=ref_contents, config=config + ref_contents=ref_contents, config=config, ) elif mode == 'author_citation': return write_proposal_with_profiles_and_citations( profiles=profiles, ref_contents=ref_contents, config=config ) - elif mode == 'textgnn': + elif mode == 'research_town': return write_proposal_researchtown( profiles=profiles, ref_contents=ref_contents, config=config ) @@ -342,5 +688,17 @@ def write_proposal( return write_proposal_sakana_ai_scientist( ref_contents=ref_contents, config=config, num_reflections=5 ) + elif mode == 'debug': + return write_proposal_debug( + profiles=profiles, ref_contents=ref_contents, config=config, + ) + elif mode == 'fake_research_town': + return write_proposal_fake_researchtown( + profiles=profiles, ref_contents=ref_contents, config=config + ) + elif mode == 'fake_research_town_twice': + return write_proposal_fake_researchtown_twice( + profiles=profiles, ref_contents=ref_contents, config=config + ) else: raise ValueError(f'Invalid proposal writing mode: {mode}') diff --git a/research_bench/resplit_paper_bench.py b/research_bench/resplit_paper_bench.py new file mode 100644 index 00000000..095b76d2 --- /dev/null +++ b/research_bench/resplit_paper_bench.py @@ -0,0 +1,303 @@ +import json +import jsonlines + +def process_value(value, with_reviews=False): + value['paper_data']['evaluation_format'] = { + 'model': 'gpt-4o-mini', + 'content': value['reference_proposal'] + } + del value['reference_proposal'] + if with_reviews is False: + if 'reviews' in value: + del value['reviews'] + filtered_references = [] + for reference in value['paper_data']['references']: + if 'reference_section' in reference: + del reference['reference_section'] + if reference['abstract'] is not None: + filtered_references.append(reference) + + value['paper_data']['references'] = filtered_references + + for author_id, author_data in value['author_data'].items(): + del value['author_data'][author_id]['project_name'] + del value['author_data'][author_id]['bio'] + for pub_title, pub_abstract in zip(author_data['pub_titles'], author_data['pub_abstracts']): + if pub_abstract is None: + continue + + if 'publications' not in value['author_data'][author_id]: + value['author_data'][author_id]['publications'] = [] + value['author_data'][author_id]['publications'].append({ + 'title': pub_title, + 'abstract': pub_abstract + }) + del value['author_data'][author_id]['pub_titles'] + del value['author_data'][author_id]['pub_abstracts'] + del value['author_data'][author_id]['institute'] + del value['author_data'][author_id]['embed'] + del value['author_data'][author_id]['is_leader_candidate'] + del value['author_data'][author_id]['is_member_candidate'] + del value['author_data'][author_id]['is_reviewer_candidate'] + del value['author_data'][author_id]['is_chair_candidate'] + return value + + +def process_review_bench_value(value): + value['paper_data']['evaluation_format'] = { + 'model': 'gpt-4o-mini', + 'content': value['reference_proposal'] + } + del value['reference_proposal'] + filtered_references = [] + for reference in value['paper_data']['references']: + if 'reference_section' in reference: + del reference['reference_section'] + if reference['abstract'] is not None: + filtered_references.append(reference) + + value['paper_data']['references'] = filtered_references + + # change a key name + value['review_data'] = value.pop('reviews') + for key in ['author_data', 'reviewer_data']: + for author_id, author_data in value[key].items(): + del value[key][author_id]['project_name'] + del value[key][author_id]['bio'] + for pub_title, pub_abstract in zip(author_data['pub_titles'], author_data['pub_abstracts']): + if pub_abstract is None: + continue + + if 'publications' not in value[key][author_id]: + value[key][author_id]['publications'] = [] + value[key][author_id]['publications'].append({ + 'title': pub_title, + 'abstract': pub_abstract + }) + del value[key][author_id]['pub_titles'] + del value[key][author_id]['pub_abstracts'] + del value[key][author_id]['institute'] + del value[key][author_id]['embed'] + del value[key][author_id]['is_leader_candidate'] + del value[key][author_id]['is_member_candidate'] + del value[key][author_id]['is_reviewer_candidate'] + del value[key][author_id]['is_chair_candidate'] + + value['paper_data']['full_content'] = value.pop('full_content') + reviewer_assign_similarity = value.pop('reviewer_assign_similarity') + for reviewer_id, reviewer_data in value['reviewer_data'].items(): + value['reviewer_data'][reviewer_id]['reviewer_match_similarity'] = reviewer_assign_similarity[reviewer_id] + + for idx, review in enumerate(value['review_data']): + value['review_data'][idx]['strength_evaluation_format'] = review['strengths_bullet'] + value['review_data'][idx]['weakness_evaluation_format'] = review['weaknesses_bullet'] + value['review_data'][idx].pop('strengths_bullet') + value['review_data'][idx].pop('weaknesses_bullet') + return value + +########################## +# 1. Load the citation-only dataset +########################## + +with jsonlines.open('./results/paper_bench_mid_500_result_4o_mini_citation_only_with_nv_filtered.jsonl', 'r') as reader: + mid_dataset_citation = list(reader) + +with jsonlines.open('./results/paper_bench_hard_500_result_4o_mini_citation_only_with_nv_filtered.jsonl', 'r') as reader: + hard_dataset_citation = list(reader) + +with jsonlines.open('./results/paper_bench_easy_500_result_4o_mini_citation_only_with_nv_filtered.jsonl', 'r') as reader: + easy_dataset_citation = list(reader) + +citation_dataset = mid_dataset_citation + hard_dataset_citation + easy_dataset_citation + +########################## +# 2. Compute the avg_openai_sim and sort +########################## + +for data in citation_dataset: + avg_openai_sim = sum(data[f'openai_sim_q{i}'] for i in range(1, 6)) / 5 + data['avg_openai_sim'] = avg_openai_sim + +citation_dataset.sort(key=lambda x: x['avg_openai_sim'], reverse=True) + +########################## +# 3. Split citation-only into easy/mid/hard +########################## + +new_easy_dataset_citation = citation_dataset[:333] +new_mid_dataset_citation = citation_dataset[333:667] # next 334 +new_hard_dataset_citation = citation_dataset[667:] + +with jsonlines.open('./results/paper_bench_easy_500_result_4o_mini_citation_only_resplit.jsonl', 'w') as writer: + writer.write_all(new_easy_dataset_citation) + +with jsonlines.open('./results/paper_bench_mid_500_result_4o_mini_citation_only_resplit.jsonl', 'w') as writer: + writer.write_all(new_mid_dataset_citation) + +with jsonlines.open('./results/paper_bench_hard_500_result_4o_mini_citation_only_resplit.jsonl', 'w') as writer: + writer.write_all(new_hard_dataset_citation) + +# Show average similarity for sanity check: +print("Citation-only splits:") +print("Easy avg:", sum(d['avg_openai_sim'] for d in new_easy_dataset_citation)/len(new_easy_dataset_citation)) +print("Mid avg:", sum(d['avg_openai_sim'] for d in new_mid_dataset_citation)/len(new_mid_dataset_citation)) +print("Hard avg:", sum(d['avg_openai_sim'] for d in new_hard_dataset_citation)/len(new_hard_dataset_citation)) + +########################## +# 4. Get ID-based membership in each split +########################## + +easy_ids = [d["paper_id"] for d in new_easy_dataset_citation] +mid_ids = [d["paper_id"] for d in new_mid_dataset_citation] +hard_ids = [d["paper_id"] for d in new_hard_dataset_citation] + +########################## +# 5. Create a reusable function +# to split a dataset by the above IDs +########################## + +def resplit_dataset_by_ids(dataset, easy_ids, mid_ids, hard_ids): + """ + dataset: a list of dicts, each containing "paper_id" + easy_ids, mid_ids, hard_ids: IDs used to form new splits + returns: (easy_list, mid_list, hard_list) + """ + # Create a dictionary for quick lookup + by_id = {d["paper_id"]: d for d in dataset} + + # Build each new split by ID + easy_data = [by_id[i] for i in easy_ids if i in by_id] + mid_data = [by_id[i] for i in mid_ids if i in by_id] + hard_data = [by_id[i] for i in hard_ids if i in by_id] + + return easy_data, mid_data, hard_data + +########################## +# 6. For each other mode, +# load + re-split using the above function +########################## + +# 6.1. fake_research_town +with jsonlines.open('./results/paper_bench_mid_500_result_4o_mini_fake_research_town_with_nv_filtered.jsonl', 'r') as reader: + mid_fake = list(reader) +with jsonlines.open('./results/paper_bench_hard_500_result_4o_mini_fake_research_town_with_nv_filtered.jsonl', 'r') as reader: + hard_fake = list(reader) +with jsonlines.open('./results/paper_bench_easy_500_result_4o_mini_fake_research_town_with_nv_filtered.jsonl', 'r') as reader: + easy_fake = list(reader) + +fake_dataset = mid_fake + hard_fake + easy_fake + +new_easy_fake, new_mid_fake, new_hard_fake = resplit_dataset_by_ids( + fake_dataset, easy_ids, mid_ids, hard_ids +) + +with jsonlines.open('./results/paper_bench_easy_500_result_4o_mini_fake_research_town_resplit.jsonl', 'w') as writer: + writer.write_all(new_easy_fake) +with jsonlines.open('./results/paper_bench_mid_500_result_4o_mini_fake_research_town_resplit.jsonl', 'w') as writer: + writer.write_all(new_mid_fake) +with jsonlines.open('./results/paper_bench_hard_500_result_4o_mini_fake_research_town_resplit.jsonl', 'w') as writer: + writer.write_all(new_hard_fake) + +# 6.2. author_only +with jsonlines.open('./results/paper_bench_mid_500_result_4o_mini_author_only_with_nv_filtered.jsonl', 'r') as reader: + mid_author = list(reader) +with jsonlines.open('./results/paper_bench_hard_500_result_4o_mini_author_only_with_nv_filtered.jsonl', 'r') as reader: + hard_author = list(reader) +with jsonlines.open('./results/paper_bench_easy_500_result_4o_mini_author_only_with_nv_filtered.jsonl', 'r') as reader: + easy_author = list(reader) + +author_dataset = mid_author + hard_author + easy_author + +new_easy_author, new_mid_author, new_hard_author = resplit_dataset_by_ids( + author_dataset, easy_ids, mid_ids, hard_ids +) + +with jsonlines.open('./results/paper_bench_easy_500_result_4o_mini_author_only_resplit.jsonl', 'w') as writer: + writer.write_all(new_easy_author) +with jsonlines.open('./results/paper_bench_mid_500_result_4o_mini_author_only_resplit.jsonl', 'w') as writer: + writer.write_all(new_mid_author) +with jsonlines.open('./results/paper_bench_hard_500_result_4o_mini_author_only_resplit.jsonl', 'w') as writer: + writer.write_all(new_hard_author) + +# 6.3. zero_shot +with jsonlines.open('./results/paper_bench_mid_500_result_4o_mini_zero_shot_with_nv_filtered.jsonl', 'r') as reader: + mid_zero = list(reader) +with jsonlines.open('./results/paper_bench_hard_500_result_4o_mini_zero_shot_with_nv_filtered.jsonl', 'r') as reader: + hard_zero = list(reader) +with jsonlines.open('./results/paper_bench_easy_500_result_4o_mini_zero_shot_with_nv_filtered.jsonl', 'r') as reader: + easy_zero = list(reader) + +zero_dataset = mid_zero + hard_zero + easy_zero + +new_easy_zero, new_mid_zero, new_hard_zero = resplit_dataset_by_ids( + zero_dataset, easy_ids, mid_ids, hard_ids +) + +with jsonlines.open('./results/paper_bench_easy_500_result_4o_mini_zero_shot_resplit.jsonl', 'w') as writer: + writer.write_all(new_easy_zero) +with jsonlines.open('./results/paper_bench_mid_500_result_4o_mini_zero_shot_resplit.jsonl', 'w') as writer: + writer.write_all(new_mid_zero) +with jsonlines.open('./results/paper_bench_hard_500_result_4o_mini_zero_shot_resplit.jsonl', 'w') as writer: + writer.write_all(new_hard_zero) + + +with open('./paper_bench/paper_bench_easy_500_filtered_1205.json', 'r') as f: + easy_data = json.load(f) + +with open('./paper_bench/paper_bench_mid_500_filtered_1205.json', 'r') as f: + mid_data = json.load(f) + +with open('./paper_bench/paper_bench_hard_500_filtered_1205.json', 'r') as f: + hard_data = json.load(f) + +data = {**easy_data, **mid_data, **hard_data} + +new_easy_dataset_citation = {} +new_mid_dataset_citation = {} +new_hard_dataset_citation = {} + +for paper_id, value in data.items(): + value = process_value(value) + if paper_id in easy_ids: + new_easy_dataset_citation[paper_id] = value + elif paper_id in mid_ids: + new_mid_dataset_citation[paper_id] = value + elif paper_id in hard_ids: + new_hard_dataset_citation[paper_id] = value + +with open('./paper_bench/paper_bench_easy.json', 'w') as f: + json.dump(new_easy_dataset_citation, f, indent=4) + +with open('./paper_bench/paper_bench_mid.json', 'w') as f: + json.dump(new_mid_dataset_citation, f, indent=4) + +with open('./paper_bench/paper_bench_hard.json', 'w') as f: + json.dump(new_hard_dataset_citation, f, indent=4) + + +with open('./oodbench/oodbench_1203_filtered.json', 'r') as f: + high_impact_data = json.load(f) + +new_high_impact_data = {} + +for paper_id, value in high_impact_data.items(): + value = process_value(value) + new_high_impact_data[paper_id] = value + +with open('./oodbench/high_impact_paper_bench.json', 'w') as f: + json.dump(new_high_impact_data, f, indent=4) + + +with open('./iclrbench/iclrbench_reviewers_filtered_bullets.json', 'r') as f: + iclr_data = json.load(f) + +new_iclr_data = {} + +for paper_id, value in iclr_data.items(): + value = process_review_bench_value(value) + new_iclr_data[paper_id] = value + +with open('./iclrbench/review_bench.json', 'w') as f: + json.dump(new_iclr_data, f, indent=4) + +import pdb; pdb.set_trace() \ No newline at end of file diff --git a/research_bench/run_eval.py b/research_bench/run_eval.py index bd78351e..1e2844c6 100644 --- a/research_bench/run_eval.py +++ b/research_bench/run_eval.py @@ -12,6 +12,8 @@ from research_town.configs import Config from research_town.data import Profile from research_town.utils.logger import logger +import random +from collections import defaultdict def inference( @@ -23,17 +25,70 @@ def inference( config: Config, ) -> Tuple[Dict[str, str], Dict[str, float]]: profiles = [Profile(**data) for data in author_data.values()] - ref_abstracts = [ref['abstract'] for ref in paper_data.get('references', [])] - - gen_proposal = write_proposal(mode, profiles, ref_abstracts, config) - - metrics = compute_proposal_metrics(ref_proposal, gen_proposal) - results = { - 'paper_id': paper_id, - 'ref_proposal': ref_proposal, - 'gen_proposal': gen_proposal, - } - return results, metrics + ref_abstracts_full = [] + for ref in paper_data.get('references', []): + if ref['abstract'] is None: + continue + else: + ref_abstracts_full.append(ref['abstract']) + ''' + if ref['reference_section'] is None or ref['abstract'] is None: + continue + reference_sections = [section.lower() for section in ref['reference_section']] + + exclude_signal = False + for section in reference_sections: + #if 'related work' in section: + # ref_abstracts_full.append(ref['abstract']) + # break + #if 'introduction' in section: + # ref_abstracts_full.append(ref['abstract']) + # break + #if 'introduction' in section or 'related work' in section: + # ref_abstracts_full.append(ref['abstract']) + # break + + #if 'related work' in section: + # exclude_signal = True + # break + #elif 'introduction' in section: + # exclude_signal = True + # break + + #if exclude_signal is False: + # ref_abstracts_full.append(ref['abstract']) + ''' + print(len(ref_abstracts_full)) + paper_title = paper_data['title'] + if mode == 'fake_research_town': + gen_proposal, gen_proposals_each_agent = write_proposal(mode, profiles, ref_abstracts_full, config, paper_title) + else: + gen_proposal = write_proposal(mode, profiles, ref_abstracts_full, config, paper_title) + + if mode == 'fake_research_town': + overall_metrics = defaultdict(list) + # assert each agent gen proposal is the same + for idx, gen_proposal in enumerate(gen_proposals_each_agent): + metrics = compute_proposal_metrics(ref_proposal, gen_proposal) + for metric, score in metrics.items(): + overall_metrics[metric + '_per_agent'].append(score) + metrics = compute_proposal_metrics(ref_proposal, gen_proposal) + for metric, score in metrics.items(): + overall_metrics[metric] = score + results = { + 'paper_id': paper_id, + 'ref_proposal': ref_proposal, + 'gen_proposal': gen_proposal, + } + return results, overall_metrics + else: + metrics = compute_proposal_metrics(ref_proposal, gen_proposal) + results = { + 'paper_id': paper_id, + 'ref_proposal': ref_proposal, + 'gen_proposal': gen_proposal, + } + return results, metrics def load_papers(input_path: str, output_path: str) -> Any: @@ -79,8 +134,11 @@ def main() -> None: 'author_only', 'citation_only', 'author_citation', - 'textgnn', + 'research_town', 'sakana_ai_scientist', + 'debug', + 'fake_research_town', + 'fake_research_town_twice', ], help='Processing mode', ) @@ -113,17 +171,6 @@ def main() -> None: ] } - for paper_id, data in tqdm(dataset.items(), desc='Processing papers'): - paper_data = data['paper_data'] - author_data = data['author_data'] - reference_proposal = data['reference_proposal'] - - results, metrics = inference( - paper_id, paper_data, author_data, reference_proposal, args.mode, config - ) - lock = Lock() - save_results(results, metrics, args.output_path, lock) - lock = Lock() with Pool(processes=args.num_processes) as pool: tasks = [ diff --git a/research_bench/run_eval.sh b/research_bench/run_eval.sh index e8663b5c..6ad8043a 100755 --- a/research_bench/run_eval.sh +++ b/research_bench/run_eval.sh @@ -1,15 +1,15 @@ #!/bin/bash # Define the input and output paths, along with the modes to test -INPUT_PATH="./mlbench/mlbench.json" +INPUT_PATH="./paper_bench/agent_number_ablation_paper_bench_with_relatedness.json" OUTPUT_DIR="./results" -MODES=("citation_only") -NUM_PROCESSES=4 +MODES=("author_only") +NUM_PROCESSES=6 # Loop through each mode and run the evaluation for MODE in "${MODES[@]}" do - OUTPUT_PATH="${OUTPUT_DIR}/mlbench_result_4o_mini_${MODE}.jsonl" + OUTPUT_PATH="${OUTPUT_DIR}/agent_number_ablation_two_author_result_4o_mini_${MODE}.jsonl" echo "Running evaluation for mode: $MODE" poetry run python run_eval.py --input "$INPUT_PATH" --output "$OUTPUT_PATH" --mode "$MODE" --num_processes "$NUM_PROCESSES" echo "Finished evaluation for mode: $MODE" diff --git a/research_bench/split_paper_bench.py b/research_bench/split_paper_bench.py new file mode 100644 index 00000000..9b12bfdd --- /dev/null +++ b/research_bench/split_paper_bench.py @@ -0,0 +1,137 @@ +import json +import jsonlines +import matplotlib.pyplot as plt +import numpy as np +from scipy.stats import pearsonr +from collections import defaultdict +import random +random.seed(42) + +def load_paper_bench(filepath): + with open(filepath, 'r') as f: + dataset = json.load(f) + return dataset + +def load_filtered_dataset(filepaths, paper_ids): + dataset = [] + for file_path in filepaths: + with jsonlines.open(file_path, 'r') as f: + dataset.extend([obj for obj in f if obj['paper_id'] in paper_ids]) + return dataset + +def calculate_openai_similarity(data): + return ( + 0.2 * data['openai_sim_q1'] + + 0.2 * data['openai_sim_q2'] + + 0.2 * data['openai_sim_q3'] + + 0.2 * data['openai_sim_q4'] + + 0.2 * data['openai_sim_q5'] + ) + +def add_openai_sim_to_dataset(dataset): + for data in dataset: + data['openai_sim_5q_avg'] = calculate_openai_similarity(data) + +def sort_dataset_by_similarity(dataset): + return sorted(dataset, key=lambda x: x['openai_sim_5q_avg']) + +def analyze_categories(dataset, paper_bench_data): + category_openai_sim_5q_avg = defaultdict(list) + for data in dataset: + paper_data = paper_bench_data[data['paper_id']] + for category in paper_data['paper_data']['categories']: + category_openai_sim_5q_avg[category].append(data['openai_sim_5q_avg']) + return category_openai_sim_5q_avg + +def calculate_category_stats(category_data): + category_avg = {} + category_num = {} + for category, sim_values in category_data.items(): + category_avg[category] = np.mean(sim_values) + category_num[category] = len(sim_values) + return category_avg, category_num + +def print_category_stats(category_avg, category_num, min_papers=10): + sorted_categories = sorted(category_avg.items(), key=lambda x: x[1]) + for category, avg in sorted_categories: + if category_num[category] > min_papers: + print(f"{category}: {avg:.2f} ({category_num[category]} papers)") + +def extract_ratings(dataset, paper_bench_data): + checks = [] + for data in dataset: + if 'reviews' in paper_bench_data[data['paper_id']]: + reviews = paper_bench_data[data['paper_id']]['reviews'] + rating = sum(int(review['rating'].split(':')[0]) for review in reviews) / len(reviews) + checks.append({ + 'rating': rating, + 'paper_id': data['paper_id'], + 'openai_sim_5q_avg': data['openai_sim_5q_avg'] + }) + return checks + +def calculate_correlation(ratings, similarities): + return pearsonr(ratings, similarities) + +def save_dataset(entries, paper_bench_data, output_path): + saved_data = {entry['paper_id']: paper_bench_data[entry['paper_id']] for entry in entries} + with open(output_path, 'w') as f: + json.dump(saved_data, f, indent=4) + print(f"Saved entries to {output_path}") + +def main(): + paper_bench_file = './paper_bench/paper_bench_full.json' + result_files = [ + './results/paper_bench_result_4o_mini_citation_only_part1.jsonl', + './results/paper_bench_result_4o_mini_citation_only_part2.jsonl' + ] + paper_bench_full = load_paper_bench(paper_bench_file) + + + + paper_ids = list(paper_bench_full.keys()) + + dataset = load_filtered_dataset(result_files, paper_ids) + add_openai_sim_to_dataset(dataset) + + filtered_dataset = [] + for data in dataset: + if data['paper_id'] in paper_ids: + filtered_dataset.append(data) + paper_ids.remove(data['paper_id']) + + filtered_dataset_sorted = sort_dataset_by_similarity(filtered_dataset) + + bottom_500 = filtered_dataset_sorted[:500] + top_500 = filtered_dataset_sorted[-500:] + mid_500 = random.sample(filtered_dataset_sorted[500:-500], 500) + + category_data = analyze_categories(filtered_dataset, paper_bench_full) + category_avg, category_num = calculate_category_stats(category_data) + print_category_stats(category_avg, category_num) + + checks = extract_ratings(filtered_dataset, paper_bench_full) + ratings = [entry['rating'] for entry in checks] + similarities = [entry['openai_sim_5q_avg'] for entry in checks] + + corr_coefficient, p_value = calculate_correlation(ratings, similarities) + print(f"Pearson Correlation Coefficient: {corr_coefficient:.2f}") + print(f"P-value: {p_value:.4f}") + + plt.figure(figsize=(8, 6)) + plt.scatter(similarities, ratings, alpha=0.7) + plt.title('Correlation between Rating and OpenAI Similarity') + plt.xlabel('OpenAI Similarity (5Q Avg)') + plt.ylabel('Rating') + plt.grid(True) + plt.show() + + for key, data in paper_bench_full.items(): + if 'paper_data' not in data.keys(): + import pdb; pdb.set_trace() + #save_dataset(bottom_500, paper_bench_full, './paper_bench/paper_bench_hard_500.json') + #save_dataset(top_500, paper_bench_full, './paper_bench/paper_bench_easy_500.json') + #save_dataset(mid_500, paper_bench_full, './paper_bench/paper_bench_mid_500.json') + +if __name__ == "__main__": + main() diff --git a/research_bench/utils.py b/research_bench/utils.py index e5ca8aad..53f0cc46 100644 --- a/research_bench/utils.py +++ b/research_bench/utils.py @@ -142,11 +142,15 @@ def get_proposal_from_paper(arxiv_id: str, intro: str, config: Config) -> str: @with_cache(cache_dir='author_data') def get_author_data( - arxiv_id: str, authors: List[str], title: str, config: Config + arxiv_id: str, authors: List[str], title: str, config: Config, with_year_limit: bool = False ) -> Dict[str, Any]: + if with_year_limit: + before_year = int('20' + arxiv_id.split('.')[0][:2]) + else: + before_year = None profile_db = ProfileDB(config.database) profile_pks = profile_db.pull_profiles( - names=authors, config=config, known_paper_titles=[title] + names=authors, config=config, known_paper_titles=[title], before_year=before_year ) author_data = {} for pk in profile_pks: diff --git a/research_bench/visualize_score_distribution.py b/research_bench/visualize_score_distribution.py new file mode 100644 index 00000000..13341fad --- /dev/null +++ b/research_bench/visualize_score_distribution.py @@ -0,0 +1,67 @@ +import json +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np + +# Path to your JSON lines file +# jsonl_file = "./results/crossbench_1205_filtered_result_4o_mini_fake_research_town.jsonl" +jsonl_file = "./results/oodbench_1203_filtered_result_4o_mini_fake_research_town.jsonl" + +# Use seaborn's default style +sns.set_style("whitegrid") + +averages = [] + +# Read the JSON lines file +with open(jsonl_file, "r") as f: + for line in f: + data = json.loads(line) + + # Extract the openai_sim values for q1 to q5 + q1 = data.get("openai_sim_q1") + q2 = data.get("openai_sim_q2") + q3 = data.get("openai_sim_q3") + q4 = data.get("openai_sim_q4") + q5 = data.get("openai_sim_q5") + + # If all values are present, compute the average + if None not in [q1, q2, q3, q4, q5]: + avg_score = (q1 + q2 + q3 + q4 + q5) / 5.0 + averages.append(avg_score) + +plt.figure(figsize=(6, 4)) + +# Define custom bin edges +bin_edges = [0.35, 0.45, 0.55, 0.65, 0.75, 0.85] + +# Plot a histogram of the averages with custom bins +ax = sns.histplot(averages, bins=bin_edges, kde=True, color="#2ca02c", edgecolor='white') + +# Add labels and title +plt.xlabel("Similarity score", fontsize=20) +plt.ylabel("Frequency", fontsize=20) + +# Adjust the size of tick labels +plt.tick_params(axis='both', which='major', labelsize=15) + +# Add frequencies above each bar +for patch in ax.patches: + height = patch.get_height() + if height > 0: # Only annotate non-empty bins + ax.annotate(f'{int(height)}', + (patch.get_x() + patch.get_width() / 2, height + 1), + ha='center', va='bottom', fontsize=14, color='black') + +# Make spines visible to create a box +for spine in ax.spines.values(): + spine.set_visible(True) # Ensure all spines are visible + spine.set_linewidth(1.5) # Set line width for the box + +# Disable grid for this plot +ax.grid(False) +ax.set_yticks(np.arange(0, 80, 20)) +ax.set_ylim(0, 62) + +plt.tight_layout() +plt.savefig("similarity_score_distribution_with_frequencies.pdf") +plt.show() diff --git a/research_town/dbs/db_profile.py b/research_town/dbs/db_profile.py index 360a23a3..168c47d0 100644 --- a/research_town/dbs/db_profile.py +++ b/research_town/dbs/db_profile.py @@ -37,13 +37,14 @@ def pull_profiles( names: List[str], config: Config, known_paper_titles: Optional[List[str]] = None, + before_year: Optional[int] = None, ) -> List[str]: profiles: List[Profile] = [] for name in names: try: pub_abstracts, pub_titles, collaborators = ( collect_publications_and_coauthors( - name, paper_max_num=20, known_paper_titles=known_paper_titles + name, paper_max_num=20, known_paper_titles=known_paper_titles, before_year=before_year ) ) logger.info(f'Collected publications for {name}: {pub_titles}') diff --git a/research_town/envs/env_proposal_writing_without_rag.py b/research_town/envs/env_proposal_writing_without_rag.py index 8ba970bc..3a7588fa 100644 --- a/research_town/envs/env_proposal_writing_without_rag.py +++ b/research_town/envs/env_proposal_writing_without_rag.py @@ -62,17 +62,13 @@ def run(self) -> Generator[Tuple[Progress, Agent], None, None]: contexts=self.contexts, config=self.config, ) - yield insight, researcher - insights.append(insight) - - # Step 3: Researchers brainstorm ideas based on their insights - for researcher in researchers: - idea = researcher.brainstorm_idea(insights=insights, config=self.config) + idea = researcher.brainstorm_idea(insights=[insight], config=self.config) yield idea, researcher + insights.append(insight) ideas.append(idea) - # Step 4: Leader summarizes ideas and writes proposals + # Step 2: Leader summarizes ideas and writes proposals idea_combos = sample_ideas(ideas, self.config.param.proposal_num) for idea_combo in idea_combos: summarized_idea = self.leader.summarize_idea( diff --git a/research_town/utils/model_prompting.py b/research_town/utils/model_prompting.py index 522d52cd..24bd81a9 100644 --- a/research_town/utils/model_prompting.py +++ b/research_town/utils/model_prompting.py @@ -30,6 +30,5 @@ def model_prompting( temperature=temperature, stream=stream, ) - content = completion.choices[0].message.content - content_l = [content] + content_l = [completion.choices[i].message.content for i in range(return_num)] return content_l diff --git a/research_town/utils/paper_collector.py b/research_town/utils/paper_collector.py index 2f9009db..38e307ba 100644 --- a/research_town/utils/paper_collector.py +++ b/research_town/utils/paper_collector.py @@ -400,7 +400,7 @@ def get_paper_introduction(url: str) -> Optional[str]: return introduction_text -def get_references(arxiv_id: str, max_retries: int = 5) -> List[Dict[str, Any]]: +def get_references(arxiv_id: str, max_retries: int = 8) -> List[Dict[str, Any]]: SEMANTIC_SCHOLAR_API_URL = 'https://api.semanticscholar.org/graph/v1/paper/' url = f'{SEMANTIC_SCHOLAR_API_URL}ARXIV:{arxiv_id}/references' params = {'limit': 100, 'offset': 0, 'fields': 'title,abstract'} diff --git a/research_town/utils/profile_collector.py b/research_town/utils/profile_collector.py index 81f65f5c..908d9f7b 100644 --- a/research_town/utils/profile_collector.py +++ b/research_town/utils/profile_collector.py @@ -5,7 +5,8 @@ from .error_handler import api_calling_error_exponential_backoff from .model_prompting import model_prompting from .prompt_constructor import openai_format_prompt_construct - +from datetime import datetime +import requests def coauthor_frequency( author_id: str, author_list: List[Dict[str, str]], co_authors: Dict[str, int] @@ -32,7 +33,7 @@ def match_author_ids( search_results = semantic_client.search_author( author_name, fields=['authorId', 'papers.title'], - limit=100, + limit=250, ) author_ids = set() @@ -60,7 +61,7 @@ def match_author_ids( @api_calling_error_exponential_backoff(retries=5, base_wait_time=1) def get_papers_from_author_id( - author_id: str, paper_max_num: int = 20 + author_id: str, paper_max_num: int = 20, before_year: Optional[int] = None ) -> List[Dict[str, Any]]: semantic_client = SemanticScholar() author_data: Dict[str, Any] = semantic_client.get_author( @@ -69,10 +70,25 @@ def get_papers_from_author_id( 'papers.title', 'papers.abstract', 'papers.authors', + 'papers.year', ], ) + papers = author_data['papers'] - return papers[:paper_max_num] if isinstance(papers, list) else [] + if not isinstance(papers, list): + return [] + + if before_year is None: + return papers[:paper_max_num] + else: + # Filter papers based on the year + filtered_papers = [] + for paper in papers: + if paper['year'] is None: + paper['year'] = 2024 + if paper['year'] <= before_year: + filtered_papers.append(paper) + return filtered_papers[:paper_max_num] def collect_publications_and_coauthors( @@ -80,11 +96,12 @@ def collect_publications_and_coauthors( known_paper_titles: Optional[List[str]] = None, paper_max_num: int = 20, exclude_known: bool = True, + before_year: Optional[int] = None, ) -> Tuple[List[str], List[str], List[str]]: matched_author_ids = match_author_ids(author, known_paper_titles) author_id = matched_author_ids.pop() # Only one author ID is expected - papers = get_papers_from_author_id(author_id, paper_max_num) + papers = get_papers_from_author_id(author_id, paper_max_num, before_year=before_year) paper_abstracts = [] paper_titles = [] co_authors: Dict[str, int] = {} diff --git a/research_town/utils/sampler.py b/research_town/utils/sampler.py index 6eb87aff..7322682c 100644 --- a/research_town/utils/sampler.py +++ b/research_town/utils/sampler.py @@ -6,6 +6,8 @@ def sample_ideas(lst: List[Idea], n: int) -> List[List[Idea]]: total_subsets = 2 ** len(lst) - (len(lst) + 1) + if len(lst) == 1: + return [lst] if n > total_subsets: raise ValueError(f'n cannot be greater than {total_subsets}')