Initial commit

AI-sandbox · Nov 8, 2024 · 7577647 · 7577647
commit 7577647
Show file tree

Hide file tree

Showing 105 changed files with 14,799 additions and 0 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,46 @@
+name: docs
+
+# build the documentation whenever there are new commits on main
+on:
+  push:
+    branches:
+      - main
+    # Alternative: only build for tags.
+    # tags:
+    #   - '*'
+
+# security: restrict permissions for CI jobs.
+permissions:
+  contents: read
+
+jobs:
+  # Build the documentation and upload the static HTML files as an artifact.
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - run: pip install -e '.[docs]'
+      - run: DOC_ALLOW_EXEC=1 pdoc --docformat google -o docs/ snputils
+
+      - uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/
+
+  # Deploy the artifact to GitHub pages.
+  # This is a separate job so that only actions/deploy-pages has the necessary permissions.
+  deploy:
+    needs: build
+    runs-on: ubuntu-latest
+    permissions:
+      pages: write
+      id-token: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,55 @@
+name: tests
+
+on:
+  # push:
+  #   branches:
+  #     - main
+  #     - fix_tests
+  #   tags:
+  #     - "v*" # Push events to matching v*, i.e. v1.0, v20.15.10
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.13"]
+
+    steps:
+    - name: Debug - Starting workflow
+      run: echo "Starting workflow"
+
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Debug - Code checked out
+      run: echo "Code checked out"
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Debug - Python setup completed
+      run: echo "Python setup completed"
+
+    - name: Install dependencies
+      run: |
+        echo "Installing dependencies"
+        python -m pip install --upgrade pip
+        python -m pip install setuptools tox tox-gh-actions
+
+    - name: Debug - Dependencies installed
+      run: echo "Dependencies installed"
+
+    - name: Run tests
+      run: |
+        echo "Running tests with tox"
+        python -m tox
+
+    - name: Debug - Tests completed
+      run: echo "Tests completed"
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,148 @@
+/data
+/docs
+/benchmark/sbatch
+/benchmark/results
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Testing
+testing.py
+testing_pca.py
+
+# Editors
+.vscode
+.idea
+
+# MyPy
+.mypy_cache/
+
+# .DS_Store
+.DS_Store
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,28 @@
+BSD 3-Clause License
+
+Copyright (c) 2024, Ioannidis Lab
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -0,0 +1,114 @@
+<p align="center">
+  <a href="https://snputils.org">
+    <img src="assets/logo.png" width="300" alt="snputils logo">
+  </a>
+</p>
+
+# snputils: A Python library for processing diverse genomes
+
+[![License BSD-3](https://img.shields.io/pypi/l/snputils.svg?color=green)](https://github.com/ai-sandbox/snputils/raw/main/LICENSE)
+[![PyPI](https://img.shields.io/pypi/v/snputils.svg?color=green)](https://pypi.org/project/snputils)
+[![Python Version](https://img.shields.io/pypi/pyversions/snputils.svg?color=green)](https://python.org)
+
+**snputils** is a Python package designed to ease the processing and analysis of common and diverse genomic datasets, while handling all the complexities of diverse genome formats and operations very efficiently. The library provides robust tools for handling sequencing and ancestry data, with a focus on performance, ease of use, and advanced visualization capabilities. 
+
+Developed in collaboration between Stanford University's Department of Biomedical Data Science, UC Santa Cruz Genomics Institute, and more collaborators worldwide.
+
+This is an early access release, parts of the code are likely to change significantly in the upcoming weeks.
+
+## Installation
+
+Basic installation using pip:
+```bash
+pip install snputils
+```
+
+Optionally, for GPU-accelerated functionalities, install the package with the `[gpu]` extra:
+```bash
+pip install snputils[gpu]
+```
+
+## Key Features
+
+### Ease of Use
+
+**snputils** is designed to be user-friendly and intuitive, with a simple API that allows you to quickly load, process, and visualize genomic data. For example, reading a whole genome VCF file is as simple as:
+```python
+import snputils as su
+snpobj = su.read_snp("path/to/file.vcf.gz")
+```
+
+Similarly, reading BED or PGEN filesets is straightforward:
+```python
+snpobj = su.read_snp("path/to/file.pgen")
+```
+
+Working with ancestry files, performing processing operations, and creating visualizations is just as straightforward. See the [demos directory](demos/) for examples.
+
+### File Format Support
+**snputils** aims to provide the fastest available readers and writers for various genomic data formats:
+- **VCF**: Support for `.vcf` and `.vcf.gz` files
+- **PLINK1**: Support for `.bed`, `.bim`, `.fam` filesets
+- **PLINK2**: Support for `.pgen`, `.pvar`, `.psam` filesets
+- **Local Ancestry**: Handle `.msp` local ancestry format
+- **Admixture**: Read and write `.Q` and `.P` files
+
+### Processing Tools
+- **Basic Data Manipulation**
+  - Filter variants and samples
+  - Correct SNP flips
+  - Filter out ambiguous SNPs
+
+- **Dimensionality Reduction**
+  - Standard PCA with optional GPU acceleration
+  - Missing-DNA PCA (mdPCA)
+  - Multi-array ancestry-specific MDS (maasMDS)
+
+- **Admixture Mapping**
+
+### Visualization
+- Interactive global ancestry bar plots
+- Detailed scatter plots of PCA, mdPCA, and maasMDS
+- Admixture mapping Manhattan plots
+- Local ancestry visualization 
+  - Chromosome painting (with [Tagore](https://github.com/jordanlab/tagore))
+  - Dataset-level
+
+<p align="center">
+    <img src="assets/lai_dataset_level.png" width="800">
+</p>
+
+
+### Performance
+
+- Fast file I/O through built-in methods or optimized wrappers (e.g., [Pgenlib](https://pypi.org/project/Pgenlib/) for PLINK files)
+- Memory-efficient operations using [NumPy](https://numpy.org) and [Polars](https://pola.rs)
+- Optional GPU acceleration via [PyTorch](https://pytorch.org) for computationally intensive tasks
+- Support for large-scale genomic datasets through efficient memory management
+
+Our benchmark demonstrates superior performance compared to existing tools:
+
+<p align="center">
+    <img src="benchmark/benchmark.png" width="800">
+</p>
+*Reading performance comparison for chromosome 22 data across different tools. See the [benchmark directory](benchmark/) for detailed methodology and results.*
+
+The **snputils** package is continuously updated with new features and improvements. Future releases will include support for statistical computations, admixture simulations, command-line tools, and more.
+
+## Documentation & Support
+
+- **API Reference**: Visit our comprehensive documentation at [docs.snputils.org](https://docs.snputils.org).
+- **Tutorials & Examples**: Check out our demos in the [demos directory](demos/).
+- **Issues & Support**: [GitHub Issues](https://github.com/AI-sandbox/snputils/issues).
+
+## Acknowledgments
+
+We would like to thank the open-source Python packages that make **snputils** possible: matplotlib, NumPy, pandas, Pgenlib, polars, pong, PyTorch, scikit-allel, scikit-learn, Tagore.
+
+## Citation
+
+If you use **snputils** in your research, please cite:
+
+> Bonet, D.\*, Comajoan Cara, M.\*, Barrabés, M.\*, Smeriglio, R., Agrawal, D., Dominguez Mantes, A., López, C., Thomassin, C., Calafell, A., Luis, A., Saurina, J., Franquesa, M., Perera, M., Geleta, M., Jaras, A., Sabat, B. O., Abante, J., Moreno-Grau, S., Mas Montserrat, D., Ioannidis, A. G., snputils: A Python library for processing diverse genomes. Annual Meeting of The American Society of Human Genetics, November 2024, Denver, Colorado, USA. \* Equal contribution.
+
+Journal paper coming soon!
diff --git a/assets/lai_dataset_level.png b/assets/lai_dataset_level.png
diff --git a/assets/logo.png b/assets/logo.png