From f28bb81f9075031121a54e1837cd320c6758b358 Mon Sep 17 00:00:00 2001 From: Matthijs Pon Date: Mon, 9 Dec 2024 13:53:33 +0100 Subject: [PATCH] working version --- .editorconfig | 9 + dev/data_single_cell_expression.txt | 4 + dev/meta_single_cell_expression.txt | 8 + scripts/importer/allowed_data_types.txt | 1 + scripts/importer/cbioportal_common.py | 17 +- .../mskcc/cbio/portal/dao/DaoCancerStudy.java | 1 + .../portal/dao/DaoSingleCellExpression.java | 66 +++++ .../portal/model/GeneticAlterationType.java | 3 +- .../portal/model/SingleCellExpression.java | 69 +++++ .../portal/scripts/ImportProfileData.java | 11 +- .../ImportSingleCellExpressionData.java | 272 ++++++++++++++++++ 11 files changed, 458 insertions(+), 3 deletions(-) create mode 100644 .editorconfig create mode 100644 dev/data_single_cell_expression.txt create mode 100644 dev/meta_single_cell_expression.txt create mode 100644 src/main/java/org/mskcc/cbio/portal/dao/DaoSingleCellExpression.java create mode 100644 src/main/java/org/mskcc/cbio/portal/model/SingleCellExpression.java create mode 100644 src/main/java/org/mskcc/cbio/portal/scripts/ImportSingleCellExpressionData.java diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..f9037de4 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true + +[*.{java,py}] +indent_style = space +indent_size = 4 diff --git a/dev/data_single_cell_expression.txt b/dev/data_single_cell_expression.txt new file mode 100644 index 00000000..16bb6c18 --- /dev/null +++ b/dev/data_single_cell_expression.txt @@ -0,0 +1,4 @@ +Sample_Id Entrez_Gene_Id Hugo_Symbol Cell_Type Tissue Expression_Value +SHAH_H000004_T09_01_WG01 TP53 CELLTYPE TISSUE 57.4 +SHAH_H000004_T09_01_WG01 7157 CELLTYPE2 TISSUE 57.4 +SAMPLE1 7157 TP53 CELLTYPE2 TISSUE 57.4 diff --git a/dev/meta_single_cell_expression.txt b/dev/meta_single_cell_expression.txt new file mode 100644 index 00000000..22bc6262 --- /dev/null +++ b/dev/meta_single_cell_expression.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: msk_spectrum_tme_2022 +genetic_alteration_type: SINGLE_CELL_EXPRESSION +datatype: SINGLE_CELL_EXPRESSION +stable_id: single_cell_expression +show_profile_in_analysis_tab: false +profile_name: single cell expression +profile_description: single cell expression test +data_filename: data_single_cell_expression.txt diff --git a/scripts/importer/allowed_data_types.txt b/scripts/importer/allowed_data_types.txt index 671bc447..4d56b09a 100644 --- a/scripts/importer/allowed_data_types.txt +++ b/scripts/importer/allowed_data_types.txt @@ -68,3 +68,4 @@ GENERIC_ASSAY LIMIT-VALUE * GENERIC_ASSAY BINARY * GENERIC_ASSAY CATEGORICAL * STRUCTURAL_VARIANT SV structural_variants +SINGLE_CELL_EXPRESSION SINGLE_CELL_EXPRESSION * diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index 8e6c97ae..2d018dde 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -66,6 +66,7 @@ class MetaFileTypes(object): CNA_CONTINUOUS = 'meta_contCNA' SEG = 'meta_segment' EXPRESSION = 'meta_expression' + SINGLE_CELL_EXPRESSION = 'meta_single_cell_expression' MUTATION = 'meta_mutations_extended' MUTATION_UNCALLED = 'meta_mutations_uncalled' METHYLATION = 'meta_methylation' @@ -214,6 +215,18 @@ class MetaFileTypes(object): 'data_filename': True, 'gene_panel': False }, + MetaFileTypes.SINGLE_CELL_EXPRESSION: { + 'cancer_study_identifier': True, + 'genetic_alteration_type': True, + 'datatype': True, + 'stable_id': True, + 'source_stable_id': False, + 'show_profile_in_analysis_tab': True, + 'profile_name': True, + 'profile_description': True, + 'data_filename': True, + 'gene_panel': False + }, MetaFileTypes.METHYLATION: { 'cancer_study_identifier': True, 'genetic_alteration_type': True, @@ -399,6 +412,7 @@ class MetaFileTypes(object): MetaFileTypes.CNA_CONTINUOUS: "org.mskcc.cbio.portal.scripts.ImportProfileData", MetaFileTypes.SEG: "org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData", MetaFileTypes.EXPRESSION: "org.mskcc.cbio.portal.scripts.ImportProfileData", + MetaFileTypes.SINGLE_CELL_EXPRESSION: "org.mskcc.cbio.portal.scripts.ImportProfileData", MetaFileTypes.MUTATION: "org.mskcc.cbio.portal.scripts.ImportProfileData", MetaFileTypes.MUTATION_UNCALLED: "org.mskcc.cbio.portal.scripts.ImportProfileData", MetaFileTypes.METHYLATION: "org.mskcc.cbio.portal.scripts.ImportProfileData", @@ -692,7 +706,8 @@ def get_meta_file_type(meta_dictionary, logger, filename): ("GENESET_SCORE", "P-VALUE"): MetaFileTypes.GSVA_PVALUES, ("GENERIC_ASSAY", "LIMIT-VALUE"): MetaFileTypes.GENERIC_ASSAY_CONTINUOUS, ("GENERIC_ASSAY", "BINARY"): MetaFileTypes.GENERIC_ASSAY_BINARY, - ("GENERIC_ASSAY", "CATEGORICAL"): MetaFileTypes.GENERIC_ASSAY_CATEGORICAL + ("GENERIC_ASSAY", "CATEGORICAL"): MetaFileTypes.GENERIC_ASSAY_CATEGORICAL, + ("SINGLE_CELL_EXPRESSION", "SINGLE_CELL_EXPRESSION"): MetaFileTypes.SINGLE_CELL_EXPRESSION } result = None if 'genetic_alteration_type' in meta_dictionary and 'datatype' in meta_dictionary: diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java index 64e9ca59..dc37ed1b 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java @@ -570,6 +570,7 @@ public static void deleteCancerStudy(int internalCancerStudyId) throws DaoExcept "DELETE FROM patient WHERE CANCER_STUDY_ID=?", "DELETE FROM sample_list WHERE CANCER_STUDY_ID=?", "DELETE FROM structural_variant WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", + "DELETE FROM single_cell_expression WHERE GENETIC_PROFILE_ID IN (SELECT GENETIC_PROFILE_ID FROM genetic_profile WHERE CANCER_STUDY_ID=?)", "DELETE FROM genetic_profile_link WHERE REFERRED_GENETIC_PROFILE_ID IN (select GENETIC_PROFILE_ID FROM genetic_profile where CANCER_STUDY_ID=?)", "DELETE FROM genetic_profile WHERE CANCER_STUDY_ID=?", "DELETE FROM gistic_to_gene WHERE GISTIC_ROI_ID IN (SELECT GISTIC_ROI_ID FROM gistic WHERE CANCER_STUDY_ID=?)", diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSingleCellExpression.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSingleCellExpression.java new file mode 100644 index 00000000..bea176f5 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSingleCellExpression.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024 The Hyve B.V. + * This code is licensed under the GNU Affero General Public License (AGPL), + * version 3, or (at your option) any later version. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +/* + * @author Matthijs Pon +*/ + +package org.mskcc.cbio.portal.dao; + +import java.sql.*; +import org.mskcc.cbio.portal.model.*; + +public class DaoSingleCellExpression { + + private DaoSingleCellExpression() { + } + + public static void addSingleCellExpression(SingleCellExpression singleCellExpression) throws DaoException { + Connection connection = null; + PreparedStatement preparedStatement = null; + ResultSet resultSet = null; + + try { + // Open connection to database + connection = JdbcUtil.getDbConnection(DaoGeneticProfileLink.class); + + // Prepare SQL statement + preparedStatement = connection.prepareStatement("INSERT INTO single_cell_expression " + + "(GENETIC_PROFILE_ID, SAMPLE_ID, TISSUE, CELL_TYPE, ENTREZ_GENE_ID, EXPRESSION_VALUE) VALUES (?,?,?,?,?,?)"); + // Fill in statement + preparedStatement.setInt(1, singleCellExpression.getGeneticProfileId()); + preparedStatement.setInt(2, singleCellExpression.getSampleId()); + preparedStatement.setString(3, singleCellExpression.getTissue()); + preparedStatement.setString(4, singleCellExpression.getCellType()); + preparedStatement.setLong(5, singleCellExpression.getGene().getEntrezGeneId()); + preparedStatement.setString(6, singleCellExpression.getExpressionValue()); + + // Execute statement + preparedStatement.execute(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoGeneticProfileLink.class, connection, preparedStatement, resultSet); + } + } +} diff --git a/src/main/java/org/mskcc/cbio/portal/model/GeneticAlterationType.java b/src/main/java/org/mskcc/cbio/portal/model/GeneticAlterationType.java index eb650336..db6d8595 100644 --- a/src/main/java/org/mskcc/cbio/portal/model/GeneticAlterationType.java +++ b/src/main/java/org/mskcc/cbio/portal/model/GeneticAlterationType.java @@ -52,5 +52,6 @@ public enum GeneticAlterationType { PROTEIN_ARRAY_PROTEIN_LEVEL, PROTEIN_ARRAY_PHOSPHORYLATION, GENESET_SCORE, - GENERIC_ASSAY + GENERIC_ASSAY, + SINGLE_CELL_EXPRESSION }; diff --git a/src/main/java/org/mskcc/cbio/portal/model/SingleCellExpression.java b/src/main/java/org/mskcc/cbio/portal/model/SingleCellExpression.java new file mode 100644 index 00000000..fb708e0e --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/model/SingleCellExpression.java @@ -0,0 +1,69 @@ +package org.mskcc.cbio.portal.model; + +public class SingleCellExpression { + private int sampleId; + private int geneticProfileId; + private CanonicalGene gene; + private String cellType; + private String tissue; + private String expressionValue; + + public SingleCellExpression(int sampleId, int geneticProfileId, CanonicalGene gene, String cellType, String tissue, + String expressionValue) { + this.sampleId = sampleId; + this.geneticProfileId = geneticProfileId; + this.gene = gene; + this.cellType = cellType; + this.tissue = tissue; + this.expressionValue = expressionValue; + } + + public int getSampleId() { + return sampleId; + } + + public void setSampleId(int sampleId) { + this.sampleId = sampleId; + } + + public int getGeneticProfileId() { + return geneticProfileId; + } + + public void setGeneticProfileId(int geneticProfileId) { + this.geneticProfileId = geneticProfileId; + } + + public CanonicalGene getGene() { + return gene; + } + + public void setGene(CanonicalGene gene) { + this.gene = gene; + } + + public String getCellType() { + return cellType; + } + + public void setCellType(String cellType) { + this.cellType = cellType; + } + + public String getTissue() { + return tissue; + } + + public void setTissue(String tissue) { + this.tissue = tissue; + } + + public String getExpressionValue() { + return expressionValue; + } + + public void setExpressionValue(String expressionValue) { + this.expressionValue = expressionValue; + } + +} diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index a35e8c29..66fcd13b 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -136,7 +136,8 @@ public void run() { ); genericAssayProfileImporter.importData(); } - } else if( + } + else if( geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION && DISCRETE_LONG.name().equals(geneticProfile.getOtherMetaDataField("datatype")) ) { @@ -150,7 +151,15 @@ public void run() { overwriteExisting ); importer.importData(); + } else if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.SINGLE_CELL_EXPRESSION) { + ImportSingleCellExpressionData importer = new ImportSingleCellExpressionData( + dataFile, + geneticProfile.getGeneticProfileId(), + daoGene + ); + importer.importData(); } else { + // All other files go through this ImportTabDelimData importer = new ImportTabDelimData( dataFile, geneticProfile.getTargetLine(), diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSingleCellExpressionData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSingleCellExpressionData.java new file mode 100644 index 00000000..91b7f16f --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSingleCellExpressionData.java @@ -0,0 +1,272 @@ +package org.mskcc.cbio.portal.scripts; + +import java.io.BufferedReader; +import java.io.Console; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; + +import org.apache.commons.lang3.math.NumberUtils; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSingleCellExpression; +import org.mskcc.cbio.portal.dao.JdbcUtil; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.SingleCellExpression; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.TsvUtil; + +public class ImportSingleCellExpressionData { + + private File dataFile; + private int numLines; + + private int geneticProfileId; + private DaoGeneOptimized daoGene; + + /** + * Constructor. + * + * @param dataFile Generic Assay Patient Level data file + * @param geneticProfileId GeneticProfile ID. + */ + public ImportSingleCellExpressionData( + File dataFile, + int geneticProfileId, + DaoGeneOptimized daoGene + ) { + this.dataFile = dataFile; + this.geneticProfileId = geneticProfileId; + this.daoGene = daoGene; + } + + /** + * Import the SingleCellExpression Data + */ + public void importData() { + JdbcUtil.getTransactionTemplate().execute(status -> { + try { + doImportData(); + } catch (Throwable e) { + status.setRollbackOnly(); + throw new RuntimeException(e); + } + return null; + }); + } + + private int sampleIdIndex; + private int hugoSymbolIndex; + private int entrezGeneIdIndex; + private int cellTypeIndex; + private int tissueIndex; + private int valueIndex; + private GeneticProfile geneticProfile; + + private void doImportData() throws IOException, DaoException { + try { + this.numLines = FileUtil.getNumLines(dataFile); + } catch (IOException e) { + throw new RuntimeException(e); + } + System.out.println("In doImportData singleCellExpression"); + ProgressMonitor.setMaxValue(numLines); + FileReader reader = new FileReader(dataFile); + BufferedReader buf = new BufferedReader(reader); + String headerLine = buf.readLine(); + String[] headerParts = TsvUtil.splitTsvLine(headerLine); + + processHeader(headerParts); + System.out.println("Header processed"); + + int numRecordsToAdd = 0; + int entriesSkipped = 0; + String line; + geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); + while ((line = buf.readLine()) != null) { + System.out.println("processing line: " + line); + ProgressMonitor.incrementCurValue(); + ConsoleUtil.showProgress(); + + if (TsvUtil.isDataLine(line)) { + System.out.println("is data line"); + String[] parts = TsvUtil.splitTsvLine(line); + if (saveLine(parts)) { + System.out.println("Saved line"); + numRecordsToAdd++; + } else { + System.out.println("skipped"); + entriesSkipped++; + } + } + + } + buf.close(); + + if (entriesSkipped > 0) { + ProgressMonitor.setCurrentMessage(" --> total number of data entries skipped (see table below): " + entriesSkipped); + } + + if (numRecordsToAdd <= 0) { + throw new DaoException ("Something has gone wrong! I did not save any records" + + " to the database!"); + } + } + + private boolean saveLine(String[] line) throws DaoException { + SingleCellExpression singleCellExpression; + System.out.println("Saving line: " + line); + try { + singleCellExpression = parseSingleCellExpression(line); + } catch (DaoException e) { + return false; + } + System.out.println("SingleCellExpression made: " + singleCellExpression.toString()); + + DaoSingleCellExpression.addSingleCellExpression(singleCellExpression); + System.out.println("Added to db"); + return true; + } + + + private SingleCellExpression parseSingleCellExpression(String[] line) throws DaoException { + String hugoSymbol = (hugoSymbolIndex < 0) ? "" : line[hugoSymbolIndex]; + String entrezGeneId = (entrezGeneIdIndex < 0) ? "" : line[entrezGeneIdIndex]; + CanonicalGene gene = parseGene(hugoSymbol, entrezGeneId); + // skip the record if a gene was expected but not identified + if (gene == null) { + ProgressMonitor.logWarning("Gene not found. Skipping line."); + throw new DaoException("Gene not found"); + } + Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), line[sampleIdIndex]); + if (sample == null) { + ProgressMonitor.logWarning("Sample \'" + line[sampleIdIndex] + "\' not found in sample file. Skipping line."); + throw new DaoException("Sample not found"); + } + + return new SingleCellExpression( + sample.getInternalId(), + geneticProfileId, + gene, + line[cellTypeIndex], + line[tissueIndex], + line[valueIndex] + ); + } + + + private CanonicalGene parseGene(String geneSymbol, String entrezId) { + // Assume we are dealing with Entrez Gene Ids (this is the best / most stable option) + + CanonicalGene gene = null; + // try to parse entrez if it is not empty nor 0: + if (!(entrezId.isEmpty() || + entrezId.equals("0"))) { + Long entrezGeneId; + try { + entrezGeneId = Long.parseLong(entrezId); + } catch (NumberFormatException e) { + entrezGeneId = null; + } + //non numeric values or negative values should not be allowed: + if (entrezGeneId == null || entrezGeneId < 0) { + ProgressMonitor.logWarning( + "Ignoring line with invalid Entrez_Id " + + entrezId); + return gene; + } else { + gene = daoGene.getGene(entrezGeneId); + if (gene == null) { + //skip if not in DB: + ProgressMonitor.logWarning( + "Entrez gene ID " + entrezGeneId + + " not found. Record will be skipped."); + return gene; + } + } + } + + // If Entrez Gene ID Fails, try Symbol. + if (gene == null && + !(geneSymbol.equals("") || + geneSymbol.equals("Unknown"))) { + gene = daoGene.getNonAmbiguousGene(geneSymbol, true); + } + + if (gene == null) { + ProgressMonitor.logWarning( + "Ambiguous or missing gene: " + geneSymbol + + " ["+ entrezId + + "] or ambiguous alias. Ignoring it " + + "and all mutation data associated with it!"); + } + return gene; + } + + private void processHeader(String[] header) { + String error = "Missing"; + boolean issueWithHeader = false; + + sampleIdIndex = getColIndexByName(header, "Sample_Id"); + if (sampleIdIndex < 0) { + error += " \'Sample_Id\'"; + issueWithHeader = true; + } + hugoSymbolIndex = getColIndexByName(header, "Hugo_Symbol"); + entrezGeneIdIndex = getColIndexByName(header, "Entrez_Gene_Id"); + if (hugoSymbolIndex < 0 && entrezGeneIdIndex < 0) { + if (issueWithHeader) { + error += ","; + } + error += " \'Hugo_Symbol\' or \'Entrez_Gene_Id\'"; + issueWithHeader = true; + } + cellTypeIndex = getColIndexByName(header, "Cell_Type"); + if (cellTypeIndex < 0) { + if (issueWithHeader) { + error += ","; + } + error += " \'Cell_Type\'"; + issueWithHeader = true; + } + tissueIndex = getColIndexByName(header, "Tissue"); + if (tissueIndex < 0) { + if (issueWithHeader) { + error += ","; + } + error += " \'Tissue\'"; + issueWithHeader = true; + } + valueIndex = getColIndexByName(header, "Expression_Value"); + if (valueIndex < 0) { + if (issueWithHeader) { + error += ","; + } + error += " \'Expression_Value\'"; + issueWithHeader = true; + } + if (issueWithHeader) { + throw new RuntimeException(error + "columns. Please fix your file."); + } + } + + // helper function for finding the index of a column by name + private int getColIndexByName(String[] headers, String colName) { + for (int i = 0; i < headers.length; i++) { + if (headers[i].equalsIgnoreCase(colName)) { + return i; + } + } + return -1; + } + +}