From 735b0d166e42334d2aba562ed386f9483c3df344 Mon Sep 17 00:00:00 2001 From: mourisl Date: Mon, 8 Apr 2024 11:23:06 -0400 Subject: [PATCH] Add the cell_id information to the smartseq output. --- README.md | 2 +- run-trust4 | 4 ++-- trust-smartseq.pl | 13 +++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 40f0e19..eeb9d82 100644 --- a/README.md +++ b/README.md @@ -42,12 +42,12 @@ TRUST4 is also available form [Bioconda](https://anaconda.org/bioconda/trust4). -t INT: number of threads (default: 1) -k INT: the starting k-mer size for indexing contigs (default: 9) --barcode STRING: if -b, bam field for barcode; if -1 -2/-u, file containing barcodes (defaul: not used) + --barcodeLevel STRING: barcode is for cell or molecule (default: cell) --barcodeWhitelist STRING: path to the barcode whitelist (default: not used) --barcodeTranslate STRING: path to the barcode translate file (default: not used) --UMI STRING: if -b, bam field for 10x Genomics-like UMI; if -1 -2/-u, file containing 10x Genomics-like UMIs (default: not used) --readFormat STRING: format for read, barcode and UMI files (example: r1:0:-1,r2:0:-1,bc:0:15,um:16:-1 for paired-end files with barcode and UMI) --repseq: the data is from TCR-seq or BCR-seq (default: not set) - --barcodeLevel STRING: barcode is for cell or molecule (default: cell) --minHitLen INT: the minimal hit length for a valid overlap (default: auto) --mateIdSuffixLen INT: the suffix length in read id for mate. (default: not used) --skipMateExtension: do not extend assemblies with mate information, useful for SMART-seq (default: not used) diff --git a/run-trust4 b/run-trust4 index deef25c..bab3302 100755 --- a/run-trust4 +++ b/run-trust4 @@ -7,7 +7,7 @@ use Cwd qw(cwd abs_path) ; use File::Basename ; use File::Path qw(make_path) ; -die "TRUST4 v1.1.0-r499 usage: ./run-trust4 [OPTIONS]:\n". +die "TRUST4 v1.1.0-r500 usage: ./run-trust4 [OPTIONS]:\n". "Required:\n". #"\t[Input]:\n". "\t-b STRING: path to bam file\n". @@ -22,6 +22,7 @@ die "TRUST4 v1.1.0-r499 usage: ./run-trust4 [OPTIONS]:\n". "\t-k INT: the starting k-mer size for indexing contigs (default: 9)\n". #"\t-h: print help message and exit.\n" "\t--barcode STRING: if -b, bam field for barcode; if -1 -2/-u, file containing barcodes (default: not used)\n". + "\t--barcodeLevel STRING: barcode is for cell or molecule (default: cell)\n". #"\t--barcodeRange INT INT CHAR: start, end(-1 for length-1), strand in a barcode is the true barcode (default: 0 -1 +)\n". "\t--barcodeWhitelist STRING: path to the barcode whitelist (default: not used)\n". "\t--barcodeTranslate STRING: path to the barcode translate file (default: not used)\n". @@ -31,7 +32,6 @@ die "TRUST4 v1.1.0-r499 usage: ./run-trust4 [OPTIONS]:\n". #"\t--umiRange INT INT CHAR: start, end(-1 for lenght-1), strand in a UMI is the true UMI (default: 0 -1 +)\n". "\t--readFormat STRING: format for read, barcode and UMI files (example: r1:0:-1,r2:0:-1,bc:0:15,um:16:-1 for paired-end files with barcode and UMI)\n". "\t--repseq: the data is from TCR-seq or BCR-seq (default: not set)\n". - "\t--barcodeLevel STRING: barcode is for cell or molecule (default: cell)\n". "\t--minHitLen INT: the minimal hit length for a valid overlap (default: auto)\n". "\t--mateIdSuffixLen INT: the suffix length in read id for mate. (default: not used)\n". "\t--skipMateExtension: do not extend assemblies with mate information, useful for SMART-seq (default: not used)\n". diff --git a/trust-smartseq.pl b/trust-smartseq.pl index 8bc4cd6..178e01d 100644 --- a/trust-smartseq.pl +++ b/trust-smartseq.pl @@ -232,13 +232,22 @@ sub GetPairChainType # Process the AIRR file open FPairr, "tmp_smartseq_airr.tsv" ; my $lineCnt = 0 ; + my %airrNameToCol ; while () { chomp ; if ($cellProcessed == 0 && $lineCnt == 0) { print FPfinalairr $_,"\n" ; + + chomp ; + my @cols = split /\t/ ; + for ($i = 0 ; $i < scalar(@cols) ; ++$i) + { + $airrNameToCol{ $cols[$i] } = $i ; + } } + if ($lineCnt == 0) { ++$lineCnt ; @@ -254,6 +263,10 @@ sub GetPairChainType if ($matchedCols[2] eq $cols[13]) { $cols[0] = ${cellPrefix}."_".$cols[0] ; + + # Add the cell id + $cols[ $airrNameToCol{"cell_id"} ] = $cellPrefix ; + print FPfinalairr join("\t", @cols), "\n" ; } }