diff --git a/README.md b/README.md index 673d009..3317d6d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Efficient, high-quality [streaming](https://docs.oracle.com/en/java/javase/21/docs/api/java.base/java/util/stream/Stream.html) parsers and writers for 16 text-based formats used in bioinformatics. -The goal is to have the best possible parsers for the most problematic ancient formats. +The goal is to have the best possible parsers for the most hated and problematic formats. **Supported formats:** @@ -35,22 +35,23 @@ The goal is to have the best possible parsers for the most problematic ancient f - Reads and writes Java Streams, keeping only essential metadata in memory. - Parses every part of a format, leaving nothing as text unnecessarily. -- Has a consistent API. Coordinates are always 0-indexed and text is always escaped as per the specification. +- Has a consistent API. + Coordinates are always 0-indexed and text is always escaped as per the specification. - Immutable, thread-safe, null-pointer-safe (`Optional<>`), and arbitrary-precision. -- All methods are either exposed through interfaces, or reside in records, enums, and final classes +- All methods are in interfaces, or in records, enums, or final classes #### Example: This example reads, filters, and writes a VCF file. ```java -import org.pharmgkb.parsers.vcf; +import org.pharmgkb.parsers.vcf.*; +import org.pharmgkb.parsers.vcf.model.*; -Stream goodMitochondrialCalls = new VcfDataParser().parseFile(path) - .filter(p -> p.chromosome.isMitochondial()) - .filter(VcfFilters.qualityAtLeast(10)); // converts to BigDecimal +Stream mitochondrialCalls = new VcfDataParser().parseFile(path) + .filter(p -> p.chromosome().isMitochondial()) -new VcfDataWriter().writeToFile(goodMitochondrialCalls, filteredPath); +new VcfDataWriter().writeToFile(mitochondrialCalls, filteredPath); ``` ## Build/install @@ -100,20 +101,22 @@ functions (`parallel()`, `collect`, `flatMap`, etc.) ```java // Store GFF3 (or GVF, or GTF) features into a list -List features = new Gff3Parser().collectAll(inputFile); +List features = new GffParser.Builder().build().collectAll(inputFile); features.get(0).getType(); // the parser unescaped this string // Now write the lines: -new Gff3Writer().writeToFile(outputFile); +new Gff3Writer.Builder().build().writeToFile(outputFile); // The writer percent-encodes GFF3 fields as necessary ``` ```java // From a BED file, get distinct chromosome names that start with "chr", in parallel -Files.lines(file).map(new BedParser()) +Files.lines(file) + .map(new BedParser()) .parallel() - .map(BedFeature::getChromosome).distinct() - .filter(chr -> chr.startsWith("chr")) + .map(BedFeature::getChromosome) + .distinct() + .filter(chr -> chr.startsWith("chr")); // You can also use new BedParser().parseAll(file) ``` @@ -129,7 +132,7 @@ NavigableSet children = pedigree.getFamily("Johnsons") // Traverse through a family pedigree in topological order Pedigree pedigree = new PedigreeParser.Builder().build().apply(Files.lines(file)); Stream = pedigree.getFamily("Johnsons") - .topologicalOrderStream(); + .topologicalOrder(); ``` ```java @@ -139,17 +142,17 @@ GenomeChain chain = new GenomeChainParser().apply(Files.lines(hg19ToGrch38ChainF List liftedOver = lociList.parallelStream() .map(chain) .filter(Optional::isPresent) - .collect(Collectors.toList()); + .toList(); // You can also use new GenomeChainParser().parse(hg19ToGrch38ChainFile) ``` ```java // Print formal species names from a GenBank file Path input = Paths.get("plasmid.genbank"); -properties = new GenbankParser().parseAll(input) +new GenbankParser().parseAll(input) .filter(record -> record instanceof SourceAnnotation) .map(record -> record.getFormalName()) - .forEach(System.out::println) + .forEach(System.out::println); ``` ```java @@ -163,22 +166,7 @@ Set properties = new GenbankParser().parseAll(input) .flatMap(feature -> feature.getProperties().entrySet().stream()) .filter(prop -> prop.getKey().equals("color")) .map(prop -> prop.getValue()) - .collect(Collectors.toSet()) -``` - -```java -// Parse a GenBank file -// Get the set of "color" properties of features on the complement starting before the sequence -Path input = Paths.get("plasmid.genbank"); -Set properties = new GenbankParser().parseAll(input) - .filter(record -> record instanceof FeaturesAnnotation) - .flatMap(record -> record.getFeatures()) - .filter(feature -> record.range.isComplement()); - .filter(feature -> record.range.start() < 0); - .flatMap(feature -> feature.getProperties().entrySet().stream()) - .filter(prop -> prop.getKey().equals("color")) - .map(prop -> prop.getValue()) - .collect(Collectors.toSet()) + .toSet(); ``` ```java @@ -190,7 +178,8 @@ char base = stream.read("gene_1", 58523); ``` ```java -// Suppose you have a 2GB FASTA file and a method smithWaterman that returns AlignmentResults +// Suppose you have a 2GB FASTA file +// and a method smithWaterman that returns AlignmentResults // Align each sequence and get the top 10 results, in parallel MultilineFastaSequenceParser parser = new MultilineFastaSequenceParser.Builder().build(); List topScores = parser.parseAll(Files.lines(fastaFile)) @@ -210,23 +199,33 @@ List topScores = parser.parseAll(Files.lines(fastaFile)) "hasSynonym" . */ Stream input = null; -try (BufferedReader reader = new BufferedReader(new InputStreamReader((HttpURLConnection) myUrl.openConnection()).getInputStream()))) { +try ( + BufferedReader reader = new BufferedReader( + new InputStreamReader((HttpURLConnection) myUrl.openConnection()).getInputStream()) + ) +) { input = reader.lines(); } -TripleParser parser = new TripleParser(true); // usePrefixes=true will replace prefixes +// usePrefixes=true will replace prefixes +TripleParser parser = new TripleParser(true); Stream stream = input.map(new TripleParser()); -// contains: List[ https://abc#cat belongsTo https://abc#owner , https://abc#cat hasSynonym https://abc#feline ] +// contains: List[ https://abc#cat belongsTo https://abc#owner , \ +// https://abc#cat hasSynonym https://abc#feline ] List prefixes = parser.getPrefixes(); ``` ```java -// Parse VCF, validate it, and write a new VCF file containing only positions whose QUAL field +// Parse VCF, validate it, +// and write a new VCF file containing only positions whose QUAL field // is at least 10, each with its FILTER field cleared -VcfMetadataCollection metadata = new VcfMetadataParser().parse(input); // short-circuits during read +// short-circuits during read: +VcfMetadataCollection metadata = new VcfMetadataParser().parse(input); Stream data = new VcfDataParser().parseAll(input) - .filter(p -> p.getQuality().isPresent() && p.getQuality().get().greaterThanOrEqual("10")) - .map(p -> new VcfPosition.Builder(p).clearFilters().build()) - .peek(new VcfValidator.Builder(metadata).warnOnly().build()); // verify consistent with metadata + .filter(p -> + p.getQuality().isPresent() && p.getQuality().get().greaterThanOrEqual("10") + ).map(p -> new VcfPosition.Builder(p).clearFilters().build()) + // verify consistent with metadata: + .peek(new VcfValidator.Builder(metadata).warnOnly().build()); new VcfMetadataWriter().writeToFile(metadata.getLines(), output); new VcfDataWriter().appendToFile(data, output); ``` @@ -242,28 +241,31 @@ Map genotypeCounts = new VcfDataParser().parseAll(input) ``` ```java -Stream org.pharmgkb.parsers.text.MatrixParserI.tabs().parseAll(file).map(BigDecimal::new); +Stream MatrixParserI.tabs().parseAll(file).map(GeneralizedBigDecimal::new); ``` -### Guiding principles +### Principles 1. Where possible, a parser is a `Function` or `Function, R>`, and writer is a `Function` or `Function>`. [Java 8+ Streams](https://www.oracle.com/technetwork/articles/java/ma14-java-se-8-streams-2177646.html) are expected to be used. -2. Null values are generally banned from public methods in favor of +2. Null values are banned from public methods in favor of [`Optional`](https://download.java.net/java/early_access/jdk16/docs/api/java.base/java/util/Optional.html). See https://www.oracle.com/technetwork/articles/java/java8-optional-2175753.html for more information. -3. Most operations are thread-safe. Thread safety is annotated using `javax.annotation.concurrent`. -4. Top-level data classes are immutable, as annotated by or `javax.annotation.concurrent.Immutable`. -5. The builder pattern is used for non-trivial classes. Each builder has a copy constructor. -6. Links to specifications are provided. Any choice made in an ambiguous specification is documented. -7. Parsing and writing is _moderately_ strict. Severe violations throw a `BadDataFormatException`, - and milder violations are logged as warnings using SLF4J. +3. Most operations are thread-safe. + Thread safety is annotated using `javax.annotation.concurrent`. +4. Top-level data classes are immutable, as annotated by `javax.annotation.concurrent.Immutable`. +5. The builder pattern is used for non-trivial classes. + Each builder has a copy constructor. +6. Links to specifications are provided. + Any choice made in an ambiguous specification is documented. +7. Parsing and writing is _moderately_ strict. + Severe violations throw a `BadDataFormatException`, and milder violations are logged as SLF4J warnings. Not every aspect of a specification is validated. 8. For specification-mandated escape sequences, encoding and decoding is automatic. 9. Coordinates are _always 0-based_, even for 1-based formats. - This is to ensure consistency as well as arithmetic simplicity. + This is to ensure consistency and arithmetic simplicity. ### Pitfalls diff --git a/vcf/src/main/java/org/pharmgkb/parsers/vcf/VcfMetadataParser.java b/vcf/src/main/java/org/pharmgkb/parsers/vcf/VcfMetadataParser.java index 1eab61b..b537fc5 100644 --- a/vcf/src/main/java/org/pharmgkb/parsers/vcf/VcfMetadataParser.java +++ b/vcf/src/main/java/org/pharmgkb/parsers/vcf/VcfMetadataParser.java @@ -40,10 +40,7 @@ public VcfMetadataParser() { @Nonnull @Override - public VcfMetadataCollection apply( - @Nonnull - Stream stream - ) throws BadDataFormatException { + public VcfMetadataCollection apply(@Nonnull Stream stream) throws BadDataFormatException { Objects.requireNonNull(stream, "Stream cannot be null"); final VcfMetadataCollection.Builder builder = new VcfMetadataCollection.Builder(); stream.takeWhile(s -> s.startsWith("#"))