Skip to content

Commit

Permalink
lib: update the repeats builder by removing the hardcoded filenames a…
Browse files Browse the repository at this point in the history
…nd taking them from the configuration file; update JUnit test and improve log messages, #TASK-5564
  • Loading branch information
jtarraga committed Apr 22, 2024
1 parent cd94452 commit 148814f
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildComma
* @throws CellBaseException Exception
*/
public void execute() throws CellBaseException {
String buildOption = null;
String data = null;
try {
// Check data sources
List<String> dataList = checkDataSources();
Expand Down Expand Up @@ -125,8 +125,9 @@ public void execute() throws CellBaseException {
makeDir(buildFolder);
}

for (String data : dataList) {
CellBaseBuilder parser;
CellBaseBuilder parser;
for (int i = 0; i < dataList.size(); i++) {
data = dataList.get(i);
switch (data) {
case GENOME_DATA:
parser = buildGenomeSequence();
Expand Down Expand Up @@ -171,22 +172,22 @@ public void execute() throws CellBaseException {
parser = buildPharmacogenomics();
break;
default:
throw new IllegalArgumentException("Value '" + buildOption + "' is not allowed for the data parameter."
throw new IllegalArgumentException("Value '" + data + "' is not allowed for the data parameter."
+ " Valid values are: " + StringUtils.join(VALID_SOURCES_TO_BUILD, ",") + "; or use 'all' to build"
+ " everything");
}

if (parser != null) {
logger.info("Building '{}' data ...", buildOption);
logger.info(CellBaseBuilder.BUILDING_LOG_MESSAGE, data);
parser.parse();
logger.info("Building '{}' data. Done.", buildOption);
logger.info(CellBaseBuilder.BUILDING_DONE_LOG_MESSAGE, data);
parser.disconnect();
}
}
} catch (Exception e) {
String msg = "Error executing the command 'build'";
if (StringUtils.isNotEmpty(buildOption)) {
msg += ". The last data being built was '" + buildOption + "'";
if (StringUtils.isNotEmpty(data)) {
msg += ". The last data being built was '" + data + "'";
}
throw new CellBaseException(msg + ": " + e.getMessage(), e);
}
Expand All @@ -202,7 +203,7 @@ private CellBaseBuilder buildRepeats() throws CellBaseException {

// Create serializer and return the repeats builder
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(REPEATS_SUBDIRECTORY), REPEATS_DATA);
return new RepeatsBuilder(repeatsDownloadPath, serializer);
return new RepeatsBuilder(repeatsDownloadPath, serializer, configuration);
}

private CellBaseBuilder buildObo() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ public class EtlCommons {
// Genome (Ensembl)
public static final String GENOME_NAME = "Genome";
public static final String GENOME_DATA = "genome";
public static final String GENOME_SUBDIRECTORY = "genome";
public static final String GENOME_VERSION_FILENAME = "genome" + SUFFIX_VERSION_FILENAME;
public static final String GENOME_SUBDIRECTORY = GENOME_DATA;
public static final String GENOME_VERSION_FILENAME = GENOME_DATA + SUFFIX_VERSION_FILENAME;

// Gene (Ensembl)
public static final String GENE_DATA = "gene";
Expand Down Expand Up @@ -209,24 +209,19 @@ public class EtlCommons {
// Repeats
public static final String REPEATS_NAME = "Repeats";
public static final String REPEATS_DATA = "repeats";
public static final String REPEATS_SUBDIRECTORY = "genome";
public static final String REPEATS_SUBDIRECTORY = GENOME_SUBDIRECTORY;
@Deprecated
public static final String REPEATS_JSON = "repeats";
// Simple repeats
public static final String TRF_NAME = "Tandem Repeats Finder";
@Deprecated
public static final String TRF_FILE = "simpleRepeat.txt.gz";
public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME;
public static final String SIMPLE_REPEATS_FILE_ID = "SIMPLE_REPEATS";
// Genomic super duplications
public static final String GSD_NAME = "Genomic Super Duplications";
@Deprecated
public static final String GSD_FILE = "genomicSuperDups.txt.gz";
public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME;
public static final String GENOMIC_SUPER_DUPS_FILE_ID = "GENOMIC_SUPER_DUPS";
// Window masker
public static final String WM_NAME = "Window Masker";
@Deprecated
public static final String WM_FILE = "windowmaskerSdust.txt.gz";
public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME;
public static final String WINDOW_MASKER_FILE_ID = "WINDOW_MASKER";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ public abstract class CellBaseBuilder {

protected Logger logger;

public static final String BUILDING_LOG_MESSAGE = "Building {} ...";
public static final String BUILDING_DONE_LOG_MESSAGE = "Building {} done!";


public CellBaseBuilder(CellBaseSerializer serializer) {
logger = LoggerFactory.getLogger(this.getClass());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.avro.Repeat;
import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.cellbase.core.serializer.CellBaseFileSerializer;
import org.opencb.commons.ProgressLogger;
Expand All @@ -27,55 +29,74 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

import static org.opencb.cellbase.lib.EtlCommons.*;

/**
* Created by fjlopez on 05/05/17.
*/
public class RepeatsBuilder extends CellBaseBuilder {
private static final String TRF = "trf";
private static final String GSD = "genomicSuperDup";
private static final String WM = "windowMasker";

private CellBaseConfiguration configuration;

private final Path filesDir;

public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer) {
public RepeatsBuilder(Path filesDir, CellBaseFileSerializer serializer, CellBaseConfiguration configuration) {
super(serializer);
this.filesDir = filesDir;
this.configuration = configuration;
}


@Override
public void parse() throws Exception {
logger.info(BUILDING_LOG_MESSAGE, EtlCommons.REPEATS_NAME);

logger.info("Parsing repeats...");
if (Files.exists(filesDir.resolve(EtlCommons.TRF_FILE))) {
parseTrfFile(filesDir.resolve(EtlCommons.TRF_FILE));
} else {
logger.warn("No TRF file found {}", EtlCommons.TRF_FILE);
logger.warn("Skipping TRF file parsing. TRF data models will not be built.");
// Check Simple Repeats (TRF) filename
String trfFilename = Paths.get(configuration.getDownload().getSimpleRepeats().getFiles().get(SIMPLE_REPEATS_FILE_ID)).getFileName()
.toString();
if (!Files.exists(filesDir.resolve(trfFilename))) {
throw new CellBaseException(TRF_NAME + " file " + trfFilename + " does not exist at " + filesDir);
}

if (Files.exists(filesDir.resolve(EtlCommons.GSD_FILE))) {
parseGsdFile(filesDir.resolve(EtlCommons.GSD_FILE));
} else {
logger.warn("No Genomic Super Duplications file found {}", EtlCommons.GSD_FILE);
logger.warn("Skipping Genomic Super Duplications file parsing. "
+ "Genomic Super Duplications data models will not be built.");
// Check Genomic Super Duplications (GSD) file
String gsdFilename = Paths.get(configuration.getDownload().getGenomicSuperDups().getFiles().get(GENOMIC_SUPER_DUPS_FILE_ID))
.getFileName().toString();
if (!Files.exists(filesDir.resolve(gsdFilename))) {
throw new CellBaseException(GSD_NAME + " file " + gsdFilename + " does not exist at " + filesDir);
}

if (Files.exists(filesDir.resolve(EtlCommons.WM_FILE))) {
parseWmFile(filesDir.resolve(EtlCommons.WM_FILE));
} else {
logger.warn("No WindowMasker file found {}", EtlCommons.WM_FILE);
logger.warn("Skipping WindowMasker file parsing. WindowMasker data models will not be built.");
// Check Window Masker (WM) file
String wmFilename = Paths.get(configuration.getDownload().getWindowMasker().getFiles().get(WINDOW_MASKER_FILE_ID)).getFileName()
.toString();
if (!Files.exists(filesDir.resolve(wmFilename))) {
throw new CellBaseException(WM_NAME + " file " + wmFilename + " does not exist at " + filesDir);
}
logger.info("Done.");

// Parse TRF file
logger.info(BUILDING_LOG_MESSAGE, TRF_NAME);
parseTrfFile(filesDir.resolve(trfFilename));
logger.info(BUILDING_DONE_LOG_MESSAGE, TRF_NAME);

// Parse GSD file
logger.info(BUILDING_LOG_MESSAGE, GSD_NAME);
parseGsdFile(filesDir.resolve(gsdFilename));
logger.info(BUILDING_DONE_LOG_MESSAGE, GSD_NAME);

// Parse WM file
logger.info(BUILDING_LOG_MESSAGE, WM_NAME);
parseWmFile(filesDir.resolve(wmFilename));
logger.info(BUILDING_DONE_LOG_MESSAGE, WM_NAME);

logger.info(BUILDING_DONE_LOG_MESSAGE, EtlCommons.REPEATS_NAME);
}

private void parseTrfFile(Path filePath) throws IOException {
try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) {
String line = bufferedReader.readLine();

ProgressLogger progressLogger = new ProgressLogger("Parsed TRF lines:",
ProgressLogger progressLogger = new ProgressLogger("Parsed " + TRF_NAME + " lines:",
() -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000);
while (line != null) {
serializer.serialize(parseTrfLine(line));
Expand All @@ -90,14 +111,14 @@ private Repeat parseTrfLine(String line) {

return new Repeat(null, Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1,
Integer.valueOf(parts[3]), Integer.valueOf(parts[5]), Integer.valueOf(parts[7]),
Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF);
Float.valueOf(parts[6]), Float.valueOf(parts[8]) / 100, Float.valueOf(parts[10]), parts[16], TRF_NAME);
}

private void parseGsdFile(Path filePath) throws IOException {
try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) {
String line = bufferedReader.readLine();

ProgressLogger progressLogger = new ProgressLogger("Parsed GSD lines:",
ProgressLogger progressLogger = new ProgressLogger("Parsed " + GSD_NAME + " lines:",
() -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000);
while (line != null) {
serializer.serialize(parseGSDLine(line));
Expand All @@ -112,15 +133,15 @@ private Repeat parseGSDLine(String line) {

return new Repeat(parts[11], Region.normalizeChromosome(parts[1]), Integer.valueOf(parts[2]) + 1,
Integer.valueOf(parts[3]), null, null, 2f, Float.valueOf(parts[26]), null,
null, GSD);
null, GSD_NAME);

}

private void parseWmFile(Path filePath) throws IOException {
try (BufferedReader bufferedReader = FileUtils.newBufferedReader(filePath)) {
String line = bufferedReader.readLine();

ProgressLogger progressLogger = new ProgressLogger("Parsed WM lines:",
ProgressLogger progressLogger = new ProgressLogger("Parsed " + WM_NAME + " lines:",
() -> EtlCommons.countFileLines(filePath), 200).setBatchSize(10000);
while (line != null) {
serializer.serialize(parseWmLine(line));
Expand All @@ -134,6 +155,6 @@ private Repeat parseWmLine(String line) {
String[] parts = line.split("\t");

return new Repeat(parts[4].replace("\t", ""), Region.normalizeChromosome(parts[1]),
Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM);
Integer.valueOf(parts[2]) + 1, Integer.valueOf(parts[3]), null, null, null, null, null, null, WM_NAME);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.junit.jupiter.api.Test;
import org.eclipse.jetty.util.ajax.JSON;
import org.opencb.biodata.models.variant.avro.Repeat;
import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.serializer.CellBaseFileSerializer;
import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer;
import org.opencb.commons.utils.FileUtils;
Expand All @@ -46,9 +47,10 @@ public RepeatsBuilderTest() {

@Test
public void testParse() throws Exception {
CellBaseConfiguration configuration = CellBaseConfiguration.load(getClass().getResourceAsStream("configuration.test.yaml"));

Check failure on line 50 in cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/RepeatsBuilderTest.java

View workflow job for this annotation

GitHub Actions / Surefire tests report

RepeatsBuilderTest.testParse

argument "src" is null
Raw output
java.lang.IllegalArgumentException: argument "src" is null
	at org.opencb.cellbase.lib.builders.RepeatsBuilderTest.testParse(RepeatsBuilderTest.java:50)
Path repeatsFilesDir = Paths.get(getClass().getResource("/repeats").getPath());
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), "repeats.test");
(new RepeatsBuilder(repeatsFilesDir, serializer)).parse();
(new RepeatsBuilder(repeatsFilesDir, serializer, configuration)).parse();
serializer.close();
assertEquals(loadRepeatSet(Paths.get(getClass().getResource("/repeats/repeats.test.json.gz").getFile())),
loadRepeatSet(Paths.get("/tmp/repeats.test.json.gz")));
Expand Down
17 changes: 14 additions & 3 deletions cellbase-lib/src/test/resources/configuration.test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,23 @@ download:
host: http://docm.genome.wustl.edu
dgv:
host: http://dgv.tcag.ca/v106/docs

simpleRepeats:
host: http://hgdownload.cse.ucsc.edu/goldenPath
host: http://hgdownload.cse.ucsc.edu/
files:
## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38
SIMPLE_REPEATS: goldenPath/put_assembly_here/database/simpleRepeat.txt.gz
windowMasker:
host: http://hgdownload.cse.ucsc.edu/goldenPath
host: http://hgdownload.cse.ucsc.edu/
files:
## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38
WINDOW_MASKER: goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz
genomicSuperDups:
host: http://hgdownload.cse.ucsc.edu/goldenPath
host: http://hgdownload.cse.ucsc.edu/
files:
## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38
GENOMIC_SUPER_DUPS: goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz

gwasCatalog:
host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2016/09/28/gwas-catalog-associations.tsv
hpo:
Expand Down

0 comments on commit 148814f

Please sign in to comment.