From aee3307061cc5ae11256eb27e4ab8d67cf9009d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 3 Feb 2026 09:10:40 +0100 Subject: [PATCH] lib: update GWAS indexer according to biodata changes, and add JUnit test, #TASK-8139 --- .../clinical/variant/GwasIndexer.java | 10 +- .../variant/ClinicalVariantBuilderTest.java | 147 ++++++++++++++++-- 2 files changed, 141 insertions(+), 16 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java index dc6d7715c..1d235256d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java @@ -263,19 +263,19 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade GwasAssociationStudyTraitScores scores = new GwasAssociationStudyTraitScores(); if (StringUtils.isNotEmpty(values[27])) { try { - scores.setPValue(Double.parseDouble(values[27])); + scores.setPvalue(Double.parseDouble(values[27])); } catch (NumberFormatException e) { logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]); } } if (StringUtils.isNotEmpty(values[28])) { try { - scores.setPValueMlog(Double.parseDouble(values[28])); + scores.setPvalueMlog(Double.parseDouble(values[28])); } catch (NumberFormatException e) { logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]); } } - scores.setPValueText(values[29]); + scores.setPvalueText(values[29]); if (StringUtils.isNotEmpty(values[30])) { try { scores.setOrBeta(Double.parseDouble(values[30])); @@ -463,8 +463,8 @@ private void checkAndAddGwasAssociation(String key, GwasAssociation newGwas, Map GwasAssociationStudyTraitScores newScores = newTrait.getScores().get(0); for (GwasAssociationStudyTraitScores scores : currTrait.getScores()) { if (scores.getOrBeta() == newScores.getOrBeta() - && scores.getPValueMlog() == newScores.getPValueMlog() - && scores.getPValue() == newScores.getPValue()) { + && scores.getPvalueMlog() == newScores.getPvalueMlog() + && scores.getPvalue() == newScores.getPvalue()) { currScores = scores; break; } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java index 247580a2c..dcffb275b 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java @@ -21,6 +21,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import org.hamcrest.CoreMatchers; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.opencb.biodata.models.variant.Variant; @@ -28,10 +30,15 @@ import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; import org.opencb.cellbase.core.serializer.CellBaseSerializer; import org.opencb.commons.utils.FileUtils; +import org.rocksdb.Options; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksDBException; +import org.rocksdb.RocksIterator; import java.io.BufferedReader; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; @@ -586,6 +593,90 @@ public void parse() throws Exception { } + @Test + public void testGwasIndexer() throws RocksDBException, IOException { + Path gwasDataDir = Paths.get("/opt/gwas-data/"); + Assumptions.assumeTrue(Files.exists(gwasDataDir)); + + Path gwasFile = gwasDataDir.resolve("gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv"); + Path dbSnpTabixFile = gwasDataDir.resolve("All.vcf.gz"); + Path genomeSequenceFilePath = gwasDataDir.resolve("Homo_sapiens.GRCh38.fa"); + String assembly = "grch38"; + + Path outputDir = Paths.get("/tmp"); + Path rocksDbDir = outputDir.resolve("integration.idx"); + Object[] dbConnection = getDBConnection(rocksDbDir.toAbsolutePath().toString(), true); + RocksDB rdb = (RocksDB) dbConnection[0]; + + GwasIndexer gwasIndexer = new GwasIndexer(gwasFile, dbSnpTabixFile, genomeSequenceFilePath, assembly, rdb); + gwasIndexer.index(); + + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(outputDir, CLINICAL_VARIANT_DATA, true); + // DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's + // named "iterator" + RocksIterator rocksIterator = rdb.newIterator(); + + ObjectMapper mapper = new ObjectMapper(); + System.out.println("Reading from RocksDB index and serializing to " + serializer.getOutdir().resolve(serializer.getFileName())); + int counter = 0; + for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { + Variant variant = parseVariantFromVariantId(new String(rocksIterator.key())); + if (variant != null) { + VariantAnnotation variantAnnotation = mapper.readValue(rocksIterator.value(), VariantAnnotation.class); + variant.setAnnotation(variantAnnotation); + serializer.serialize(variant); + counter++; + if (counter % 10000 == 0) { + System.out.printf(counter + " written"); + } + } + } + serializer.close(); + System.out.println("Done."); + serializer.close(); + + Path clinicalVariantFile = outputDir.resolve(CLINICAL_VARIANT_DATA + JSON_GZ_EXTENSION); + Assertions.assertTrue(Files.exists(clinicalVariantFile)); + + // Read serialized variants and check some of them + List variantList = loadSerializedVariants(clinicalVariantFile.toAbsolutePath().toString()); + Assertions.assertFalse(variantList.isEmpty()); + Assertions.assertEquals(93, variantList.size()); + boolean found = false; + for (Variant variant : variantList) { + assertNotNull(variant.getAnnotation().getGwas()); + Assertions.assertEquals("EBI GWAS catalog", variant.getAnnotation().getGwas().get(0).getSource(), "Source"); + if (variant.getChromosome().equals("11") && variant.getStart().equals(27658369) && variant.getReference().equals("C") + && variant.getAlternate().equals("T")) { + found = true; + Assertions.assertEquals("rs6265", variant.getAnnotation().getGwas().get(0).getSnpId()); + Assertions.assertEquals(3.0E-10, variant.getAnnotation().getGwas().get(0).getStudies().get(0).getTraits().get(0).getScores().get(0).getPvalue()); + Assertions.assertEquals(9.522878745280337, variant.getAnnotation().getGwas().get(0).getStudies().get(0).getTraits().get(0).getScores().get(0).getPvalueMlog()); + } + } + Assertions.assertTrue(found, "Expected GWAS variant not found in serialized variants."); + + // Clean and delete directories/files + rdb.close(); + org.apache.commons.io.FileUtils.deleteDirectory(rocksDbDir.toFile()); + Files.deleteIfExists(clinicalVariantFile); + } + + private Variant parseVariantFromVariantId(String variantId) { + try { + String[] parts = variantId.split(":", -1); // -1 to include empty fields + if (parts[1].contains("-")) { + String[] pos = parts[1].split("-"); + return new Variant(parts[0].trim(), Integer.parseInt(pos[0].trim()), Integer.parseInt(pos[1].trim()), parts[2], parts[3]); + } else { + return new Variant(parts[0].trim(), Integer.parseInt(parts[1].trim()), parts[2], parts[3]); + } + } catch (Exception e) { + System.out.printf("{}. Impossible to create the variant object from the variant ID: {}", e.getMessage(), variantId); + return null; + } + } + private void cleanUp() throws URISyntaxException, IOException { // Clean up temporary files/directories/indexes org.apache.commons.io.FileUtils.deleteDirectory(Paths.get("/tmp/clinicalVariant1/").toFile()); @@ -726,17 +817,17 @@ private List loadSerializedVariants(String fileName) { // // EvidenceEntry entry = buildEvidenceEntry(info); // System.out.println(variant.toStringSimple() + " : " + entry.toString()); -//// if (variant != null) { -//// boolean success = updateRocksDB(variant); -//// // updateRocksDB may fail (false) if normalisation process fails -//// if (success) { -//// numberIndexedRecords++; -//// } -//// } -//// totalNumberRecords++; -//// if (totalNumberRecords % 1000 == 0) { -//// logger.info("{} records parsed", totalNumberRecords); -//// } + //// if (variant != null) { + //// boolean success = updateRocksDB(variant); + //// // updateRocksDB may fail (false) if normalisation process fails + //// if (success) { + //// numberIndexedRecords++; + //// } + //// } + //// totalNumberRecords++; + //// if (totalNumberRecords % 1000 == 0) { + //// logger.info("{} records parsed", totalNumberRecords); + //// } // } // } // } @@ -811,4 +902,38 @@ public void testVariant() { System.out.println(v.toStringSimple()); } + private Object[] getDBConnection(String dbLocation, boolean forceCreate) { + boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); + // a static method that loads the RocksDB C++ library. + RocksDB.loadLibrary(); + // the Options class contains a set of configurable DB options + // that determines the behavior of a database. + Options options = new Options().setCreateIfMissing(true); + +// options.setMaxBackgroundCompactions(4); +// options.setMaxBackgroundFlushes(1); +// options.setCompressionType(CompressionType.NO_COMPRESSION); +// options.setMaxOpenFiles(-1); +// options.setIncreaseParallelism(4); +// options.setCompactionStyle(CompactionStyle.LEVEL); +// options.setLevelCompactionDynamicLevelBytes(true); + + RocksDB db = null; + try { + // a factory method that returns a RocksDB instance + if (indexingNeeded) { + db = RocksDB.open(options, dbLocation); + } else { + db = RocksDB.openReadOnly(options, dbLocation); + } + // do something + } catch (RocksDBException e) { + // do some error handling + e.printStackTrace(); + System.exit(1); + } + + return new Object[]{db, options, dbLocation, indexingNeeded}; + + } }