From 7a86737fc8adc6570ee4fc71257357c126220215 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Wed, 6 May 2026 15:06:15 -0700 Subject: [PATCH] Mark a graph as a subgraph of another --- deps/gbwtgraph | 2 +- src/subcommand/gbwt_main.cpp | 35 +++++++++++++++++++++++++++++++---- test/t/37_vg_gbwt.t | 14 +++++++++++--- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/deps/gbwtgraph b/deps/gbwtgraph index 17dda01d5f..4dfaeba336 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit 17dda01d5fee04a39c3ba54e3d6422f67c0ca712 +Subproject commit 4dfaeba3361149ae491f2585cd180e8818621825 diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index 9b5336c6a6..feae32758f 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -55,6 +55,7 @@ struct GBWTConfig { // GBZ construction. bool set_pggname = false; + std::string supergraph_filename; bool unset_pggname = false; bool gbz_v1 = false; @@ -307,6 +308,7 @@ void help_gbwt(char** argv) { std::cerr << " --translation FILE write the segment to node translation table to FILE" << std::endl; std::cerr << " -Z, --gbz-input use GBZ as input GBWT and input graph (one input arg)" << std::endl; std::cerr << " --set-pggname compute the pggname for the GBZ if not already present" << std::endl; + std::cerr << " --subgraph-of FILE mark the GBZ as a subgraph of the other GBZ in FILE" << std::endl; std::cerr << " --unset-pggname clear the stored pggname for the GBZ if present" << std::endl; std::cerr << " -E, --index-paths index the embedded non-alt paths in the graph" << std::endl; std::cerr << " (requires -x, no input args)" << std::endl; @@ -443,8 +445,9 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { constexpr int OPT_PATH_FIELDS = 1116; constexpr int OPT_TRANSLATION = 1117; constexpr int OPT_SET_PGGNAME = 1118; - constexpr int OPT_UNSET_PGGNAME = 1119; - constexpr int OPT_GAM_FORMAT = 1120; + constexpr int OPT_SUBGRAPH_OF = 1119; + constexpr int OPT_UNSET_PGGNAME = 1120; + constexpr int OPT_GAM_FORMAT = 1121; constexpr int OPT_CHUNK_SIZE = 1200; constexpr int OPT_POS_BUFFER = 1201; constexpr int OPT_THREAD_BUFFER = 1202; @@ -512,6 +515,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { // Input GBWT construction: GBZ { "gbz-input", no_argument, 0, 'Z' }, { "set-pggname", no_argument, 0, OPT_SET_PGGNAME }, + { "subgraph-of", required_argument, 0, OPT_SUBGRAPH_OF }, { "unset-pggname", no_argument, 0, OPT_UNSET_PGGNAME }, // Input GBWT construction: paths @@ -724,6 +728,9 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { case OPT_SET_PGGNAME: config.set_pggname = true; break; + case OPT_SUBGRAPH_OF: + config.supergraph_filename = require_exists(config.logger, optarg); + break; case OPT_UNSET_PGGNAME: config.unset_pggname = true; break; @@ -951,7 +958,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { //---------------------------------------------------------------------------- void validate_gbwt_config(GBWTConfig& config) { - // We can either write GBWT in SDSL format to a separate file or as part of a GBZ graph. + // We can either write GBWT to a separate file or as part of a GBZ graph. // However, `--parse-only` uses `gbwt_output` for other purposes. bool has_gbwt_output = (!config.gbwt_output.empty() || (!config.graph_output.empty() && !config.parse_only)); @@ -1351,7 +1358,7 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co config.logger.info() << "Input type: GBZ" << std::endl; } graphs.load_gbz(gbwts, config); - if (config.set_pggname) { + if (config.set_pggname || !config.supergraph_filename.empty()) { std::string pggname = graphs.gbz_graph->pggname(); if (pggname.empty()) { if (config.show_progress) { @@ -1363,6 +1370,26 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co if (config.show_progress) { config.logger.info() << "Graph name: " << pggname << std::endl; } + if (!config.supergraph_filename.empty()) { + gbwtgraph::GraphName supergraph; + try { + gbwt::Tags tags = gbwtgraph::GBZ::simple_sds_load_tags(config.supergraph_filename); + supergraph = gbwtgraph::GraphName(tags); + } catch (const std::runtime_error& e) { + config.logger.error() << "Failed to load supergraph tags from " << config.supergraph_filename << ": " << e.what() << std::endl; + } + if (supergraph.name().empty()) { + config.logger.warn() << "Supergraph " << config.supergraph_filename << " does not have a pggname" << std::endl; + } else { + if (config.show_progress) { + config.logger.info() << "Supergraph name: " << supergraph.name() << std::endl; + } + gbwtgraph::GraphName subgraph = graphs.gbz_graph->graph_name(); + subgraph.add_subgraph(subgraph.name(), supergraph.name()); + subgraph.add_relationships(supergraph); + subgraph.set_tags(graphs.gbz_graph->tags); + } + } } if (config.unset_pggname) { gbwtgraph::GraphName empty; diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index bcd6673089..0479a76141 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 164 +plan tests 168 # Build vg graphs for two chromosomes @@ -240,6 +240,16 @@ vg gbwt --set-pggname -Z x.gbz -g x.gbz is $? 0 "Graph name can be set" is "$(vg describe x.gbz | grep -c 'pggname =')" 1 "GBZ contains graph name after setting" +# Set graph name and a subgraph relationship +vg gbwt -g supergraph.gbz -G graphs/components_walks.gfa +is $? 0 "GBZ construction for supergraph" +vg chunk --gbz --contig A -x supergraph.gbz +is $? 0 "Graph component extraction" +vg gbwt --subgraph-of supergraph.gbz -g subgraph.gbz -Z chunk_0_component_0.gbz +is $? 0 "Subgraph relationship can be set" +is "$(vg describe subgraph.gbz | grep -c 'subgraph =')" 1 "GBZ contains subgraph relationship" +rm -f supergraph.gbz chunk_0_component_0.gbz subgraph.gbz + # Build and serialize GBZ from VCF vg gbwt -x x.vg -g x2.gbz -v small/xy2.vcf.gz is $? 0 "GBZ construction from VCF" @@ -430,5 +440,3 @@ vg gbwt -g gfa.gbz -G graphs/gfa_two_part_reference.gfa is "$(vg paths -M --reference-paths -x gfa.gbz | grep -v "^#" | cut -f4 | grep NO_HAPLOTYPE | wc -l)" "2" "GBZ can represent reference paths without haplotype numbers" rm -f gfa.gbz - -