diff --git a/Makefile b/Makefile index a275405..36bfa21 100644 --- a/Makefile +++ b/Makefile @@ -33,8 +33,8 @@ ${sonLibDir}/cuTest.a : sonLib stTafDependencies = ${sonLibDir}/sonLib.a ${sonLibDir}/cuTest.a ${LIBDIR}/libabpoa.a -${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies} - ${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o +${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${srcDir}/block_reader.o ${libHeaders} ${stTafDependencies} + ${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${srcDir}/block_reader.o mv libstTaf.a ${LIBDIR}/ ${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders} @@ -67,6 +67,9 @@ ${srcDir}/tai.o : ${srcDir}/tai.c ${libHeaders} ${srcDir}/prefix_sort.o : ${srcDir}/prefix_sort.c ${libHeaders} ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/prefix_sort.o -c ${srcDir}/prefix_sort.c +${srcDir}/block_reader.o : ${srcDir}/block_reader.c ${libHeaders} + ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/block_reader.o -c ${srcDir}/block_reader.c + ${srcDir}/wiggle.o : ${srcDir}/wiggle.c ${libHeaders} ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/wiggle.o -c ${srcDir}/wiggle.c diff --git a/taf_add_gap_bases.cpp b/taf_add_gap_bases.cpp index 645d0fb..373f38c 100644 --- a/taf_add_gap_bases.cpp +++ b/taf_add_gap_bases.cpp @@ -5,6 +5,7 @@ */ extern "C" { #include "taf.h" +#include "block_reader.h" #include "sonLib.h" } #include "bioioC.h" @@ -20,7 +21,7 @@ static int64_t maximum_gap_string_length = 50; static void usage() { fprintf(stderr, "taffy add_gap_bases SEQ_FILExN [options]\n"); fprintf(stderr, "Add interstitial gap strings to taf file\n"); - fprintf(stderr, "-i --inputFile : Input taf file to normalize. If not specified reads from stdin\n"); + fprintf(stderr, "-i --inputFile : Input TAF or MAF file. If not specified reads from stdin\n"); fprintf(stderr, "-o --outputFile : Output taf file. If not specified outputs to stdout\n"); fprintf(stderr, "-a --halFile : HAL file for extracting gap sequence (MAF must be created with hal2maf *without* --onlySequenceNames)\n"); fprintf(stderr, "-m --maximumGapStringLength : The maximum size of a gap string to add, be default: %" PRIi64 "\n", @@ -143,13 +144,23 @@ int taf_add_gap_bases_main(int argc, char *argv[]) { LW *output = LW_construct(outputFile == NULL ? stdout : fopen(outputFile, "w"), use_compression); LI *li = LI_construct(input); - // Pass the header line to determine parameters and write the updated taf header - Tag *tag = taf_read_header_2(li, &run_length_encode_bases); + // Open a format-agnostic reader. For MAF input the reader auto-links adjacent + // blocks so alignment_add_gap_strings sees the inter-block coordinate continuity + // it needs. Output is always TAF (interstitial gap strings are a TAF-only feature). + BlockReader *reader = block_reader_open(li); + if (reader == NULL) { + LW_destruct(output, outputFile != NULL); + LI_destruct(li); + if (inputFile != NULL) fclose(input); + return 1; + } + run_length_encode_bases = block_reader_run_length_encoded(reader); + Tag *tag = block_reader_take_header(reader); taf_write_header(tag, output); tag_destruct(tag); Alignment *alignment, *p_alignment = NULL; - while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) { + while ((alignment = block_reader_next(reader, p_alignment)) != NULL) { // Add in the gap strings if there is a previous block if(p_alignment != NULL) { alignment_add_gap_strings(p_alignment, alignment, fastas, hal_handle, hal_species, maximum_gap_string_length); @@ -172,6 +183,7 @@ int taf_add_gap_bases_main(int argc, char *argv[]) { // Cleanup ////////////////////////////////////////////// + block_reader_destruct(reader); LI_destruct(li); if(inputFile != NULL) { fclose(input); diff --git a/taf_annotate.c b/taf_annotate.c index 78ffbf5..6489de2 100644 --- a/taf_annotate.c +++ b/taf_annotate.c @@ -6,6 +6,7 @@ #include "taf.h" #include "tai.h" +#include "block_reader.h" #include "sonLib.h" #include #include @@ -15,7 +16,7 @@ static int64_t repeat_coordinates_every_n_columns = 10000; static void usage(void) { fprintf(stderr, "taffy annotate [options]\n"); fprintf(stderr, "Annotate the columns of a taf file using wiggle file\n"); - fprintf(stderr, "-i --inputFile : Input TAF file. If not specified reads from stdin\n"); + fprintf(stderr, "-i --inputFile : Input TAF or MAF file. If not specified reads from stdin. Output is always TAF (column tags have no MAF representation)\n"); fprintf(stderr, "-w --wiggle [FILE_NAME] : REQUIRED The input wiggle file\n"); fprintf(stderr, "-t --tagName [STRING] : REQUIRED: The name of the tag to annotate for the given wiggle\n"); fprintf(stderr, "-s --repeatCoordinatesEveryNColumns : Repeat coordinates of each sequence at least every n columns. By default: %" PRIi64 "\n", repeat_coordinates_every_n_columns); @@ -143,23 +144,23 @@ int taf_annotate_main(int argc, char *argv[]) { // Open the inputs and outputs and parse the labels to annotate ////////////////////////////////////////////// - // load the input + // load the input (TAF or MAF -- BlockReader sniffs and dispatches) FILE *taf_fh = taf_file == NULL ? stdin : fopen(taf_file, "r"); if (taf_fh == NULL) { - fprintf(stderr, "Unable to open input TAF file: %s\n", taf_file); + fprintf(stderr, "Unable to open input TAF/MAF file: %s\n", taf_file); return 1; } LI *li = LI_construct(taf_fh); - - // Check is a taf file - if (check_input_format(LI_peek_at_next_line(li)) != 0) { - fprintf(stderr, "Input not supported: requires #taf header\n"); + BlockReader *reader = block_reader_open(li); + if (reader == NULL) { + LI_destruct(li); + if (taf_file != NULL) fclose(taf_fh); return 1; } - - // Check if run_length_encode_bases is set and read header - bool run_length_encode_bases = 0; - Tag *tag = taf_read_header_2(li, &run_length_encode_bases); + // Output is TAF regardless of input format. For MAF input we won't run-length encode the + // body (annotate has no flag to opt in); for TAF input we preserve the input's RLE setting. + bool run_length_encode_bases = block_reader_run_length_encoded(reader); + Tag *tag = block_reader_take_header(reader); // Load the wiggle file, making coordinates 0 based stHash *labels = wig_parse(wig_file, ref_prefix, 1); @@ -183,7 +184,7 @@ int taf_annotate_main(int argc, char *argv[]) { Alignment *alignment = NULL; Alignment *p_alignment = NULL; // Keep reading blocks while available - while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) { + while ((alignment = block_reader_next(reader, p_alignment)) != NULL) { label_alignment(alignment, labels, tag_name); // Make any changes to the alignment for output // Write back the labelled taf @@ -204,6 +205,7 @@ int taf_annotate_main(int argc, char *argv[]) { // Cleanup ////////////////////////////////////////////// + block_reader_destruct(reader); LI_destruct(li); if(taf_file != NULL) { fclose(taf_fh); diff --git a/taf_coverage.cpp b/taf_coverage.cpp index d2cee40..bd2b800 100644 --- a/taf_coverage.cpp +++ b/taf_coverage.cpp @@ -6,6 +6,7 @@ extern "C" { #include "taf.h" +#include "block_reader.h" #include "sonLib.h" } #include @@ -57,7 +58,7 @@ static void print_coverage_tsv(const ContigCoverageMap& contig_cov_map, const se static void usage() { fprintf(stderr, "taffy coverage [options]\n"); fprintf(stderr, "Compute very basic pairwise coverage stats as fraction and bp for a TAF file\n"); - fprintf(stderr, "-i --inputFile : Input taf file to normalize. If not specified reads from stdin\n"); + fprintf(stderr, "-i --inputFile : Input TAF or MAF file. If not specified reads from stdin\n"); fprintf(stderr, "-r --reference : Name of reference genome. If note specified used first row in block\n"); fprintf(stderr, "-g --genomeNames : List of genome names (quoted, space-separated), ex from \"$(halStats --genomes aln.hal)\". This can help contig name parsing which otherwise uses everything up to first . as genome name\n"); fprintf(stderr, "-a, --gapThreshold : Breakdown rows using given gap threshold, to restrict aligned bp to exclude gaps>threshold. Multiple allowed. \n"); @@ -161,29 +162,32 @@ int taf_coverage_main(int argc, char *argv[]) { stList_destruct(tokens); } - // Open TAF + // Open input (MAF or TAF -- BlockReader sniffs and dispatches) FILE *input = inputFile == NULL ? stdin : fopen(inputFile, "r"); LI *li = LI_construct(input); + BlockReader *reader = block_reader_open(li); + if (reader == NULL) { + LI_destruct(li); + if (inputFile != NULL) fclose(input); + return 1; + } + tag_destruct(block_reader_take_header(reader)); - // Parse the header - bool run_length_encode_bases; - Tag *tag = taf_read_header_2(li, &run_length_encode_bases); - tag_destruct(tag); - Alignment *alignment, *p_alignment = NULL; - while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) { + while ((alignment = block_reader_next(reader, p_alignment)) != NULL) { // update the coverage update_block_coverage(alignment, p_alignment, reference, genome_names_hash, contig_coverage_map); // Clean up the previous alignment - if(p_alignment != NULL) { + if (p_alignment != NULL) { alignment_destruct(p_alignment, 1); } p_alignment = alignment; // Update the previous alignment } - if(p_alignment != NULL) { // Clean up the final alignment + if (p_alignment != NULL) { // Clean up the final alignment alignment_destruct(p_alignment, 1); } + block_reader_destruct(reader); // add gaps from last covered base to ends of contigs add_final_gap(contig_coverage_map); diff --git a/taf_norm.c b/taf_norm.c index 49114cd..0e2feee 100644 --- a/taf_norm.c +++ b/taf_norm.c @@ -5,6 +5,7 @@ */ #include "taf.h" +#include "block_reader.h" #include "sonLib.h" #include #include @@ -21,8 +22,8 @@ static void usage(void) { fprintf(stderr, "Note, taffy norm will resort the rows alpha-numerically according to sequence name, " "as is necessary to successfully merge all mergeable rows. Is the resorting is undesired, pipe the" "result to taffy sort to resort.\n"); - fprintf(stderr, "-i --inputFile : Input taf file to normalize. If not specified reads from stdin\n"); - fprintf(stderr, "-o --outputFile : Output taf file. If not specified outputs to stdout\n"); + fprintf(stderr, "-i --inputFile : Input TAF or MAF file to normalize. If not specified reads from stdin\n"); + fprintf(stderr, "-o --outputFile : Output taf file (or maf if -k is given). If not specified outputs to stdout\n"); fprintf(stderr, "-l --logLevel : Set the log level\n"); fprintf(stderr, "-k --maf : Print maf output instead of taf\n"); fprintf(stderr, "-m --maximumBlockLengthToMerge : Only merge together any two adjacent blocks if one or both is less than this many bases long, by default: %" PRIi64 "\n", maximum_block_length_to_merge); @@ -37,13 +38,12 @@ static void usage(void) { fprintf(stderr, "-h --help : Print this help message\n"); } -static Alignment *get_next_taf_block(LI *li, bool run_length_encode_bases) { +static Alignment *get_next_block(BlockReader *reader) { static Alignment *alignments[3]; static int64_t alignment_index=0; assert(alignment_index >= 0); while(alignment_index < 3) { - alignments[alignment_index] = taf_read_block(alignment_index == 0 ? NULL : alignments[alignment_index-1], - run_length_encode_bases, li); // Read a block + alignments[alignment_index] = block_reader_next(reader, alignment_index == 0 ? NULL : alignments[alignment_index-1]); if(alignments[alignment_index] == NULL) { // The read block is empty break; } @@ -325,8 +325,18 @@ int taf_norm_main(int argc, char *argv[]) { LW *output = LW_construct(outputFile == NULL ? stdout : fopen(outputFile, "w"), use_compression); LI *li = LI_construct(input); - // Pass the header line to determine parameters and write the updated taf header - Tag *tag = taf_read_header_2(li, &run_length_encode_bases); + // Open a format-agnostic reader; for MAF input the reader transparently links adjacent + // blocks via alignment_link_adjacent so downstream merging logic sees the same coordinate + // continuity it gets from a TAF input. + BlockReader *reader = block_reader_open(li); + if (reader == NULL) { + LW_destruct(output, outputFile != NULL); + LI_destruct(li); + if (inputFile != NULL) fclose(input); + return 1; + } + run_length_encode_bases = block_reader_run_length_encoded(reader); + Tag *tag = block_reader_take_header(reader); if(output_maf && run_length_encode_bases) { // Remove this tag from the maf output as not relevant tag = tag_remove(tag, "run_length_encode_bases"); } @@ -334,7 +344,7 @@ int taf_norm_main(int argc, char *argv[]) { tag_destruct(tag); Alignment *alignment, *p_alignment = NULL, *p_p_alignment = NULL; - while((alignment = get_next_taf_block(li, run_length_encode_bases)) != NULL) { + while((alignment = get_next_block(reader)) != NULL) { // First resort the rows to be alphabetical and then realign with any previous block. This ensures // we will not have any mergeable rows unlinked. Note: // We do not allow row substitutions when linking two blocks to merge (see last parameter of function call), @@ -396,6 +406,8 @@ int taf_norm_main(int argc, char *argv[]) { // Cleanup ////////////////////////////////////////////// + block_reader_destruct(reader); + LI_destruct(li); if(inputFile != NULL) { fclose(input); } diff --git a/taf_sort.c b/taf_sort.c index 5645621..4cef025 100644 --- a/taf_sort.c +++ b/taf_sort.c @@ -6,6 +6,7 @@ #include "taf.h" #include "tai.h" +#include "block_reader.h" #include "sonLib.h" #include #include @@ -193,17 +194,24 @@ int taf_sort_main(int argc, char *argv[]) { stList *prefixes_to_sort_by = load_sort_file(sort_file); stList *prefixes_to_dup_filter = load_sort_file(dup_filter_file); - // Parse the header - bool run_length_encode_bases; - Tag *tag = taf_read_header_2(li, &run_length_encode_bases); + // Open a format-agnostic reader (TAF or MAF input) + BlockReader *reader = block_reader_open(li); + if (reader == NULL) { + LW_destruct(output, output_file != NULL); + LI_destruct(li); + if (input_file != NULL) fclose(input); + return 1; + } + bool run_length_encode_bases = block_reader_run_length_encoded(reader); + Tag *tag = block_reader_take_header(reader); - // Write the header + // For now, output is always TAF (a MAF output mode could be added later -- see pass-2 plan). taf_write_header(tag, output); tag_destruct(tag); // Write the alignment blocks Alignment *alignment, *p_alignment = NULL, *pp_alignment = NULL; - while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) { + while ((alignment = block_reader_next(reader, p_alignment)) != NULL) { process_alignment_block(pp_alignment, p_alignment, prefixes_to_filter_by, prefixes_to_pad, prefixes_to_sort_by, prefixes_to_dup_filter, run_length_encode_bases, ignore_first_row, output); pp_alignment = p_alignment; @@ -220,6 +228,7 @@ int taf_sort_main(int argc, char *argv[]) { ////////////////////////////////////////////// + block_reader_destruct(reader); LI_destruct(li); if(input_file != NULL) { fclose(input); diff --git a/taf_stats.c b/taf_stats.c index 9e5f7f5..d740c58 100644 --- a/taf_stats.c +++ b/taf_stats.c @@ -6,6 +6,7 @@ #include "taf.h" #include "tai.h" +#include "block_reader.h" #include "sonLib.h" #include #include @@ -105,19 +106,30 @@ int taf_stats_main(int argc, char *argv[]) { } LI *li = LI_construct(taf_fh); - // sniff the format - int input_format = check_input_format(LI_peek_at_next_line(li)); - if (input_format == 2) { - fprintf(stderr, "Input not supported: unable to detect ##maf or #taf header\n"); - return 1; - } else if (input_format != 0 && seq_intervals) { - fprintf(stderr, "MAF input detected but -b only works with TAF input. Please use taffy view to convert\n"); - return 1; - } - // parse the header - bool run_length_encode_bases = 0; - if(input_format == 0) { // Is taf, check if run_length_encode_bases is set - tag_destruct(taf_read_header_2(li, &run_length_encode_bases)); + // open a format-agnostic reader (handles MAF or TAF) + BlockReader *reader = NULL; + bool input_is_maf = false; + if (!seq_lengths) { + // seq_lengths goes through tai_sequence_lengths which manages its own header reading; + // for the other paths we need to consume the header up-front via BlockReader + reader = block_reader_open(li); + if (reader == NULL) { + LI_destruct(li); + if (taf_fn != NULL) fclose(taf_fh); + return 1; + } + input_is_maf = block_reader_is_maf(reader); + tag_destruct(block_reader_take_header(reader)); + } else { + // for -s we still need to know the format to load the index correctly + int input_format = check_input_format(LI_peek_at_next_line(li)); + if (input_format != 0 && input_format != 1) { + fprintf(stderr, "Input not supported: unable to detect ##maf or #taf header\n"); + LI_destruct(li); + if (taf_fn != NULL) fclose(taf_fh); + return 1; + } + input_is_maf = (input_format == 1); } // load the index if it's required by the given options @@ -136,7 +148,7 @@ int taf_stats_main(int argc, char *argv[]) { fprintf(stderr, "Required index %s not found. Please run taffy index first\n", tai_fn); return 1; } - tai = tai_load(tai_fh, input_format == 1); + tai = tai_load(tai_fh, input_is_maf); } // do the stats @@ -155,7 +167,7 @@ int taf_stats_main(int argc, char *argv[]) { char *cur_seq = NULL; int64_t cur_start = -1; int64_t cur_end = 0; - while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) { + while ((alignment = block_reader_next(reader, p_alignment)) != NULL) { if (alignment->row_number > 0) { if (!cur_seq || strcmp(cur_seq, alignment->row->sequence_name) != 0 || alignment->row->start != cur_end) { if (cur_seq) { @@ -187,16 +199,7 @@ int taf_stats_main(int argc, char *argv[]) { if(alignment_stats) { int64_t total_blocks = 0, total_columns = 0, total_aligned_bases = 0, total_gaps = 0, total_column_depth = 0; Alignment *alignment, *p_alignment = NULL; - while(1) { - if(input_format == 0) { - alignment = taf_read_block(p_alignment, run_length_encode_bases, li); - } - else { - alignment = maf_read_block(li); - } - if(!alignment) { // No more blocks - break; - } + while ((alignment = block_reader_next(reader, p_alignment)) != NULL) { total_blocks++; total_columns += alignment_length(alignment); total_column_depth += alignment_length(alignment) * alignment->row_number; @@ -237,7 +240,8 @@ int taf_stats_main(int argc, char *argv[]) { free(tai_fn); tai_destruct(tai); } - + + block_reader_destruct(reader); LI_destruct(li); if(taf_fn != NULL) { fclose(taf_fh); diff --git a/taffy/impl/block_reader.c b/taffy/impl/block_reader.c new file mode 100644 index 0000000..7d44bab --- /dev/null +++ b/taffy/impl/block_reader.c @@ -0,0 +1,60 @@ +#include "block_reader.h" +#include "sonLib.h" + +struct _BlockReader { + LI *li; + bool is_maf; + bool run_length_encode_bases; + Tag *header; // owned until block_reader_take_header is called +}; + +BlockReader *block_reader_open(LI *li) { + int input_format = check_input_format(LI_peek_at_next_line(li)); + if (input_format != 0 && input_format != 1) { + fprintf(stderr, "Input not supported: unable to detect ##maf or #taf header\n"); + return NULL; + } + BlockReader *r = (BlockReader *) st_calloc(1, sizeof(BlockReader)); + r->li = li; + r->is_maf = (input_format == 1); + if (r->is_maf) { + r->header = maf_read_header(li); + r->run_length_encode_bases = false; + } else { + r->header = taf_read_header_2(li, &r->run_length_encode_bases); + } + return r; +} + +Tag *block_reader_take_header(BlockReader *r) { + Tag *tag = r->header; + r->header = NULL; + return tag; +} + +bool block_reader_is_maf(const BlockReader *r) { + return r->is_maf; +} + +bool block_reader_run_length_encoded(const BlockReader *r) { + return r->run_length_encode_bases; +} + +Alignment *block_reader_next(BlockReader *r, Alignment *prev_block) { + Alignment *a; + if (r->is_maf) { + a = maf_read_block(r->li); + if (a != NULL && prev_block != NULL) { + alignment_link_adjacent(prev_block, a, 1); + } + } else { + a = taf_read_block(prev_block, r->run_length_encode_bases, r->li); + } + return a; +} + +void block_reader_destruct(BlockReader *r) { + if (r == NULL) return; + tag_destruct(r->header); // tag_destruct accepts NULL + free(r); +} diff --git a/taffy/inc/block_reader.h b/taffy/inc/block_reader.h new file mode 100644 index 0000000..3ae6ab3 --- /dev/null +++ b/taffy/inc/block_reader.h @@ -0,0 +1,53 @@ +#ifndef TAF_BLOCK_READER_H_ +#define TAF_BLOCK_READER_H_ + +/* + * Format-agnostic block reader. Sniffs MAF vs TAF on construction, reads the + * appropriate header, and dispatches subsequent block reads to the right + * underlying reader. For MAF input, also handles cross-block linkage via + * alignment_link_adjacent so that downstream tools see the same coordinate + * continuity they get from a TAF input. + * + * Intended to let CLI tools that previously hardcoded TAF (norm, sort, + * coverage, annotate, add-gap-bases, stats, ...) accept either format with + * minimal per-tool churn. + */ + +#include "taf.h" + +typedef struct _BlockReader BlockReader; + +/* + * Open a reader on an existing LI handle. Sniffs the input format from the + * first header line, reads the header, and stashes any flags (e.g. RLE) that + * tools may need to consult. Caller retains ownership of li. + * + * Returns NULL on unrecognized format (with an error printed to stderr). + */ +BlockReader *block_reader_open(LI *li); + +/* + * Take ownership of the parsed header tag. Subsequent calls return NULL. + * The caller is responsible for tag_destruct'ing it (or passing it to a + * header writer which will). After this call the BlockReader still owns + * its other state and must still be destructed. + */ +Tag *block_reader_take_header(BlockReader *r); + +bool block_reader_is_maf(const BlockReader *r); +bool block_reader_run_length_encoded(const BlockReader *r); + +/* + * Read the next block. For MAF inputs, prev_block is used to link adjacent + * rows (matching the behavior of taffy view's MAF read path). Pass NULL for + * the first block. Returns NULL at EOF. + * + * Lifetime note: the returned alignment is owned by the caller and must be + * destructed (alignment_destruct) when no longer needed. The reader does not + * retain a reference to it. + */ +Alignment *block_reader_next(BlockReader *r, Alignment *prev_block); + +void block_reader_destruct(BlockReader *r); + +#endif diff --git a/tests/allTests.c b/tests/allTests.c index 4ea0384..b11c128 100644 --- a/tests/allTests.c +++ b/tests/allTests.c @@ -15,6 +15,8 @@ CuSuite* view_test_suite(void); CuSuite* sort_test_suite(void); CuSuite* coverage_test_suite(void); CuSuite* wiggle_test_suite(void); +CuSuite* block_reader_test_suite(void); +CuSuite* stats_test_suite(void); static int allTests(void) { CuString *output = CuStringNew(); @@ -27,6 +29,8 @@ static int allTests(void) { CuSuiteAddSuite(suite, sort_test_suite()); CuSuiteAddSuite(suite, coverage_test_suite()); CuSuiteAddSuite(suite, wiggle_test_suite()); + CuSuiteAddSuite(suite, block_reader_test_suite()); + CuSuiteAddSuite(suite, stats_test_suite()); CuSuiteRun(suite); CuSuiteSummary(suite, output); diff --git a/tests/block_reader_test.c b/tests/block_reader_test.c new file mode 100644 index 0000000..8a0c935 --- /dev/null +++ b/tests/block_reader_test.c @@ -0,0 +1,125 @@ +#include "CuTest.h" +#include "block_reader.h" +#include "sonLib.h" + +// Walk a file with BlockReader and return (block_count, total_columns, first_ref_name). +static void walk_with_block_reader(const char *path, int *block_count, int64_t *total_cols, char **first_ref) { + FILE *fh = fopen(path, "r"); + LI *li = LI_construct(fh); + BlockReader *r = block_reader_open(li); + Tag *tag = block_reader_take_header(r); + tag_destruct(tag); + + *block_count = 0; + *total_cols = 0; + *first_ref = NULL; + Alignment *prev = NULL; + Alignment *a; + while ((a = block_reader_next(r, prev)) != NULL) { + if (*block_count == 0) { + *first_ref = stString_copy(a->row->sequence_name); + } + *total_cols += a->column_number; + (*block_count)++; + if (prev != NULL) alignment_destruct(prev, 1); + prev = a; + } + if (prev != NULL) alignment_destruct(prev, 1); + block_reader_destruct(r); + LI_destruct(li); + fclose(fh); +} + +static void test_block_reader_maf_input(CuTest *tc) { + int n; int64_t cols; char *ref; + walk_with_block_reader("./tests/evolverMammals.maf", &n, &cols, &ref); + CuAssertTrue(tc, n > 0); + CuAssertTrue(tc, cols > 0); + CuAssertStrEquals(tc, "Anc0.Anc0refChr0", ref); + free(ref); +} + +static void test_block_reader_taf_input(CuTest *tc) { + // Build a TAF on the side from the same MAF and walk it + int rc = st_system("./bin/taffy view -i ./tests/evolverMammals.maf -o ./tests/block_reader_test.taf"); + CuAssertIntEquals(tc, 0, rc); + + int n; int64_t cols; char *ref; + walk_with_block_reader("./tests/block_reader_test.taf", &n, &cols, &ref); + CuAssertTrue(tc, n > 0); + CuAssertTrue(tc, cols > 0); + CuAssertStrEquals(tc, "Anc0.Anc0refChr0", ref); + free(ref); + + st_system("rm -f ./tests/block_reader_test.taf"); +} + +static void test_block_reader_maf_and_taf_match(CuTest *tc) { + // BlockReader walking a MAF and walking the equivalent TAF should yield the + // same sequence of (block_count, total_columns) -- this is the contract + // that lets downstream tools accept either format transparently. + int rc = st_system("./bin/taffy view -i ./tests/evolverMammals.maf -o ./tests/block_reader_test.taf"); + CuAssertIntEquals(tc, 0, rc); + + int n_maf, n_taf; int64_t cols_maf, cols_taf; char *ref_maf, *ref_taf; + walk_with_block_reader("./tests/evolverMammals.maf", &n_maf, &cols_maf, &ref_maf); + walk_with_block_reader("./tests/block_reader_test.taf", &n_taf, &cols_taf, &ref_taf); + + CuAssertIntEquals(tc, n_maf, n_taf); + CuAssertIntEquals(tc, cols_maf, cols_taf); + CuAssertStrEquals(tc, ref_maf, ref_taf); + free(ref_maf); free(ref_taf); + + st_system("rm -f ./tests/block_reader_test.taf"); +} + +static void test_block_reader_unrecognized_format(CuTest *tc) { + // An input that isn't MAF or TAF should make block_reader_open return NULL. + char *junk_path = "./tests/block_reader_junk.txt"; + FILE *junk = fopen(junk_path, "w"); + fprintf(junk, "this is not a maf or taf file\n"); + fclose(junk); + + FILE *fh = fopen(junk_path, "r"); + LI *li = LI_construct(fh); + BlockReader *r = block_reader_open(li); + CuAssertPtrEquals(tc, NULL, r); + LI_destruct(li); + fclose(fh); + st_system("rm -f %s", junk_path); +} + +static void test_block_reader_header_take(CuTest *tc) { + // taf_read_header_2 picks up the run_length_encode_bases flag; verify the + // reader exposes it correctly when the input is RLE-encoded TAF. + int rc = st_system("./bin/taffy view -i ./tests/evolverMammals.maf -u -o ./tests/block_reader_test_rle.taf"); + CuAssertIntEquals(tc, 0, rc); + + FILE *fh = fopen("./tests/block_reader_test_rle.taf", "r"); + LI *li = LI_construct(fh); + BlockReader *r = block_reader_open(li); + CuAssertPtrNotNull(tc, r); + CuAssertTrue(tc, !block_reader_is_maf(r)); + CuAssertTrue(tc, block_reader_run_length_encoded(r)); + + Tag *tag = block_reader_take_header(r); + CuAssertPtrNotNull(tc, tag); + // After take, a second take returns NULL + CuAssertPtrEquals(tc, NULL, block_reader_take_header(r)); + tag_destruct(tag); + + block_reader_destruct(r); + LI_destruct(li); + fclose(fh); + st_system("rm -f ./tests/block_reader_test_rle.taf"); +} + +CuSuite *block_reader_test_suite(void) { + CuSuite *suite = CuSuiteNew(); + SUITE_ADD_TEST(suite, test_block_reader_maf_input); + SUITE_ADD_TEST(suite, test_block_reader_taf_input); + SUITE_ADD_TEST(suite, test_block_reader_maf_and_taf_match); + SUITE_ADD_TEST(suite, test_block_reader_unrecognized_format); + SUITE_ADD_TEST(suite, test_block_reader_header_take); + return suite; +} diff --git a/tests/coverage_test.c b/tests/coverage_test.c index ae20bae..7465f0f 100644 --- a/tests/coverage_test.c +++ b/tests/coverage_test.c @@ -37,8 +37,25 @@ static void test_coverage(CuTest *testCase) { } } +// Verifies that taffy coverage -i x.maf produces identical output to +// taffy view -i x.maf | taffy coverage. This is the per-tool dual-input +// regression test for the BlockReader migration. +static void test_coverage_maf_input(CuTest *testCase) { + char *example_file = "./tests/coverage_test.maf"; + char *piped_out = "./tests/coverage_test.piped.tsv"; + char *direct_out = "./tests/coverage_test.direct.tsv"; + int i = st_system("./bin/taffy view -i %s | ./bin/taffy coverage -g cat.a > %s", example_file, piped_out); + CuAssertIntEquals(testCase, 0, i); + int j = st_system("./bin/taffy coverage -i %s -g cat.a > %s", example_file, direct_out); + CuAssertIntEquals(testCase, 0, j); + int diff_ret = st_system("diff %s %s", piped_out, direct_out); + CuAssertIntEquals(testCase, 0, diff_ret); + st_system("rm -f %s %s", piped_out, direct_out); +} + CuSuite* coverage_test_suite(void) { CuSuite* suite = CuSuiteNew(); SUITE_ADD_TEST(suite, test_coverage); + SUITE_ADD_TEST(suite, test_coverage_maf_input); return suite; } diff --git a/tests/normalize_test.c b/tests/normalize_test.c index f3d4a9c..271d1fa 100644 --- a/tests/normalize_test.c +++ b/tests/normalize_test.c @@ -207,6 +207,41 @@ static void test_norm_pipeline(CuTest *testCase) { st_system("rm %s", output_file); } +// Verifies that taffy add-gap-bases -i x.maf produces output identical to +// taffy view -i x.maf | taffy add-gap-bases. add-gap-bases relies on adjacent +// blocks being linked (alignment_add_gap_strings reads l_row/r_row coordinates), +// so this is the most sensitive of the dual-input migration tests. +static void test_add_gap_bases_maf_input(CuTest *testCase) { + char *example_file = "./tests/evolverMammals.maf"; + char *piped_out = "./tests/agb.piped.taf"; + char *direct_out = "./tests/agb.direct.taf"; + int i = st_system("./bin/taffy view -i %s | ./bin/taffy add-gap-bases ./tests/seqs/* > %s", + example_file, piped_out); + CuAssertIntEquals(testCase, 0, i); + int j = st_system("./bin/taffy add-gap-bases -i %s -o %s ./tests/seqs/*", + example_file, direct_out); + CuAssertIntEquals(testCase, 0, j); + int diff_ret = st_system("diff %s %s", piped_out, direct_out); + CuAssertIntEquals(testCase, 0, diff_ret); + st_system("rm -f %s %s", piped_out, direct_out); +} + +// Verifies that taffy norm -i x.maf produces output identical to +// taffy view -i x.maf | taffy norm. This is the per-tool dual-input +// regression test for the BlockReader migration. +static void test_norm_maf_input(CuTest *testCase) { + char *example_file = "./tests/evolverMammals.maf"; + char *piped_out = "./tests/norm.piped.taf"; + char *direct_out = "./tests/norm.direct.taf"; + int i = st_system("./bin/taffy view -i %s | ./bin/taffy norm > %s", example_file, piped_out); + CuAssertIntEquals(testCase, 0, i); + int j = st_system("./bin/taffy norm -i %s -o %s", example_file, direct_out); + CuAssertIntEquals(testCase, 0, j); + int diff_ret = st_system("diff %s %s", piped_out, direct_out); + CuAssertIntEquals(testCase, 0, diff_ret); + st_system("rm -f %s %s", piped_out, direct_out); +} + CuSuite* normalize_test_suite(void) { CuSuite* suite = CuSuiteNew(); SUITE_ADD_TEST(suite, test_normalize); @@ -214,5 +249,7 @@ CuSuite* normalize_test_suite(void) { SUITE_ADD_TEST(suite, test_maf_norm_to_maf); SUITE_ADD_TEST(suite, test_dupe_filter); SUITE_ADD_TEST(suite, test_norm_pipeline); + SUITE_ADD_TEST(suite, test_norm_maf_input); + SUITE_ADD_TEST(suite, test_add_gap_bases_maf_input); return suite; } diff --git a/tests/sort_test.c b/tests/sort_test.c index ee1ce5e..80a305c 100644 --- a/tests/sort_test.c +++ b/tests/sort_test.c @@ -79,6 +79,25 @@ static void test_sort_filter_pad_and_dup_filter(CuTest *testCase) { } } +// Verifies that taffy sort -i x.maf produces output identical to +// taffy view -i x.maf | taffy sort. Per-tool dual-input regression +// for the BlockReader migration. +static void test_sort_maf_input(CuTest *testCase) { + char *example_file = "./tests/evolverMammals.maf.mini"; + char *sort_file = "./tests/sort_file.txt"; + char *piped_out = "./tests/sort.piped.taf"; + char *direct_out = "./tests/sort.direct.taf"; + int i = st_system("./bin/taffy view -i %s | ./bin/taffy sort -n %s --dontIgnoreFirstRow > %s", + example_file, sort_file, piped_out); + CuAssertIntEquals(testCase, 0, i); + int j = st_system("./bin/taffy sort -i %s -n %s --dontIgnoreFirstRow -o %s", + example_file, sort_file, direct_out); + CuAssertIntEquals(testCase, 0, j); + int diff_ret = st_system("diff %s %s", piped_out, direct_out); + CuAssertIntEquals(testCase, 0, diff_ret); + st_system("rm -f %s %s", piped_out, direct_out); +} + CuSuite* sort_test_suite(void) { CuSuite* suite = CuSuiteNew(); SUITE_ADD_TEST(suite, test_sort); @@ -86,5 +105,6 @@ CuSuite* sort_test_suite(void) { SUITE_ADD_TEST(suite, test_filter); SUITE_ADD_TEST(suite, test_filter_ignore_first_row); SUITE_ADD_TEST(suite, test_sort_filter_pad_and_dup_filter); + SUITE_ADD_TEST(suite, test_sort_maf_input); return suite; } diff --git a/tests/stats_test.c b/tests/stats_test.c new file mode 100644 index 0000000..81c0def --- /dev/null +++ b/tests/stats_test.c @@ -0,0 +1,35 @@ +#include "CuTest.h" +#include "taf.h" +#include "sonLib.h" + +// Verifies that taffy stats accepts MAF input directly and produces output identical to +// the equivalent `taffy view -i x.maf | taffy stats` invocation. This is the per-tool +// dual-input regression test for the BlockReader migration. Covers the three stats modes: +// -a (alignment stats) +// -b (sequence intervals) -- this previously errored out on MAF input +static void test_stats_maf_input(CuTest *tc) { + char *maf = "./tests/evolverMammals.maf"; + + // -a alignment stats + int rc; + rc = st_system("./bin/taffy stats -a -i %s > ./tests/stats.direct.a.txt", maf); + CuAssertIntEquals(tc, 0, rc); + rc = st_system("./bin/taffy view -i %s | ./bin/taffy stats -a > ./tests/stats.piped.a.txt", maf); + CuAssertIntEquals(tc, 0, rc); + CuAssertIntEquals(tc, 0, st_system("diff ./tests/stats.direct.a.txt ./tests/stats.piped.a.txt")); + st_system("rm -f ./tests/stats.direct.a.txt ./tests/stats.piped.a.txt"); + + // -b sequence intervals (previously rejected on MAF input) + rc = st_system("./bin/taffy stats -b -i %s > ./tests/stats.direct.b.txt", maf); + CuAssertIntEquals(tc, 0, rc); + rc = st_system("./bin/taffy view -i %s | ./bin/taffy stats -b > ./tests/stats.piped.b.txt", maf); + CuAssertIntEquals(tc, 0, rc); + CuAssertIntEquals(tc, 0, st_system("diff ./tests/stats.direct.b.txt ./tests/stats.piped.b.txt")); + st_system("rm -f ./tests/stats.direct.b.txt ./tests/stats.piped.b.txt"); +} + +CuSuite *stats_test_suite(void) { + CuSuite *suite = CuSuiteNew(); + SUITE_ADD_TEST(suite, test_stats_maf_input); + return suite; +} diff --git a/tests/wiggle_test.c b/tests/wiggle_test.c index 6a8ad9b..19820d1 100644 --- a/tests/wiggle_test.c +++ b/tests/wiggle_test.c @@ -48,10 +48,30 @@ static void test_annotate(CuTest *testCase) { } } +// Verifies that taffy annotate -i x.maf produces output identical to +// taffy view -i x.maf | taffy annotate. Output is always TAF (column tags +// have no MAF representation); only the input side gained MAF support. +static void test_annotate_maf_input(CuTest *testCase) { + char *example_file = "./tests/evolverMammals.maf.mini"; + char *example_wig = "./tests/evolverMammals.wig.mini"; + char *piped_out = "./tests/annotate_test.piped.taf"; + char *direct_out = "./tests/annotate_test.direct.taf"; + int i = st_system("./bin/taffy view -i %s | ./bin/taffy annotate -w %s -t test_label -r 'Anc0.' > %s", + example_file, example_wig, piped_out); + CuAssertIntEquals(testCase, 0, i); + int j = st_system("./bin/taffy annotate -i %s -w %s -t test_label -r 'Anc0.' -o %s", + example_file, example_wig, direct_out); + CuAssertIntEquals(testCase, 0, j); + int diff_ret = st_system("diff %s %s", piped_out, direct_out); + CuAssertIntEquals(testCase, 0, diff_ret); + st_system("rm -f %s %s", piped_out, direct_out); +} + CuSuite* wiggle_test_suite(void) { CuSuite* suite = CuSuiteNew(); SUITE_ADD_TEST(suite, test_wiggle); SUITE_ADD_TEST(suite, test_large_wiggle); SUITE_ADD_TEST(suite, test_annotate); + SUITE_ADD_TEST(suite, test_annotate_maf_input); return suite; }