Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ ${sonLibDir}/cuTest.a : sonLib

stTafDependencies = ${sonLibDir}/sonLib.a ${sonLibDir}/cuTest.a ${LIBDIR}/libabpoa.a

${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies}
${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o
${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${srcDir}/block_reader.o ${libHeaders} ${stTafDependencies}
${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${srcDir}/block_reader.o
mv libstTaf.a ${LIBDIR}/

${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders}
Expand Down Expand Up @@ -67,6 +67,9 @@ ${srcDir}/tai.o : ${srcDir}/tai.c ${libHeaders}
${srcDir}/prefix_sort.o : ${srcDir}/prefix_sort.c ${libHeaders}
${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/prefix_sort.o -c ${srcDir}/prefix_sort.c

${srcDir}/block_reader.o : ${srcDir}/block_reader.c ${libHeaders}
${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/block_reader.o -c ${srcDir}/block_reader.c

${srcDir}/wiggle.o : ${srcDir}/wiggle.c ${libHeaders}
${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/wiggle.o -c ${srcDir}/wiggle.c

Expand Down
20 changes: 16 additions & 4 deletions taf_add_gap_bases.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
*/
extern "C" {
#include "taf.h"
#include "block_reader.h"
#include "sonLib.h"
}
#include "bioioC.h"
Expand All @@ -20,7 +21,7 @@ static int64_t maximum_gap_string_length = 50;
static void usage() {
fprintf(stderr, "taffy add_gap_bases SEQ_FILExN [options]\n");
fprintf(stderr, "Add interstitial gap strings to taf file\n");
fprintf(stderr, "-i --inputFile : Input taf file to normalize. If not specified reads from stdin\n");
fprintf(stderr, "-i --inputFile : Input TAF or MAF file. If not specified reads from stdin\n");
fprintf(stderr, "-o --outputFile : Output taf file. If not specified outputs to stdout\n");
fprintf(stderr, "-a --halFile : HAL file for extracting gap sequence (MAF must be created with hal2maf *without* --onlySequenceNames)\n");
fprintf(stderr, "-m --maximumGapStringLength : The maximum size of a gap string to add, be default: %" PRIi64 "\n",
Expand Down Expand Up @@ -143,13 +144,23 @@ int taf_add_gap_bases_main(int argc, char *argv[]) {
LW *output = LW_construct(outputFile == NULL ? stdout : fopen(outputFile, "w"), use_compression);
LI *li = LI_construct(input);

// Pass the header line to determine parameters and write the updated taf header
Tag *tag = taf_read_header_2(li, &run_length_encode_bases);
// Open a format-agnostic reader. For MAF input the reader auto-links adjacent
// blocks so alignment_add_gap_strings sees the inter-block coordinate continuity
// it needs. Output is always TAF (interstitial gap strings are a TAF-only feature).
BlockReader *reader = block_reader_open(li);
if (reader == NULL) {
LW_destruct(output, outputFile != NULL);
LI_destruct(li);
if (inputFile != NULL) fclose(input);
return 1;
}
run_length_encode_bases = block_reader_run_length_encoded(reader);
Tag *tag = block_reader_take_header(reader);
taf_write_header(tag, output);
tag_destruct(tag);

Alignment *alignment, *p_alignment = NULL;
while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) {
while ((alignment = block_reader_next(reader, p_alignment)) != NULL) {
// Add in the gap strings if there is a previous block
if(p_alignment != NULL) {
alignment_add_gap_strings(p_alignment, alignment, fastas, hal_handle, hal_species, maximum_gap_string_length);
Expand All @@ -172,6 +183,7 @@ int taf_add_gap_bases_main(int argc, char *argv[]) {
// Cleanup
//////////////////////////////////////////////

block_reader_destruct(reader);
LI_destruct(li);
if(inputFile != NULL) {
fclose(input);
Expand Down
26 changes: 14 additions & 12 deletions taf_annotate.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "taf.h"
#include "tai.h"
#include "block_reader.h"
#include "sonLib.h"
#include <getopt.h>
#include <time.h>
Expand All @@ -15,7 +16,7 @@ static int64_t repeat_coordinates_every_n_columns = 10000;
static void usage(void) {
fprintf(stderr, "taffy annotate [options]\n");
fprintf(stderr, "Annotate the columns of a taf file using wiggle file\n");
fprintf(stderr, "-i --inputFile : Input TAF file. If not specified reads from stdin\n");
fprintf(stderr, "-i --inputFile : Input TAF or MAF file. If not specified reads from stdin. Output is always TAF (column tags have no MAF representation)\n");
fprintf(stderr, "-w --wiggle [FILE_NAME] : REQUIRED The input wiggle file\n");
fprintf(stderr, "-t --tagName [STRING] : REQUIRED: The name of the tag to annotate for the given wiggle\n");
fprintf(stderr, "-s --repeatCoordinatesEveryNColumns : Repeat coordinates of each sequence at least every n columns. By default: %" PRIi64 "\n", repeat_coordinates_every_n_columns);
Expand Down Expand Up @@ -143,23 +144,23 @@ int taf_annotate_main(int argc, char *argv[]) {
// Open the inputs and outputs and parse the labels to annotate
//////////////////////////////////////////////

// load the input
// load the input (TAF or MAF -- BlockReader sniffs and dispatches)
FILE *taf_fh = taf_file == NULL ? stdin : fopen(taf_file, "r");
if (taf_fh == NULL) {
fprintf(stderr, "Unable to open input TAF file: %s\n", taf_file);
fprintf(stderr, "Unable to open input TAF/MAF file: %s\n", taf_file);
return 1;
}
LI *li = LI_construct(taf_fh);

// Check is a taf file
if (check_input_format(LI_peek_at_next_line(li)) != 0) {
fprintf(stderr, "Input not supported: requires #taf header\n");
BlockReader *reader = block_reader_open(li);
if (reader == NULL) {
LI_destruct(li);
if (taf_file != NULL) fclose(taf_fh);
return 1;
}

// Check if run_length_encode_bases is set and read header
bool run_length_encode_bases = 0;
Tag *tag = taf_read_header_2(li, &run_length_encode_bases);
// Output is TAF regardless of input format. For MAF input we won't run-length encode the
// body (annotate has no flag to opt in); for TAF input we preserve the input's RLE setting.
bool run_length_encode_bases = block_reader_run_length_encoded(reader);
Tag *tag = block_reader_take_header(reader);

// Load the wiggle file, making coordinates 0 based
stHash *labels = wig_parse(wig_file, ref_prefix, 1);
Expand All @@ -183,7 +184,7 @@ int taf_annotate_main(int argc, char *argv[]) {
Alignment *alignment = NULL;
Alignment *p_alignment = NULL;
// Keep reading blocks while available
while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) {
while ((alignment = block_reader_next(reader, p_alignment)) != NULL) {
label_alignment(alignment, labels, tag_name); // Make any changes to the alignment for output

// Write back the labelled taf
Expand All @@ -204,6 +205,7 @@ int taf_annotate_main(int argc, char *argv[]) {
// Cleanup
//////////////////////////////////////////////

block_reader_destruct(reader);
LI_destruct(li);
if(taf_file != NULL) {
fclose(taf_fh);
Expand Down
24 changes: 14 additions & 10 deletions taf_coverage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

extern "C" {
#include "taf.h"
#include "block_reader.h"
#include "sonLib.h"
}
#include <getopt.h>
Expand Down Expand Up @@ -57,7 +58,7 @@ static void print_coverage_tsv(const ContigCoverageMap& contig_cov_map, const se
static void usage() {
fprintf(stderr, "taffy coverage [options]\n");
fprintf(stderr, "Compute very basic pairwise coverage stats as fraction and bp for a TAF file\n");
fprintf(stderr, "-i --inputFile : Input taf file to normalize. If not specified reads from stdin\n");
fprintf(stderr, "-i --inputFile : Input TAF or MAF file. If not specified reads from stdin\n");
fprintf(stderr, "-r --reference : Name of reference genome. If note specified used first row in block\n");
fprintf(stderr, "-g --genomeNames : List of genome names (quoted, space-separated), ex from \"$(halStats --genomes aln.hal)\". This can help contig name parsing which otherwise uses everything up to first . as genome name\n");
fprintf(stderr, "-a, --gapThreshold : Breakdown rows using given gap threshold, to restrict aligned bp to exclude gaps>threshold. Multiple allowed. \n");
Expand Down Expand Up @@ -161,29 +162,32 @@ int taf_coverage_main(int argc, char *argv[]) {
stList_destruct(tokens);
}

// Open TAF
// Open input (MAF or TAF -- BlockReader sniffs and dispatches)
FILE *input = inputFile == NULL ? stdin : fopen(inputFile, "r");
LI *li = LI_construct(input);
BlockReader *reader = block_reader_open(li);
if (reader == NULL) {
LI_destruct(li);
if (inputFile != NULL) fclose(input);
return 1;
}
tag_destruct(block_reader_take_header(reader));

// Parse the header
bool run_length_encode_bases;
Tag *tag = taf_read_header_2(li, &run_length_encode_bases);
tag_destruct(tag);

Alignment *alignment, *p_alignment = NULL;
while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) {
while ((alignment = block_reader_next(reader, p_alignment)) != NULL) {
// update the coverage
update_block_coverage(alignment, p_alignment, reference, genome_names_hash, contig_coverage_map);

// Clean up the previous alignment
if(p_alignment != NULL) {
if (p_alignment != NULL) {
alignment_destruct(p_alignment, 1);
}
p_alignment = alignment; // Update the previous alignment
}
if(p_alignment != NULL) { // Clean up the final alignment
if (p_alignment != NULL) { // Clean up the final alignment
alignment_destruct(p_alignment, 1);
}
block_reader_destruct(reader);

// add gaps from last covered base to ends of contigs
add_final_gap(contig_coverage_map);
Expand Down
28 changes: 20 additions & 8 deletions taf_norm.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
*/

#include "taf.h"
#include "block_reader.h"
#include "sonLib.h"
#include <getopt.h>
#include <time.h>
Expand All @@ -21,8 +22,8 @@ static void usage(void) {
fprintf(stderr, "Note, taffy norm will resort the rows alpha-numerically according to sequence name, "
"as is necessary to successfully merge all mergeable rows. Is the resorting is undesired, pipe the"
"result to taffy sort to resort.\n");
fprintf(stderr, "-i --inputFile : Input taf file to normalize. If not specified reads from stdin\n");
fprintf(stderr, "-o --outputFile : Output taf file. If not specified outputs to stdout\n");
fprintf(stderr, "-i --inputFile : Input TAF or MAF file to normalize. If not specified reads from stdin\n");
fprintf(stderr, "-o --outputFile : Output taf file (or maf if -k is given). If not specified outputs to stdout\n");
fprintf(stderr, "-l --logLevel : Set the log level\n");
fprintf(stderr, "-k --maf : Print maf output instead of taf\n");
fprintf(stderr, "-m --maximumBlockLengthToMerge : Only merge together any two adjacent blocks if one or both is less than this many bases long, by default: %" PRIi64 "\n", maximum_block_length_to_merge);
Expand All @@ -37,13 +38,12 @@ static void usage(void) {
fprintf(stderr, "-h --help : Print this help message\n");
}

static Alignment *get_next_taf_block(LI *li, bool run_length_encode_bases) {
static Alignment *get_next_block(BlockReader *reader) {
static Alignment *alignments[3];
static int64_t alignment_index=0;
assert(alignment_index >= 0);
while(alignment_index < 3) {
alignments[alignment_index] = taf_read_block(alignment_index == 0 ? NULL : alignments[alignment_index-1],
run_length_encode_bases, li); // Read a block
alignments[alignment_index] = block_reader_next(reader, alignment_index == 0 ? NULL : alignments[alignment_index-1]);
if(alignments[alignment_index] == NULL) { // The read block is empty
break;
}
Expand Down Expand Up @@ -325,16 +325,26 @@ int taf_norm_main(int argc, char *argv[]) {
LW *output = LW_construct(outputFile == NULL ? stdout : fopen(outputFile, "w"), use_compression);
LI *li = LI_construct(input);

// Pass the header line to determine parameters and write the updated taf header
Tag *tag = taf_read_header_2(li, &run_length_encode_bases);
// Open a format-agnostic reader; for MAF input the reader transparently links adjacent
// blocks via alignment_link_adjacent so downstream merging logic sees the same coordinate
// continuity it gets from a TAF input.
BlockReader *reader = block_reader_open(li);
if (reader == NULL) {
LW_destruct(output, outputFile != NULL);
LI_destruct(li);
if (inputFile != NULL) fclose(input);
return 1;
}
run_length_encode_bases = block_reader_run_length_encoded(reader);
Tag *tag = block_reader_take_header(reader);
if(output_maf && run_length_encode_bases) { // Remove this tag from the maf output as not relevant
tag = tag_remove(tag, "run_length_encode_bases");
}
output_maf ? maf_write_header(tag, output) : taf_write_header(tag, output);
tag_destruct(tag);

Alignment *alignment, *p_alignment = NULL, *p_p_alignment = NULL;
while((alignment = get_next_taf_block(li, run_length_encode_bases)) != NULL) {
while((alignment = get_next_block(reader)) != NULL) {
// First resort the rows to be alphabetical and then realign with any previous block. This ensures
// we will not have any mergeable rows unlinked. Note:
// We do not allow row substitutions when linking two blocks to merge (see last parameter of function call),
Expand Down Expand Up @@ -396,6 +406,8 @@ int taf_norm_main(int argc, char *argv[]) {
// Cleanup
//////////////////////////////////////////////

block_reader_destruct(reader);
LI_destruct(li);
if(inputFile != NULL) {
fclose(input);
}
Expand Down
19 changes: 14 additions & 5 deletions taf_sort.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "taf.h"
#include "tai.h"
#include "block_reader.h"
#include "sonLib.h"
#include <getopt.h>
#include <time.h>
Expand Down Expand Up @@ -193,17 +194,24 @@ int taf_sort_main(int argc, char *argv[]) {
stList *prefixes_to_sort_by = load_sort_file(sort_file);
stList *prefixes_to_dup_filter = load_sort_file(dup_filter_file);

// Parse the header
bool run_length_encode_bases;
Tag *tag = taf_read_header_2(li, &run_length_encode_bases);
// Open a format-agnostic reader (TAF or MAF input)
BlockReader *reader = block_reader_open(li);
if (reader == NULL) {
LW_destruct(output, output_file != NULL);
LI_destruct(li);
if (input_file != NULL) fclose(input);
return 1;
}
bool run_length_encode_bases = block_reader_run_length_encoded(reader);
Tag *tag = block_reader_take_header(reader);

// Write the header
// For now, output is always TAF (a MAF output mode could be added later -- see pass-2 plan).
taf_write_header(tag, output);
tag_destruct(tag);

// Write the alignment blocks
Alignment *alignment, *p_alignment = NULL, *pp_alignment = NULL;
while((alignment = taf_read_block(p_alignment, run_length_encode_bases, li)) != NULL) {
while ((alignment = block_reader_next(reader, p_alignment)) != NULL) {
process_alignment_block(pp_alignment, p_alignment, prefixes_to_filter_by, prefixes_to_pad,
prefixes_to_sort_by, prefixes_to_dup_filter, run_length_encode_bases, ignore_first_row, output);
pp_alignment = p_alignment;
Expand All @@ -220,6 +228,7 @@ int taf_sort_main(int argc, char *argv[]) {
//////////////////////////////////////////////


block_reader_destruct(reader);
LI_destruct(li);
if(input_file != NULL) {
fclose(input);
Expand Down
Loading