Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ ${sonLibDir}/cuTest.a : sonLib

stTafDependencies = ${sonLibDir}/sonLib.a ${sonLibDir}/cuTest.a ${LIBDIR}/libabpoa.a

${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies}
${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o
${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${srcDir}/remote_io.o ${libHeaders} ${stTafDependencies}
${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${srcDir}/remote_io.o
mv libstTaf.a ${LIBDIR}/

${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders}
Expand Down Expand Up @@ -70,6 +70,9 @@ ${srcDir}/prefix_sort.o : ${srcDir}/prefix_sort.c ${libHeaders}
${srcDir}/wiggle.o : ${srcDir}/wiggle.c ${libHeaders}
${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/wiggle.o -c ${srcDir}/wiggle.c

${srcDir}/remote_io.o : ${srcDir}/remote_io.c ${libHeaders}
${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/remote_io.o -c ${srcDir}/remote_io.c

${BINDIR}/stTafTests : ${libTests} ${LIBDIR}/libstTaf.a ${stTafDependencies}
${CC} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/stTafTests ${libTests} ${LIBDIR}/libstTaf.a ${LDLIBS}

Expand Down
26 changes: 25 additions & 1 deletion docs/c_cli_lib_install.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,28 @@ To test the installation do:
make test

This will run the unit tests. You should see that all tests pass okay. You will
then want to add the taf/bin directory to your path.
then want to add the taf/bin directory to your path.

## htslib dependency

Taffy uses htslib (via `pkg-config --exists htslib` at build time) for bgzip
support and HTTP/S3 input. If `pkg-config` doesn't find an htslib install,
the build still succeeds but bgzipped input and URL input are disabled.

You can also point at a non-pkg-config htslib install by setting
`HTSLIB_CFLAGS` and `HTSLIB_LIBS` in the environment before `make`.

### URL/HTTPS/S3 input requires htslib with libcurl

`taffy view -r SEQ:s-e -i https://...` (or `s3://...`, etc.) routes through
htslib's libcurl-backed `hFILE` layer. Most distro/conda htslib packages
have this enabled. If you built htslib from source, you may need to rerun
configure with libcurl support:

cd htslib
autoreconf -i # only if there's no ./configure yet
./configure --enable-libcurl
make

If your htslib lacks libcurl, taffy will print a clear error suggesting the
above when a URL input is given.
30 changes: 22 additions & 8 deletions taf_view.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "taf.h"
#include "tai.h"
#include "remote_io.h"
#include "sonLib.h"
#include <getopt.h>
#include <time.h>
Expand Down Expand Up @@ -213,9 +214,19 @@ int taf_view_main(int argc, char *argv[]) {
return 1;
}

FILE *input = inputFile == NULL ? stdin : fopen(inputFile, "r");
if (input == NULL) {
fprintf(stderr, "Unable to open input file: %s\n", inputFile);
/* For URL inputs (only meaningful with -r region queries) we'll skip the
* local fopen; LI is constructed via bgzf_open which goes through htslib's
* URL-aware backend. */
bool input_is_url = inputFile != NULL && is_url(inputFile);
FILE *input = NULL;
if (!input_is_url) {
input = inputFile == NULL ? stdin : fopen(inputFile, "r");
if (input == NULL) {
fprintf(stderr, "Unable to open input file: %s\n", inputFile);
return 1;
}
} else if (region == NULL) {
fprintf(stderr, "URL inputs are only supported with -r region queries\n");
return 1;
}

Expand All @@ -241,7 +252,10 @@ int taf_view_main(int argc, char *argv[]) {
}

LW *output = LW_construct(output_fh, use_compression);
LI *li = LI_construct(input);
LI *li = input_is_url ? LI_construct_from_path(inputFile) : LI_construct(input);
if (li == NULL) {
return 1;
}

// sniff the format
int input_format = check_input_format(LI_peek_at_next_line(li));
Expand Down Expand Up @@ -309,9 +323,9 @@ int taf_view_main(int argc, char *argv[]) {

st_logInfo("Region: contig=%s start=%" PRIi64 " length=%" PRIi64 "\n", region_seq, region_start, region_length);

char *tai_fn = tai_path(inputFile);
FILE *tai_fh = fopen(tai_fn, "r");
char *tai_fn = input_is_url ? tai_path_for(inputFile) : tai_path(inputFile);
FILE *tai_fh = open_tai_for_reading(tai_fn);

if (tai_fh == NULL) {
fprintf(stderr, "Index %s not found. Please run taffy index first\n", tai_fn);
return 1;
Expand Down Expand Up @@ -429,7 +443,7 @@ int taf_view_main(int argc, char *argv[]) {
//////////////////////////////////////////////

LI_destruct(li);
if(inputFile != NULL) {
if (inputFile != NULL && !input_is_url) {
fclose(input);
}
LW_destruct(output, outputFile != NULL);
Expand Down
21 changes: 19 additions & 2 deletions taffy/_taffy_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@

LI *LI_construct(FILE *fh);

/*
* Construct an LI directly from a local path or a URL (http://, https://,
* s3://, etc.) via htslib's URL-aware bgzf_open. Returns NULL on failure.
*/
LI *LI_construct_from_path(const char *path);

void LI_destruct(LI *li);

char *LI_peek_at_next_line(LI *li);
Expand Down Expand Up @@ -248,6 +254,15 @@
* Free a tai iterator
*/
void tai_iterator_destruct(TaiIt *tai_it);

/* Returns true if the path looks like a URL (contains "://"). */
bool is_url(const char *path);

/* Open an input source for the .tai index, whether local file or URL.
* For a local path: behaves like fopen(path, "r").
* For a URL: streams the entire response into a temp FILE* via htslib's
* hopen/hread. Returns NULL on failure. */
FILE *open_tai_for_reading(const char *path);
""")

# set_source() gives the name of the python extension module to
Expand All @@ -260,9 +275,10 @@
#include <stdlib.h>
#include "htslib/bgzf.h"
#include "htslib/kstring.h"
#include "taf.h"
#include "line_iterator.h"
#include "taf.h"
#include "line_iterator.h"
#include "tai.h"
#include "remote_io.h"
""",
include_dirs=["taffy/submodules/sonLib/externalTools/cutest",
"taffy/submodules/sonLib/C/inc",
Expand All @@ -287,6 +303,7 @@
"taffy/impl/ond.c",
"taffy/impl/taf.c",
"taffy/impl/tai.c",
"taffy/impl/remote_io.c",
],
extra_compile_args=["-DUSE_HTSLIB"],
libraries=["hts"],
Expand Down
36 changes: 36 additions & 0 deletions taffy/impl/line_iterator.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,42 @@ LI *LI_construct(FILE *fh) {
return li;
}

#ifdef USE_HTSLIB
LI *LI_construct_from_path(const char *path) {
BGZF *bgzf = bgzf_open(path, "r");
if (bgzf == NULL) {
fprintf(stderr, "Unable to open input %s (htslib bgzf_open failed)\n", path);
if (strstr(path, "://") != NULL) {
fprintf(stderr, " URL inputs require htslib built with libcurl support.\n"
" If you built htslib yourself, rerun ./configure --enable-libcurl and rebuild.\n");
}
return NULL;
}
LI *li = st_calloc(1, sizeof(LI));
li->bgzf = bgzf;
if (bgzf_compression(li->bgzf) == 2) {
if (bgzf_index_build_init(li->bgzf) != 0) {
assert(false);
}
}
kstring_t ks = KS_INITIALIZE;
li->prev_pos = bgzf_tell(li->bgzf);
li->pos = li->prev_pos;
bgzf_getline(li->bgzf, '\n', &ks);
li->line = ks_release(&ks);
return li;
}
#else
LI *LI_construct_from_path(const char *path) {
FILE *fh = fopen(path, "r");
if (fh == NULL) {
fprintf(stderr, "Unable to open input %s\n", path);
return NULL;
}
return LI_construct(fh);
}
#endif

void LI_destruct(LI *li) {
#ifdef USE_HTSLIB
bgzf_close(li->bgzf);
Expand Down
90 changes: 90 additions & 0 deletions taffy/impl/remote_io.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#include "remote_io.h"
#include "sonLib.h"
#include "htslib/hfile.h"
#include <string.h>
#include <stdlib.h>
#include <errno.h>

bool is_url(const char *path) {
if (path == NULL) return false;
return strstr(path, "://") != NULL;
}

char *tai_path_for(const char *input) {
assert(input != NULL);
size_t n = strlen(input);
char *ret = (char *) st_calloc(n + 5, sizeof(char));
sprintf(ret, "%s.tai", input);
return ret;
}

/* Slurp an entire URL into a malloc'd buffer. Returns the buffer (caller frees)
* via *out_buf and sets *out_len. Returns true on success, false on failure
* (with an error printed to stderr). */
static bool slurp_url(const char *url, char **out_buf, size_t *out_len) {
hFILE *hf = hopen(url, "r");
if (hf == NULL) {
fprintf(stderr, "Unable to open URL: %s (%s)\n", url, strerror(errno));
fprintf(stderr, " URL inputs require htslib built with libcurl support.\n"
" If you built htslib yourself, rerun ./configure --enable-libcurl and rebuild.\n");
return false;
}
size_t cap = 64 * 1024;
size_t len = 0;
char *buf = (char *) malloc(cap);
if (buf == NULL) { hclose_abruptly(hf); return false; }
for (;;) {
if (len == cap) {
cap *= 2;
char *nb = (char *) realloc(buf, cap);
if (nb == NULL) { free(buf); hclose_abruptly(hf); return false; }
buf = nb;
}
ssize_t n = hread(hf, buf + len, cap - len);
if (n < 0) {
fprintf(stderr, "Read error fetching URL: %s\n", url);
free(buf);
hclose_abruptly(hf);
return false;
}
if (n == 0) break;
len += (size_t) n;
}
if (hclose(hf) != 0) {
fprintf(stderr, "Close error after fetching URL: %s\n", url);
free(buf);
return false;
}
*out_buf = buf;
*out_len = len;
return true;
}

FILE *open_tai_for_reading(const char *path) {
if (!is_url(path)) {
return fopen(path, "r");
}
char *buf = NULL;
size_t len = 0;
if (!slurp_url(path, &buf, &len)) {
return NULL;
}
/* Spool to an anonymous tmpfile rather than fmemopen, because downstream
* LI_construct calls fileno()+bgzf_dopen which needs a real file descriptor.
* tmpfile() is auto-deleted when closed, so no cleanup required by the caller. */
FILE *fh = tmpfile();
if (fh == NULL) {
fprintf(stderr, "tmpfile failed for buffered .tai from %s\n", path);
free(buf);
return NULL;
}
if (fwrite(buf, 1, len, fh) != len) {
fprintf(stderr, "Short write spooling .tai to tmpfile from %s\n", path);
fclose(fh);
free(buf);
return NULL;
}
free(buf);
rewind(fh);
return fh;
}
8 changes: 8 additions & 0 deletions taffy/inc/line_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ typedef struct _LI {

LI *LI_construct(FILE *fh);

/*
* Construct an LI directly from a local path or a URL, going through htslib's
* bgzf_open (which routes URLs through hFILE+libcurl). Use this when reading
* remote files via HTTP/HTTPS/S3/GCS; otherwise LI_construct(FILE*) is fine.
* Returns NULL on failure (with an error printed to stderr).
*/
LI *LI_construct_from_path(const char *path);

void LI_destruct(LI *li);

/*
Expand Down
36 changes: 36 additions & 0 deletions taffy/inc/remote_io.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef TAF_REMOTE_IO_H_
#define TAF_REMOTE_IO_H_

/*
* Helpers for reading TAF/MAF + .tai inputs from URLs (HTTP/HTTPS/S3/GCS/etc.)
* via htslib's hFILE backend. The data file is read on demand through htslib's
* BGZF/HTTP-Range layer (see line_iterator.c -- LI_construct_from_path goes
* through bgzf_open which handles both local paths and URLs). The .tai is
* small and is fetched in full into an in-memory buffer that's then exposed
* to the rest of the code as a FILE*.
*/

#include <stdio.h>
#include <stdbool.h>

/* Heuristic URL detection: anything containing "://" is treated as a URL.
* Matches http, https, s3, gs, ftp, etc. -- whatever htslib's hopen supports. */
bool is_url(const char *path);

/* Build the .tai sibling path/URL for a given input. Caller frees the returned
* string. This is the same construction as tai_path() but with no assumptions
* about local-vs-remote: it just appends ".tai". */
char *tai_path_for(const char *input);

/* Open an input source for the .tai index, whether local file or URL.
*
* For a local path: behaves like fopen(path, "r").
* For a URL: streams the entire response into memory via htslib's hopen/hread,
* then returns an fmemopen'd FILE* over that buffer. The buffer is
* owned by the FILE*; closing the FILE frees it.
*
* Returns NULL on failure (with an error printed to stderr).
*/
FILE *open_tai_for_reading(const char *path);

#endif
Loading