diff --git a/Makefile b/Makefile index a275405..9d0725a 100644 --- a/Makefile +++ b/Makefile @@ -33,8 +33,8 @@ ${sonLibDir}/cuTest.a : sonLib stTafDependencies = ${sonLibDir}/sonLib.a ${sonLibDir}/cuTest.a ${LIBDIR}/libabpoa.a -${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${libHeaders} ${stTafDependencies} - ${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o +${LIBDIR}/libstTaf.a : ${libTests} ${libHeaders} ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${srcDir}/remote_io.o ${libHeaders} ${stTafDependencies} + ${AR} rc libstTaf.a ${srcDir}/alignment_block.o ${srcDir}/line_iterator.o ${srcDir}/maf.o ${srcDir}/paf.o ${srcDir}/ond.o ${srcDir}/taf.o ${srcDir}/add_gap_bases.o ${srcDir}/merge_adjacent_alignments.o ${srcDir}/prefix_sort.o ${srcDir}/wiggle.o ${srcDir}/tai.o ${srcDir}/remote_io.o mv libstTaf.a ${LIBDIR}/ ${srcDir}/alignment_block.o : ${srcDir}/alignment_block.c ${libHeaders} @@ -70,6 +70,9 @@ ${srcDir}/prefix_sort.o : ${srcDir}/prefix_sort.c ${libHeaders} ${srcDir}/wiggle.o : ${srcDir}/wiggle.c ${libHeaders} ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/wiggle.o -c ${srcDir}/wiggle.c +${srcDir}/remote_io.o : ${srcDir}/remote_io.c ${libHeaders} + ${CC} ${CFLAGS} ${LDFLAGS} -o ${srcDir}/remote_io.o -c ${srcDir}/remote_io.c + ${BINDIR}/stTafTests : ${libTests} ${LIBDIR}/libstTaf.a ${stTafDependencies} ${CC} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/stTafTests ${libTests} ${LIBDIR}/libstTaf.a ${LDLIBS} diff --git a/docs/c_cli_lib_install.md b/docs/c_cli_lib_install.md index 851c788..a2a7e4c 100644 --- a/docs/c_cli_lib_install.md +++ b/docs/c_cli_lib_install.md @@ -10,4 +10,28 @@ To test the installation do: make test This will run the unit tests. You should see that all tests pass okay. You will -then want to add the taf/bin directory to your path. \ No newline at end of file +then want to add the taf/bin directory to your path. + +## htslib dependency + +Taffy uses htslib (via `pkg-config --exists htslib` at build time) for bgzip +support and HTTP/S3 input. If `pkg-config` doesn't find an htslib install, +the build still succeeds but bgzipped input and URL input are disabled. + +You can also point at a non-pkg-config htslib install by setting +`HTSLIB_CFLAGS` and `HTSLIB_LIBS` in the environment before `make`. + +### URL/HTTPS/S3 input requires htslib with libcurl + +`taffy view -r SEQ:s-e -i https://...` (or `s3://...`, etc.) routes through +htslib's libcurl-backed `hFILE` layer. Most distro/conda htslib packages +have this enabled. If you built htslib from source, you may need to rerun +configure with libcurl support: + + cd htslib + autoreconf -i # only if there's no ./configure yet + ./configure --enable-libcurl + make + +If your htslib lacks libcurl, taffy will print a clear error suggesting the +above when a URL input is given. diff --git a/taf_view.c b/taf_view.c index 8f4e9d4..39ddf40 100644 --- a/taf_view.c +++ b/taf_view.c @@ -6,6 +6,7 @@ #include "taf.h" #include "tai.h" +#include "remote_io.h" #include "sonLib.h" #include #include @@ -213,9 +214,19 @@ int taf_view_main(int argc, char *argv[]) { return 1; } - FILE *input = inputFile == NULL ? stdin : fopen(inputFile, "r"); - if (input == NULL) { - fprintf(stderr, "Unable to open input file: %s\n", inputFile); + /* For URL inputs (only meaningful with -r region queries) we'll skip the + * local fopen; LI is constructed via bgzf_open which goes through htslib's + * URL-aware backend. */ + bool input_is_url = inputFile != NULL && is_url(inputFile); + FILE *input = NULL; + if (!input_is_url) { + input = inputFile == NULL ? stdin : fopen(inputFile, "r"); + if (input == NULL) { + fprintf(stderr, "Unable to open input file: %s\n", inputFile); + return 1; + } + } else if (region == NULL) { + fprintf(stderr, "URL inputs are only supported with -r region queries\n"); return 1; } @@ -241,7 +252,10 @@ int taf_view_main(int argc, char *argv[]) { } LW *output = LW_construct(output_fh, use_compression); - LI *li = LI_construct(input); + LI *li = input_is_url ? LI_construct_from_path(inputFile) : LI_construct(input); + if (li == NULL) { + return 1; + } // sniff the format int input_format = check_input_format(LI_peek_at_next_line(li)); @@ -309,9 +323,9 @@ int taf_view_main(int argc, char *argv[]) { st_logInfo("Region: contig=%s start=%" PRIi64 " length=%" PRIi64 "\n", region_seq, region_start, region_length); - char *tai_fn = tai_path(inputFile); - FILE *tai_fh = fopen(tai_fn, "r"); - + char *tai_fn = input_is_url ? tai_path_for(inputFile) : tai_path(inputFile); + FILE *tai_fh = open_tai_for_reading(tai_fn); + if (tai_fh == NULL) { fprintf(stderr, "Index %s not found. Please run taffy index first\n", tai_fn); return 1; @@ -429,7 +443,7 @@ int taf_view_main(int argc, char *argv[]) { ////////////////////////////////////////////// LI_destruct(li); - if(inputFile != NULL) { + if (inputFile != NULL && !input_is_url) { fclose(input); } LW_destruct(output, outputFile != NULL); diff --git a/taffy/_taffy_build.py b/taffy/_taffy_build.py index b995a80..f8aac77 100644 --- a/taffy/_taffy_build.py +++ b/taffy/_taffy_build.py @@ -32,6 +32,12 @@ LI *LI_construct(FILE *fh); + /* + * Construct an LI directly from a local path or a URL (http://, https://, + * s3://, etc.) via htslib's URL-aware bgzf_open. Returns NULL on failure. + */ + LI *LI_construct_from_path(const char *path); + void LI_destruct(LI *li); char *LI_peek_at_next_line(LI *li); @@ -248,6 +254,15 @@ * Free a tai iterator */ void tai_iterator_destruct(TaiIt *tai_it); + + /* Returns true if the path looks like a URL (contains "://"). */ + bool is_url(const char *path); + + /* Open an input source for the .tai index, whether local file or URL. + * For a local path: behaves like fopen(path, "r"). + * For a URL: streams the entire response into a temp FILE* via htslib's + * hopen/hread. Returns NULL on failure. */ + FILE *open_tai_for_reading(const char *path); """) # set_source() gives the name of the python extension module to @@ -260,9 +275,10 @@ #include #include "htslib/bgzf.h" #include "htslib/kstring.h" - #include "taf.h" - #include "line_iterator.h" + #include "taf.h" + #include "line_iterator.h" #include "tai.h" + #include "remote_io.h" """, include_dirs=["taffy/submodules/sonLib/externalTools/cutest", "taffy/submodules/sonLib/C/inc", @@ -287,6 +303,7 @@ "taffy/impl/ond.c", "taffy/impl/taf.c", "taffy/impl/tai.c", + "taffy/impl/remote_io.c", ], extra_compile_args=["-DUSE_HTSLIB"], libraries=["hts"], diff --git a/taffy/impl/line_iterator.c b/taffy/impl/line_iterator.c index f618e8a..a3b6fea 100644 --- a/taffy/impl/line_iterator.c +++ b/taffy/impl/line_iterator.c @@ -30,6 +30,42 @@ LI *LI_construct(FILE *fh) { return li; } +#ifdef USE_HTSLIB +LI *LI_construct_from_path(const char *path) { + BGZF *bgzf = bgzf_open(path, "r"); + if (bgzf == NULL) { + fprintf(stderr, "Unable to open input %s (htslib bgzf_open failed)\n", path); + if (strstr(path, "://") != NULL) { + fprintf(stderr, " URL inputs require htslib built with libcurl support.\n" + " If you built htslib yourself, rerun ./configure --enable-libcurl and rebuild.\n"); + } + return NULL; + } + LI *li = st_calloc(1, sizeof(LI)); + li->bgzf = bgzf; + if (bgzf_compression(li->bgzf) == 2) { + if (bgzf_index_build_init(li->bgzf) != 0) { + assert(false); + } + } + kstring_t ks = KS_INITIALIZE; + li->prev_pos = bgzf_tell(li->bgzf); + li->pos = li->prev_pos; + bgzf_getline(li->bgzf, '\n', &ks); + li->line = ks_release(&ks); + return li; +} +#else +LI *LI_construct_from_path(const char *path) { + FILE *fh = fopen(path, "r"); + if (fh == NULL) { + fprintf(stderr, "Unable to open input %s\n", path); + return NULL; + } + return LI_construct(fh); +} +#endif + void LI_destruct(LI *li) { #ifdef USE_HTSLIB bgzf_close(li->bgzf); diff --git a/taffy/impl/remote_io.c b/taffy/impl/remote_io.c new file mode 100644 index 0000000..a39e6b4 --- /dev/null +++ b/taffy/impl/remote_io.c @@ -0,0 +1,90 @@ +#include "remote_io.h" +#include "sonLib.h" +#include "htslib/hfile.h" +#include +#include +#include + +bool is_url(const char *path) { + if (path == NULL) return false; + return strstr(path, "://") != NULL; +} + +char *tai_path_for(const char *input) { + assert(input != NULL); + size_t n = strlen(input); + char *ret = (char *) st_calloc(n + 5, sizeof(char)); + sprintf(ret, "%s.tai", input); + return ret; +} + +/* Slurp an entire URL into a malloc'd buffer. Returns the buffer (caller frees) + * via *out_buf and sets *out_len. Returns true on success, false on failure + * (with an error printed to stderr). */ +static bool slurp_url(const char *url, char **out_buf, size_t *out_len) { + hFILE *hf = hopen(url, "r"); + if (hf == NULL) { + fprintf(stderr, "Unable to open URL: %s (%s)\n", url, strerror(errno)); + fprintf(stderr, " URL inputs require htslib built with libcurl support.\n" + " If you built htslib yourself, rerun ./configure --enable-libcurl and rebuild.\n"); + return false; + } + size_t cap = 64 * 1024; + size_t len = 0; + char *buf = (char *) malloc(cap); + if (buf == NULL) { hclose_abruptly(hf); return false; } + for (;;) { + if (len == cap) { + cap *= 2; + char *nb = (char *) realloc(buf, cap); + if (nb == NULL) { free(buf); hclose_abruptly(hf); return false; } + buf = nb; + } + ssize_t n = hread(hf, buf + len, cap - len); + if (n < 0) { + fprintf(stderr, "Read error fetching URL: %s\n", url); + free(buf); + hclose_abruptly(hf); + return false; + } + if (n == 0) break; + len += (size_t) n; + } + if (hclose(hf) != 0) { + fprintf(stderr, "Close error after fetching URL: %s\n", url); + free(buf); + return false; + } + *out_buf = buf; + *out_len = len; + return true; +} + +FILE *open_tai_for_reading(const char *path) { + if (!is_url(path)) { + return fopen(path, "r"); + } + char *buf = NULL; + size_t len = 0; + if (!slurp_url(path, &buf, &len)) { + return NULL; + } + /* Spool to an anonymous tmpfile rather than fmemopen, because downstream + * LI_construct calls fileno()+bgzf_dopen which needs a real file descriptor. + * tmpfile() is auto-deleted when closed, so no cleanup required by the caller. */ + FILE *fh = tmpfile(); + if (fh == NULL) { + fprintf(stderr, "tmpfile failed for buffered .tai from %s\n", path); + free(buf); + return NULL; + } + if (fwrite(buf, 1, len, fh) != len) { + fprintf(stderr, "Short write spooling .tai to tmpfile from %s\n", path); + fclose(fh); + free(buf); + return NULL; + } + free(buf); + rewind(fh); + return fh; +} diff --git a/taffy/inc/line_iterator.h b/taffy/inc/line_iterator.h index 33e436d..245db5e 100644 --- a/taffy/inc/line_iterator.h +++ b/taffy/inc/line_iterator.h @@ -28,6 +28,14 @@ typedef struct _LI { LI *LI_construct(FILE *fh); +/* + * Construct an LI directly from a local path or a URL, going through htslib's + * bgzf_open (which routes URLs through hFILE+libcurl). Use this when reading + * remote files via HTTP/HTTPS/S3/GCS; otherwise LI_construct(FILE*) is fine. + * Returns NULL on failure (with an error printed to stderr). + */ +LI *LI_construct_from_path(const char *path); + void LI_destruct(LI *li); /* diff --git a/taffy/inc/remote_io.h b/taffy/inc/remote_io.h new file mode 100644 index 0000000..2e91e9d --- /dev/null +++ b/taffy/inc/remote_io.h @@ -0,0 +1,36 @@ +#ifndef TAF_REMOTE_IO_H_ +#define TAF_REMOTE_IO_H_ + +/* + * Helpers for reading TAF/MAF + .tai inputs from URLs (HTTP/HTTPS/S3/GCS/etc.) + * via htslib's hFILE backend. The data file is read on demand through htslib's + * BGZF/HTTP-Range layer (see line_iterator.c -- LI_construct_from_path goes + * through bgzf_open which handles both local paths and URLs). The .tai is + * small and is fetched in full into an in-memory buffer that's then exposed + * to the rest of the code as a FILE*. + */ + +#include +#include + +/* Heuristic URL detection: anything containing "://" is treated as a URL. + * Matches http, https, s3, gs, ftp, etc. -- whatever htslib's hopen supports. */ +bool is_url(const char *path); + +/* Build the .tai sibling path/URL for a given input. Caller frees the returned + * string. This is the same construction as tai_path() but with no assumptions + * about local-vs-remote: it just appends ".tai". */ +char *tai_path_for(const char *input); + +/* Open an input source for the .tai index, whether local file or URL. + * + * For a local path: behaves like fopen(path, "r"). + * For a URL: streams the entire response into memory via htslib's hopen/hread, + * then returns an fmemopen'd FILE* over that buffer. The buffer is + * owned by the FILE*; closing the FILE frees it. + * + * Returns NULL on failure (with an error printed to stderr). + */ +FILE *open_tai_for_reading(const char *path); + +#endif diff --git a/taffy/lib.py b/taffy/lib.py index 40ba723..38c9be3 100644 --- a/taffy/lib.py +++ b/taffy/lib.py @@ -35,21 +35,38 @@ def _dictionary_to_c_tags(tags): return first_c_tag +def _is_url(file_string_or_handle): + """ True if the argument is a string that looks like a URL (e.g. http://, s3://). """ + return isinstance(file_string_or_handle, str) and "://" in file_string_or_handle + + def _check_file_exists(file_string_or_handle): - """ Used to check that expected file strings exist, creates FileNotFoundError if not""" - if isinstance(file_string_or_handle, str): # If is a file string + """ Used to check that expected file strings exist, creates FileNotFoundError if not. + URLs are skipped -- their reachability is checked at open time by htslib. """ + if isinstance(file_string_or_handle, str) and not _is_url(file_string_or_handle): p = Path(file_string_or_handle) if not p.is_file(): raise FileNotFoundError(f"The file {file_string_or_handle} doesn't exist") def _get_c_file_handle(file_string_or_handle, modifier_string="r"): - """ Gets the c file handle for file, which can be either a file string - or a file handle. If file handle you can set the modifier string. - Note the Python file handle is *way* slower + """ Gets the c file handle for file, which can be either a file string, + a URL string, or a file handle. If a file handle you can set the modifier string. + For URLs we go through open_tai_for_reading which slurps via htslib's hFILE + into a tmpfile -- this is the same path used by `taffy view -r` for the .tai. + Note the Python file handle is *way* slower than passing a path string. """ - return lib.fopen(_to_c_string(file_string_or_handle), _to_c_string(modifier_string)) if \ - isinstance(file_string_or_handle, str) else ffi.cast("FILE *", file_string_or_handle) + if isinstance(file_string_or_handle, str): + if _is_url(file_string_or_handle): + if modifier_string != "r": + raise ValueError("URL inputs are read-only") + fh = lib.open_tai_for_reading(_to_c_string(file_string_or_handle)) + if fh == ffi.NULL: + raise IOError(f"Unable to open URL {file_string_or_handle} (htslib hopen failed; " + f"is htslib built with libcurl?)") + return fh + return lib.fopen(_to_c_string(file_string_or_handle), _to_c_string(modifier_string)) + return ffi.cast("FILE *", file_string_or_handle) class Alignment: @@ -260,9 +277,18 @@ def __init__(self, file, taf_index=None, sequence_intervals=None, make_row_link self.make_row_links = make_row_links # Optionally store links between rows _check_file_exists(file) self.file = file - self.c_file_handle = _get_c_file_handle(file) self.file_string_not_handle = isinstance(file, str) # Will be true if the file is a string, not a file handle - self.c_li_handle = lib.LI_construct(self.c_file_handle) + # For URL inputs we route through LI_construct_from_path (htslib URL-aware bgzf_open) + # rather than fopen+LI_construct, since fopen can't open URLs. + if _is_url(file): + self.c_file_handle = ffi.NULL + self.c_li_handle = lib.LI_construct_from_path(_to_c_string(file)) + if self.c_li_handle == ffi.NULL: + raise IOError(f"Unable to open URL {file} (htslib bgzf_open failed; " + f"is htslib built with libcurl?)") + else: + self.c_file_handle = _get_c_file_handle(file) + self.c_li_handle = lib.LI_construct(self.c_file_handle) i = lib.check_input_format(lib.LI_peek_at_next_line(self.c_li_handle)) if i not in (0, 1): raise RuntimeError("Input file is not a TAF or MAF file") @@ -380,7 +406,9 @@ def close(self): if self.taf_index: lib.tai_iterator_destruct(self._c_taf_index_it) lib.LI_destruct(self.c_li_handle) # Cleanup the allocated line iterator - if self.file_string_not_handle: # Close the underlying file handle + if self.file_string_not_handle and self.c_file_handle != ffi.NULL: + # URL-backed readers don't have a separate FILE* (LI_construct_from_path + # owns the BGZF directly, which LI_destruct above already cleaned up). lib.fclose(self.c_file_handle) def __enter__(self):