rrwick · dalofa · Mar 31, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/badread/error_model.py b/badread/error_model.py
@@ -29,7 +29,7 @@
 
 
 def make_error_model(args, output=sys.stderr, dot_interval=1000):
-    refs, _, _ = load_fasta(args.reference)
+    refs, _, _,_,_ = load_fasta(args.reference)
     reads = load_fastq(args.reads, output=output)
     alignments = load_alignments(args.alignment, args.max_alignments, output=output)
     if len(alignments) == 0:

diff --git a/badread/misc.py b/badread/misc.py
@@ -122,7 +122,7 @@ def load_fastq(filename, output=sys.stderr, dot_interval=1000):
 
 def load_fasta(filename):
     fasta_seqs = collections.OrderedDict()
-    depths, circular = {}, {}
+    depths, circular, hairpin_left, hairpin_right = {}, {}, {}, {}
     p = re.compile(r'depth=([\d.]+)')
     with get_open_func(filename)(filename, 'rt') as fasta_file:
         name = ''
@@ -145,11 +145,13 @@ def load_fasta(filename):
                 else:
                     depths[short_name] = 1.0
                 circular[short_name] = 'circular=true' in name.lower()
+                hairpin_left[short_name] = 'hairpin_left=true' in name.lower()
+                hairpin_right[short_name] = 'hairpin_right=true' in name.lower()
             else:
                 sequence.append(line)
         if name:
             fasta_seqs[name.split()[0]] = ''.join(sequence).upper()
-    return fasta_seqs, depths, circular
+    return fasta_seqs, depths, circular, hairpin_left, hairpin_right
 
 
 RANDOM_SEQ_DICT = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}

diff --git a/badread/plot_window_identity.py b/badread/plot_window_identity.py
@@ -26,7 +26,7 @@
 
 def plot_window_identity(args, output=sys.stdout):
     reads = load_fastq(args.reads, output=output)
-    refs, _, _ = load_fasta(args.reference)
+    refs, _, _,_,_ = load_fasta(args.reference)
     alignments = load_alignments(args.alignment, output=output)
 
     for a in alignments:

diff --git a/badread/qscore_model.py b/badread/qscore_model.py
@@ -76,7 +76,7 @@ def get_qscores(seq, frag, qscore_model):
 
 
 def make_qscore_model(args, output=sys.stderr, dot_interval=1000):
-    refs, _, _ = load_fasta(args.reference)
+    refs, _, _,_,_ = load_fasta(args.reference)
     reads = load_fastq(args.reads, output=output)
     alignments = load_alignments(args.alignment, args.max_alignments, output=output)
     if len(alignments) == 0:

diff --git a/badread/simulate.py b/badread/simulate.py
@@ -34,7 +34,7 @@ def simulate(args, output=sys.stderr):
     if args.seed is not None:
         random.seed(args.seed)
         np.random.seed(args.seed)
-    ref_seqs, ref_depths, ref_circular = load_reference(args.reference, output)
+    ref_seqs, ref_depths, ref_circular, left_hairpin, right_hairpin = load_reference(args.reference, output)
     rev_comp_ref_seqs = {name: reverse_complement(seq) for name, seq in ref_seqs.items()}
     frag_lengths = FragmentLengths(args.mean_frag_length, args.frag_length_stdev, output)
     adjust_depths(ref_seqs, ref_depths, ref_circular, frag_lengths, args)
@@ -62,8 +62,8 @@ def simulate(args, output=sys.stderr):
     print_progress(count, total_size, target_size, output)
     while total_size < target_size:
         fragment, info = build_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs,
-                                        ref_contig_weights, ref_circular, args, start_adapt_rate,
-                                        start_adapt_amount, end_adapt_rate, end_adapt_amount)
+                                        ref_contig_weights, ref_circular, left_hairpin, right_hairpin,
+                                        args, start_adapt_rate, start_adapt_amount, end_adapt_rate, end_adapt_amount)
         target_identity = identities.get_identity()
         seq, quals, actual_identity, identity_by_qscores = \
             sequence_fragment(fragment, target_identity, error_model, qscore_model)
@@ -89,12 +89,12 @@ def simulate(args, output=sys.stderr):
 
 
 def build_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs, ref_contig_weights,
-                   ref_circular, args, start_adapt_rate, start_adapt_amount, end_adapt_rate,
+                   ref_circular, left_hairpin, right_hairpin, args, start_adapt_rate, start_adapt_amount, end_adapt_rate,
                    end_adapt_amount):
     fragment = [get_start_adapter(start_adapt_rate, start_adapt_amount, args.start_adapter_seq)]
     info = []
     frag_seq, frag_info = get_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs,
-                                       ref_contigs, ref_contig_weights, ref_circular, args)
+                                       ref_contigs, ref_contig_weights, ref_circular, left_hairpin, right_hairpin, args)
     fragment.append(frag_seq)
     info.append(','.join(frag_info))
 
@@ -105,7 +105,7 @@ def build_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs, ref_c
         if random_chance(settings.CHIMERA_START_ADAPTER_CHANCE):
             fragment.append(args.start_adapter_seq)
         frag_seq, frag_info = get_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs,
-                                           ref_contigs, ref_contig_weights, ref_circular, args)
+                                           ref_contigs, ref_contig_weights, ref_circular, left_hairpin, right_hairpin, args)
         fragment.append(frag_seq)
         info.append(','.join(frag_info))
     fragment.append(get_end_adapter(end_adapt_rate, end_adapt_amount, args.end_adapter_seq))
@@ -146,7 +146,7 @@ def get_target_size(ref_size, quantity):
 
 
 def get_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs, ref_contig_weights,
-                 ref_circular, args):
+                 ref_circular, left_hairpin, right_hairpin, args):
     fragment_length = frag_lengths.get_fragment_length()
     fragment_type = get_fragment_type(args)
     if fragment_type == 'junk':
@@ -158,7 +158,7 @@ def get_fragment(frag_lengths, ref_seqs, rev_comp_ref_seqs, ref_contigs, ref_con
     # repeatedly until we get a result.
     for _ in range(1000):
         seq, info = get_real_fragment(fragment_length, ref_seqs, rev_comp_ref_seqs, ref_contigs,
-                                      ref_contig_weights, ref_circular)
+                                      ref_contig_weights, ref_circular, left_hairpin, right_hairpin)
         if seq != '':
             return seq, info
     sys.exit('Error: failed to generate any sequence fragments - are your read lengths '
@@ -181,23 +181,31 @@ def get_fragment_type(args):
 
 
 def get_real_fragment(fragment_length, ref_seqs, rev_comp_ref_seqs, ref_contigs,
-                      ref_contig_weights, ref_circular):
+                      ref_contig_weights, ref_circular, left_hairpin, right_hairpin):
 
     if len(ref_contigs) == 1:
         contig = ref_contigs[0]
     else:
         contig = random.choices(ref_contigs, weights=ref_contig_weights)[0]
+
     info = [contig]
     if random_chance(0.5):
         seq = ref_seqs[contig]
+        rv_seq = rev_comp_ref_seqs[contig] # save to use for hairpin
         info.append('+strand')
+        strand = '+'
     else:
         seq = rev_comp_ref_seqs[contig]
+        rv_seq = ref_seqs[contig] # save to use for hairpin
         info.append('-strand')
+        strand = '-'
+
+    # is there a hairpin at the relevant end of the contig?
+    hairpin_at_end= (right_hairpin[contig] if strand == '+' else left_hairpin[contig])
 
     # If the reference contig is linear and the fragment length is long enough, then we just
     # return the entire fragment, start to end.
-    if fragment_length >= len(seq) and not ref_circular[contig]:
+    if fragment_length >= len(seq) and not ref_circular[contig] and not hairpin_at_end:
         info.append('0-' + str(len(seq)))
         return seq, info
 
@@ -209,25 +217,44 @@ def get_real_fragment(fragment_length, ref_seqs, rev_comp_ref_seqs, ref_contigs,
     start_pos = random.randint(0, len(seq)-1)
     end_pos = start_pos + fragment_length
 
-    # The ending position might be past the end of the sequence. If the sequence is linear, we
-    # fix this now.
-    if not ref_circular[contig] and end_pos > len(seq):
-        end_pos = len(seq)
-
-    info.append(f'{start_pos}-{end_pos}')
-
     # For circular contigs, we may have to loop the read around the contig.
     if ref_circular[contig]:
+        info.append(f'{start_pos}-{end_pos}')
         if end_pos <= len(seq):
             return seq[start_pos:end_pos], info
         else:
             looped_end_pos = end_pos - len(seq)
             assert looped_end_pos > 0
+
         return seq[start_pos:] + seq[:looped_end_pos], info
 
-    # For linear contigs, no looping is needed.
-    else:
-        return seq[start_pos:end_pos], info
+    # The ending position might be past the end of the sequence. If the sequence is linear, we
+    # fix this now.
+
+    if not ref_circular[contig] and end_pos > len(seq):
+        # If the read would extend past the end of a linear contigs with
+        # a hairpin at the end, we allow it to extend through the hairpin
+        # into the other strand (reverse complement)
+        if hairpin_at_end:
+            fwd_seq = seq[start_pos:]
+            left_over_bases = fragment_length - len(fwd_seq)
+
+            # do not allow multiple hairpin loops
+            # as I have observed this in real data so far
+            if left_over_bases > len(rv_seq): 
+                return '', ''  # read would extend past the end of the other strand, so we fail to get the read
+
+            if left_over_bases > 0:
+                hp_seq = rv_seq[:left_over_bases]
+            else:
+                hp_seq = ''
+            info.append(f'{start_pos}-{len(seq)} (hairpin) 0-{left_over_bases}')
+            return fwd_seq + hp_seq, info
+
+        # If there is no hairpin terminate at contig end.
+        end_pos = len(seq)
+    info.append(f'{start_pos}-{end_pos}')
+    return seq[start_pos:end_pos], info
 
 
 def get_junk_fragment(fragment_length):
@@ -478,7 +505,7 @@ def print_progress(count, bp, target, output):
 def load_reference(reference, output):
     print('', file=output)
     print(f'Loading reference from {reference}', file=output)
-    ref_seqs, ref_depths, ref_circular = load_fasta(reference)
+    ref_seqs, ref_depths, ref_circular, left_hairpin, right_hairpin = load_fasta(reference)
     plural = '' if len(ref_seqs) == 1 else 's'
     print(f'  {len(ref_seqs):,} contig{plural}:', file=output)
     for contig in ref_seqs:
@@ -488,7 +515,7 @@ def load_reference(reference, output):
     if len(ref_seqs) > 1:
         total_size = sum(len(s) for s in ref_seqs.values())
         print(f'  total size: {total_size:,} bp', file=output)
-    return ref_seqs, ref_depths, ref_circular
+    return ref_seqs, ref_depths, ref_circular, left_hairpin, right_hairpin
 
 
 def print_intro(output):