Skip to content

Commit b7b1edf

Browse files
authored
Merge pull request #7 from Spill-Tea/uchar
Uchar
2 parents 9c75cdf + 4825960 commit b7b1edf

7 files changed

Lines changed: 478 additions & 108 deletions

File tree

src/designer_dna/_oligonucleotides.pyi

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,9 @@ def manacher(sequence: str, dna: bool = ...) -> str:
3737
dna (bool): Sequence is DNA, else RNA.
3838
3939
Returns:
40-
(str): longest palindromic substring within sequence.
40+
(str) Longest palindromic substring within a sequence.
4141
4242
Notes:
4343
* This is a cython/c++ implementation of the O(n) Manacher's algorithm.
44-
* This algorithm is typically slower than the O(nlogn) palindrome function for
45-
strings up to 2^23 characters (not benchmarked beyond this limit).
46-
* This function here is primarily here for demonstration purposes.
4744
4845
"""

src/designer_dna/_oligonucleotides.pyx

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,22 +30,36 @@
3030
# distutils: language = c++
3131

3232
"""Oligonucleotide functions with the help of C++."""
33-
from narray cimport NumericArray
34-
from designer_dna._oligos cimport v_complement
35-
from common cimport StringView, str_to_view
36-
from libc.stdlib cimport free
3733

34+
from libc.stdlib cimport free
3835

3936
cdef extern from "Python.h":
4037
str PyUnicode_Join(str, str)
4138

39+
from common cimport StringView, str_to_view
40+
from narray cimport NumericArray
41+
42+
from designer_dna._oligos cimport v_complement
43+
4244

4345
cdef inline void _compute(
44-
char* s,
45-
char* c,
46+
unsigned char* s,
47+
unsigned char* c,
4648
NumericArray[int]* arr,
4749
ssize_t n,
4850
):
51+
"""Primary computation behind manacher's algorithm.
52+
53+
Args:
54+
s (uchar*): nucleotide sequence
55+
c (uchar*): complement of nucleotide sequence
56+
arr (NumericArray[int]*): an array of integers
57+
n (ssize_t): length of input sequence, s.
58+
59+
Returns:
60+
(void) relevant data saved in place to NumericArray
61+
62+
"""
4963
cdef:
5064
ssize_t mirror, a, b, i, stemp, center = 0, radius = 0
5165
int temp, zero = 0
@@ -92,13 +106,10 @@ cpdef str manacher(str sequence, bint dna = True):
92106
dna (bool): Sequence is DNA, else RNA.
93107
94108
Returns:
95-
(str): longest palindromic substring within sequence.
109+
(str) Longest palindromic substring within a sequence.
96110
97111
Notes:
98112
* This is a cython/c++ implementation of the O(n) Manacher's algorithm.
99-
* This algorithm is typically slower than the O(nlogn) palindrome function for
100-
strings up to 2^23 characters (not benchmarked beyond this limit).
101-
* This function here is primarily here for demonstration purposes.
102113
103114
"""
104115
cdef:
@@ -117,11 +128,12 @@ cpdef str manacher(str sequence, bint dna = True):
117128
free(ref.ptr)
118129
free(com.ptr)
119130

120-
# Enumerate, capturing index (center) and value of max (radius)
131+
# Enumerate, capturing index (center) at value of max (radius)
121132
for i in range(1, ref.size - 1):
122133
if arr[0][i] > radius:
123134
radius = arr[0][i]
124135
center = i
125136
del arr
126137

127-
return k[center - radius + 1: center + radius: 2]
138+
# By nature, a palindrome is symmetrical around center (+/- radius)
139+
return sequence[(center - radius + 1) // 2 - 1: (center + radius) // 2]

src/designer_dna/_oligos.pxd

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,15 @@
3030
from common cimport StringView
3131

3232
cdef:
33-
void c_reverse(char*, Py_ssize_t)
33+
void c_reverse(unsigned char*, Py_ssize_t)
3434
void v_reverse(StringView*)
3535

36-
void c_complement(char*, Py_ssize_t, unsigned char*)
36+
void c_complement(unsigned char*, Py_ssize_t, bint)
3737
void v_complement(StringView*, bint)
3838

39-
void c_reverse_complement(char*, Py_ssize_t, unsigned char*)
39+
void c_reverse_complement(unsigned char*, Py_ssize_t, bint)
4040
void v_reverse_complement(StringView*, bint)
41+
42+
(Py_ssize_t, Py_ssize_t) c_palindrome(unsigned char*, Py_ssize_t, bint)
43+
int c_stretch(unsigned char*, Py_ssize_t)
44+
int c_nrepeats(unsigned char*, int, int)

src/designer_dna/_oligos.pyi

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,19 @@
2727
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2828
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929

30-
"""Cythonized oligonucleotide functions."""
30+
from array import array
31+
from typing import Any
32+
33+
def m_reverse(sequence: array[int]) -> Any:
34+
"""Reverse a nucleotide sequence.
35+
36+
Args:
37+
sequence (uchar[]): Nucleotide sequence writeable memory view.
38+
39+
Returns:
40+
(void) Reverse a sequence in place.
41+
42+
"""
3143

3244
def reverse(sequence: str) -> str:
3345
"""Reverse a nucleotide sequence.
@@ -46,6 +58,18 @@ def reverse(sequence: str) -> str:
4658
4759
"""
4860

61+
def m_complement(sequence: array[int], dna: bool = ...) -> Any:
62+
"""Complement a nucleotide sequence.
63+
64+
Args:
65+
sequence (uchar[]): Nucleotide sequence writeable memory view.
66+
dna (bool): Sequence is DNA, else RNA.
67+
68+
Returns:
69+
(void) Complement nucleotide sequence in place.
70+
71+
"""
72+
4973
def complement(sequence: str, dna: bool = ...) -> str:
5074
"""Complement a nucleotide sequence.
5175
@@ -64,6 +88,18 @@ def complement(sequence: str, dna: bool = ...) -> str:
6488
6589
"""
6690

91+
def m_reverse_complement(sequence: array[int], dna: bool = ...) -> Any:
92+
"""Reverse complement a nucleotide sequence.
93+
94+
Args:
95+
sequence (uchar[]): Nucleotide sequence writeable memory view.
96+
dna (bool): Sequence is DNA, else RNA.
97+
98+
Returns:
99+
(void) Reverse complement nucleotide sequence in place.
100+
101+
"""
102+
67103
def reverse_complement(sequence: str, dna: bool = ...) -> str:
68104
"""Reverse complement a nucleotide sequence.
69105
@@ -90,31 +126,40 @@ def palindrome(sequence: str, dna: bool = ...) -> str:
90126
dna (bool): Sequence is DNA, else RNA.
91127
92128
Returns:
93-
(str): longest palindromic subsequence within sequence.
129+
(str) longest palindromic subsequence within sequence.
94130
95131
Examples:
96132
.. code-block:: python
97133
98134
palindrome("ATAT") == "ATAT"
99135
palindrome("GATATG") == "ATAT"
100136
palindrome("ANT") == "ANT" # Handles degenerate bases
101-
palindrome("UGCA", False) == "UGCA" # Handles RNA sequences
102137
103138
Notes:
104-
* Algorithmic time complexity O(NlogN).
105139
* If a sequence contains two or more palindromic substrings of equal size, the
106140
first leftmost palindrome is prioritized.
107141
108142
"""
109143

144+
def m_stretch(sequence: array[int]) -> int:
145+
"""Return the maximum length of a single letter (nucleotide) repeat in a string.
146+
147+
Args:
148+
sequence (uchar[]): Nucleotide sequence writeable memory view.
149+
150+
Returns:
151+
(int) Length of maximum run of a single letter.
152+
153+
"""
154+
110155
def stretch(sequence: str) -> int:
111156
"""Return the maximum length of a single letter (nucleotide) repeat in a string.
112157
113158
Args:
114159
sequence (str): Nucleotide sequence string.
115160
116161
Returns:
117-
(int): Length of maximum run of a single letter.
162+
(int) Length of maximum run of a single letter.
118163
119164
Examples:
120165
.. code-block:: python
@@ -124,6 +169,22 @@ def stretch(sequence: str) -> int:
124169
125170
"""
126171

172+
def m_nrepeats(sequence: array[int], n: int) -> int:
173+
"""Calculate the maximum observed repeats of composite pattern size n characters.
174+
175+
Args:
176+
sequence (uchar[]): Nucleotide sequence string.
177+
n (int): Size of k-mers (composite pattern) to observe.
178+
179+
Returns:
180+
(int) The longest tandem run of nucleotides comprised of a composite pattern
181+
of length n characters.
182+
183+
Raises:
184+
ZeroDivisionError: if value of n is 0.
185+
186+
"""
187+
127188
def nrepeats(sequence: str, n: int) -> int:
128189
"""Calculate the maximum observed repeats of composite pattern size n characters.
129190
@@ -136,7 +197,7 @@ def nrepeats(sequence: str, n: int) -> int:
136197
of length n characters.
137198
138199
Raises:
139-
ValueError: if value of n is less than 1.
200+
ZeroDivisionError: if value of n is 0.
140201
141202
Examples:
142203
.. code-block:: python

0 commit comments

Comments
 (0)