diff --git a/pyproject.toml b/pyproject.toml index 79603c1..195ff8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,8 +35,8 @@ where = ["src"] exclude = ["benchmarks", "build", "docs", "tests", "scripts"] [tool.setuptools.package-data] -"*" = ["py.typed", "*.pyi"] -"designer_dna.headers" = ["*.h", "*.pxd"] +"*" = ["py.typed", "*.pyi", "*.pxd"] +"designer_dna.headers" = ["*.h", "*.c", "*.hpp", "*.cpp"] [project.optional-dependencies] dev = ["designer_dna[doc,test,lint,type,commit]", "tox"] diff --git a/src/designer_dna/_oligos.pyi b/src/designer_dna/_oligos.pyi index 5013673..9d5f5e8 100644 --- a/src/designer_dna/_oligos.pyi +++ b/src/designer_dna/_oligos.pyi @@ -1,3 +1,32 @@ +# BSD 3-Clause License +# +# Copyright (c) 2025, Spill-Tea +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # pylint: disable=W0613 """Cythonized oligonucleotide functions.""" @@ -70,10 +99,11 @@ def palindrome(sequence: str, dna: bool = ...) -> str: palindrome("ATAT") == "ATAT" palindrome("GATATG") == "ATAT" + palindrome("ANT") == "ANT" # Handles degenerate bases + palindrome("UGCA", False) == "UGCA" # Handles RNA sequences Notes: - * Uses a modified center expansion method (Manacher's algorithm) to identify the - longest substring that is palindromic. + * Algorithmic time complexity O(NlogN). * If a sequence contains two or more palindromic substrings of equal size, the first leftmost palindrome is prioritized. diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index 57d7c87..d222d2d 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -219,10 +219,11 @@ cpdef str palindrome(str sequence, bint dna = True): palindrome("ATAT") == "ATAT" palindrome("GATATG") == "ATAT" + palindrome("ANT") == "ANT" # Handles degenerate bases + palindrome("UGCA", False) == "UGCA" # Handles RNA sequences Notes: - * Uses a modified center expansion method (Manacher's algorithm) to identify the - longest substring that is palindromic. + * Algorithmic time complexity O(NlogN). * If a sequence contains two or more palindromic substrings of equal size, the first leftmost palindrome is prioritized. @@ -235,6 +236,7 @@ cpdef str palindrome(str sequence, bint dna = True): v_complement(com, dna) for i in range(seq.size - 1): + # Check even length palindromes first (more common for ATGC based sequences) left = i right = i + 1 _center(seq.ptr, com.ptr, &left, &right, seq.size) @@ -244,6 +246,19 @@ cpdef str palindrome(str sequence, bint dna = True): start = left end = right + # Only check odd length palindromes in case of (center) degenerate bases + if seq.ptr[i] != com.ptr[i]: + continue + + left = i - 1 + right = i + 1 + _center(seq.ptr, com.ptr, &left, &right, seq.size) + current = right - left + if current > length: + length = current + start = left + end = right + free(seq.ptr) free(com.ptr) diff --git a/src/designer_dna/headers/common.pxd b/src/designer_dna/headers/common.pxd index 9fb0ed9..6f4003c 100644 --- a/src/designer_dna/headers/common.pxd +++ b/src/designer_dna/headers/common.pxd @@ -33,7 +33,6 @@ from libc.string cimport memcpy from libc.stdlib cimport free, malloc cdef extern from "Python.h": - bint PyBytes_Check(object) Py_ssize_t PyUnicode_GET_LENGTH(object) bytes PyUnicode_AsUTF8String(object) Py_ssize_t PyBytes_GET_SIZE(object) @@ -42,11 +41,6 @@ cdef extern from "Python.h": bytes PyBytes_FromStringAndSize(char*, Py_ssize_t) -# ctypedef fused StrT: -# str -# bytes - - cdef struct StringView: char* ptr Py_ssize_t size @@ -54,6 +48,7 @@ cdef struct StringView: cdef inline StringView construct(bytes s, Py_ssize_t length, bint isbytes): + """Construct the StringView from a python bytes object.""" cdef: char* buffer = PyBytes_AS_STRING(s) StringView view @@ -97,20 +92,3 @@ cdef inline bytes to_bytes(StringView view): free(view.ptr) return obj - - -# TODO: Cannot coerce to a type that is not specialized -# cdef inline StringView handle_input(StrT received): -# """Primary interface to handle both string and bytes python objects.""" -# if PyBytes_Check(received): -# return bytes_to_view( received) - -# return str_to_view( received) - - -# cdef inline StrT convert_output(StringView view): -# """Primary interface to handle conversion output back to python objects.""" -# if view.origin: -# return to_bytes(view) - -# return to_str(view) diff --git a/src/designer_dna/oligos.py b/src/designer_dna/oligos.py index ee2e596..8cfb130 100644 --- a/src/designer_dna/oligos.py +++ b/src/designer_dna/oligos.py @@ -27,7 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -"""oligos.""" +"""Common utility functions to work with and analyze oligonucleotide sequences.""" from ._oligos import ( complement, @@ -161,12 +161,12 @@ def _center_expansion( left: int, right: int, length: int, -) -> str: +) -> tuple[int, int]: while left > -1 and right < length and s[left] == c[right] and s[right] == c[left]: left -= 1 right += 1 - return s[left + 1 : right] + return left + 1, right def palindrome_py(sequence: str, dna: bool = True) -> str: @@ -186,29 +186,42 @@ def palindrome_py(sequence: str, dna: bool = True) -> str: palindrome_py("GATATG") == "ATAT" Notes: - * Uses a modified center expansion method (Manacher's algorithm) to identify the - longest substring that is palindromic in O(n) time complexity. + * Algorithmic time complexity is O(N). * If a sequence contains two or more palindromic substrings of equal size, the first leftmost palindrome is prioritized. - * Ambiguous IUPAC nucleotide characters are not supported. Only ATGCU. """ seq_length: int = len(sequence) comp: str = complement_py(sequence, dna) - pal: str = "" + best_left: int = 0 + best_right: int = 0 + left: int = 0 + right: int = 0 length: int = 0 - if seq_length < 2: - return pal for i in range(seq_length - 1): - # Palindromic nucleotides are only even length, reducing search space in half - even: str = _center_expansion(sequence, comp, i, i + 1, seq_length) - current: int = len(even) + # If we only consider ATGC based sequences, then Palindromic nucleotides are + # only even length, reducing search space in half + left, right = _center_expansion(sequence, comp, i, i + 1, seq_length) + current: int = right - left if current > length: - pal = even length = current + best_left = left + best_right = right - return pal + # Handle Degenerate bases which equals it's complement (e.g. N, S, W) + # We can have odd length palindromes if a degenerate base is at the center + if sequence[i] != comp[i]: + continue + + left, right = _center_expansion(sequence, comp, i - 1, i + 1, seq_length) + current = right - left + if current > length: + length = current + best_left = left + best_right = right + + return sequence[best_left:best_right] def stretch_py(sequence: str) -> int: diff --git a/tests/unit/test_oligos.py b/tests/unit/test_oligos.py index bfe8651..a3a31f5 100644 --- a/tests/unit/test_oligos.py +++ b/tests/unit/test_oligos.py @@ -60,12 +60,9 @@ def test_reverse(seq: str, expected: str, function: Callable[[str], str]) -> Non ("", False, ""), ] -for k, v in oligos.BASEPAIRS_DNA.items(): - for j in (True, False): - if j: - complements.append((k, j, v)) - else: - complements.append((k, j, oligos.BASEPAIRS_RNA[k])) +for _k, _v in oligos.BASEPAIRS_DNA.items(): + for _j in (True, False): + complements.append((_k, _j, _v if _j else oligos.BASEPAIRS_RNA[_k])) @pytest.mark.parametrize( @@ -87,6 +84,16 @@ def test_reverse(seq: str, expected: str, function: Callable[[str], str]) -> Non ("AGTCNURYSWKMBVDH-.", False, "UCAGNAYRSWMKVBHD-."), ("agtcnuryswkmbvdh-.", True, "TCAGNAYRSWMKVBHD-."), ("agtcnuryswkmbvdh-.", False, "UCAGNAYRSWMKVBHD-."), + ( + "".join(oligos.BASEPAIRS_DNA.keys()), + True, + "".join(oligos.BASEPAIRS_DNA.values()), + ), + ( + "".join(oligos.BASEPAIRS_RNA.keys()), + False, + "".join(oligos.BASEPAIRS_RNA.values()), + ), ], ) def test_complement( @@ -136,6 +143,9 @@ def test_reverse_complement( [ ("", 0), ("A", 0), + ("AA", 1), + ("TAA", 1), + ("AAT", 1), ("ATGC", 0), ("AAAAACCCCCCGGGGGGG", 6), ], @@ -175,7 +185,13 @@ def test_nrepeats( assert result == expected, f"Unexpected stretch calculation: {result}" -@pytest.mark.parametrize("function", [oligos.palindrome, oligos.palindrome_py]) +@pytest.mark.parametrize( + "function", + [ + oligos.palindrome, + oligos.palindrome_py, + ], +) @pytest.mark.parametrize( ["seq", "dna", "expected"], [ @@ -195,6 +211,10 @@ def test_nrepeats( ("GAATTC", True, "GAATTC"), ("ATGAATTC", True, "GAATTC"), ("CTTAAG", True, "CTTAAG"), + ("ANT", True, "ANT"), + ("AANT", True, "ANT"), + ("AWSNSWT", True, "AWSNSWT"), + ("AWSSWT", True, "AWSSWT"), ], ) def test_palindromes(