From a14b674a9a0744912175ba292085159258fddf16 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 04:28:09 -0700 Subject: [PATCH 01/18] feat(oligos): Implement basic functions to work with oligonucleotides. --- src/designer_dna/__init__.py | 32 +++ src/designer_dna/_oligos.pyi | 120 +++++++++++ src/designer_dna/_oligos.pyx | 332 ++++++++++++++++++++++++++++++ src/designer_dna/headers/oligos.h | 76 +++++++ src/designer_dna/oligos.py | 286 +++++++++++++++++++++++++ tests/unit/test_oligos.py | 195 ++++++++++++++++++ 6 files changed, 1041 insertions(+) create mode 100644 src/designer_dna/__init__.py create mode 100644 src/designer_dna/_oligos.pyi create mode 100644 src/designer_dna/_oligos.pyx create mode 100644 src/designer_dna/headers/oligos.h create mode 100644 src/designer_dna/oligos.py create mode 100644 tests/unit/test_oligos.py diff --git a/src/designer_dna/__init__.py b/src/designer_dna/__init__.py new file mode 100644 index 0000000..9e43c03 --- /dev/null +++ b/src/designer_dna/__init__.py @@ -0,0 +1,32 @@ +# BSD 3-Clause License +# +# Copyright (c) 2025, Spill-Tea +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""DesignerDNA Project.""" + +__version__: str = "v0.0.1" diff --git a/src/designer_dna/_oligos.pyi b/src/designer_dna/_oligos.pyi new file mode 100644 index 0000000..fdf7448 --- /dev/null +++ b/src/designer_dna/_oligos.pyi @@ -0,0 +1,120 @@ +# pylint: disable=W0613 + +"""Cythonized oligonucleotide functions.""" + +def reverse(sequence: str) -> str: + """Reverse a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + + Returns: + (str) Reverse a string. + + Examples: + .. code-block:: python + + reverse("ATATAT") == "TATATA" + reverse("AATATA") == "ATATAA" + + """ + +def complement(sequence: str, dna: bool = ...) -> str: + """Complement a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (str) Complement of a nucleotide sequence string. + + Examples: + .. code-block:: python + + complement("ATGC", True) == "TACG" + complement("ATGC", False) == "UACG" + + """ + +def reverse_complement(sequence: str, dna: bool = ...) -> str: + """Reverse complement a nucleotide sequence. + + Args: + sequence (str): Nucelotide sequence string. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (str) Reverse complement of sequence string. + + Examples: + .. code-block:: python + + reverse_complement("ATGC", True) == "GCAT" + reverse_complement("ATGC", False) == "GCAU" + + """ + +def palindrome(sequence: str, dna: bool = ...) -> str: + """Find the longest palindromic substring within a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (str): longest palindromic subsequence within sequence. + + Examples: + .. code-block:: python + + palindrome("ATAT") == "ATAT" + palindrome("GATATG") == "ATAT" + + Notes: + * Uses a modified center expansion method (Manacher's algorithm) to identify the + longest substring that is palindromic. + * If a sequence contains two or more palindromic substrings of equal size, the + first leftmost palindrome is prioritized. + + """ + +def stretch(sequence: str) -> int: + """Return the maximum length of a single letter (nucleotide) repeat in a string. + + Args: + sequence (str): Nucleotide sequence string. + + Returns: + (int): Length of maximum run of a single letter. + + Examples: + .. code-block:: python + + stretch("ATATAT") == 0 # True + stretch("AATATA") == 1 # True + + """ + +def nrepeats(sequence: str, n: int) -> int: + """Calculate the maximum observed repeats of composite pattern size n characters. + + Args: + sequence (str): Nucleotide sequence string. + n (int): stretch of k-mers to observe. + + Returns: + (int) The longest tandem run of nucleotides comprised of a composite pattern + of length n characters. + + Raises: + ValueError: if value of n is less than 1. + + Examples: + .. code-block:: python + + n_stretch("AAAA", 1) == 3 # True + n_stretch("AAAA", 2) == 1 # True + n_stretch("ACAACAACA", 3) == 2 # True + + """ diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx new file mode 100644 index 0000000..480d88e --- /dev/null +++ b/src/designer_dna/_oligos.pyx @@ -0,0 +1,332 @@ +# cython: boundscheck=False, wraparound=False, nonecheck=False +"""Cythonized oligonucleotide functions.""" + +from libc.string cimport memcpy +from libc.stdlib cimport malloc, free + +cdef extern from "Python.h": + str PyUnicode_FromStringAndSize(char*, Py_ssize_t) + char* PyUnicode_UTF8(object) + Py_ssize_t PyUnicode_GET_LENGTH(object) + # bint PyBytes_Check(object) + # char* PyBytes_AS_STRING(object) + # Py_ssize_t PyBytes_GET_SIZE(object) + +cdef extern from "oligos.h": + const unsigned char DNA[0x100] + const unsigned char RNA[0x100] + +# ctypedef fused StrT: +# str +# bytes + + +cdef struct StringView: + char* ptr + Py_ssize_t size + + +cdef inline StringView to_view(str sequence): + """Construct StringView, using Cpython C-API to construct a c char string.""" + cdef: + Py_ssize_t length = PyUnicode_GET_LENGTH(sequence) + bytes temp = sequence.encode("utf8") + char* buffer = temp + StringView view + + view.ptr = malloc((length + 1) * sizeof(char)) + memcpy(view.ptr, buffer, length + 1) + view.ptr[length] = "\0" # c string terminator + view.size = length + + return view + + +cdef inline str to_str(StringView view): + """Convert StringView back into a python string object, safely releasing memory.""" + cdef: + str obj = PyUnicode_FromStringAndSize(view.ptr, view.size) + free(view.ptr) + + return obj + + +cdef void c_reverse(char* seq, Py_ssize_t length) noexcept: + """Reverse a C string in place. + + Args: + seq (char*): buffer sequence. + length (Py_ssize_t): length of seq. + + """ + cdef Py_ssize_t start, end, x = length // 2 + + for start in range(x): + end = length - start - 1 + seq[start], seq[end] = seq[end], seq[start] + + +cpdef str reverse(str sequence): + """Reverse a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + + Returns: + (str) Reverse a string. + + Examples: + .. code-block:: python + + reverse("ATATAT") == "TATATA" + reverse("AATATA") == "ATATAA" + + """ + return sequence[::-1] + + +cdef void c_complement(char* sequence, Py_ssize_t length, unsigned char[] table): + """Complement sequence C string in place. + + Args: + seq (char*): buffer sequence. + length (Py_ssize_t): length of seq. + table (char[]): translation table. + + """ + cdef: + Py_ssize_t j, end, idx = length // 2 + + for j in range(idx): + end = (length - 1) - j + sequence[j] = table[ sequence[j]] + sequence[end] = table[ sequence[end]] + + if length % 2: + sequence[idx] = table[ sequence[idx]] + + +cpdef str complement(str sequence, bint dna = True): + """Complement a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (str) Complement of a nucleotide sequence string. + + Examples: + .. code-block:: python + + complement("ATGC", True) == "TACG" + complement("ATGC", False) == "UACG" + + """ + cdef StringView view = to_view(sequence) + + if dna: + c_complement(view.ptr, view.size, DNA) + else: + c_complement(view.ptr, view.size, RNA) + + return to_str(view) + + +cdef void c_reverse_complement( + char* sequence, + Py_ssize_t length, + unsigned char[] table +): + """Reverse complement sequence C string in place. + + Args: + sequence (char*): buffer pointer to nucleotide char sequence. + length (Py_ssize_t): length of seq. + table (char[]): translation table. + + """ + cdef: + char* end_ptr = sequence + (length - 1) + + while end_ptr > sequence: + sequence[0], end_ptr[0] = ( + table[ end_ptr[0]], + table[ sequence[0]] + ) + end_ptr -= 1 + sequence += 1 + + if length % 2: + sequence[0] = table[ sequence[0]] + + +cpdef str reverse_complement(str sequence, bint dna = True): + """Reverse complement a nucleotide sequence. + + Args: + sequence (str): Nucelotide sequence string. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (str) Reverse complement of sequence string. + + Examples: + .. code-block:: python + + reverse_complement("ATGC", True) == "GCAT" + reverse_complement("ATGC", False) == "GCAU" + + """ + cdef StringView view = to_view(sequence) + + if dna: + c_reverse_complement(view.ptr, view.size, DNA) + else: + c_reverse_complement(view.ptr, view.size, RNA) + + return to_str(view) + + +cdef bytes _expand_from_center( + bytes seq, + bytes comp, + Py_ssize_t left, + Py_ssize_t right, + Py_ssize_t length, +): + while ( + left > -1 + and right < length + and seq[left] == comp[right] + and seq[right] == comp[left] # required to detect dna to rna based complements + ): + left -= 1 + right += 1 + + return seq[left + 1 : right] + + +cpdef str palindrome(str sequence, bint dna = True): + """Find the longest palindromic substring within a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (str): longest palindromic subsequence within sequence. + + Examples: + .. code-block:: python + + palindrome("ATAT") == "ATAT" + palindrome("GATATG") == "ATAT" + + Notes: + * Uses a modified center expansion method (Manacher's algorithm) to identify the + longest substring that is palindromic. + * If a sequence contains two or more palindromic substrings of equal size, the + first leftmost palindrome is prioritized. + + """ + cdef: + bytes temp = sequence.encode("utf8") + bytes comp = complement(sequence, dna).encode("utf8") + bytes even, pal = b"" + Py_ssize_t i, current, seq_length = len(sequence), length = 0 + + if seq_length < 2: # noqa: PLR2004 + return "" + + for i in range(seq_length - 1): + # NOTE: Palindromic nucleotides are only even length, halving search space + even = _expand_from_center(temp, comp, i, i + 1, seq_length) + current = len(even) + if current > length: + pal = even + length = current + + return pal.decode("utf8") + + +cpdef int stretch(str sequence): + """Return the maximum length of a single letter (nucleotide) repeat in a string. + + Args: + sequence (str): Nucleotide sequence string. + + Returns: + (int): Length of maximum run of a single letter. + + Examples: + .. code-block:: python + + stretch("ATATAT") == 0 # True + stretch("AATATA") == 1 # True + + """ + cdef: + bytes temp = sequence.encode("utf8") + char* buffer = temp + int longest = 0, current = 0 + char c, prev = buffer[0] + + for c in buffer[1:]: + if c == prev: + current += 1 + if current > longest: + longest = current + else: + current = 0 + prev = c + + return longest + + +cpdef int nrepeats(str sequence, int n): + """Calculate the maximum observed repeats of composite pattern size n characters. + + Args: + sequence (str): Nucleotide sequence string. + n (int): stretch of k-mers to observe. + + Returns: + (int) The longest tandem run of nucleotides comprised of a composite pattern + of length n characters. + + Raises: + ValueError: if value of n is less than 1. + + Examples: + .. code-block:: python + + n_stretch("AAAA", 1) == 3 # True + n_stretch("AAAA", 2) == 1 # True + n_stretch("ACAACAACA", 3) == 2 # True + + """ + if n < 1: + raise ValueError("n must be greater than 0.") + if n == 1: + return stretch(sequence) + + cdef: + bytes phase, temp = sequence.encode("utf8") + char* buffer = temp + int i, j, k, max_val = 0 + list[char] previous = [buffer[i : n + i] for i in range(n)] + list[int] current = [0 for _ in range(n)] + + for j in range(n, len(sequence), n): + for k in range(n): + phase = buffer[j + k : j + k + n] + if phase == previous[k]: + current[k] += 1 + if current[k] > max_val: + max_val = current[k] + else: + current[k] = 0 + previous[k] = phase + + return max_val diff --git a/src/designer_dna/headers/oligos.h b/src/designer_dna/headers/oligos.h new file mode 100644 index 0000000..2350d64 --- /dev/null +++ b/src/designer_dna/headers/oligos.h @@ -0,0 +1,76 @@ + + +#ifndef __LDPY_LDHELPERS_H +#define __LDPY_LDHELPERS_H + +const unsigned char DNA[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 84, 86, 71, 72, 69, 70, 67, + 68, 73, 74, 77, 76, 75, 78, 79, + 80, 81, 89, 83, 65, 65, 66, 87, + 88, 82, 90, 91, 92, 93, 94, 95, + 96, 84, 86, 71, 72, 101, 102, 67, + 68, 105, 106, 77, 108, 75, 78, 111, + 112, 113, 89, 83, 65, 65, 66, 87, + 120, 82, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, +}; + +const unsigned char RNA[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 85, 86, 71, 72, 69, 70, 67, + 68, 73, 74, 77, 76, 75, 78, 79, + 80, 81, 89, 83, 65, 65, 66, 87, + 88, 82, 90, 91, 92, 93, 94, 95, + 96, 85, 86, 71, 72, 101, 102, 67, + 68, 105, 106, 77, 108, 75, 78, 111, + 112, 113, 89, 83, 65, 65, 66, 87, + 120, 82, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, +}; + +#endif diff --git a/src/designer_dna/oligos.py b/src/designer_dna/oligos.py new file mode 100644 index 0000000..ee2e596 --- /dev/null +++ b/src/designer_dna/oligos.py @@ -0,0 +1,286 @@ +# BSD 3-Clause License +# +# Copyright (c) 2025, Spill-Tea +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""oligos.""" + +from ._oligos import ( + complement, + nrepeats, + palindrome, + reverse, + reverse_complement, + stretch, +) + + +__all__ = [ + "complement", + "complement_py", + "nrepeats", + "nrepeats_py", + "palindrome", + "palindrome_py", + "reverse", + "reverse_complement", + "reverse_complement_py", + "reverse_py", + "stretch", + "stretch_py", +] + +BASEPAIRS_DNA: dict[str, str] = dict( + zip( + "AGTCNURYSWKMBVDH-.", + "TCAGNAYRSWMKVBHD-.", + strict=True, + ) +) +BASEPAIRS_DNA.update({k.lower(): v for k, v in BASEPAIRS_DNA.items()}) +BASEPAIRS_RNA: dict[str, str] = BASEPAIRS_DNA.copy() +BASEPAIRS_RNA.update({"A": "U", "a": "U"}) +DEFAULT_ENCODING: str = "utf_8" + + +def _make_translation( + mapping: dict[str, str], + encoding: str = DEFAULT_ENCODING, +) -> bytes: + """Construct a string translation table from a dictionary mapping. + + Args: + mapping (dict): a dictionary of complements. + encoding (str): Valid and supported encoding schema. + + Returns: + (bytes): translation table of the provided mapping, prepared as specified by the + encoding. + + """ + keys: bytes = "".join(mapping.keys()).encode(encoding) + values: bytes = "".join(mapping.values()).encode(encoding) + + return bytes.maketrans(keys, values) + + +DNA: bytes = _make_translation(BASEPAIRS_DNA, DEFAULT_ENCODING) +RNA: bytes = _make_translation(BASEPAIRS_RNA, DEFAULT_ENCODING) + + +def reverse_py(sequence: str) -> str: + """Reverse a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + + Returns: + (str) Reverse a string. + + Examples: + .. code-block:: python + + reverse_py("ATATAT") == "TATATA" + reverse_py("AATATA") == "ATATAA" + + """ + return sequence[::-1] + + +def complement_py(sequence: str, dna: bool = True) -> str: + """Return the complement of a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + dna (bool): If true, treat sequence as DNA, otherwise treat as RNA + + Returns: + (str): Complement of input sequence. + + Examples: + .. code-block:: python + + complement_py("ATGC", True) == "TACG" + complement_py("ATGC", False) == "UACG" + + """ + return ( + sequence.encode(DEFAULT_ENCODING) + .translate(DNA if dna else RNA) + .decode(DEFAULT_ENCODING) + ) + + +def reverse_complement_py(sequence: str, dna: bool = True) -> str: + """Reverse complement a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + dna (bool): sequence is dna, else rna. + + Returns: + (str) Reverse complement of sequence string. + + Examples: + .. code-block:: python + + reverse_complement_py("ATGC", True) == "GCAT" + reverse_complement_py("ATGC", False) == "GCAU" + + """ + return complement_py(reverse_py(sequence), dna) + + +def _center_expansion( + s: str, + c: str, + left: int, + right: int, + length: int, +) -> str: + while left > -1 and right < length and s[left] == c[right] and s[right] == c[left]: + left -= 1 + right += 1 + + return s[left + 1 : right] + + +def palindrome_py(sequence: str, dna: bool = True) -> str: + """Find the longest substring palindrome within a nucleotide sequence. + + Args: + sequence (str): Nucleotide sequence string. + dna (bool): If true, treat sequence as DNA, otherwise treat as RNA + + Returns: + (str): longest palindromic subsequence within sequence. + + Examples: + .. code-block:: python + + palindrome_py("ATAT") == "ATAT" + palindrome_py("GATATG") == "ATAT" + + Notes: + * Uses a modified center expansion method (Manacher's algorithm) to identify the + longest substring that is palindromic in O(n) time complexity. + * If a sequence contains two or more palindromic substrings of equal size, the + first leftmost palindrome is prioritized. + * Ambiguous IUPAC nucleotide characters are not supported. Only ATGCU. + + """ + seq_length: int = len(sequence) + comp: str = complement_py(sequence, dna) + pal: str = "" + length: int = 0 + if seq_length < 2: + return pal + + for i in range(seq_length - 1): + # Palindromic nucleotides are only even length, reducing search space in half + even: str = _center_expansion(sequence, comp, i, i + 1, seq_length) + current: int = len(even) + if current > length: + pal = even + length = current + + return pal + + +def stretch_py(sequence: str) -> int: + """Calculate the maximum stretch of a single character in a string. + + Args: + sequence (str): Nucleotide sequence string. + + Returns: + (int): maximum length observed within sequence of a repeated character. + + Examples: + .. code-block:: python + + stretch_py("AAAA") == 3 + stretch_py("AATT") == 1 + + """ + if not sequence: + return 0 + + longest: int = 0 + current: int = 0 + last: str = sequence[0] + char: str + + for char in sequence[1:]: + if char == last: + current += 1 + if current > longest: + longest = current + else: # reset + current = 0 + last = char + + return longest + + +def nrepeats_py(sequence: str, n: int) -> int: + """Calculate the longest substring of n repeating characters. + + Args: + sequence (str): Nucleotide string or Series of string + n (int): stretch of k-mer to observe + + Returns: + (int) The longest run of repeating n-length characters. + + Raises: + ValueError: when n < 1 + + Examples: + .. code-block:: python + + nrepeats_py("AAAA", 1) == 3 # True + nrepeats_py("AAAA", 2) == 1 # True + nrepeats_py("ACAACAACA", 3) == 2 # True + + """ + previous: list[str] = [sequence[i : n + i] for i in range(n)] + current: list[int] = [0] * n + max_val: int = 0 + + for j in range(n, len(sequence), n): + for k in range(n): + phase: str = sequence[j + k : j + k + n] + if phase == previous[k]: + current[k] += 1 + if current[k] > max_val: + max_val = current[k] + else: + current[k] = 0 + previous[k] = phase + + return max_val diff --git a/tests/unit/test_oligos.py b/tests/unit/test_oligos.py new file mode 100644 index 0000000..4e6f99c --- /dev/null +++ b/tests/unit/test_oligos.py @@ -0,0 +1,195 @@ +# BSD 3-Clause License +# +# Copyright (c) 2025, Spill-Tea +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Unit test oligos module.""" + +from typing import Callable + +import pytest + +from designer_dna import oligos + + +@pytest.mark.parametrize("function", [oligos.reverse, oligos.reverse_py]) +@pytest.mark.parametrize( + ["seq", "expected"], + [ + ("", ""), + ("A", "A"), + ("AT", "TA"), + ("TAT", "TAT"), + ("GGC", "CGG"), + ("GATC", "CTAG"), + ], +) +def test_reverse(seq: str, expected: str, function: Callable[[str], str]) -> None: + """Test reversing a nucleotide sequence.""" + result: str = function(seq) + assert result == expected, "Unexpected reverse seq." + + +# 0, and 1 length complements --> important for cython woes +complements = [ + ("", True, ""), + ("", False, ""), +] + +for k, v in oligos.BASEPAIRS_DNA.items(): + for j in (True, False): + if j: + complements.append((k, j, v)) + else: + complements.append((k, j, oligos.BASEPAIRS_RNA[k])) + + +@pytest.mark.parametrize( + "function", + [oligos.complement, oligos.complement_py], +) +@pytest.mark.parametrize( + ["seq", "dna", "expected"], + [ + *complements, + ("AA", True, "TT"), + ("AA", False, "UU"), + ("AT", True, "TA"), + ("TAT", True, "ATA"), + ("GGC", True, "CCG"), + ("GATC", True, "CTAG"), + ("GATC", False, "CUAG"), + ("AGTCNURYSWKMBVDH-.", True, "TCAGNAYRSWMKVBHD-."), + ("AGTCNURYSWKMBVDH-.", False, "UCAGNAYRSWMKVBHD-."), + ("agtcnuryswkmbvdh-.", True, "TCAGNAYRSWMKVBHD-."), + ("agtcnuryswkmbvdh-.", False, "UCAGNAYRSWMKVBHD-."), + ], +) +def test_complement( + seq: str, + dna: bool, + expected: str, + function: Callable[[str, bool], str], +) -> None: + """Test complement of a nucleotide sequence.""" + result: str = function(seq, dna) + assert result == expected, f"Unexpected complement seq: {result}" + + +@pytest.mark.parametrize( + "function", [oligos.reverse_complement, oligos.reverse_complement_py] +) +@pytest.mark.parametrize( + ["seq", "dna", "expected"], + [ + *complements, + ("AT", True, "AT"), + ("AT", False, "AU"), + ("TAT", True, "ATA"), + ("GGC", True, "GCC"), + ("GATC", True, "GATC"), + ("GATC", False, "GAUC"), + ("AGTCNURYSWKMBVDH-.", True, ".-DHBVKMWSRYANGACT"), + ("AGTCNURYSWKMBVDH-.", False, ".-DHBVKMWSRYANGACU"), + ("agtcnuryswkmbvdh-.", True, ".-DHBVKMWSRYANGACT"), + ("agtcnuryswkmbvdh-.", False, ".-DHBVKMWSRYANGACU"), + ], +) +def test_reverse_complement( + seq: str, + dna: bool, + expected: str, + function: Callable[[str, bool], str], +) -> None: + """Test reverse complement of a nucleotide sequence.""" + result: str = function(seq, dna) + assert result == expected, "Unexpected reverse complement." + + +@pytest.mark.parametrize("function", [oligos.stretch, oligos.stretch_py]) +@pytest.mark.parametrize( + ["seq", "expected"], + [ + ("", 0), + ("A", 0), + ("ATGC", 0), + ("AAAAACCCCCCGGGGGGG", 6), + ], +) +def test_stretch(seq, expected: int, function: Callable[[str], int]) -> None: + """Test calculation of longest observed run of a single nucleotide within a seq.""" + result: int = function(seq) + assert result == expected, f"Unexpected stretch calculation: {result}" + + +@pytest.mark.parametrize("function", [oligos.nrepeats, oligos.nrepeats_py]) +@pytest.mark.parametrize( + ["seq", "n", "expected"], + [ + ("", 1, 0), + ("A", 1, 0), + ("ATGC", 1, 0), + ("AAAAACCCCCCGGGGGGG", 1, 6), + ("ACACAC", 2, 2), + ], +) +def test_nrepeats( + seq, + n: int, + expected: int, + function: Callable[[str, int], int], +) -> None: + """Test calculation of longest observed run of n characters within a seq.""" + result: int = function(seq, n) + assert result == expected, f"Unexpected stretch calculation: {result}" + + +@pytest.mark.parametrize("function", [oligos.palindrome, oligos.palindrome_py]) +@pytest.mark.parametrize( + ["seq", "dna", "expected"], + [ + ("", True, ""), + ("", False, ""), + ("A", True, ""), + ("A", False, ""), + ("AAAA", True, ""), + ("AAAA", False, ""), + ("ATATATATATAT", True, "ATATATATATAT"), + ("ATATATATATAT", False, ""), + ], +) +def test_palindromes( + seq: str, + dna: bool, + expected: str, + function: Callable[[str, int], int], +) -> None: + """Test detection of longest palindrome within a sequence.""" + result = function(seq, dna) + assert result == expected, f"Unexpected palindrome: {result}" + if result: + assert result == oligos.reverse_complement(result) From 0977f360105e6910862efd120427eb3dec62c352 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 04:30:23 -0700 Subject: [PATCH 02/18] feat(packaging): Correct packaging to implement cython dependencies. --- .gitignore | 1 + pyproject.toml | 44 ++++++++++++-------- setup.py | 83 ++++++++++++++++++++++++++++++++++++++ tests/unit/test_version.py | 42 ------------------- 4 files changed, 111 insertions(+), 59 deletions(-) create mode 100644 setup.py delete mode 100644 tests/unit/test_version.py diff --git a/.gitignore b/.gitignore index 103fb24..2d3d5ac 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ # recursively re-ignore __pycache__ *.egg-info/ +docs/build diff --git a/pyproject.toml b/pyproject.toml index 1811b0f..2319a6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,39 +1,46 @@ [build-system] -requires = ["setuptools>=67.6.1"] +requires = ["setuptools>=67.6.1", "cython>=3.0.0"] build-backend = "setuptools.build_meta" [project] -name = "PyTemplate" +name = "designer_dna" authors = [{ name = "Jason C Del Rio", email = "spillthetea917@gmail.com" }] maintainers = [{ name = "Jason C Del Rio", email = "spillthetea917@gmail.com" }] description = "Project description here." license = { file = "LICENSE" } -requires-python = ">=3.7" -keywords = ["keyword1", "keyword2"] -classifiers = ["Programming Language :: Python :: 3"] +requires-python = ">=3.11" +keywords = ["DNA", "design", "ligation", "optimization"] +classifiers = [ + "Programming Language :: Python :: 3", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Bio-Informatics" +] dynamic = ["version", "readme", "dependencies"] [project.urls] -homepage = "https://github.com/Spill-Tea/PyTemplate" -issues = "https://github.com/Spill-Tea/PyTemplate/issues" +homepage = "https://github.com/Spill-Tea/DesignerDNA" +issues = "https://github.com/Spill-Tea/DesignerDNA/issues" [tool.setuptools.dynamic] -version = { attr = "PyTemplate.__version__" } +version = { attr = "designer_dna.__version__" } readme = { file = ["README.md"], content-type = "text/markdown" } dependencies = { file = ["requirements.txt"] } [tool.setuptools] -package-dir = { "" = "src" } +package-dir = { "" = "src"} [tool.setuptools.packages.find] where = ["src"] -exclude = ["benchmarks", "docs", "tests"] +exclude = ["benchmarks", "build", "docs", "tests", "scripts"] [tool.setuptools.package-data] "*" = ["py.typed", "*.pyi"] +"designer_dna.headers" = ["*.h"] [project.optional-dependencies] -dev = ["PyTemplate[doc,test,lint,type]", "tox", "pre-commit"] +dev = ["designer_dna[doc,test,lint,type,commit]", "tox"] +commit = ["pre-commit"] doc = ["sphinx", "sphinx-rtd-theme"] test = ["pytest", "coverage", "pytest-xdist"] lint = ["pylint", "ruff"] @@ -46,7 +53,7 @@ addopts = "-n auto -rA" [tool.coverage.run] parallel = true branch = true -source = ["PyTemplate"] +source = ["designer_dna"] disable_warnings = ["no-data-collected", "module-not-imported"] [tool.coverage.paths] @@ -61,14 +68,14 @@ skip_empty = true exclude_also = ["def __repr__", 'if __name__ == "__main__"'] [tool.mypy] -mypy_path = "PyTemplate" +mypy_path = "designer_dna" warn_unused_ignores = true allow_redefinition = false force_uppercase_builtins = true [tool.pylint.main] # extension-pkg-whitelist = [] -ignore = ["tests", "dist", "build"] +ignore = ["dist", "build"] fail-under = 9.0 jobs = 0 limit-inference-results = 100 @@ -86,8 +93,10 @@ module-naming-style = "any" [tool.pylint.format] max-line-length = 88 -# [tool.pylint."messages control"] -# disable = [] +[tool.pylint."messages control"] +disable = [ + "R1731" # consider-using-max-builtin +] [tool.ruff] line-length = 88 @@ -118,6 +127,7 @@ ignore = [ "D213", # multi-line-summary-second-line (D213) "PLR0913", # too-many-arguments (PLR0913) "C408", # unnecessary-collection-call (C408) + "PYI021", # docstring-in-stub (PYI021) ] [tool.ruff.lint.pydocstyle] @@ -128,7 +138,7 @@ lines-after-imports = 2 [tool.ruff.lint.per-file-ignores] "__init__.py" = [ - "E402", # Import Statement not at Top of File + "E402", # Import Statement not at top of file "F401", # Unused Imports ] "tests/*.py" = [ diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9149af7 --- /dev/null +++ b/setup.py @@ -0,0 +1,83 @@ +# BSD 3-Clause License + +# Copyright (c) 2025, Spill-Tea + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""setup script to assist with compilation of extensions. + +References: + * https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html#compiler-directives + * https://github.com/cython/cython/issues/2995 + * https://stackoverflow.com/a/58116368/16771898 + * https://medium.com/@xpl/protecting-python-sources-using-cython-dcd940bb188e + * https://cython.readthedocs.io/en/latest/src/tutorial/parallelization.html#compilation + +""" + +import sys # noqa: I001 + +from setuptools import Extension, setup + +# NOTE: Import cython only after setuptools +from Cython.Compiler import Options +from Cython.Distutils import build_ext + + +# Primitive determination if package is being installed in editable mode (via pip) +if any(map(lambda x: x in sys.argv, ("editable_wheel", "-e", "--editable"))): + Options.annotate = True + +MAJOR_VERSION: str = str(sys.version_info[0]) +extensions: list[Extension] = [] +openmp: str = "/openmp" if sys.platform.startswith("win") else "-fopenmp" + + +# Oligonucleotides +extensions.append( + Extension( + "designer_dna._oligos", + ["src/designer_dna/_oligos.pyx"], + include_dirs=["src/designer_dna/headers"], + # extra_compile_args=[openmp], + # extra_link_args=[openmp], + ) +) + + +# Add cython directive to specify python version target +directives: dict = {"language_level": MAJOR_VERSION} +for ext in extensions: + if hasattr(ext, "cython_directives") and isinstance(ext.cython_directives, dict): + ext.cython_directives.update(directives) + else: + ext.cython_directives = directives + +# NOTE: Project metadata is captured from pyproject.toml +setup( + ext_modules=extensions, + cmdclass={"build_ext": build_ext}, +) diff --git a/tests/unit/test_version.py b/tests/unit/test_version.py deleted file mode 100644 index 7c7bf28..0000000 --- a/tests/unit/test_version.py +++ /dev/null @@ -1,42 +0,0 @@ -# BSD 3-Clause License -# -# Copyright (c) 2025, Spill-Tea -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""Example unit tests.""" - -from PyTemplate import __version__ - - -def test_version_type(): - """Test defined version is a string.""" - assert isinstance(__version__, str), "Expected string format version" - - -def test_version_value(): - """Test version string starts with the letter v.""" - assert __version__.lower().startswith("v"), "Expected version to begin with `v`" From 90cb749255ae7c2ef55e7d66fbddcfe50c6610fc Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 04:30:46 -0700 Subject: [PATCH 03/18] feat(docs): Update readme. --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 55ef5c3..590e8b6 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,14 @@ -# PyTemplate +# DesignerDNA [![build status][buildstatus-image]][buildstatus-url] -[buildstatus-image]: https://github.com/Spill-Tea/PyTemplate/actions/workflows/python-app.yml/badge.svg?branch=main -[buildstatus-url]: https://github.com/Spill-Tea/PyTemplate/actions?query=branch%3Amain +[buildstatus-image]: https://github.com/Spill-Tea/DesignerDNA/actions/workflows/python-app.yml/badge.svg?branch=main +[buildstatus-url]: https://github.com/Spill-Tea/DesignerDNA/actions?query=branch%3Amain -Python Project Template. Be sure to create a template directly -from github. +DesignerDNA - Design DNA sequences with intent. ## Table of Contents -- [PyTemplate](#pytemplate) +- [DesignerDNA](#designerdna) - [Installation](#installation) - [For Developers](#for-developers) - [License](#license) @@ -19,19 +18,20 @@ from github. Clone the repository and pip install. ```bash -git clone https://github.com/Spill-Tea/PyTemplate.git -cd PyTemplate +git clone https://github.com/Spill-Tea/DesignerDNA.git +cd DesignerDNA pip install . ``` Alternatively, you may install directly from github. ```bash -pip install git+https://github.com/Spill-Tea/PyTemplate@main +pip install git+https://github.com/Spill-Tea/DesignerDNA@main ``` ## For Developers -After cloning the repository, create a new virtual environment and run the following commands: +After cloning the repository, create a new virtual environment and run the following +commands: ```bash pip install -e ".[dev]" @@ -39,9 +39,9 @@ pre-commit install pre-commit run --all-files ``` -Running unit tests locally is straightforward with tox. Make sure -you have all python versions available required for your project -The `p` flag is not required, but it runs tox environments in parallel. +Running unit tests locally is straightforward with tox. Make sure you have all python +versions available required for your project. The `p` flag is not required, but it runs +tox environments in parallel. ```bash tox -p ``` From 5f5f344bc93563e05bae2f204ba2825b5bdb0f2e Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 04:31:43 -0700 Subject: [PATCH 04/18] feat(tox): Update tox environments to limit python versions and correct doc environment. --- tox.ini | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tox.ini b/tox.ini index 2b170b3..e3b49d9 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] requires = tox>=4 -envlist = type, lint, coverage, docs, py{38,39,310,311,312}-tests +envlist = type, lint, coverage, docs, py{311,312,313}-tests [testenv] description = Base Environment @@ -12,7 +12,7 @@ commands_pre = commands = coverage run --rcfile pyproject.toml -m pytest {posargs} -[testenv:py{38,39,310,311,312}-tests] +[testenv:py{311,312,313}-tests] description = Run Unit Tests commands_pre = {envpython} --version @@ -23,7 +23,7 @@ description = Report Code Coverage skip_install = true deps = coverage parallel_show_output = true -depends = py{38,39,310,311,312}-tests +depends = py{311,312,313}-tests commands = coverage combine --quiet --rcfile pyproject.toml coverage report --rcfile pyproject.toml {posargs} @@ -46,7 +46,7 @@ commands = changedir = docs extras = doc allowlist_externals = rm -commands = +commands = sphinx-build -W -b html -d {envtmpdir}/doctrees source {envtmpdir}/html commands_post = rm -rf {envtmpdir} From bb298ffa28395043e5597144056650ec9b580a17 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 04:33:43 -0700 Subject: [PATCH 05/18] feat(docs): Update api documentation, and implement custom code highliting pygment style. --- docs/source/_ext/styles.py | 52 ++++++++++++++++++++++++++++++++ docs/source/api.rst | 5 --- docs/source/api/designer_dna.rst | 21 +++++++++++++ docs/source/api/modules.rst | 7 +++++ docs/source/conf.py | 5 +-- docs/source/index.rst | 22 +++----------- 6 files changed, 88 insertions(+), 24 deletions(-) create mode 100644 docs/source/_ext/styles.py delete mode 100644 docs/source/api.rst create mode 100644 docs/source/api/designer_dna.rst create mode 100644 docs/source/api/modules.rst diff --git a/docs/source/_ext/styles.py b/docs/source/_ext/styles.py new file mode 100644 index 0000000..38bf385 --- /dev/null +++ b/docs/source/_ext/styles.py @@ -0,0 +1,52 @@ +"""pygment styles.""" + +from typing import ClassVar + +from pygments.style import Style +from pygments.token import ( + Comment, + Error, + Keyword, + Name, + Number, + Operator, + Punctuation, + String, + _TokenType, +) + + +class VSCodeDarkPlus(Style): + """VSCode Dark+ Style.""" + + background_color: str = "#1E1E1E" + + styles: ClassVar[dict[_TokenType, str]] = { + Number: "#B6CEA9", + Operator: "#D4D4D4", + Operator.Word: "#C586C0", + Comment: "#6D9957", + Comment.Preproc: "#639BD4", + Keyword.Namespace: "#C287A0", + # Keyword.Reserved: "#C287A0", + Keyword.Reserved: "#639BD4", + Keyword.Type: "#61C8B0", + Keyword.Constant: "#4FC1FF", + # Keyword: "#639BD4", + Keyword: "#C586C0", + Name: "#7FD0FD", + Name.Class: "#61C8B0", + Name.Namespace: "#61C8B0", + Name.Function: "#DCDCAA", + # Name.Builtin: "#DCDCAA", + Name.Builtin: "#4EC9B0", + Name.Type: "#4EC9B0", + Name.Builtin.Pseudo: "#9CDCFE", + Name.Variable: "#9CDCFE", + Name.Variable.Class: "#61C8B0", + Name.Variable.Magic: "#DCDCAA", + Name.Exception: "#61C8B0", + Error: "#61C8B0", + String: "#C9937A", + Punctuation: "#F9C922", + } diff --git a/docs/source/api.rst b/docs/source/api.rst deleted file mode 100644 index b8bc495..0000000 --- a/docs/source/api.rst +++ /dev/null @@ -1,5 +0,0 @@ -API -=== - -.. autosummary:: - :toctree: generated diff --git a/docs/source/api/designer_dna.rst b/docs/source/api/designer_dna.rst new file mode 100644 index 0000000..8006cb6 --- /dev/null +++ b/docs/source/api/designer_dna.rst @@ -0,0 +1,21 @@ +designer\_dna package +===================== + +Submodules +---------- + +designer\_dna.oligos module +--------------------------- + +.. automodule:: designer_dna.oligos + :members: + :show-inheritance: + :undoc-members: + +Module contents +--------------- + +.. automodule:: designer_dna + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/source/api/modules.rst b/docs/source/api/modules.rst new file mode 100644 index 0000000..f7691a3 --- /dev/null +++ b/docs/source/api/modules.rst @@ -0,0 +1,7 @@ +designer_dna +============ + +.. toctree:: + :maxdepth: 4 + + designer_dna diff --git a/docs/source/conf.py b/docs/source/conf.py index 1387eae..28aef52 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,9 +10,9 @@ sys.path.insert(0, os.path.abspath("../src/")) +sys.path.append(os.path.abspath("./_ext")) # Required for custom extensions - -project = "PyTemplate" +project = "DesignerDNA" copyright = "2025, Jason C Del Rio (Spill-Tea)" author = "Jason C Del Rio (Spill-Tea)" release = "v0.0.1" @@ -38,3 +38,4 @@ html_theme = "sphinx_rtd_theme" html_static_path = ["_static"] html_css_files = ["custom.css"] +pygments_style = "styles.VSCodeDarkPlus" diff --git a/docs/source/index.rst b/docs/source/index.rst index c3dfcb9..fb8a224 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,29 +1,17 @@ -.. PyTemplate documentation master file, created by - sphinx-quickstart on Thu Jun 12 22:11:46 2025. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +DesignerDNA documentation +========================= -PyTemplate documentation -======================== - -Add your content using ``reStructuredText`` syntax. See the -`reStructuredText `_ -documentation for details. - -.. automodule:: PyTemplate.__init__ - :members: - :undoc-members: - :show-inheritance: +Design, fiddle, and optimize DNA sequences. .. toctree:: :maxdepth: 2 :caption: Contents: - api + api/modules Indices and tables ================== * :ref:`genindex` * :ref:`modindex` -* :ref:`search` \ No newline at end of file +* :ref:`search` From be5870e4f97edbfde8b566ee2869d329a246589c Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 04:38:09 -0700 Subject: [PATCH 06/18] fix(license): add license to python files within docs folder. --- docs/source/_ext/styles.py | 31 ++++++++++++++++++++++++++++++- docs/source/conf.py | 34 +++++++++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/docs/source/_ext/styles.py b/docs/source/_ext/styles.py index 38bf385..9aa5b76 100644 --- a/docs/source/_ext/styles.py +++ b/docs/source/_ext/styles.py @@ -1,4 +1,33 @@ -"""pygment styles.""" +# BSD 3-Clause License +# +# Copyright (c) 2025, Spill-Tea +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Custom Pygment styles.""" from typing import ClassVar diff --git a/docs/source/conf.py b/docs/source/conf.py index 28aef52..5f4aa50 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,10 +1,34 @@ -# Configuration file for the Sphinx documentation builder. +# BSD 3-Clause License # -# For the full list of built-in configuration values, see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html +# Copyright (c) 2025, Spill-Tea +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Sphinx configuration file.""" -# -- Project information ----------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information import os import sys From 3aee328191e0f79d9986281d4760ee76eb3c1530 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 04:44:15 -0700 Subject: [PATCH 07/18] fix(docs): Correct docstrings. --- src/designer_dna/_oligos.pyi | 6 +++--- src/designer_dna/_oligos.pyx | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/designer_dna/_oligos.pyi b/src/designer_dna/_oligos.pyi index fdf7448..17122c9 100644 --- a/src/designer_dna/_oligos.pyi +++ b/src/designer_dna/_oligos.pyi @@ -113,8 +113,8 @@ def nrepeats(sequence: str, n: int) -> int: Examples: .. code-block:: python - n_stretch("AAAA", 1) == 3 # True - n_stretch("AAAA", 2) == 1 # True - n_stretch("ACAACAACA", 3) == 2 # True + nrepeats("AAAA", 1) == 3 # True + nrepeats("AAAA", 2) == 1 # True + nrepeats("ACAACAACA", 3) == 2 # True """ diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index 480d88e..49be06c 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -301,9 +301,9 @@ cpdef int nrepeats(str sequence, int n): Examples: .. code-block:: python - n_stretch("AAAA", 1) == 3 # True - n_stretch("AAAA", 2) == 1 # True - n_stretch("ACAACAACA", 3) == 2 # True + nrepeats("AAAA", 1) == 3 # True + nrepeats("AAAA", 2) == 1 # True + nrepeats("ACAACAACA", 3) == 2 # True """ if n < 1: From 06ec781f3ab3c9ef7454a0975a86212938a8ad4d Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 04:54:46 -0700 Subject: [PATCH 08/18] fix(_oligos): lint unnecessary imports. --- src/designer_dna/_oligos.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index 49be06c..b446ea8 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -6,7 +6,6 @@ from libc.stdlib cimport malloc, free cdef extern from "Python.h": str PyUnicode_FromStringAndSize(char*, Py_ssize_t) - char* PyUnicode_UTF8(object) Py_ssize_t PyUnicode_GET_LENGTH(object) # bint PyBytes_Check(object) # char* PyBytes_AS_STRING(object) @@ -44,8 +43,7 @@ cdef inline StringView to_view(str sequence): cdef inline str to_str(StringView view): """Convert StringView back into a python string object, safely releasing memory.""" - cdef: - str obj = PyUnicode_FromStringAndSize(view.ptr, view.size) + cdef str obj = PyUnicode_FromStringAndSize(view.ptr, view.size) free(view.ptr) return obj From 42c3a70d76e2bbf9603bc5b4e59a8cba2a0ad57e Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Wed, 25 Jun 2025 05:15:34 -0700 Subject: [PATCH 09/18] fix(src): remove previous folder or src. --- src/PyTemplate/__init__.py | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 src/PyTemplate/__init__.py diff --git a/src/PyTemplate/__init__.py b/src/PyTemplate/__init__.py deleted file mode 100644 index 4a61609..0000000 --- a/src/PyTemplate/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -# BSD 3-Clause License -# -# Copyright (c) 2025, Spill-Tea -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""PyTemplate Project.""" - -__version__: str = "v0.0.1" From 8fc48ffcd78e18d5c423ed650fb3f1dba08f60f0 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 03:54:52 -0700 Subject: [PATCH 10/18] feat(headers.common): Implement, extract, and cleanup common utilities for converting between c char and python objects. --- src/designer_dna/headers/common.pxd | 116 ++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/designer_dna/headers/common.pxd diff --git a/src/designer_dna/headers/common.pxd b/src/designer_dna/headers/common.pxd new file mode 100644 index 0000000..ed3581e --- /dev/null +++ b/src/designer_dna/headers/common.pxd @@ -0,0 +1,116 @@ +# BSD 3-Clause License + +# Copyright (c) 2025, Spill-Tea + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Common string handling utilities to shuttle between C and python.""" + +from libc.string cimport memcpy +from libc.stdlib cimport free, malloc + +cdef extern from "Python.h": + bint PyBytes_Check(object) + Py_ssize_t PyUnicode_GET_LENGTH(object) + bytes PyUnicode_AsUTF8String(object) + Py_ssize_t PyBytes_GET_SIZE(object) + char* PyBytes_AS_STRING(object) + str PyUnicode_DecodeUTF8Stateful(char*, Py_ssize_t, char*, Py_ssize_t*) + bytes PyBytes_FromStringAndSize(char*, Py_ssize_t) + + +# ctypedef fused StrT: +# str +# bytes + + +cdef struct StringView: + char* ptr + Py_ssize_t size + bint origin + + +cdef inline StringView construct(bytes s, Py_ssize_t length, bint isbytes): + cdef: + char* buffer = PyBytes_AS_STRING(s) + StringView view + + view.ptr = malloc((length + 1) * sizeof(char)) + memcpy(view.ptr, buffer, length + 1) + view.ptr[length] = "\0" # c string terminator + view.size = length + view.origin = isbytes + + return view + + +cdef inline StringView bytes_to_view(bytes b): + """Construct StringView from python bytes object""" + cdef Py_ssize_t length = PyBytes_GET_SIZE(b) + + return construct(b, length, True) + + +cdef inline StringView str_to_view(str s): + """Construct StringView from python string object.""" + cdef: + Py_ssize_t length = PyUnicode_GET_LENGTH(s) + bytes temp = PyUnicode_AsUTF8String(s) + + return construct(temp, length, False) + + +cdef inline str to_str(StringView view): + """Convert StringView back into a python string object, safely releasing memory.""" + cdef str obj = PyUnicode_DecodeUTF8Stateful(view.ptr, view.size, NULL, NULL) + free(view.ptr) + + return obj + + +cdef inline bytes to_bytes(StringView view): + """Convert StringView back into a python bytes object, safely releasing memory.""" + cdef bytes obj = PyBytes_FromStringAndSize(view.ptr, view.size) + free(view.ptr) + + return obj + + +# TODO: Cannot coerce to a type that is not specialized +# cdef inline StringView handle_input(StrT received): +# """Primary interface to handle both string and bytes python objects.""" +# if PyBytes_Check(received): +# return bytes_to_view( received) + +# return str_to_view( received) + + +# cdef inline StrT convert_output(StringView view): +# """Primary interface to handle conversion output back to python objects.""" +# if view.origin: +# return to_bytes(view) + +# return to_str(view) From 676f21b85be30d6a45dc6afd071150f643452a77 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 04:38:13 -0700 Subject: [PATCH 11/18] feat(_oligos): Improve stretch function, and to a minor degree, nrepeats function. Finish cimport of common utilities and extrapolate changes. --- src/designer_dna/_oligos.pyi | 4 +- src/designer_dna/_oligos.pyx | 132 +++++++++++++++++------------------ 2 files changed, 66 insertions(+), 70 deletions(-) diff --git a/src/designer_dna/_oligos.pyi b/src/designer_dna/_oligos.pyi index 17122c9..1689431 100644 --- a/src/designer_dna/_oligos.pyi +++ b/src/designer_dna/_oligos.pyi @@ -73,9 +73,9 @@ def palindrome(sequence: str, dna: bool = ...) -> str: Notes: * Uses a modified center expansion method (Manacher's algorithm) to identify the - longest substring that is palindromic. + longest substring that is palindromic. * If a sequence contains two or more palindromic substrings of equal size, the - first leftmost palindrome is prioritized. + first leftmost palindrome is prioritized. """ diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index b446ea8..ee13a00 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -1,52 +1,47 @@ -# cython: boundscheck=False, wraparound=False, nonecheck=False -"""Cythonized oligonucleotide functions.""" +# BSD 3-Clause License -from libc.string cimport memcpy -from libc.stdlib cimport malloc, free +# Copyright (c) 2025, Spill-Tea -cdef extern from "Python.h": - str PyUnicode_FromStringAndSize(char*, Py_ssize_t) - Py_ssize_t PyUnicode_GET_LENGTH(object) - # bint PyBytes_Check(object) - # char* PyBytes_AS_STRING(object) - # Py_ssize_t PyBytes_GET_SIZE(object) +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: -cdef extern from "oligos.h": - const unsigned char DNA[0x100] - const unsigned char RNA[0x100] - -# ctypedef fused StrT: -# str -# bytes +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. -cdef struct StringView: - char* ptr - Py_ssize_t size +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -cdef inline StringView to_view(str sequence): - """Construct StringView, using Cpython C-API to construct a c char string.""" - cdef: - Py_ssize_t length = PyUnicode_GET_LENGTH(sequence) - bytes temp = sequence.encode("utf8") - char* buffer = temp - StringView view - - view.ptr = malloc((length + 1) * sizeof(char)) - memcpy(view.ptr, buffer, length + 1) - view.ptr[length] = "\0" # c string terminator - view.size = length +# cython: boundscheck=False, wraparound=False, nonecheck=False +"""Cythonized oligonucleotide functions.""" - return view +from libc.stdlib cimport free +cdef extern from "Python.h": + Py_ssize_t PyUnicode_GET_LENGTH(object) + bytes PyUnicode_AsUTF8String(object) + Py_ssize_t PyBytes_GET_SIZE(object) -cdef inline str to_str(StringView view): - """Convert StringView back into a python string object, safely releasing memory.""" - cdef str obj = PyUnicode_FromStringAndSize(view.ptr, view.size) - free(view.ptr) +cimport common - return obj +cdef extern from "oligos.h": + const unsigned char DNA[0x100] + const unsigned char RNA[0x100] cdef void c_reverse(char* seq, Py_ssize_t length) noexcept: @@ -121,14 +116,14 @@ cpdef str complement(str sequence, bint dna = True): complement("ATGC", False) == "UACG" """ - cdef StringView view = to_view(sequence) + cdef common.StringView view = common.str_to_view(sequence) if dna: c_complement(view.ptr, view.size, DNA) else: c_complement(view.ptr, view.size, RNA) - return to_str(view) + return common.to_str(view) cdef void c_reverse_complement( @@ -176,14 +171,14 @@ cpdef str reverse_complement(str sequence, bint dna = True): reverse_complement("ATGC", False) == "GCAU" """ - cdef StringView view = to_view(sequence) + cdef common.StringView view = common.str_to_view(sequence) if dna: c_reverse_complement(view.ptr, view.size, DNA) else: c_reverse_complement(view.ptr, view.size, RNA) - return to_str(view) + return common.to_str(view) cdef bytes _expand_from_center( @@ -223,16 +218,17 @@ cpdef str palindrome(str sequence, bint dna = True): Notes: * Uses a modified center expansion method (Manacher's algorithm) to identify the - longest substring that is palindromic. + longest substring that is palindromic. * If a sequence contains two or more palindromic substrings of equal size, the - first leftmost palindrome is prioritized. + first leftmost palindrome is prioritized. """ cdef: - bytes temp = sequence.encode("utf8") - bytes comp = complement(sequence, dna).encode("utf8") + bytes temp = PyUnicode_AsUTF8String(sequence) + bytes comp = PyUnicode_AsUTF8String(complement(sequence, dna)) bytes even, pal = b"" - Py_ssize_t i, current, seq_length = len(sequence), length = 0 + Py_ssize_t seq_length = PyUnicode_GET_LENGTH(sequence) + Py_ssize_t i, current, length = 0 if seq_length < 2: # noqa: PLR2004 return "" @@ -240,7 +236,7 @@ cpdef str palindrome(str sequence, bint dna = True): for i in range(seq_length - 1): # NOTE: Palindromic nucleotides are only even length, halving search space even = _expand_from_center(temp, comp, i, i + 1, seq_length) - current = len(even) + current = PyBytes_GET_SIZE(even) if current > length: pal = even length = current @@ -265,19 +261,20 @@ cpdef int stretch(str sequence): """ cdef: - bytes temp = sequence.encode("utf8") - char* buffer = temp + common.StringView view = common.str_to_view(sequence) + Py_ssize_t j int longest = 0, current = 0 - char c, prev = buffer[0] + char prev = view.ptr[0] - for c in buffer[1:]: - if c == prev: + for j in range(1, view.size): + if view.ptr[j] == prev: current += 1 if current > longest: longest = current else: current = 0 - prev = c + prev = view.ptr[j] + free(view.ptr) return longest @@ -304,21 +301,19 @@ cpdef int nrepeats(str sequence, int n): nrepeats("ACAACAACA", 3) == 2 # True """ - if n < 1: - raise ValueError("n must be greater than 0.") - if n == 1: - return stretch(sequence) - cdef: - bytes phase, temp = sequence.encode("utf8") - char* buffer = temp - int i, j, k, max_val = 0 - list[char] previous = [buffer[i : n + i] for i in range(n)] - list[int] current = [0 for _ in range(n)] - - for j in range(n, len(sequence), n): - for k in range(n): - phase = buffer[j + k : j + k + n] + common.StringView view = common.str_to_view(sequence) + Py_ssize_t t = n + Py_ssize_t v = view.size // t + Py_ssize_t i, j, k + int max_val = 0 + list[char] previous = [view.ptr[i : t + i] for i in range(t)] + list[int] current = [0 for i in range(t)] + bytes phase + + for j in range(1, v): + for k in range(t): + phase = view.ptr[j * t + k : j * t + k + t] if phase == previous[k]: current[k] += 1 if current[k] > max_val: @@ -326,5 +321,6 @@ cpdef int nrepeats(str sequence, int n): else: current[k] = 0 previous[k] = phase + free(view.ptr) return max_val From 057a67e66ab61212e9f75b71e983962a4f95562a Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 04:40:49 -0700 Subject: [PATCH 12/18] fix(pyproject): Include pxd files as part of package data for correct build during testing. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2319a6f..79603c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ exclude = ["benchmarks", "build", "docs", "tests", "scripts"] [tool.setuptools.package-data] "*" = ["py.typed", "*.pyi"] -"designer_dna.headers" = ["*.h"] +"designer_dna.headers" = ["*.h", "*.pxd"] [project.optional-dependencies] dev = ["designer_dna[doc,test,lint,type,commit]", "tox"] From c003debea37003a17f68ab22df7b331a56d711a1 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 11:04:05 -0700 Subject: [PATCH 13/18] feat(_oligos): Drastically improve speed of palindrom function by over an order of magnitude. Extract out portion of complement to act inplace on stringview. --- src/designer_dna/_oligos.pyx | 72 ++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index ee13a00..35c3a7d 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -99,6 +99,13 @@ cdef void c_complement(char* sequence, Py_ssize_t length, unsigned char[] table) sequence[idx] = table[ sequence[idx]] +cdef void v_complement(common.StringView view, bint dna): + if dna: + c_complement(view.ptr, view.size, DNA) + else: + c_complement(view.ptr, view.size, RNA) + + cpdef str complement(str sequence, bint dna = True): """Complement a nucleotide sequence. @@ -117,11 +124,7 @@ cpdef str complement(str sequence, bint dna = True): """ cdef common.StringView view = common.str_to_view(sequence) - - if dna: - c_complement(view.ptr, view.size, DNA) - else: - c_complement(view.ptr, view.size, RNA) + v_complement(view, dna) return common.to_str(view) @@ -181,23 +184,19 @@ cpdef str reverse_complement(str sequence, bint dna = True): return common.to_str(view) -cdef bytes _expand_from_center( - bytes seq, - bytes comp, - Py_ssize_t left, - Py_ssize_t right, +cdef void _center( + char* seq, + char* comp, + Py_ssize_t* left, + Py_ssize_t* right, Py_ssize_t length, -): - while ( - left > -1 - and right < length - and seq[left] == comp[right] - and seq[right] == comp[left] # required to detect dna to rna based complements - ): - left -= 1 - right += 1 - - return seq[left + 1 : right] +) noexcept: + while (left[0] > -1 and right[0] < length): + if seq[left[0]] != comp[right[0]] or seq[right[0]] != comp[left[0]]: + break + left[0] -= 1 + right[0] += 1 + left[0] += 1 cpdef str palindrome(str sequence, bint dna = True): @@ -224,24 +223,25 @@ cpdef str palindrome(str sequence, bint dna = True): """ cdef: - bytes temp = PyUnicode_AsUTF8String(sequence) - bytes comp = PyUnicode_AsUTF8String(complement(sequence, dna)) - bytes even, pal = b"" - Py_ssize_t seq_length = PyUnicode_GET_LENGTH(sequence) - Py_ssize_t i, current, length = 0 - - if seq_length < 2: # noqa: PLR2004 - return "" - - for i in range(seq_length - 1): - # NOTE: Palindromic nucleotides are only even length, halving search space - even = _expand_from_center(temp, comp, i, i + 1, seq_length) - current = PyBytes_GET_SIZE(even) + common.StringView seq = common.str_to_view(sequence) + common.StringView com = common.str_to_view(sequence) + Py_ssize_t i, l, r, current, length = 0, cr = 0, cl = 0 + + v_complement(com, dna) + + for i in range(seq.size - 1): + l = i + r = i + 1 + _center(seq.ptr, com.ptr, &l, &r, seq.size) + current = r - l if current > length: - pal = even length = current + cr = r + cl = l + free(seq.ptr) + free(com.ptr) - return pal.decode("utf8") + return sequence[cl: cr] cpdef int stretch(str sequence): From bc3aaf542a5ecfb44780c5be7714dab979872511 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 11:14:14 -0700 Subject: [PATCH 14/18] chore(_oligos): change variable names of palindrome function to be more readable. --- src/designer_dna/_oligos.pyx | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index 35c3a7d..12fa9be 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -225,23 +225,24 @@ cpdef str palindrome(str sequence, bint dna = True): cdef: common.StringView seq = common.str_to_view(sequence) common.StringView com = common.str_to_view(sequence) - Py_ssize_t i, l, r, current, length = 0, cr = 0, cl = 0 + Py_ssize_t i, left, right, current, length = 0, start = 0, end = 0 v_complement(com, dna) for i in range(seq.size - 1): - l = i - r = i + 1 - _center(seq.ptr, com.ptr, &l, &r, seq.size) - current = r - l + left = i + right = i + 1 + _center(seq.ptr, com.ptr, &left, &right, seq.size) + current = right - left if current > length: length = current - cr = r - cl = l + start = left + end = right + free(seq.ptr) free(com.ptr) - return sequence[cl: cr] + return sequence[start: end] cpdef int stretch(str sequence): From d46dfc17f161ff534fd9cbc31db49be917c4221b Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 11:31:05 -0700 Subject: [PATCH 15/18] chore(tests): Include additional unit tests for palindrome function. --- tests/unit/test_oligos.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/test_oligos.py b/tests/unit/test_oligos.py index 4e6f99c..3af0e31 100644 --- a/tests/unit/test_oligos.py +++ b/tests/unit/test_oligos.py @@ -180,6 +180,14 @@ def test_nrepeats( ("AAAA", False, ""), ("ATATATATATAT", True, "ATATATATATAT"), ("ATATATATATAT", False, ""), + ("TGGATCCA", True, "TGGATCCA"), + ("ATGGATCCA", True, "TGGATCCA"), + ("AATGGATCCA", True, "TGGATCCA"), + ("TGGATCCAT", True, "TGGATCCA"), + ("TGGATCCATT", True, "TGGATCCA"), + ("GAATTC", True, "GAATTC"), + ("ATGAATTC", True, "GAATTC"), + ("CTTAAG", True, "CTTAAG"), ], ) def test_palindromes( From 0cf2f8dde294340bb7a6db93e3706c19f7b50049 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 13:30:23 -0700 Subject: [PATCH 16/18] feat(_oligos): Improve nrepeats function speed by an order of magnitude. --- src/designer_dna/_oligos.pyi | 2 +- src/designer_dna/_oligos.pyx | 57 +++++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/src/designer_dna/_oligos.pyi b/src/designer_dna/_oligos.pyi index 1689431..5013673 100644 --- a/src/designer_dna/_oligos.pyi +++ b/src/designer_dna/_oligos.pyi @@ -101,7 +101,7 @@ def nrepeats(sequence: str, n: int) -> int: Args: sequence (str): Nucleotide sequence string. - n (int): stretch of k-mers to observe. + n (int): Size of k-mers (composite pattern) to observe. Returns: (int) The longest tandem run of nucleotides comprised of a composite pattern diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index 12fa9be..a7d7136 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -30,7 +30,7 @@ # cython: boundscheck=False, wraparound=False, nonecheck=False """Cythonized oligonucleotide functions.""" -from libc.stdlib cimport free +from libc.stdlib cimport free, malloc cdef extern from "Python.h": Py_ssize_t PyUnicode_GET_LENGTH(object) @@ -44,7 +44,7 @@ cdef extern from "oligos.h": const unsigned char RNA[0x100] -cdef void c_reverse(char* seq, Py_ssize_t length) noexcept: +cdef inline void c_reverse(char* seq, Py_ssize_t length) noexcept: """Reverse a C string in place. Args: @@ -280,12 +280,34 @@ cpdef int stretch(str sequence): return longest +cdef inline bint _compare(char* p, char* q, Py_ssize_t start, Py_ssize_t end): + cdef: + Py_ssize_t j, count = 0 + + for j in range(start, end): + if p[j] != q[count]: + return False + count += 1 + + return True + + +cdef inline void _assign(char* src, char* dest, Py_ssize_t start, Py_ssize_t end): + """Overcome assigning a substring slice to another char variable.""" + cdef: + Py_ssize_t j, count = 0 + + for j in range(start, end): + dest[count] = src[j] + count += 1 + + cpdef int nrepeats(str sequence, int n): """Calculate the maximum observed repeats of composite pattern size n characters. Args: sequence (str): Nucleotide sequence string. - n (int): stretch of k-mers to observe. + n (int): Size of k-mers (composite pattern) to observe. Returns: (int) The longest tandem run of nucleotides comprised of a composite pattern @@ -307,21 +329,22 @@ cpdef int nrepeats(str sequence, int n): Py_ssize_t t = n Py_ssize_t v = view.size // t Py_ssize_t i, j, k - int max_val = 0 - list[char] previous = [view.ptr[i : t + i] for i in range(t)] - list[int] current = [0 for i in range(t)] - bytes phase - - for j in range(1, v): - for k in range(t): - phase = view.ptr[j * t + k : j * t + k + t] - if phase == previous[k]: - current[k] += 1 - if current[k] > max_val: - max_val = current[k] + int current, max_val = 0 + char* previous = malloc((t + 1) * sizeof(char)) + + for k in range(t): + _assign(view.ptr, previous, k, t + k) + current = 0 + for j in range(1, v): + if _compare(view.ptr, previous, j * t + k, j * t + k + t): + current += 1 + if current > max_val: + max_val = current else: - current[k] = 0 - previous[k] = phase + current = 0 + _assign(view.ptr, previous, j * t + k, j * t + k + t) + free(view.ptr) + free(previous) return max_val From cf20b289ab234f0b833d5d3d0c91a050e83d20b2 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 13:51:04 -0700 Subject: [PATCH 17/18] chore(tests): Include additional unit tests for higher order n for nrepeats across multiple phases. --- src/designer_dna/_oligos.pyx | 2 ++ tests/unit/test_oligos.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index a7d7136..4d0d412 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -100,6 +100,7 @@ cdef void c_complement(char* sequence, Py_ssize_t length, unsigned char[] table) cdef void v_complement(common.StringView view, bint dna): + """Handle complement on StringView directly, in place.""" if dna: c_complement(view.ptr, view.size, DNA) else: @@ -281,6 +282,7 @@ cpdef int stretch(str sequence): cdef inline bint _compare(char* p, char* q, Py_ssize_t start, Py_ssize_t end): + """Awkward slice comparison between two different size chars.""" cdef: Py_ssize_t j, count = 0 diff --git a/tests/unit/test_oligos.py b/tests/unit/test_oligos.py index 3af0e31..bfe8651 100644 --- a/tests/unit/test_oligos.py +++ b/tests/unit/test_oligos.py @@ -155,6 +155,13 @@ def test_stretch(seq, expected: int, function: Callable[[str], int]) -> None: ("ATGC", 1, 0), ("AAAAACCCCCCGGGGGGG", 1, 6), ("ACACAC", 2, 2), + ("ATC" * 4, 3, 3), + ("A" + "ATC" * 4, 3, 3), + ("AG" + "ATC" * 4, 3, 3), + ("AG" + "ATC" * 4 + "C", 3, 3), + ("AG" + "ATC" * 4 + "CG", 3, 3), + ("G" + "ATC" * 4 + "C", 3, 3), + ("G" + "ATC" * 4 + "CG", 3, 3), ], ) def test_nrepeats( From e39d6bab5c9ff0c50987e1e903e9adc6b4351fe7 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Thu, 26 Jun 2025 13:57:41 -0700 Subject: [PATCH 18/18] chore(headers.oligos): Add license to file. --- src/designer_dna/headers/oligos.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/designer_dna/headers/oligos.h b/src/designer_dna/headers/oligos.h index 2350d64..89527e2 100644 --- a/src/designer_dna/headers/oligos.h +++ b/src/designer_dna/headers/oligos.h @@ -1,4 +1,31 @@ +// BSD 3-Clause License +// Copyright (c) 2025, Spill-Tea + +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: + +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. + +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. + +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef __LDPY_LDHELPERS_H #define __LDPY_LDHELPERS_H