diff --git a/jupyter/Codex Numpy.ipynb b/jupyter/Codex Numpy.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e7b53ce0818755424a0a2ebae0d0cb6dd4b01719 --- /dev/null +++ b/jupyter/Codex Numpy.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os, sys, numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from ntai.codex import NPCodex" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "seq = 'acgturykmswbdhvn-ACGTURYKMSWBDHVN_'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "uracil = True\n", + "repeat = True\n", + "codex = NPCodex(include_uracil=uracil, include_repeat=repeat)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'acgturykmswbdhvn-ACGTURYKMSWBDHVN_'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "codex.decode(codex.encode(seq))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/jupyter/LabeledRanges.ipynb b/jupyter/LabeledRanges.ipynb index 797089e47e84f774475ad6269beb98741b625807..7082b559ed29c4b7520861a005994c243b29327d 100644 --- a/jupyter/LabeledRanges.ipynb +++ b/jupyter/LabeledRanges.ipynb @@ -7,7 +7,8 @@ "outputs": [], "source": [ "from ntai import Labeler\n", - "from ntai.ranges.labeled_ranges import LabeledRange, LabeledRanges" + "# from ntai.ranges.labeled_ranges import LabeledRange, LabeledRanges\n", + "from lrng import LabeledRange, LabeledRanges" ] }, { @@ -27,7 +28,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -50,7 +51,228 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1]])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seq=[i for i in range(300-100)]\n", + "enc = l.encode(seq, crngs, 100)\n", + "enc" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -258,14 +480,13 @@ " [0, 0, 1]]" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "seq=[i for i in range(300-100)]\n", - "l.encode(seq, crngs, 100)" + "enc.tolist()" ] }, { @@ -303,6 +524,13 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, diff --git a/ntai/__init__.py b/ntai/__init__.py index 65802e6ba4798398ab95b29b7b57bcd7b7a374b6..dc996c1a5fa5e986409e5f07754b3f06b2066e2d 100644 --- a/ntai/__init__.py +++ b/ntai/__init__.py @@ -2,7 +2,7 @@ from .codex import Codex from .bedtools import bedtools from .fetch import fetch_files from .labeler import Labeler -from .ranges import LabeledRange, LabeledRanges +# from .ranges import LabeledRange, LabeledRanges name = 'ntai' version = '0.0.8' diff --git a/ntai/codex/__init__.py b/ntai/codex/__init__.py index c49ce9c3bd1d8d18fdcc112e870bd700852e3aaf..15d6ea8d0e317164da8085240b1eeaf32bd88884 100644 --- a/ntai/codex/__init__.py +++ b/ntai/codex/__init__.py @@ -1 +1,2 @@ from .codex import Codex +from .numpy import NPCodex diff --git a/ntai/codex/numpy/__init__.py b/ntai/codex/numpy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..21c39fbc597cf00098f564caa8de9440daf114a0 --- /dev/null +++ b/ntai/codex/numpy/__init__.py @@ -0,0 +1,22 @@ +from .codex import Codex as NPCodex +from .codex import ( + NUMPY_FASTA_ENCODEX, + NUMPY_FASTA_ENCODEX_MATRIX, + NUMPY_FASTA_DECODEX, + INCLUDE_URACIL, + INCLUDE_REPEAT, + ENCODING_a, + ENCODING_c, + ENCODING_g, + ENCODING_t, + ENCODING_u, + ENCODING_repeat, +) + +from .utils import ( + make_encodex_matrix, + invert_encodex, + find_row, + lookup_channel_index, + get_channel_indices +) diff --git a/ntai/codex/numpy/codex.py b/ntai/codex/numpy/codex.py new file mode 100644 index 0000000000000000000000000000000000000000..5d497c99a52f182ed1ee3a940807c12f06fc813c --- /dev/null +++ b/ntai/codex/numpy/codex.py @@ -0,0 +1,83 @@ +import numpy as np +from ntai.codex.numpy.defaults import ( + NUMPY_FASTA_ENCODEX, + INCLUDE_URACIL, + INCLUDE_REPEAT, + ENCODING_a, + ENCODING_c, + ENCODING_g, + ENCODING_t, + ENCODING_u, + ENCODING_repeat +) + +from ntai.codex.numpy.utils import ( + make_encodex_matrix, invert_encodex, + encode, decode +) + +NUMPY_FASTA_ENCODEX_MATRIX = make_encodex_matrix( + NUMPY_FASTA_ENCODEX, + ENCODING_a, + ENCODING_c, + ENCODING_g, + ENCODING_t, + ENCODING_u, + ENCODING_repeat +) + +NUMPY_FASTA_DECODEX = invert_encodex(NUMPY_FASTA_ENCODEX) + +class Codex: + + def __init__( + self, + include_uracil:bool = INCLUDE_URACIL, + include_repeat:bool = INCLUDE_REPEAT, + encodex: dict = NUMPY_FASTA_ENCODEX, + decodex: dict = NUMPY_FASTA_DECODEX, + encodex_matrix: list = NUMPY_FASTA_ENCODEX_MATRIX + ): + ''' + Arguments: + include_uracil (bool): whether or not uracil should be included in the + embedding. By default False. + + include_repeat (bool): wehtehr or not repeated masked regions should be + included in the embedding. By default False. + + encodex (dict): the dictionary converting fasta characters to + nucleotides. + + decodex (dict): the dictionary converting nucleotide characters to + fasta characters. + ''' + self.include_uracil = include_uracil + self.include_repeat = include_repeat + self.encodex = encodex + if decodex is None: + decodex = invert_encodex(encodex) + self.decodex = decodex + if encodex_matrix is None: + encodex_matrix = make_encodex_matrix(encodex=encodex) + self.encodex_matrix = encodex_matrix + + + def encode(self, sequence): + return encode( + sequence, + encodex = self.encodex, + encodex_matrix = self.encodex_matrix, + include_uracil = self.include_uracil, + include_repeat = self.include_repeat + ) + + def decode(self, encoded): + return decode( + encoded, + encodex = self.encodex, + encodex_matrix = self.encodex_matrix, + include_uracil = self.include_uracil, + include_repeat = self.include_repeat, + decodex = self.decodex + ) diff --git a/ntai/codex/numpy/defaults.py b/ntai/codex/numpy/defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..90e0924f7c874ab1f47592d05e4d57c4ff17159c --- /dev/null +++ b/ntai/codex/numpy/defaults.py @@ -0,0 +1,48 @@ +import numpy as np +# from ntai.codex.numpy.utils import make_encodex_matrix, invert_codex + +ENCODING_a = [1,0,0,0,0,0] +ENCODING_c = [0,1,0,0,0,0] +ENCODING_g = [0,0,1,0,0,0] +ENCODING_t = [0,0,0,1,0,0] +ENCODING_u = [0,0,0,0,1,0] +ENCODING_repeat = [0,0,0,0,0,1] +INCLUDE_URACIL = False +INCLUDE_REPEAT = False + +NUMPY_FASTA_ENCODEX = { + '-': 0, # e is used to represent '-' as a python variable + 'a': 1, + 'c': 2, + 'g': 3, + 't': 4, + 'u': 5, + 'r': 6, + 'y': 7, + 'k': 8, + 'm': 9, + 's': 10, + 'w': 11, + 'b': 12, + 'd': 13, + 'h': 14, + 'v': 15, + 'n': 16, + '_': 17, # E is used to represent '_' as a python variable + 'A': 18, + 'C': 19, + 'G': 20, + 'T': 21, + 'U': 22, + 'R': 23, + 'Y': 24, + 'K': 25, + 'M': 26, + 'S': 27, + 'W': 28, + 'B': 29, + 'D': 30, + 'H': 31, + 'V': 32, + 'N': 33 +} diff --git a/ntai/codex/numpy/utils.py b/ntai/codex/numpy/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ab6d0e2ddfe18b95b83df514bc1bfe2beeda99c9 --- /dev/null +++ b/ntai/codex/numpy/utils.py @@ -0,0 +1,151 @@ +import numpy as np +from ntai.codex.numpy.defaults import ( + INCLUDE_URACIL, + INCLUDE_REPEAT, + ENCODING_a, + ENCODING_c, + ENCODING_g, + ENCODING_t, + ENCODING_u, + ENCODING_repeat +) + +def validate_base_encoding(name, encoding): + if np.cumsum(encoding)[-1] != 1: + msg = 'base encoding for {} should be one hot!'.format(name) + raise ValueError(msg) + if encoding.size != 6: + msg = 'base encoding for {} should be of shape (6, )!'.format(name) + raise ValueError(msg) + +def make_encodex_matrix( + encodex:dict, + a:list = ENCODING_a, + c:list = ENCODING_c, + g:list = ENCODING_g, + t:list = ENCODING_t, + u:list = ENCODING_u, + repeat:list = ENCODING_repeat +) -> list: + _a = np.array(a) + _c = np.array(c) + _g = np.array(g) + _t = np.array(t) + _u = np.array(u) + _r = np.array(repeat) + encodings = [('a', _a),('c', _c),('g', _g,),('t', _t,),('u', _u,),('r', _r)] + for c, encoding in encodings: + validate_base_encoding(c, encoding) + + e = np.zeros((6, ), dtype=np.int64) # '-' + r = _a + g + y = _c + _t + _u + k = _g + _t + _u + m = _a + _c + s = _c + _g + w = _a + _t + _u + b = _c + _g + _t + _u + d = _a + _g + _t + _u + h = _a + _c + _t + _u + v = _a + _c + _g + n = _a + _c + _g + _t + _u + E = e + _r + A = _a + _r + C = _c + _r + G = _g + _r + T = _t + _r + U = _u + _r + R = r + _r + Y = y + _r + K = k + _r + M = m + _r + S = s + _r + W = w + _r + B = b + _r + D = d + _r + H = h + _r + V = v + _r + N = n + _r + + + encodex_matrix = np.array([ + # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16, + e, _a,_c,_g,_t,_u, r, y, k, m, s, w, b, d, h, v, n, + #17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33 + E, A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, + ]) + return encodex_matrix[list(encodex.values())] + + +def invert_encodex(encodex:dict) -> dict: + k, v = zip(*encodex.items()) + return dict(zip(v, k)) + +def find_row(matrix:list, row:list) -> int: + return np.where(np.all(matrix == row, axis=1))[0][0] + +def _char_to_int(char:str, encodex:dict)->int: + return encodex[char] + +def _sequence_to_indices(sequence:str, encodex:dict)->list: + fn = lambda c: _char_to_int(c, encodex) + return list(map(fn, sequence)) + + +def lookup_channel_index( + channel:int, + encodex:dict, + encodex_matrix:list +) -> int: + return np.argmax(encodex_matrix[_char_to_int(channel, encodex)]) + + +def get_channel_indices( + encodex: dict, + encodex_matrix: list, + include_uracil: bool = INCLUDE_URACIL, + include_repeat: bool = INCLUDE_REPEAT +) -> list: + # drop uracil or repeat channel as needed + channels = 'acgt' + if include_uracil: channels += 'u' + if include_repeat: channels += '_' + fn = lambda c: lookup_channel_index(c, encodex, encodex_matrix) + return list(map(fn, channels)) + + +def encode( + sequence:str, + encodex: dict, + encodex_matrix: list = None, + include_uracil: bool = INCLUDE_URACIL, + include_repeat: bool = INCLUDE_REPEAT +): + # convert from string to integers + indices = np.array(_sequence_to_indices(sequence, encodex)) + if encodex_matrix is None: + encodex_matrix = make_encodex_matrix(encodex=encodex) + # extract the rows from the encoding matrix in order as they appear in seq + results = encodex_matrix[indices] + channel_indices = get_channel_indices(encodex, encodex_matrix, include_uracil, include_repeat) + # filter results + return results[:, channel_indices].tolist() + +def decode( + encoded:list, + encodex: dict, + encodex_matrix: list = None, + include_uracil: bool = INCLUDE_URACIL, + include_repeat: bool = INCLUDE_REPEAT, + decodex: dict = None +): + + if encodex_matrix is None: + encodex_matrix = make_encodex_matrix(encodex=encodex) + if decodex is None: + decodex = invert_encodex(encodex) + + channel_indices = get_channel_indices(encodex, encodex_matrix, include_uracil, include_repeat) + filtered_matrix = encodex_matrix[:, channel_indices] + keys = list(map(lambda r: find_row(filtered_matrix, r), encoded)) + return ''.join(list(map(lambda k: decodex[k], keys))) diff --git a/ntai/labeler/labeler.py b/ntai/labeler/labeler.py index b1ee3cacd110883bcb902d11f5723763f867dd0a..006f8773d2bb3c0143a2f4f748f2dcb2db74eae8 100644 --- a/ntai/labeler/labeler.py +++ b/ntai/labeler/labeler.py @@ -1,7 +1,9 @@ import os from multiprocessing import Pool from .defaults import (LABEL_ORDER, USE_OTHER_CLASS, OTHER_CLASS) -from ntai.ranges import LabeledRange, LabeledRanges +# from ntai.ranges import LabeledRange, LabeledRanges +from lrng import LabeledRange, LabeledRanges +from lrng.numba import coalesce, label_range, relevant_labels class Labeler: def __init__( self, @@ -29,34 +31,6 @@ class Labeler: label_order += [other_class] self.label_order = label_order - def processes_to_use(self, n): - ''' - Only intialize at most self.processes, but less if less is needed - - Arguments: - n (int): number of things to process - Returns: - number of processes to use - ''' - return min(n, self.processes) - - def encode_index(self, index:int, ranges) -> list: - ''' - Arguments: - index (int): the index to be encoded: - ranges (LabeledRanges): the class ranges to reference when making - the embedding for the index. - Returns: - encoded (list): the index encoded. - ''' - encoded = [0 for label in self.label_order] - for _range in ranges: - if index in _range: - encoded[self.label_order.index(_range.name)] = 1 - if 1 not in encoded and self.use_other_class: - encoded[self.label_order.index(self.other_class)] = 1 - return encoded - def encode(self, sequence:str, ranges, offset:int=0) -> list: ''' Arguments: @@ -67,13 +41,9 @@ class Labeler: Returns: embedding (list): the embedded sequence. ''' - _range = range(len(sequence)) - if self.processes == 1: - return [self.encode_index(offset+i, ranges) for i in _range] - else: - processes = self.processes_to_use(len(sequence)) - with Pool(processes=processes) as pool: - return pool.starmap(self.encode_index, [(offset+i, ranges) for i in _range]) + if isinstance(ranges, LabeledRanges): + ranges = ranges.as_list() + return label_range(offset, offset+len(sequence), ranges, self.label_order, self.use_other_class) def label(self, sequence:list, reference_labels:dict): ''' @@ -98,38 +68,7 @@ class Labeler: ''' chromosome, start, stop, name, score, strand, *_ = sequence reference_ranges = reference_labels[chromosome][strand] - - if self.processes == 1: - res = [] - for _range in reference_ranges: - if (self._keep_range(start, stop, _range)) is not None: - res.append(_range) - else: - processes = self.processes_to_use(len(sequence)) - with Pool(processes=processes) as pool: - res = pool.starmap(self._keep_range, [(start, stop, _range) for _range in reference_ranges]) - res = list(filter(lambda e: e is not None, res)) - return LabeledRanges(res) - - - def _keep_range(self, start, stop, _range): - ''' - Arguments: - start (int): the start of sequence under consideration - stop (int): the stop of sequence under consideration - _range (list / LabeledRange): a labeled range - Returns: - (None / Range): None is returned in _range is not contained inside - the range `[start, stop]`, else _range is returned - ''' - _class, range_start, range_stop = _range - if range_stop < start: return - if range_start > stop: return - if not ( - start <= range_start <= stop or \ - start <= range_stop <= stop or \ - range_start <= start <= range_stop or \ - range_start <= stop <= range_stop - ): - return - return _range + if isinstance(reference_ranges, LabeledRanges): + reference_ranges = reference_ranges.as_list() + result = relevant_labels(start, stop, reference_ranges, self.label_order) + return LabeledRanges(result) diff --git a/ntai/ranges/__init__.py b/ntai/ranges/__init__.py deleted file mode 100644 index c51df843fdb3b23dfb0e838495f732cca9102aee..0000000000000000000000000000000000000000 --- a/ntai/ranges/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .labeled_ranges import LabeledRange, LabeledRanges diff --git a/ntai/ranges/labeled_ranges.py b/ntai/ranges/labeled_ranges.py deleted file mode 100644 index d50e66328d4964a8d26e50489fa2ede2982573db..0000000000000000000000000000000000000000 --- a/ntai/ranges/labeled_ranges.py +++ /dev/null @@ -1,240 +0,0 @@ -from numbers import Number -from copy import copy, deepcopy - -class LabeledRange: - ''' - A helper class for keeping track of the start / stop of a given class in a - sequence - ''' - - def __init__(self, name:str, start:int, stop:int): - ''' - Arguments: - name (str): name of the class - start (int): the index at which the class starts - stop (int): the index at which the class stops - ''' - self.name = name - self.start = int(start) - self.stop = int(stop) - - - ''' - Various conversions from LabeledRange to pythonic types - ''' - def as_list(self): - return [self.name, self.start, self.stop] - def as_str_list(self): - return [str(e) for e in self.as_list()] - def as_tuple(self): - return tuple(self.as_list()) - def as_dict(self): - return dict(zip(['name', 'start', 'stop'], self.as_list())) - def as_txt(self, delim='\t', newline='\n', newline_q=True): - return delim.join(self.as_str_list()) + (newline if newline_q else '') - def as_csv(self, newline='\n', newline_q=True): - return self.as_txt(',', newline, newline_q) - def as_tsv(self, newline='\n', newline_q=True): - return self.as_txt('\t', newline, newline_q) - def __hash__(self): - return hash(self.as_tuple()) - def __repr__(self): - return '{}{}'.format(self.__class__.__name__, self.as_tuple()) - def __str__(self): - return self.__repr__() - def __len__(self): - return self.stop - self.start - def __iter__(self): - return (e for e in self.as_list()) - def __eq__(self, other): - if not isinstance(other, LabeledRange): - return False - return (self.name == other.name) and \ - (self.start == other.start) and \ - (self.stop == other.stop) - - def __ne__(self, other): - return not self.__eq__(other) - - def __contains__(self, other): - ''' - Arguments: - other (LabeledRange / int): If other is a LabeledRange, only true - if other is bounded by self. If other is a number, true if - self.start <= other <= self.stop - Returns: - results (bool) - ''' - if isinstance(other, Number): - return self.start <= other <= self.stop - if not isinstance(other, LabeledRange): - return False - if not other.same_q(self): - return False - return other.start in self and other.stop in self - - - def same_q(self, other): - '''Whether or not other is of the same class''' - if not isinstance(other, LabeledRange): - return False - return self.name == other.name - - def min(self, other): - return min([self.start, self.stop, other.start, other.stop]) - - def max(self, other): - return max([self.start, self.stop, other.start, other.stop]) - - def overlap_q(self, other): - if not self.same_q(other): - return False - return any([ - other.start in self, other.stop in self, - self.start in other, self.stop in other - ]) - - def __add__(self, other): - if not isinstance(other, LabeledRange): - raise ValueError('{} is not a LabeledRange'.format(other)) - if not self.overlap_q(other): - return LabeledRanges([deepcopy(self), deepcopy(other)]) - else: - return LabeledRange(self.name, self.min(other), self.max(other)) - - def __iadd__(self, other): - if self.overlap_q(other): - self.start = self.min(other) - self.stop = self.max(other) - return self - - -class LabeledRanges: - def __init__(self, ranges:list=[]): - self.ranges = ranges - - def classes(self): - return set([rng.name for rng in self]) - def as_list(self): - return [rng.as_list() for rng in self] - def as_tuple(self): - return tuple([rng.as_tuple() for rng in self]) - - - @property - def ranges(self): - return self._ranges - - @ranges.setter - def ranges(self, ranges): - rngs = [] - for rng in ranges: - if isinstance(rng, LabeledRange): - rngs.append(rng) - else: - rngs.append(LabeledRange(*rng)) - self._ranges = list(set(rngs)) - - @ranges.deleter - def ranges(self): - del self._ranges - - - def __iter__(self): - return (rng for rng in self.ranges) - - def __getitem__(self, key): - return self.ranges[key] - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = '{}('.format(self.__class__.__name__) - if len(self.ranges) == 0: - return s + ')' - else: - s += '\n' - for i, rng in enumerate(self.ranges): - s += '\t' + repr(rng) + '\n' - s += ')' - return s - - - - def __eq__(self, other): - if isinstance(other, LabeledRanges): - return all([rng in other for rng in self.ranges]) and \ - all([rng in self for rng in other.ranges]) - return False - - def __ne__(self, other): - return not self.__eq__(other) - - - def __contains__(self, other): - if isinstance(other, str): - return any([rng.name == other for rng in self]) - - if isinstance(other, LabeledRange): - return any([rng == other for rng in self]) - - if isinstance(other, LabeledRanges): - return all([self.__contains__(rng) for rng in other]) - - return False - - def overlap_q(self, other): - return any([rng.overlap_q(other) for rng in self.ranges]) - - def append(self, other): - - # Append a range - if isinstance(other, LabeledRange): - found_q = False - for rng in self: - if rng.overlap_q(other): - found_q = True - rng += other - if not found_q: - self.ranges.append(other) - - # Map each range to the above block - if isinstance(other, LabeledRanges): - for rng in other: - self.append(other) - - return self - - - def __give__(self, other): - if isinstance(other, LabeledRange): - self.append(other) - - if isinstance(other, LabeledRanges): - for rng in other: - self.append(rng) - - return self.simplify() - - def simplify(self): - for rng in self: - self.append(rng) - self.ranges = list(set(self.ranges)) - return self - - def __add__(self, other): - cp = deepcopy(self) - cp.__give__(other) - return cp - - def __iadd__(self, other): - self.__give__(other) - return self - - def __radd__(self, other): - cp = deepcopy(self) - if not isinstance(other, LabeledRange): - return cp - cp.__iadd__(other) - return cp