From 193021283172b8be785fcad03cdd380a845a8a24 Mon Sep 17 00:00:00 2001 From: Sumner Date: Tue, 2 Jul 2019 10:50:16 +0200 Subject: [PATCH 1/6] remove labeld_ranges from ntai and now use lrng package --- ntai/labeler/labeler.py | 3 +- ntai/ranges/__init__.py | 1 - ntai/ranges/labeled_ranges.py | 240 ---------------------------------- 3 files changed, 2 insertions(+), 242 deletions(-) delete mode 100644 ntai/ranges/__init__.py delete mode 100644 ntai/ranges/labeled_ranges.py diff --git a/ntai/labeler/labeler.py b/ntai/labeler/labeler.py index b1ee3ca..4445bbd 100644 --- a/ntai/labeler/labeler.py +++ b/ntai/labeler/labeler.py @@ -1,7 +1,8 @@ import os from multiprocessing import Pool from .defaults import (LABEL_ORDER, USE_OTHER_CLASS, OTHER_CLASS) -from ntai.ranges import LabeledRange, LabeledRanges +# from ntai.ranges import LabeledRange, LabeledRanges +from lrng import LabeledRange, LabeledRanges class Labeler: def __init__( self, diff --git a/ntai/ranges/__init__.py b/ntai/ranges/__init__.py deleted file mode 100644 index c51df84..0000000 --- a/ntai/ranges/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .labeled_ranges import LabeledRange, LabeledRanges diff --git a/ntai/ranges/labeled_ranges.py b/ntai/ranges/labeled_ranges.py deleted file mode 100644 index d50e663..0000000 --- a/ntai/ranges/labeled_ranges.py +++ /dev/null @@ -1,240 +0,0 @@ -from numbers import Number -from copy import copy, deepcopy - -class LabeledRange: - ''' - A helper class for keeping track of the start / stop of a given class in a - sequence - ''' - - def __init__(self, name:str, start:int, stop:int): - ''' - Arguments: - name (str): name of the class - start (int): the index at which the class starts - stop (int): the index at which the class stops - ''' - self.name = name - self.start = int(start) - self.stop = int(stop) - - - ''' - Various conversions from LabeledRange to pythonic types - ''' - def as_list(self): - return [self.name, self.start, self.stop] - def as_str_list(self): - return [str(e) for e in self.as_list()] - def as_tuple(self): - return tuple(self.as_list()) - def as_dict(self): - return dict(zip(['name', 'start', 'stop'], self.as_list())) - def as_txt(self, delim='\t', newline='\n', newline_q=True): - return delim.join(self.as_str_list()) + (newline if newline_q else '') - def as_csv(self, newline='\n', newline_q=True): - return self.as_txt(',', newline, newline_q) - def as_tsv(self, newline='\n', newline_q=True): - return self.as_txt('\t', newline, newline_q) - def __hash__(self): - return hash(self.as_tuple()) - def __repr__(self): - return '{}{}'.format(self.__class__.__name__, self.as_tuple()) - def __str__(self): - return self.__repr__() - def __len__(self): - return self.stop - self.start - def __iter__(self): - return (e for e in self.as_list()) - def __eq__(self, other): - if not isinstance(other, LabeledRange): - return False - return (self.name == other.name) and \ - (self.start == other.start) and \ - (self.stop == other.stop) - - def __ne__(self, other): - return not self.__eq__(other) - - def __contains__(self, other): - ''' - Arguments: - other (LabeledRange / int): If other is a LabeledRange, only true - if other is bounded by self. If other is a number, true if - self.start <= other <= self.stop - Returns: - results (bool) - ''' - if isinstance(other, Number): - return self.start <= other <= self.stop - if not isinstance(other, LabeledRange): - return False - if not other.same_q(self): - return False - return other.start in self and other.stop in self - - - def same_q(self, other): - '''Whether or not other is of the same class''' - if not isinstance(other, LabeledRange): - return False - return self.name == other.name - - def min(self, other): - return min([self.start, self.stop, other.start, other.stop]) - - def max(self, other): - return max([self.start, self.stop, other.start, other.stop]) - - def overlap_q(self, other): - if not self.same_q(other): - return False - return any([ - other.start in self, other.stop in self, - self.start in other, self.stop in other - ]) - - def __add__(self, other): - if not isinstance(other, LabeledRange): - raise ValueError('{} is not a LabeledRange'.format(other)) - if not self.overlap_q(other): - return LabeledRanges([deepcopy(self), deepcopy(other)]) - else: - return LabeledRange(self.name, self.min(other), self.max(other)) - - def __iadd__(self, other): - if self.overlap_q(other): - self.start = self.min(other) - self.stop = self.max(other) - return self - - -class LabeledRanges: - def __init__(self, ranges:list=[]): - self.ranges = ranges - - def classes(self): - return set([rng.name for rng in self]) - def as_list(self): - return [rng.as_list() for rng in self] - def as_tuple(self): - return tuple([rng.as_tuple() for rng in self]) - - - @property - def ranges(self): - return self._ranges - - @ranges.setter - def ranges(self, ranges): - rngs = [] - for rng in ranges: - if isinstance(rng, LabeledRange): - rngs.append(rng) - else: - rngs.append(LabeledRange(*rng)) - self._ranges = list(set(rngs)) - - @ranges.deleter - def ranges(self): - del self._ranges - - - def __iter__(self): - return (rng for rng in self.ranges) - - def __getitem__(self, key): - return self.ranges[key] - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = '{}('.format(self.__class__.__name__) - if len(self.ranges) == 0: - return s + ')' - else: - s += '\n' - for i, rng in enumerate(self.ranges): - s += '\t' + repr(rng) + '\n' - s += ')' - return s - - - - def __eq__(self, other): - if isinstance(other, LabeledRanges): - return all([rng in other for rng in self.ranges]) and \ - all([rng in self for rng in other.ranges]) - return False - - def __ne__(self, other): - return not self.__eq__(other) - - - def __contains__(self, other): - if isinstance(other, str): - return any([rng.name == other for rng in self]) - - if isinstance(other, LabeledRange): - return any([rng == other for rng in self]) - - if isinstance(other, LabeledRanges): - return all([self.__contains__(rng) for rng in other]) - - return False - - def overlap_q(self, other): - return any([rng.overlap_q(other) for rng in self.ranges]) - - def append(self, other): - - # Append a range - if isinstance(other, LabeledRange): - found_q = False - for rng in self: - if rng.overlap_q(other): - found_q = True - rng += other - if not found_q: - self.ranges.append(other) - - # Map each range to the above block - if isinstance(other, LabeledRanges): - for rng in other: - self.append(other) - - return self - - - def __give__(self, other): - if isinstance(other, LabeledRange): - self.append(other) - - if isinstance(other, LabeledRanges): - for rng in other: - self.append(rng) - - return self.simplify() - - def simplify(self): - for rng in self: - self.append(rng) - self.ranges = list(set(self.ranges)) - return self - - def __add__(self, other): - cp = deepcopy(self) - cp.__give__(other) - return cp - - def __iadd__(self, other): - self.__give__(other) - return self - - def __radd__(self, other): - cp = deepcopy(self) - if not isinstance(other, LabeledRange): - return cp - cp.__iadd__(other) - return cp -- GitLab From 287fab9642bc0bd8923c6f03e4e0146b81884997 Mon Sep 17 00:00:00 2001 From: Sumner Date: Tue, 2 Jul 2019 11:25:54 +0200 Subject: [PATCH 2/6] labeler now uses numba functions from lrng --- jupyter/LabeledRanges.ipynb | 240 +++++++++++++++++++++++++++++++++++- ntai/__init__.py | 2 +- ntai/labeler/labeler.py | 45 ++++--- 3 files changed, 261 insertions(+), 26 deletions(-) diff --git a/jupyter/LabeledRanges.ipynb b/jupyter/LabeledRanges.ipynb index 797089e..7082b55 100644 --- a/jupyter/LabeledRanges.ipynb +++ b/jupyter/LabeledRanges.ipynb @@ -7,7 +7,8 @@ "outputs": [], "source": [ "from ntai import Labeler\n", - "from ntai.ranges.labeled_ranges import LabeledRange, LabeledRanges" + "# from ntai.ranges.labeled_ranges import LabeledRange, LabeledRanges\n", + "from lrng import LabeledRange, LabeledRanges" ] }, { @@ -27,7 +28,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -50,7 +51,228 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [0, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 1, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [1, 0, 0],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1],\n", + " [0, 0, 1]])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seq=[i for i in range(300-100)]\n", + "enc = l.encode(seq, crngs, 100)\n", + "enc" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -258,14 +480,13 @@ " [0, 0, 1]]" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "seq=[i for i in range(300-100)]\n", - "l.encode(seq, crngs, 100)" + "enc.tolist()" ] }, { @@ -303,6 +524,13 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, diff --git a/ntai/__init__.py b/ntai/__init__.py index 65802e6..dc996c1 100644 --- a/ntai/__init__.py +++ b/ntai/__init__.py @@ -2,7 +2,7 @@ from .codex import Codex from .bedtools import bedtools from .fetch import fetch_files from .labeler import Labeler -from .ranges import LabeledRange, LabeledRanges +# from .ranges import LabeledRange, LabeledRanges name = 'ntai' version = '0.0.8' diff --git a/ntai/labeler/labeler.py b/ntai/labeler/labeler.py index 4445bbd..ee994c7 100644 --- a/ntai/labeler/labeler.py +++ b/ntai/labeler/labeler.py @@ -3,6 +3,7 @@ from multiprocessing import Pool from .defaults import (LABEL_ORDER, USE_OTHER_CLASS, OTHER_CLASS) # from ntai.ranges import LabeledRange, LabeledRanges from lrng import LabeledRange, LabeledRanges +from lrng.numba import coalesce, label_range, relevant_labels class Labeler: def __init__( self, @@ -68,13 +69,16 @@ class Labeler: Returns: embedding (list): the embedded sequence. ''' - _range = range(len(sequence)) - if self.processes == 1: - return [self.encode_index(offset+i, ranges) for i in _range] - else: - processes = self.processes_to_use(len(sequence)) - with Pool(processes=processes) as pool: - return pool.starmap(self.encode_index, [(offset+i, ranges) for i in _range]) + if isinstance(ranges, LabeledRanges): + ranges = ranges.as_list() + return label_range(offset, offset+len(sequence), ranges, self.label_order, self.use_other_class) + # _range = range(len(sequence)) + # if self.processes == 1: + # return [self.encode_index(offset+i, ranges) for i in _range] + # else: + # processes = self.processes_to_use(len(sequence)) + # with Pool(processes=processes) as pool: + # return pool.starmap(self.encode_index, [(offset+i, ranges) for i in _range]) def label(self, sequence:list, reference_labels:dict): ''' @@ -99,18 +103,21 @@ class Labeler: ''' chromosome, start, stop, name, score, strand, *_ = sequence reference_ranges = reference_labels[chromosome][strand] - - if self.processes == 1: - res = [] - for _range in reference_ranges: - if (self._keep_range(start, stop, _range)) is not None: - res.append(_range) - else: - processes = self.processes_to_use(len(sequence)) - with Pool(processes=processes) as pool: - res = pool.starmap(self._keep_range, [(start, stop, _range) for _range in reference_ranges]) - res = list(filter(lambda e: e is not None, res)) - return LabeledRanges(res) + if isinstance(reference_ranges, LabeledRanges): + reference_ranges = reference_ranges.as_list() + result = relevant_labels(start, stop, reference_ranges, self.label_order) + return LabeledRanges(result) + # if self.processes == 1: + # res = [] + # for _range in reference_ranges: + # if (self._keep_range(start, stop, _range)) is not None: + # res.append(_range) + # else: + # processes = self.processes_to_use(len(sequence)) + # with Pool(processes=processes) as pool: + # res = pool.starmap(self._keep_range, [(start, stop, _range) for _range in reference_ranges]) + # res = list(filter(lambda e: e is not None, res)) + # return LabeledRanges(res) def _keep_range(self, start, stop, _range): -- GitLab From 9b1a004474c50805c4c8eccb4038a27be48ff594 Mon Sep 17 00:00:00 2001 From: Sumner Date: Tue, 2 Jul 2019 13:11:40 +0200 Subject: [PATCH 3/6] removed code from Labeler that utilized numba functions from lrng --- ntai/labeler/labeler.py | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/ntai/labeler/labeler.py b/ntai/labeler/labeler.py index ee994c7..a53a2ad 100644 --- a/ntai/labeler/labeler.py +++ b/ntai/labeler/labeler.py @@ -72,13 +72,6 @@ class Labeler: if isinstance(ranges, LabeledRanges): ranges = ranges.as_list() return label_range(offset, offset+len(sequence), ranges, self.label_order, self.use_other_class) - # _range = range(len(sequence)) - # if self.processes == 1: - # return [self.encode_index(offset+i, ranges) for i in _range] - # else: - # processes = self.processes_to_use(len(sequence)) - # with Pool(processes=processes) as pool: - # return pool.starmap(self.encode_index, [(offset+i, ranges) for i in _range]) def label(self, sequence:list, reference_labels:dict): ''' @@ -107,37 +100,3 @@ class Labeler: reference_ranges = reference_ranges.as_list() result = relevant_labels(start, stop, reference_ranges, self.label_order) return LabeledRanges(result) - # if self.processes == 1: - # res = [] - # for _range in reference_ranges: - # if (self._keep_range(start, stop, _range)) is not None: - # res.append(_range) - # else: - # processes = self.processes_to_use(len(sequence)) - # with Pool(processes=processes) as pool: - # res = pool.starmap(self._keep_range, [(start, stop, _range) for _range in reference_ranges]) - # res = list(filter(lambda e: e is not None, res)) - # return LabeledRanges(res) - - - def _keep_range(self, start, stop, _range): - ''' - Arguments: - start (int): the start of sequence under consideration - stop (int): the stop of sequence under consideration - _range (list / LabeledRange): a labeled range - Returns: - (None / Range): None is returned in _range is not contained inside - the range `[start, stop]`, else _range is returned - ''' - _class, range_start, range_stop = _range - if range_stop < start: return - if range_start > stop: return - if not ( - start <= range_start <= stop or \ - start <= range_stop <= stop or \ - range_start <= start <= range_stop or \ - range_start <= stop <= range_stop - ): - return - return _range -- GitLab From 04beea09bbbe6637c5028638ff7f3de6378b0a90 Mon Sep 17 00:00:00 2001 From: Sumner Date: Tue, 2 Jul 2019 13:12:33 +0200 Subject: [PATCH 4/6] removed code from Labeler that utilized numba functions from lrng --- ntai/labeler/labeler.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/ntai/labeler/labeler.py b/ntai/labeler/labeler.py index a53a2ad..006f877 100644 --- a/ntai/labeler/labeler.py +++ b/ntai/labeler/labeler.py @@ -31,34 +31,6 @@ class Labeler: label_order += [other_class] self.label_order = label_order - def processes_to_use(self, n): - ''' - Only intialize at most self.processes, but less if less is needed - - Arguments: - n (int): number of things to process - Returns: - number of processes to use - ''' - return min(n, self.processes) - - def encode_index(self, index:int, ranges) -> list: - ''' - Arguments: - index (int): the index to be encoded: - ranges (LabeledRanges): the class ranges to reference when making - the embedding for the index. - Returns: - encoded (list): the index encoded. - ''' - encoded = [0 for label in self.label_order] - for _range in ranges: - if index in _range: - encoded[self.label_order.index(_range.name)] = 1 - if 1 not in encoded and self.use_other_class: - encoded[self.label_order.index(self.other_class)] = 1 - return encoded - def encode(self, sequence:str, ranges, offset:int=0) -> list: ''' Arguments: -- GitLab From ec76390306298863f8986346fb8ce15958a876c1 Mon Sep 17 00:00:00 2001 From: Sumner Date: Tue, 2 Jul 2019 15:45:49 +0200 Subject: [PATCH 5/6] numpy version for encoding fastas --- jupyter/Codex Numpy.ipynb | 205 ++++++++++++++++++++++++++++++++++++++ ntai/codex/numpy.py | 141 ++++++++++++++++++++++++++ 2 files changed, 346 insertions(+) create mode 100644 jupyter/Codex Numpy.ipynb create mode 100644 ntai/codex/numpy.py diff --git a/jupyter/Codex Numpy.ipynb b/jupyter/Codex Numpy.ipynb new file mode 100644 index 0000000..faa1d77 --- /dev/null +++ b/jupyter/Codex Numpy.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import os, sys, numpy as np\n", + "from ntai.codex.numpy import (\n", + " NUMPY_FASTA_CODEX, NUMPY_FASTA_CODEX_MATRIX, \n", + " encode, lookup_channel_index, _char_to_int, get_channel_indices\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "seq = 'actgrykmswbdhvn-ACTGRYKMSWBDHVN'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "uracil = False\n", + "repeat = False\n", + "enc = encode(seq, NUMPY_FASTA_CODEX, NUMPY_FASTA_CODEX_MATRIX, uracil, repeat)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "c = get_channel_indices(NUMPY_FASTA_CODEX, NUMPY_FASTA_CODEX_MATRIX, uracil, repeat)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "f = NUMPY_FASTA_CODEX_MATRIX[:, c]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 0, 0, 0]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "enc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0, 0],\n", + " [ 0, 1],\n", + " [ 0, 2],\n", + " [ 0, 3],\n", + " [ 1, 2],\n", + " [ 1, 3],\n", + " [ 2, 1],\n", + " [ 2, 3],\n", + " [ 3, 1],\n", + " [ 3, 2],\n", + " [ 4, 1],\n", + " [ 4, 2],\n", + " [ 4, 3],\n", + " [ 5, 0],\n", + " [ 5, 1],\n", + " [ 5, 3],\n", + " [ 6, 2],\n", + " [ 7, 1],\n", + " [ 8, 0],\n", + " [ 8, 2],\n", + " [ 8, 3],\n", + " [ 9, 3],\n", + " [10, 0],\n", + " [10, 1],\n", + " [10, 2],\n", + " [12, 0],\n", + " [12, 1],\n", + " [13, 0],\n", + " [13, 2],\n", + " [14, 0],\n", + " [14, 3],\n", + " [15, 0],\n", + " [16, 1],\n", + " [16, 2],\n", + " [16, 3],\n", + " [17, 0],\n", + " [17, 1],\n", + " [17, 2],\n", + " [17, 3],\n", + " [18, 2],\n", + " [18, 3],\n", + " [19, 1],\n", + " [19, 3],\n", + " [20, 1],\n", + " [20, 2],\n", + " [21, 1],\n", + " [21, 2],\n", + " [21, 3],\n", + " [22, 0],\n", + " [22, 1],\n", + " [22, 3],\n", + " [23, 2],\n", + " [24, 1],\n", + " [25, 0],\n", + " [25, 2],\n", + " [25, 3],\n", + " [26, 3],\n", + " [27, 0],\n", + " [27, 1],\n", + " [27, 2],\n", + " [29, 0],\n", + " [29, 1],\n", + " [30, 0],\n", + " [30, 2],\n", + " [31, 0],\n", + " [31, 3],\n", + " [32, 0],\n", + " [33, 1],\n", + " [33, 2],\n", + " [33, 3]])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.argwhere(f == enc[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ntai/codex/numpy.py b/ntai/codex/numpy.py new file mode 100644 index 0000000..a4aeedb --- /dev/null +++ b/ntai/codex/numpy.py @@ -0,0 +1,141 @@ +import numpy as np + +NUMPY_FASTA_CODEX = { + 'a': 0, + 'c': 1, + 'g': 2, + 't': 3, + 'u': 4, + 'r': 5, + 'y': 6, + 'k': 7, + 'm': 8, + 's': 9, + 'w': 10, + 'b': 11, + 'd': 12, + 'h': 13, + 'v': 14, + 'n': 15, + '-': 16, + 'A': 17, + 'C': 18, + 'G': 19, + 'T': 20, + 'U': 21, + 'R': 22, + 'Y': 23, + 'K': 24, + 'M': 25, + 'S': 26, + 'W': 27, + 'B': 28, + 'D': 29, + 'H': 30, + 'V': 31, + 'N': 32, + '_': 33 +} + +def make_codex_matrix( + a:list = [1,0,0,0,0,0], + c:list = [0,1,0,0,0,0], + g:list = [0,0,1,0,0,0], + t:list = [0,0,0,1,0,0], + u:list = [0,0,0,0,1,0], + repeat:list = [0,0,0,0,0,1], + codex:dict = NUMPY_FASTA_CODEX +): + _a = np.array(a) + _c = np.array(c) + _g = np.array(g) + _t = np.array(t) + _u = np.array(u) + _r = np.array(repeat) + encodings = [('a', _a),('c', _c),('g', _g,),('t', _t,),('u', _u,),('r', _r)] + for c, encoding in encodings: + if np.cumsum(encoding)[-1] != 1: + msg = 'base encoding for {} should be one hot!'.format(c) + raise ValueError(msg) + if encoding.size != 6: + msg = 'base encoding for {} should be of shape (6, )!'.format(c) + raise ValueError(msg) + r = _a + g + y = _c + _t + _u + k = _g + _t + _u + m = _a + _c + s = _c + _g + w = _a + _t + _u + b = _c + _g + _t + _u + d = _a + _g + _t + _u + h = _a + _c + _t + _u + v = _a + _c + _g + n = _a + _c + _g + _t + _u + e = np.zeros((6, ), dtype=np.int64) # '-' + A = _a + _r + C = _c + _r + G = _g + _r + T = _t + _r + U = _u + _r + R = r + _r + Y = y + _r + K = k + _r + M = m + _r + S = s + _r + W = w + _r + B = b + _r + D = d + _r + H = h + _r + V = v + _r + N = n + _r + E = e + _r + + + codex_matrix = np.array([ + _a,_c,_g,_t,_u, r, y, k, m, s, w, b, d, h, v, n, e, + A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, E + ]) + return codex_matrix[list(codex.values())] + +NUMPY_FASTA_CODEX_MATRIX = make_codex_matrix() + + +def _char_to_int(char:str, codex=NUMPY_FASTA_CODEX): + return codex[char] + +def _sequence_to_indices(sequence:str, codex=NUMPY_FASTA_CODEX): + fn = lambda c: _char_to_int(c, codex) + return list(map(fn, sequence)) + +def lookup_channel_index(channel, codex=NUMPY_FASTA_CODEX, codex_matrix=NUMPY_FASTA_CODEX_MATRIX): + return np.argmax(codex_matrix[_char_to_int(channel, codex)]) + +def get_channel_indices( + codex: dict = NUMPY_FASTA_CODEX, + codex_matrix: list = NUMPY_FASTA_CODEX_MATRIX, + include_uracil: bool = False, + include_repeat: bool = True +): + # drop uracil or repeat channel as needed + channels = 'acgt' + if include_uracil: channels += 'u' + if include_repeat: channels += '_' + fn = lambda c: lookup_channel_index(c, codex, codex_matrix) + return list(map(fn, channels)) + + + +def encode( + sequence:str, + codex: dict = NUMPY_FASTA_CODEX, + codex_matrix: list = NUMPY_FASTA_CODEX_MATRIX, + include_uracil: bool = False, + include_repeat: bool = True +): + # convert from string to integers + indices = np.array(_sequence_to_indices(sequence, codex)) + # extract the rows from the encoding matrix in order as they appear in seq + results = codex_matrix[indices] + channel_indices = get_channel_indices(codex, codex_matrix, include_uracil, include_repeat) + # filter results + return results[:, channel_indices].tolist() -- GitLab From 71b2008d491ed1c9cebd141387f9afc41e8f6996 Mon Sep 17 00:00:00 2001 From: Sumner Date: Wed, 3 Jul 2019 11:07:17 +0200 Subject: [PATCH 6/6] codex numpy-ified --- jupyter/Codex Numpy.ipynb | 139 ++++---------------------------- ntai/codex/__init__.py | 1 + ntai/codex/numpy.py | 141 -------------------------------- ntai/codex/numpy/__init__.py | 22 +++++ ntai/codex/numpy/codex.py | 83 +++++++++++++++++++ ntai/codex/numpy/defaults.py | 48 +++++++++++ ntai/codex/numpy/utils.py | 151 +++++++++++++++++++++++++++++++++++ 7 files changed, 321 insertions(+), 264 deletions(-) delete mode 100644 ntai/codex/numpy.py create mode 100644 ntai/codex/numpy/__init__.py create mode 100644 ntai/codex/numpy/codex.py create mode 100644 ntai/codex/numpy/defaults.py create mode 100644 ntai/codex/numpy/utils.py diff --git a/jupyter/Codex Numpy.ipynb b/jupyter/Codex Numpy.ipynb index faa1d77..e7b53ce 100644 --- a/jupyter/Codex Numpy.ipynb +++ b/jupyter/Codex Numpy.ipynb @@ -2,15 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "import os, sys, numpy as np\n", - "from ntai.codex.numpy import (\n", - " NUMPY_FASTA_CODEX, NUMPY_FASTA_CODEX_MATRIX, \n", - " encode, lookup_channel_index, _char_to_int, get_channel_indices\n", - ")" + "import os, sys, numpy as np" ] }, { @@ -19,153 +15,50 @@ "metadata": {}, "outputs": [], "source": [ - "seq = 'actgrykmswbdhvn-ACTGRYKMSWBDHVN'" + "from ntai.codex import NPCodex" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "uracil = False\n", - "repeat = False\n", - "enc = encode(seq, NUMPY_FASTA_CODEX, NUMPY_FASTA_CODEX_MATRIX, uracil, repeat)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "c = get_channel_indices(NUMPY_FASTA_CODEX, NUMPY_FASTA_CODEX_MATRIX, uracil, repeat)" + "seq = 'acgturykmswbdhvn-ACGTURYKMSWBDHVN_'" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "f = NUMPY_FASTA_CODEX_MATRIX[:, c]" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 0, 0, 0]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "enc[0]" + "uracil = True\n", + "repeat = True\n", + "codex = NPCodex(include_uracil=uracil, include_repeat=repeat)" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, + "execution_count": 5, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { "text/plain": [ - "array([[ 0, 0],\n", - " [ 0, 1],\n", - " [ 0, 2],\n", - " [ 0, 3],\n", - " [ 1, 2],\n", - " [ 1, 3],\n", - " [ 2, 1],\n", - " [ 2, 3],\n", - " [ 3, 1],\n", - " [ 3, 2],\n", - " [ 4, 1],\n", - " [ 4, 2],\n", - " [ 4, 3],\n", - " [ 5, 0],\n", - " [ 5, 1],\n", - " [ 5, 3],\n", - " [ 6, 2],\n", - " [ 7, 1],\n", - " [ 8, 0],\n", - " [ 8, 2],\n", - " [ 8, 3],\n", - " [ 9, 3],\n", - " [10, 0],\n", - " [10, 1],\n", - " [10, 2],\n", - " [12, 0],\n", - " [12, 1],\n", - " [13, 0],\n", - " [13, 2],\n", - " [14, 0],\n", - " [14, 3],\n", - " [15, 0],\n", - " [16, 1],\n", - " [16, 2],\n", - " [16, 3],\n", - " [17, 0],\n", - " [17, 1],\n", - " [17, 2],\n", - " [17, 3],\n", - " [18, 2],\n", - " [18, 3],\n", - " [19, 1],\n", - " [19, 3],\n", - " [20, 1],\n", - " [20, 2],\n", - " [21, 1],\n", - " [21, 2],\n", - " [21, 3],\n", - " [22, 0],\n", - " [22, 1],\n", - " [22, 3],\n", - " [23, 2],\n", - " [24, 1],\n", - " [25, 0],\n", - " [25, 2],\n", - " [25, 3],\n", - " [26, 3],\n", - " [27, 0],\n", - " [27, 1],\n", - " [27, 2],\n", - " [29, 0],\n", - " [29, 1],\n", - " [30, 0],\n", - " [30, 2],\n", - " [31, 0],\n", - " [31, 3],\n", - " [32, 0],\n", - " [33, 1],\n", - " [33, 2],\n", - " [33, 3]])" + "'acgturykmswbdhvn-ACGTURYKMSWBDHVN_'" ] }, - "execution_count": 15, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "np.argwhere(f == enc[0])" + "codex.decode(codex.encode(seq))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/ntai/codex/__init__.py b/ntai/codex/__init__.py index c49ce9c..15d6ea8 100644 --- a/ntai/codex/__init__.py +++ b/ntai/codex/__init__.py @@ -1 +1,2 @@ from .codex import Codex +from .numpy import NPCodex diff --git a/ntai/codex/numpy.py b/ntai/codex/numpy.py deleted file mode 100644 index a4aeedb..0000000 --- a/ntai/codex/numpy.py +++ /dev/null @@ -1,141 +0,0 @@ -import numpy as np - -NUMPY_FASTA_CODEX = { - 'a': 0, - 'c': 1, - 'g': 2, - 't': 3, - 'u': 4, - 'r': 5, - 'y': 6, - 'k': 7, - 'm': 8, - 's': 9, - 'w': 10, - 'b': 11, - 'd': 12, - 'h': 13, - 'v': 14, - 'n': 15, - '-': 16, - 'A': 17, - 'C': 18, - 'G': 19, - 'T': 20, - 'U': 21, - 'R': 22, - 'Y': 23, - 'K': 24, - 'M': 25, - 'S': 26, - 'W': 27, - 'B': 28, - 'D': 29, - 'H': 30, - 'V': 31, - 'N': 32, - '_': 33 -} - -def make_codex_matrix( - a:list = [1,0,0,0,0,0], - c:list = [0,1,0,0,0,0], - g:list = [0,0,1,0,0,0], - t:list = [0,0,0,1,0,0], - u:list = [0,0,0,0,1,0], - repeat:list = [0,0,0,0,0,1], - codex:dict = NUMPY_FASTA_CODEX -): - _a = np.array(a) - _c = np.array(c) - _g = np.array(g) - _t = np.array(t) - _u = np.array(u) - _r = np.array(repeat) - encodings = [('a', _a),('c', _c),('g', _g,),('t', _t,),('u', _u,),('r', _r)] - for c, encoding in encodings: - if np.cumsum(encoding)[-1] != 1: - msg = 'base encoding for {} should be one hot!'.format(c) - raise ValueError(msg) - if encoding.size != 6: - msg = 'base encoding for {} should be of shape (6, )!'.format(c) - raise ValueError(msg) - r = _a + g - y = _c + _t + _u - k = _g + _t + _u - m = _a + _c - s = _c + _g - w = _a + _t + _u - b = _c + _g + _t + _u - d = _a + _g + _t + _u - h = _a + _c + _t + _u - v = _a + _c + _g - n = _a + _c + _g + _t + _u - e = np.zeros((6, ), dtype=np.int64) # '-' - A = _a + _r - C = _c + _r - G = _g + _r - T = _t + _r - U = _u + _r - R = r + _r - Y = y + _r - K = k + _r - M = m + _r - S = s + _r - W = w + _r - B = b + _r - D = d + _r - H = h + _r - V = v + _r - N = n + _r - E = e + _r - - - codex_matrix = np.array([ - _a,_c,_g,_t,_u, r, y, k, m, s, w, b, d, h, v, n, e, - A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, E - ]) - return codex_matrix[list(codex.values())] - -NUMPY_FASTA_CODEX_MATRIX = make_codex_matrix() - - -def _char_to_int(char:str, codex=NUMPY_FASTA_CODEX): - return codex[char] - -def _sequence_to_indices(sequence:str, codex=NUMPY_FASTA_CODEX): - fn = lambda c: _char_to_int(c, codex) - return list(map(fn, sequence)) - -def lookup_channel_index(channel, codex=NUMPY_FASTA_CODEX, codex_matrix=NUMPY_FASTA_CODEX_MATRIX): - return np.argmax(codex_matrix[_char_to_int(channel, codex)]) - -def get_channel_indices( - codex: dict = NUMPY_FASTA_CODEX, - codex_matrix: list = NUMPY_FASTA_CODEX_MATRIX, - include_uracil: bool = False, - include_repeat: bool = True -): - # drop uracil or repeat channel as needed - channels = 'acgt' - if include_uracil: channels += 'u' - if include_repeat: channels += '_' - fn = lambda c: lookup_channel_index(c, codex, codex_matrix) - return list(map(fn, channels)) - - - -def encode( - sequence:str, - codex: dict = NUMPY_FASTA_CODEX, - codex_matrix: list = NUMPY_FASTA_CODEX_MATRIX, - include_uracil: bool = False, - include_repeat: bool = True -): - # convert from string to integers - indices = np.array(_sequence_to_indices(sequence, codex)) - # extract the rows from the encoding matrix in order as they appear in seq - results = codex_matrix[indices] - channel_indices = get_channel_indices(codex, codex_matrix, include_uracil, include_repeat) - # filter results - return results[:, channel_indices].tolist() diff --git a/ntai/codex/numpy/__init__.py b/ntai/codex/numpy/__init__.py new file mode 100644 index 0000000..21c39fb --- /dev/null +++ b/ntai/codex/numpy/__init__.py @@ -0,0 +1,22 @@ +from .codex import Codex as NPCodex +from .codex import ( + NUMPY_FASTA_ENCODEX, + NUMPY_FASTA_ENCODEX_MATRIX, + NUMPY_FASTA_DECODEX, + INCLUDE_URACIL, + INCLUDE_REPEAT, + ENCODING_a, + ENCODING_c, + ENCODING_g, + ENCODING_t, + ENCODING_u, + ENCODING_repeat, +) + +from .utils import ( + make_encodex_matrix, + invert_encodex, + find_row, + lookup_channel_index, + get_channel_indices +) diff --git a/ntai/codex/numpy/codex.py b/ntai/codex/numpy/codex.py new file mode 100644 index 0000000..5d497c9 --- /dev/null +++ b/ntai/codex/numpy/codex.py @@ -0,0 +1,83 @@ +import numpy as np +from ntai.codex.numpy.defaults import ( + NUMPY_FASTA_ENCODEX, + INCLUDE_URACIL, + INCLUDE_REPEAT, + ENCODING_a, + ENCODING_c, + ENCODING_g, + ENCODING_t, + ENCODING_u, + ENCODING_repeat +) + +from ntai.codex.numpy.utils import ( + make_encodex_matrix, invert_encodex, + encode, decode +) + +NUMPY_FASTA_ENCODEX_MATRIX = make_encodex_matrix( + NUMPY_FASTA_ENCODEX, + ENCODING_a, + ENCODING_c, + ENCODING_g, + ENCODING_t, + ENCODING_u, + ENCODING_repeat +) + +NUMPY_FASTA_DECODEX = invert_encodex(NUMPY_FASTA_ENCODEX) + +class Codex: + + def __init__( + self, + include_uracil:bool = INCLUDE_URACIL, + include_repeat:bool = INCLUDE_REPEAT, + encodex: dict = NUMPY_FASTA_ENCODEX, + decodex: dict = NUMPY_FASTA_DECODEX, + encodex_matrix: list = NUMPY_FASTA_ENCODEX_MATRIX + ): + ''' + Arguments: + include_uracil (bool): whether or not uracil should be included in the + embedding. By default False. + + include_repeat (bool): wehtehr or not repeated masked regions should be + included in the embedding. By default False. + + encodex (dict): the dictionary converting fasta characters to + nucleotides. + + decodex (dict): the dictionary converting nucleotide characters to + fasta characters. + ''' + self.include_uracil = include_uracil + self.include_repeat = include_repeat + self.encodex = encodex + if decodex is None: + decodex = invert_encodex(encodex) + self.decodex = decodex + if encodex_matrix is None: + encodex_matrix = make_encodex_matrix(encodex=encodex) + self.encodex_matrix = encodex_matrix + + + def encode(self, sequence): + return encode( + sequence, + encodex = self.encodex, + encodex_matrix = self.encodex_matrix, + include_uracil = self.include_uracil, + include_repeat = self.include_repeat + ) + + def decode(self, encoded): + return decode( + encoded, + encodex = self.encodex, + encodex_matrix = self.encodex_matrix, + include_uracil = self.include_uracil, + include_repeat = self.include_repeat, + decodex = self.decodex + ) diff --git a/ntai/codex/numpy/defaults.py b/ntai/codex/numpy/defaults.py new file mode 100644 index 0000000..90e0924 --- /dev/null +++ b/ntai/codex/numpy/defaults.py @@ -0,0 +1,48 @@ +import numpy as np +# from ntai.codex.numpy.utils import make_encodex_matrix, invert_codex + +ENCODING_a = [1,0,0,0,0,0] +ENCODING_c = [0,1,0,0,0,0] +ENCODING_g = [0,0,1,0,0,0] +ENCODING_t = [0,0,0,1,0,0] +ENCODING_u = [0,0,0,0,1,0] +ENCODING_repeat = [0,0,0,0,0,1] +INCLUDE_URACIL = False +INCLUDE_REPEAT = False + +NUMPY_FASTA_ENCODEX = { + '-': 0, # e is used to represent '-' as a python variable + 'a': 1, + 'c': 2, + 'g': 3, + 't': 4, + 'u': 5, + 'r': 6, + 'y': 7, + 'k': 8, + 'm': 9, + 's': 10, + 'w': 11, + 'b': 12, + 'd': 13, + 'h': 14, + 'v': 15, + 'n': 16, + '_': 17, # E is used to represent '_' as a python variable + 'A': 18, + 'C': 19, + 'G': 20, + 'T': 21, + 'U': 22, + 'R': 23, + 'Y': 24, + 'K': 25, + 'M': 26, + 'S': 27, + 'W': 28, + 'B': 29, + 'D': 30, + 'H': 31, + 'V': 32, + 'N': 33 +} diff --git a/ntai/codex/numpy/utils.py b/ntai/codex/numpy/utils.py new file mode 100644 index 0000000..ab6d0e2 --- /dev/null +++ b/ntai/codex/numpy/utils.py @@ -0,0 +1,151 @@ +import numpy as np +from ntai.codex.numpy.defaults import ( + INCLUDE_URACIL, + INCLUDE_REPEAT, + ENCODING_a, + ENCODING_c, + ENCODING_g, + ENCODING_t, + ENCODING_u, + ENCODING_repeat +) + +def validate_base_encoding(name, encoding): + if np.cumsum(encoding)[-1] != 1: + msg = 'base encoding for {} should be one hot!'.format(name) + raise ValueError(msg) + if encoding.size != 6: + msg = 'base encoding for {} should be of shape (6, )!'.format(name) + raise ValueError(msg) + +def make_encodex_matrix( + encodex:dict, + a:list = ENCODING_a, + c:list = ENCODING_c, + g:list = ENCODING_g, + t:list = ENCODING_t, + u:list = ENCODING_u, + repeat:list = ENCODING_repeat +) -> list: + _a = np.array(a) + _c = np.array(c) + _g = np.array(g) + _t = np.array(t) + _u = np.array(u) + _r = np.array(repeat) + encodings = [('a', _a),('c', _c),('g', _g,),('t', _t,),('u', _u,),('r', _r)] + for c, encoding in encodings: + validate_base_encoding(c, encoding) + + e = np.zeros((6, ), dtype=np.int64) # '-' + r = _a + g + y = _c + _t + _u + k = _g + _t + _u + m = _a + _c + s = _c + _g + w = _a + _t + _u + b = _c + _g + _t + _u + d = _a + _g + _t + _u + h = _a + _c + _t + _u + v = _a + _c + _g + n = _a + _c + _g + _t + _u + E = e + _r + A = _a + _r + C = _c + _r + G = _g + _r + T = _t + _r + U = _u + _r + R = r + _r + Y = y + _r + K = k + _r + M = m + _r + S = s + _r + W = w + _r + B = b + _r + D = d + _r + H = h + _r + V = v + _r + N = n + _r + + + encodex_matrix = np.array([ + # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16, + e, _a,_c,_g,_t,_u, r, y, k, m, s, w, b, d, h, v, n, + #17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33 + E, A, C, G, T, U, R, Y, K, M, S, W, B, D, H, V, N, + ]) + return encodex_matrix[list(encodex.values())] + + +def invert_encodex(encodex:dict) -> dict: + k, v = zip(*encodex.items()) + return dict(zip(v, k)) + +def find_row(matrix:list, row:list) -> int: + return np.where(np.all(matrix == row, axis=1))[0][0] + +def _char_to_int(char:str, encodex:dict)->int: + return encodex[char] + +def _sequence_to_indices(sequence:str, encodex:dict)->list: + fn = lambda c: _char_to_int(c, encodex) + return list(map(fn, sequence)) + + +def lookup_channel_index( + channel:int, + encodex:dict, + encodex_matrix:list +) -> int: + return np.argmax(encodex_matrix[_char_to_int(channel, encodex)]) + + +def get_channel_indices( + encodex: dict, + encodex_matrix: list, + include_uracil: bool = INCLUDE_URACIL, + include_repeat: bool = INCLUDE_REPEAT +) -> list: + # drop uracil or repeat channel as needed + channels = 'acgt' + if include_uracil: channels += 'u' + if include_repeat: channels += '_' + fn = lambda c: lookup_channel_index(c, encodex, encodex_matrix) + return list(map(fn, channels)) + + +def encode( + sequence:str, + encodex: dict, + encodex_matrix: list = None, + include_uracil: bool = INCLUDE_URACIL, + include_repeat: bool = INCLUDE_REPEAT +): + # convert from string to integers + indices = np.array(_sequence_to_indices(sequence, encodex)) + if encodex_matrix is None: + encodex_matrix = make_encodex_matrix(encodex=encodex) + # extract the rows from the encoding matrix in order as they appear in seq + results = encodex_matrix[indices] + channel_indices = get_channel_indices(encodex, encodex_matrix, include_uracil, include_repeat) + # filter results + return results[:, channel_indices].tolist() + +def decode( + encoded:list, + encodex: dict, + encodex_matrix: list = None, + include_uracil: bool = INCLUDE_URACIL, + include_repeat: bool = INCLUDE_REPEAT, + decodex: dict = None +): + + if encodex_matrix is None: + encodex_matrix = make_encodex_matrix(encodex=encodex) + if decodex is None: + decodex = invert_encodex(encodex) + + channel_indices = get_channel_indices(encodex, encodex_matrix, include_uracil, include_repeat) + filtered_matrix = encodex_matrix[:, channel_indices] + keys = list(map(lambda r: find_row(filtered_matrix, r), encoded)) + return ''.join(list(map(lambda k: decodex[k], keys))) -- GitLab