Skip to content

Commit c6d0e9f

Browse files
authored
Merge pull request #96 from CompOmics/feature/progress-and-threads
Feature/progress and threads
2 parents 5e9890c + c9689d1 commit c6d0e9f

11 files changed

Lines changed: 654 additions & 289 deletions

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to
77
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).
88

9-
## Unreleased
9+
## [4.0.0-alpha.1]
1010

1111
### Changed
1212

1313
- Simplified the public package API by splitting up the single class-based API into core functions (`predict`, `finetune`, `train`, etc.)
1414
- Switched deep learning framework from Tensorflow to PyTorch
1515
- Speed up predictions by removing ensemble method where output from three models with differing kernel sizes was averaged to one prediction
1616
- Separated calibration logic to dedicated reusable module with sklearn-like API.
17+
- Improved computational efficiency of piece-wise linear calibration and set sensible default parameters
1718
- Built-in transfer learning functionality, instead of using external `deeplcretrainer` package.
1819
- Cleaned up package, removing legacy and unused code and files, and improving modularity
1920
- Modernized CI workflows to use `uv`

MANIFEST.in

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1 @@
1-
include deeplc/models/*
21
include deeplc/package_data/**/*
3-
include deeplc/baseline_performance/*

deeplc/_features.py

Lines changed: 189 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Feature extraction for DeepLC."""
22

3+
# TODO: Consider ProForma fixed modifications (that are not applied yet) for feature extraction.
4+
35
from __future__ import annotations
46

57
import logging
@@ -26,6 +28,112 @@
2628
# fmt: on
2729

2830

31+
def encode_peptidoform(
32+
peptidoform: Peptidoform | str,
33+
add_ccs_features: bool = False,
34+
padding_length: int = 60,
35+
positions: set[int] | None = None,
36+
positions_pos: set[int] | None = None,
37+
positions_neg: set[int] | None = None,
38+
dict_aa: dict[str, int] | None = None,
39+
dict_index_pos: dict[str, int] | None = None,
40+
dict_index: dict[str, int] | None = None,
41+
) -> dict[str, np.ndarray]:
42+
"""
43+
Extract features from a single peptidoform.
44+
45+
Parameters
46+
----------
47+
peptidoform
48+
The peptidoform to encode, either as a Peptidoform object or a string.
49+
add_ccs_features
50+
Whether to include CCS features. Default is False.
51+
padding_length
52+
The maximum length of the sequence after padding. Default is 60.
53+
positions
54+
The positions to consider for feature extraction. Default is DEFAULT_POSITIONS.
55+
positions_pos
56+
The positive positions to consider for feature extraction. Default is
57+
DEFAULT_POSITIONS_POS.
58+
positions_neg
59+
The negative positions to consider for feature extraction. Default is
60+
DEFAULT_POSITIONS_NEG.
61+
dict_aa
62+
A dictionary mapping amino acids to indices. Default is DEFAULT_DICT_AA.
63+
dict_index_pos
64+
A dictionary mapping atoms to indices for the positional matrix. Default is
65+
DEFAULT_DICT_INDEX_POS.
66+
dict_index
67+
A dictionary mapping atoms to indices. Default is DEFAULT_DICT_INDEX.
68+
69+
Returns
70+
-------
71+
dict[str, np.ndarray]
72+
A dictionary of Numpy arrays containing the extracted features.
73+
74+
"""
75+
positions = positions or DEFAULT_POSITIONS
76+
positions_pos = positions_pos or DEFAULT_POSITIONS_POS
77+
positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
78+
dict_aa = dict_aa or DEFAULT_DICT_AA
79+
dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
80+
dict_index = dict_index or DEFAULT_DICT_INDEX
81+
82+
if isinstance(peptidoform, str):
83+
peptidoform = Peptidoform(peptidoform)
84+
seq = peptidoform.sequence
85+
charge = peptidoform.precursor_charge
86+
seq, seq_len = _truncate_sequence(seq, padding_length)
87+
88+
std_matrix = _fill_standard_matrix(seq, padding_length, dict_index)
89+
onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa)
90+
pos_matrix = _fill_pos_matrix(
91+
seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos
92+
)
93+
_apply_modifications(
94+
std_matrix,
95+
pos_matrix,
96+
peptidoform.parsed_sequence,
97+
seq_len,
98+
dict_index,
99+
dict_index_pos,
100+
positions,
101+
)
102+
_apply_terminal_modifications(
103+
std_matrix,
104+
pos_matrix,
105+
peptidoform,
106+
seq_len,
107+
dict_index,
108+
dict_index_pos,
109+
positions,
110+
)
111+
112+
matrix_all = np.sum(std_matrix, axis=0)
113+
matrix_all = np.append(matrix_all, seq_len)
114+
if add_ccs_features:
115+
if not charge:
116+
raise ValueError(f"Peptidoform has no charge: {peptidoform}")
117+
matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len)
118+
matrix_all = np.append(
119+
matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len
120+
)
121+
matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len)
122+
matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len)
123+
matrix_all = np.append(matrix_all, charge)
124+
125+
matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T
126+
127+
matrix_global = np.concatenate([matrix_all, pos_matrix.flatten()])
128+
129+
return {
130+
"matrix": std_matrix,
131+
"matrix_sum": matrix_sum,
132+
"matrix_global": matrix_global,
133+
"matrix_hc": onehot_matrix,
134+
}
135+
136+
29137
def _truncate_sequence(seq: str, max_length: int) -> tuple[str, int]:
30138
"""Truncate the sequence if it exceeds the max_length."""
31139
if len(seq) > max_length:
@@ -98,6 +206,40 @@ def _fill_pos_matrix(
98206
return pos_mat
99207

100208

209+
def _apply_composition_to_matrices(
210+
mat: np.ndarray,
211+
pos_mat: np.ndarray,
212+
composition: mass.Composition,
213+
i: int,
214+
seq_len: int,
215+
dict_index: dict[str, int],
216+
dict_index_pos: dict[str, int],
217+
positions: set[int],
218+
) -> None:
219+
"""Apply a composition delta to the standard and positional matrices."""
220+
for atom_comp, change in composition.items():
221+
try:
222+
mat[i, dict_index[atom_comp]] += change
223+
if i in positions:
224+
pos_mat[i, dict_index_pos[atom_comp]] += change
225+
elif (i - seq_len) in positions:
226+
pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change
227+
except KeyError:
228+
try:
229+
warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2)
230+
atom_comp_clean = sub(r"\[.*?\]", "", atom_comp)
231+
mat[i, dict_index[atom_comp_clean]] += change
232+
if i in positions:
233+
pos_mat[i, dict_index_pos[atom_comp_clean]] += change
234+
elif (i - seq_len) in positions:
235+
pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change
236+
except KeyError:
237+
warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2)
238+
continue
239+
except IndexError:
240+
warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2)
241+
242+
101243
def _apply_modifications(
102244
mat: np.ndarray,
103245
pos_mat: np.ndarray,
@@ -118,96 +260,58 @@ def _apply_modifications(
118260
f"Skipping modification without known composition: {token[1]}", stacklevel=2
119261
)
120262
continue
121-
for atom_comp, change in mod_comp.items():
263+
_apply_composition_to_matrices(
264+
mat,
265+
pos_mat,
266+
mod_comp,
267+
i,
268+
seq_len,
269+
dict_index,
270+
dict_index_pos,
271+
positions,
272+
)
273+
274+
275+
def _apply_terminal_modifications(
276+
mat: np.ndarray,
277+
pos_mat: np.ndarray,
278+
peptidoform: Peptidoform,
279+
seq_len: int,
280+
dict_index: dict[str, int],
281+
dict_index_pos: dict[str, int],
282+
positions: set[int],
283+
) -> None:
284+
"""Apply N- and C-terminal modification changes to the matrices."""
285+
terminal_mods = [
286+
(0, peptidoform.properties.get("n_term")), # N-terminus at position 0
287+
(seq_len - 1, peptidoform.properties.get("c_term")), # C-terminus at last position
288+
]
289+
for i, mods in terminal_mods:
290+
if not mods:
291+
continue
292+
for tag in mods:
122293
try:
123-
mat[i, dict_index[atom_comp]] += change
124-
if i in positions:
125-
pos_mat[i, dict_index_pos[atom_comp]] += change
126-
elif (i - seq_len) in positions:
127-
pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change
128-
except KeyError:
129-
try:
130-
warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2)
131-
atom_comp_clean = sub(r"\[.*?\]", "", atom_comp)
132-
mat[i, dict_index[atom_comp_clean]] += change
133-
if i in positions:
134-
pos_mat[i, dict_index_pos[atom_comp_clean]] += change
135-
elif (i - seq_len) in positions:
136-
pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change
137-
except KeyError:
138-
warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2)
139-
continue
140-
except IndexError:
141-
warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2)
294+
mod_comp = tag.composition
295+
except Exception:
296+
warnings.warn(
297+
f"Skipping terminal modification without known composition: {tag}",
298+
stacklevel=2,
299+
)
300+
continue
301+
_apply_composition_to_matrices(
302+
mat,
303+
pos_mat,
304+
mod_comp,
305+
i,
306+
seq_len,
307+
dict_index,
308+
dict_index_pos,
309+
positions,
310+
)
142311

143312

144313
def _compute_rolling_sum(matrix: np.ndarray, n: int = 2) -> np.ndarray:
145314
"""Compute a rolling sum over the matrix."""
146315
ret = np.cumsum(matrix, axis=1, dtype=np.float32)
147316
ret[:, n:] = ret[:, n:] - ret[:, :-n]
148317
return ret[:, n - 1 :]
149-
150-
151-
def encode_peptidoform(
152-
peptidoform: Peptidoform | str,
153-
add_ccs_features: bool = False,
154-
padding_length: int = 60,
155-
positions: set[int] | None = None,
156-
positions_pos: set[int] | None = None,
157-
positions_neg: set[int] | None = None,
158-
dict_aa: dict[str, int] | None = None,
159-
dict_index_pos: dict[str, int] | None = None,
160-
dict_index: dict[str, int] | None = None,
161-
) -> dict[str, np.ndarray]:
162-
"""Extract features from a single peptidoform."""
163-
positions = positions or DEFAULT_POSITIONS
164-
positions_pos = positions_pos or DEFAULT_POSITIONS_POS
165-
positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
166-
dict_aa = dict_aa or DEFAULT_DICT_AA
167-
dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
168-
dict_index = dict_index or DEFAULT_DICT_INDEX
169-
170-
if isinstance(peptidoform, str):
171-
peptidoform = Peptidoform(peptidoform)
172-
seq = peptidoform.sequence
173-
charge = peptidoform.precursor_charge
174-
seq, seq_len = _truncate_sequence(seq, padding_length)
175-
176-
std_matrix = _fill_standard_matrix(seq, padding_length, dict_index)
177-
onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa)
178-
pos_matrix = _fill_pos_matrix(
179-
seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos
180-
)
181-
_apply_modifications(
182-
std_matrix,
183-
pos_matrix,
184-
peptidoform.parsed_sequence,
185-
seq_len,
186-
dict_index,
187-
dict_index_pos,
188-
positions,
189-
)
190-
191-
matrix_all = np.sum(std_matrix, axis=0)
192-
matrix_all = np.append(matrix_all, seq_len)
193-
if add_ccs_features:
194-
if not charge:
195-
raise ValueError(f"Peptidoform has no charge: {peptidoform}")
196-
matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len)
197-
matrix_all = np.append(
198-
matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len
199-
)
200-
matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len)
201-
matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len)
202-
matrix_all = np.append(matrix_all, charge)
203-
204-
matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T
205-
206-
matrix_global = np.concatenate([matrix_all, pos_matrix.flatten()])
207-
208-
return {
209-
"matrix": std_matrix,
210-
"matrix_sum": matrix_sum,
211-
"matrix_global": matrix_global,
212-
"matrix_hc": onehot_matrix,
213-
}

0 commit comments

Comments
 (0)