11"""Feature extraction for DeepLC."""
22
3+ # TODO: Consider ProForma fixed modifications (that are not applied yet) for feature extraction.
4+
35from __future__ import annotations
46
57import logging
2628# fmt: on
2729
2830
31+ def encode_peptidoform (
32+ peptidoform : Peptidoform | str ,
33+ add_ccs_features : bool = False ,
34+ padding_length : int = 60 ,
35+ positions : set [int ] | None = None ,
36+ positions_pos : set [int ] | None = None ,
37+ positions_neg : set [int ] | None = None ,
38+ dict_aa : dict [str , int ] | None = None ,
39+ dict_index_pos : dict [str , int ] | None = None ,
40+ dict_index : dict [str , int ] | None = None ,
41+ ) -> dict [str , np .ndarray ]:
42+ """
43+ Extract features from a single peptidoform.
44+
45+ Parameters
46+ ----------
47+ peptidoform
48+ The peptidoform to encode, either as a Peptidoform object or a string.
49+ add_ccs_features
50+ Whether to include CCS features. Default is False.
51+ padding_length
52+ The maximum length of the sequence after padding. Default is 60.
53+ positions
54+ The positions to consider for feature extraction. Default is DEFAULT_POSITIONS.
55+ positions_pos
56+ The positive positions to consider for feature extraction. Default is
57+ DEFAULT_POSITIONS_POS.
58+ positions_neg
59+ The negative positions to consider for feature extraction. Default is
60+ DEFAULT_POSITIONS_NEG.
61+ dict_aa
62+ A dictionary mapping amino acids to indices. Default is DEFAULT_DICT_AA.
63+ dict_index_pos
64+ A dictionary mapping atoms to indices for the positional matrix. Default is
65+ DEFAULT_DICT_INDEX_POS.
66+ dict_index
67+ A dictionary mapping atoms to indices. Default is DEFAULT_DICT_INDEX.
68+
69+ Returns
70+ -------
71+ dict[str, np.ndarray]
72+ A dictionary of Numpy arrays containing the extracted features.
73+
74+ """
75+ positions = positions or DEFAULT_POSITIONS
76+ positions_pos = positions_pos or DEFAULT_POSITIONS_POS
77+ positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
78+ dict_aa = dict_aa or DEFAULT_DICT_AA
79+ dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
80+ dict_index = dict_index or DEFAULT_DICT_INDEX
81+
82+ if isinstance (peptidoform , str ):
83+ peptidoform = Peptidoform (peptidoform )
84+ seq = peptidoform .sequence
85+ charge = peptidoform .precursor_charge
86+ seq , seq_len = _truncate_sequence (seq , padding_length )
87+
88+ std_matrix = _fill_standard_matrix (seq , padding_length , dict_index )
89+ onehot_matrix = _fill_onehot_matrix (peptidoform .parsed_sequence , padding_length , dict_aa )
90+ pos_matrix = _fill_pos_matrix (
91+ seq , seq_len , positions_pos , positions_neg , dict_index , dict_index_pos
92+ )
93+ _apply_modifications (
94+ std_matrix ,
95+ pos_matrix ,
96+ peptidoform .parsed_sequence ,
97+ seq_len ,
98+ dict_index ,
99+ dict_index_pos ,
100+ positions ,
101+ )
102+ _apply_terminal_modifications (
103+ std_matrix ,
104+ pos_matrix ,
105+ peptidoform ,
106+ seq_len ,
107+ dict_index ,
108+ dict_index_pos ,
109+ positions ,
110+ )
111+
112+ matrix_all = np .sum (std_matrix , axis = 0 )
113+ matrix_all = np .append (matrix_all , seq_len )
114+ if add_ccs_features :
115+ if not charge :
116+ raise ValueError (f"Peptidoform has no charge: { peptidoform } " )
117+ matrix_all = np .append (matrix_all , (seq .count ("H" )) / seq_len )
118+ matrix_all = np .append (
119+ matrix_all , (seq .count ("F" ) + seq .count ("W" ) + seq .count ("Y" )) / seq_len
120+ )
121+ matrix_all = np .append (matrix_all , (seq .count ("D" ) + seq .count ("E" )) / seq_len )
122+ matrix_all = np .append (matrix_all , (seq .count ("K" ) + seq .count ("R" )) / seq_len )
123+ matrix_all = np .append (matrix_all , charge )
124+
125+ matrix_sum = _compute_rolling_sum (std_matrix .T , n = 2 )[:, ::2 ].T
126+
127+ matrix_global = np .concatenate ([matrix_all , pos_matrix .flatten ()])
128+
129+ return {
130+ "matrix" : std_matrix ,
131+ "matrix_sum" : matrix_sum ,
132+ "matrix_global" : matrix_global ,
133+ "matrix_hc" : onehot_matrix ,
134+ }
135+
136+
29137def _truncate_sequence (seq : str , max_length : int ) -> tuple [str , int ]:
30138 """Truncate the sequence if it exceeds the max_length."""
31139 if len (seq ) > max_length :
@@ -98,6 +206,40 @@ def _fill_pos_matrix(
98206 return pos_mat
99207
100208
209+ def _apply_composition_to_matrices (
210+ mat : np .ndarray ,
211+ pos_mat : np .ndarray ,
212+ composition : mass .Composition ,
213+ i : int ,
214+ seq_len : int ,
215+ dict_index : dict [str , int ],
216+ dict_index_pos : dict [str , int ],
217+ positions : set [int ],
218+ ) -> None :
219+ """Apply a composition delta to the standard and positional matrices."""
220+ for atom_comp , change in composition .items ():
221+ try :
222+ mat [i , dict_index [atom_comp ]] += change
223+ if i in positions :
224+ pos_mat [i , dict_index_pos [atom_comp ]] += change
225+ elif (i - seq_len ) in positions :
226+ pos_mat [i - seq_len , dict_index_pos [atom_comp ]] += change
227+ except KeyError :
228+ try :
229+ warnings .warn (f"Replacing pattern for atom: { atom_comp } " , stacklevel = 2 )
230+ atom_comp_clean = sub (r"\[.*?\]" , "" , atom_comp )
231+ mat [i , dict_index [atom_comp_clean ]] += change
232+ if i in positions :
233+ pos_mat [i , dict_index_pos [atom_comp_clean ]] += change
234+ elif (i - seq_len ) in positions :
235+ pos_mat [i - seq_len , dict_index_pos [atom_comp_clean ]] += change
236+ except KeyError :
237+ warnings .warn (f"Ignoring atom { atom_comp } at pos { i } " , stacklevel = 2 )
238+ continue
239+ except IndexError :
240+ warnings .warn (f"Index error for atom { atom_comp } at pos { i } " , stacklevel = 2 )
241+
242+
101243def _apply_modifications (
102244 mat : np .ndarray ,
103245 pos_mat : np .ndarray ,
@@ -118,96 +260,58 @@ def _apply_modifications(
118260 f"Skipping modification without known composition: { token [1 ]} " , stacklevel = 2
119261 )
120262 continue
121- for atom_comp , change in mod_comp .items ():
263+ _apply_composition_to_matrices (
264+ mat ,
265+ pos_mat ,
266+ mod_comp ,
267+ i ,
268+ seq_len ,
269+ dict_index ,
270+ dict_index_pos ,
271+ positions ,
272+ )
273+
274+
275+ def _apply_terminal_modifications (
276+ mat : np .ndarray ,
277+ pos_mat : np .ndarray ,
278+ peptidoform : Peptidoform ,
279+ seq_len : int ,
280+ dict_index : dict [str , int ],
281+ dict_index_pos : dict [str , int ],
282+ positions : set [int ],
283+ ) -> None :
284+ """Apply N- and C-terminal modification changes to the matrices."""
285+ terminal_mods = [
286+ (0 , peptidoform .properties .get ("n_term" )), # N-terminus at position 0
287+ (seq_len - 1 , peptidoform .properties .get ("c_term" )), # C-terminus at last position
288+ ]
289+ for i , mods in terminal_mods :
290+ if not mods :
291+ continue
292+ for tag in mods :
122293 try :
123- mat [i , dict_index [atom_comp ]] += change
124- if i in positions :
125- pos_mat [i , dict_index_pos [atom_comp ]] += change
126- elif (i - seq_len ) in positions :
127- pos_mat [i - seq_len , dict_index_pos [atom_comp ]] += change
128- except KeyError :
129- try :
130- warnings .warn (f"Replacing pattern for atom: { atom_comp } " , stacklevel = 2 )
131- atom_comp_clean = sub (r"\[.*?\]" , "" , atom_comp )
132- mat [i , dict_index [atom_comp_clean ]] += change
133- if i in positions :
134- pos_mat [i , dict_index_pos [atom_comp_clean ]] += change
135- elif (i - seq_len ) in positions :
136- pos_mat [i - seq_len , dict_index_pos [atom_comp_clean ]] += change
137- except KeyError :
138- warnings .warn (f"Ignoring atom { atom_comp } at pos { i } " , stacklevel = 2 )
139- continue
140- except IndexError :
141- warnings .warn (f"Index error for atom { atom_comp } at pos { i } " , stacklevel = 2 )
294+ mod_comp = tag .composition
295+ except Exception :
296+ warnings .warn (
297+ f"Skipping terminal modification without known composition: { tag } " ,
298+ stacklevel = 2 ,
299+ )
300+ continue
301+ _apply_composition_to_matrices (
302+ mat ,
303+ pos_mat ,
304+ mod_comp ,
305+ i ,
306+ seq_len ,
307+ dict_index ,
308+ dict_index_pos ,
309+ positions ,
310+ )
142311
143312
144313def _compute_rolling_sum (matrix : np .ndarray , n : int = 2 ) -> np .ndarray :
145314 """Compute a rolling sum over the matrix."""
146315 ret = np .cumsum (matrix , axis = 1 , dtype = np .float32 )
147316 ret [:, n :] = ret [:, n :] - ret [:, :- n ]
148317 return ret [:, n - 1 :]
149-
150-
151- def encode_peptidoform (
152- peptidoform : Peptidoform | str ,
153- add_ccs_features : bool = False ,
154- padding_length : int = 60 ,
155- positions : set [int ] | None = None ,
156- positions_pos : set [int ] | None = None ,
157- positions_neg : set [int ] | None = None ,
158- dict_aa : dict [str , int ] | None = None ,
159- dict_index_pos : dict [str , int ] | None = None ,
160- dict_index : dict [str , int ] | None = None ,
161- ) -> dict [str , np .ndarray ]:
162- """Extract features from a single peptidoform."""
163- positions = positions or DEFAULT_POSITIONS
164- positions_pos = positions_pos or DEFAULT_POSITIONS_POS
165- positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
166- dict_aa = dict_aa or DEFAULT_DICT_AA
167- dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
168- dict_index = dict_index or DEFAULT_DICT_INDEX
169-
170- if isinstance (peptidoform , str ):
171- peptidoform = Peptidoform (peptidoform )
172- seq = peptidoform .sequence
173- charge = peptidoform .precursor_charge
174- seq , seq_len = _truncate_sequence (seq , padding_length )
175-
176- std_matrix = _fill_standard_matrix (seq , padding_length , dict_index )
177- onehot_matrix = _fill_onehot_matrix (peptidoform .parsed_sequence , padding_length , dict_aa )
178- pos_matrix = _fill_pos_matrix (
179- seq , seq_len , positions_pos , positions_neg , dict_index , dict_index_pos
180- )
181- _apply_modifications (
182- std_matrix ,
183- pos_matrix ,
184- peptidoform .parsed_sequence ,
185- seq_len ,
186- dict_index ,
187- dict_index_pos ,
188- positions ,
189- )
190-
191- matrix_all = np .sum (std_matrix , axis = 0 )
192- matrix_all = np .append (matrix_all , seq_len )
193- if add_ccs_features :
194- if not charge :
195- raise ValueError (f"Peptidoform has no charge: { peptidoform } " )
196- matrix_all = np .append (matrix_all , (seq .count ("H" )) / seq_len )
197- matrix_all = np .append (
198- matrix_all , (seq .count ("F" ) + seq .count ("W" ) + seq .count ("Y" )) / seq_len
199- )
200- matrix_all = np .append (matrix_all , (seq .count ("D" ) + seq .count ("E" )) / seq_len )
201- matrix_all = np .append (matrix_all , (seq .count ("K" ) + seq .count ("R" )) / seq_len )
202- matrix_all = np .append (matrix_all , charge )
203-
204- matrix_sum = _compute_rolling_sum (std_matrix .T , n = 2 )[:, ::2 ].T
205-
206- matrix_global = np .concatenate ([matrix_all , pos_matrix .flatten ()])
207-
208- return {
209- "matrix" : std_matrix ,
210- "matrix_sum" : matrix_sum ,
211- "matrix_global" : matrix_global ,
212- "matrix_hc" : onehot_matrix ,
213- }
0 commit comments