CompOmics
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎MANIFEST.in‎
Lines changed: 0 additions & 2 deletions b/‎MANIFEST.in‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎deeplc/_features.py‎
Lines changed: 189 additions & 85 deletions b/‎deeplc/_features.py‎
Lines changed: 189 additions & 85 deletions
@@ -6,14 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to
 [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## Unreleased
+## [4.0.0-alpha.1]
 
 ### Changed
 
 - Simplified the public package API by splitting up the single class-based API into core functions (`predict`, `finetune`, `train`, etc.)
 - Switched deep learning framework from Tensorflow to PyTorch
 - Speed up predictions by removing ensemble method where output from three models with differing kernel sizes was averaged to one prediction
 - Separated calibration logic to dedicated reusable module with sklearn-like API.
+- Improved computational efficiency of piece-wise linear calibration and set sensible default parameters
 - Built-in transfer learning functionality, instead of using external `deeplcretrainer` package.
 - Cleaned up package, removing legacy and unused code and files, and improving modularity
 - Modernized CI workflows to use `uv`
 
@@ -1,3 +1 @@
-include deeplc/models/*
 include deeplc/package_data/**/*
-include deeplc/baseline_performance/*
@@ -1,5 +1,7 @@
 """Feature extraction for DeepLC."""
 
+# TODO: Consider ProForma fixed modifications (that are not applied yet) for feature extraction.
+
 from __future__ import annotations
 
 import logging
@@ -26,6 +28,112 @@
 # fmt: on
 
 
+def encode_peptidoform(
+    peptidoform: Peptidoform | str,
+    add_ccs_features: bool = False,
+    padding_length: int = 60,
+    positions: set[int] | None = None,
+    positions_pos: set[int] | None = None,
+    positions_neg: set[int] | None = None,
+    dict_aa: dict[str, int] | None = None,
+    dict_index_pos: dict[str, int] | None = None,
+    dict_index: dict[str, int] | None = None,
+) -> dict[str, np.ndarray]:
+    """
+    Extract features from a single peptidoform.
+
+    Parameters
+    ----------
+    peptidoform
+        The peptidoform to encode, either as a Peptidoform object or a string.
+    add_ccs_features
+        Whether to include CCS features. Default is False.
+    padding_length
+        The maximum length of the sequence after padding. Default is 60.
+    positions
+        The positions to consider for feature extraction. Default is DEFAULT_POSITIONS.
+    positions_pos
+        The positive positions to consider for feature extraction. Default is
+        DEFAULT_POSITIONS_POS.
+    positions_neg
+        The negative positions to consider for feature extraction. Default is
+        DEFAULT_POSITIONS_NEG.
+    dict_aa
+        A dictionary mapping amino acids to indices. Default is DEFAULT_DICT_AA.
+    dict_index_pos
+        A dictionary mapping atoms to indices for the positional matrix. Default is
+        DEFAULT_DICT_INDEX_POS.
+    dict_index
+        A dictionary mapping atoms to indices. Default is DEFAULT_DICT_INDEX.
+
+    Returns
+    -------
+    dict[str, np.ndarray]
+        A dictionary of Numpy arrays containing the extracted features.
+
+    """
+    positions = positions or DEFAULT_POSITIONS
+    positions_pos = positions_pos or DEFAULT_POSITIONS_POS
+    positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
+    dict_aa = dict_aa or DEFAULT_DICT_AA
+    dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
+    dict_index = dict_index or DEFAULT_DICT_INDEX
+
+    if isinstance(peptidoform, str):
+        peptidoform = Peptidoform(peptidoform)
+    seq = peptidoform.sequence
+    charge = peptidoform.precursor_charge
+    seq, seq_len = _truncate_sequence(seq, padding_length)
+
+    std_matrix = _fill_standard_matrix(seq, padding_length, dict_index)
+    onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa)
+    pos_matrix = _fill_pos_matrix(
+        seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos
+    )
+    _apply_modifications(
+        std_matrix,
+        pos_matrix,
+        peptidoform.parsed_sequence,
+        seq_len,
+        dict_index,
+        dict_index_pos,
+        positions,
+    )
+    _apply_terminal_modifications(
+        std_matrix,
+        pos_matrix,
+        peptidoform,
+        seq_len,
+        dict_index,
+        dict_index_pos,
+        positions,
+    )
+
+    matrix_all = np.sum(std_matrix, axis=0)
+    matrix_all = np.append(matrix_all, seq_len)
+    if add_ccs_features:
+        if not charge:
+            raise ValueError(f"Peptidoform has no charge: {peptidoform}")
+        matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len)
+        matrix_all = np.append(
+            matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len
+        )
+        matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len)
+        matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len)
+        matrix_all = np.append(matrix_all, charge)
+
+    matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T
+
+    matrix_global = np.concatenate([matrix_all, pos_matrix.flatten()])
+
+    return {
+        "matrix": std_matrix,
+        "matrix_sum": matrix_sum,
+        "matrix_global": matrix_global,
+        "matrix_hc": onehot_matrix,
+    }
+
+
 def _truncate_sequence(seq: str, max_length: int) -> tuple[str, int]:
     """Truncate the sequence if it exceeds the max_length."""
     if len(seq) > max_length:
@@ -98,6 +206,40 @@ def _fill_pos_matrix(
     return pos_mat
 
 
+def _apply_composition_to_matrices(
+    mat: np.ndarray,
+    pos_mat: np.ndarray,
+    composition: mass.Composition,
+    i: int,
+    seq_len: int,
+    dict_index: dict[str, int],
+    dict_index_pos: dict[str, int],
+    positions: set[int],
+) -> None:
+    """Apply a composition delta to the standard and positional matrices."""
+    for atom_comp, change in composition.items():
+        try:
+            mat[i, dict_index[atom_comp]] += change
+            if i in positions:
+                pos_mat[i, dict_index_pos[atom_comp]] += change
+            elif (i - seq_len) in positions:
+                pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change
+        except KeyError:
+            try:
+                warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2)
+                atom_comp_clean = sub(r"\[.*?\]", "", atom_comp)
+                mat[i, dict_index[atom_comp_clean]] += change
+                if i in positions:
+                    pos_mat[i, dict_index_pos[atom_comp_clean]] += change
+                elif (i - seq_len) in positions:
+                    pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change
+            except KeyError:
+                warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2)
+                continue
+        except IndexError:
+            warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2)
+
+
 def _apply_modifications(
     mat: np.ndarray,
     pos_mat: np.ndarray,
@@ -118,96 +260,58 @@ def _apply_modifications(
                 f"Skipping modification without known composition: {token[1]}", stacklevel=2
             )
             continue
-        for atom_comp, change in mod_comp.items():
+        _apply_composition_to_matrices(
+            mat,
+            pos_mat,
+            mod_comp,
+            i,
+            seq_len,
+            dict_index,
+            dict_index_pos,
+            positions,
+        )
+
+
+def _apply_terminal_modifications(
+    mat: np.ndarray,
+    pos_mat: np.ndarray,
+    peptidoform: Peptidoform,
+    seq_len: int,
+    dict_index: dict[str, int],
+    dict_index_pos: dict[str, int],
+    positions: set[int],
+) -> None:
+    """Apply N- and C-terminal modification changes to the matrices."""
+    terminal_mods = [
+        (0, peptidoform.properties.get("n_term")),  # N-terminus at position 0
+        (seq_len - 1, peptidoform.properties.get("c_term")),  # C-terminus at last position
+    ]
+    for i, mods in terminal_mods:
+        if not mods:
+            continue
+        for tag in mods:
             try:
-                mat[i, dict_index[atom_comp]] += change
-                if i in positions:
-                    pos_mat[i, dict_index_pos[atom_comp]] += change
-                elif (i - seq_len) in positions:
-                    pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change
-            except KeyError:
-                try:
-                    warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2)
-                    atom_comp_clean = sub(r"\[.*?\]", "", atom_comp)
-                    mat[i, dict_index[atom_comp_clean]] += change
-                    if i in positions:
-                        pos_mat[i, dict_index_pos[atom_comp_clean]] += change
-                    elif (i - seq_len) in positions:
-                        pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change
-                except KeyError:
-                    warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2)
-                    continue
-            except IndexError:
-                warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2)
+                mod_comp = tag.composition
+            except Exception:
+                warnings.warn(
+                    f"Skipping terminal modification without known composition: {tag}",
+                    stacklevel=2,
+                )
+                continue
+            _apply_composition_to_matrices(
+                mat,
+                pos_mat,
+                mod_comp,
+                i,
+                seq_len,
+                dict_index,
+                dict_index_pos,
+                positions,
+            )
 
 
 def _compute_rolling_sum(matrix: np.ndarray, n: int = 2) -> np.ndarray:
     """Compute a rolling sum over the matrix."""
     ret = np.cumsum(matrix, axis=1, dtype=np.float32)
     ret[:, n:] = ret[:, n:] - ret[:, :-n]
     return ret[:, n - 1 :]
-
-
-def encode_peptidoform(
-    peptidoform: Peptidoform | str,
-    add_ccs_features: bool = False,
-    padding_length: int = 60,
-    positions: set[int] | None = None,
-    positions_pos: set[int] | None = None,
-    positions_neg: set[int] | None = None,
-    dict_aa: dict[str, int] | None = None,
-    dict_index_pos: dict[str, int] | None = None,
-    dict_index: dict[str, int] | None = None,
-) -> dict[str, np.ndarray]:
-    """Extract features from a single peptidoform."""
-    positions = positions or DEFAULT_POSITIONS
-    positions_pos = positions_pos or DEFAULT_POSITIONS_POS
-    positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
-    dict_aa = dict_aa or DEFAULT_DICT_AA
-    dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
-    dict_index = dict_index or DEFAULT_DICT_INDEX
-
-    if isinstance(peptidoform, str):
-        peptidoform = Peptidoform(peptidoform)
-    seq = peptidoform.sequence
-    charge = peptidoform.precursor_charge
-    seq, seq_len = _truncate_sequence(seq, padding_length)
-
-    std_matrix = _fill_standard_matrix(seq, padding_length, dict_index)
-    onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa)
-    pos_matrix = _fill_pos_matrix(
-        seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos
-    )
-    _apply_modifications(
-        std_matrix,
-        pos_matrix,
-        peptidoform.parsed_sequence,
-        seq_len,
-        dict_index,
-        dict_index_pos,
-        positions,
-    )
-
-    matrix_all = np.sum(std_matrix, axis=0)
-    matrix_all = np.append(matrix_all, seq_len)
-    if add_ccs_features:
-        if not charge:
-            raise ValueError(f"Peptidoform has no charge: {peptidoform}")
-        matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len)
-        matrix_all = np.append(
-            matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len
-        )
-        matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len)
-        matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len)
-        matrix_all = np.append(matrix_all, charge)
-
-    matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T
-
-    matrix_global = np.concatenate([matrix_all, pos_matrix.flatten()])
-
-    return {
-        "matrix": std_matrix,
-        "matrix_sum": matrix_sum,
-        "matrix_global": matrix_global,
-        "matrix_hc": onehot_matrix,
-    }
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1 @@`
`1`		`-include deeplc/models/*`
`2`	`1`	`include deeplc/package_data/*/`
`3`		`-include deeplc/baseline_performance/*`