Computational efficiency

Spectrum processing in spectrum_utils has been optimized for computational efficiency using NumPy and Numba to be able to process thousands of spectra per second.

As shown below, spectrum_utils (version 0.4.0) is faster than alternative libraries, such as pymzML (version 2.5.2) and pyOpenMS (version 2.7.0), when performing typical spectrum processing tasks, including the following steps:

  • The m/z range is set to 100–1400 m/z.

  • The precursor peak is removed.

  • Low-intensity noise peaks are removed.

  • Peak intensities are scaled by their square root.

import time

import matplotlib.pyplot as plt
import numpy as np
import pyopenms
import pyteomics.mgf
import seaborn as sns
import spectrum_utils.spectrum as sus
from pymzml.spec import Spectrum


min_peaks = 10
min_mz, max_mz = 100, 1400
fragment_tol_mass, fragment_tol_mode = 0.02, "Da"
min_intensity = 0.05
max_num_peaks = 150


def time_spectrum_utils(mgf_filename):
    runtimes = []
    for spec_dict in pyteomics.mgf.read(mgf_filename):
        # Omit invalid spectra.
        if (
            len(spec_dict["m/z array"]) < min_peaks
            or "charge" not in spec_dict["params"]
        ):
            continue

        spectrum = sus.MsmsSpectrum(
            spec_dict["params"]["title"],
            spec_dict["params"]["pepmass"][0],
            spec_dict["params"]["charge"][0],
            spec_dict["m/z array"],
            spec_dict["intensity array"],
            float(spec_dict["params"]["rtinseconds"]),
        )._inner

        start_time = time.time()

        spectrum.set_mz_range(min_mz, max_mz)
        spectrum.remove_precursor_peak(fragment_tol_mass, fragment_tol_mode)
        spectrum.filter_intensity(min_intensity, max_num_peaks)
        spectrum.scale_intensity("root", 1)

        runtimes.append(time.time() - start_time)

    return runtimes


def time_pymzml(mgf_filename):
    runtimes = []
    for spec_dict in pyteomics.mgf.read(mgf_filename):
        # Omit invalid spectra.
        if (
            len(spec_dict["m/z array"]) < min_peaks
            or "charge" not in spec_dict["params"]
        ):
            continue

        spec = Spectrum()
        spec.set_peaks(
            [*zip(spec_dict["m/z array"], spec_dict["intensity array"])], "raw"
        )

        start_time = time.time()

        spec.reduce("raw", (min_mz, max_mz))
        spec.remove_precursor_peak()
        spec.remove_noise(noise_level=min_intensity)
        spec /= np.amax(spec.i)
        spec.i = np.sqrt(spec.i)

        runtimes.append(time.time() - start_time)

    return runtimes


def time_pyopenms(mgf_filename):
    experiment = pyopenms.MSExperiment()
    pyopenms.MascotGenericFile().load(mgf_filename, experiment)

    runtimes = []
    for spectrum in experiment:
        # Omit invalid spectra.
        if (
            len(spectrum.get_peaks()[0]) < min_peaks
            or spectrum.getPrecursors()[0].getCharge() == 0
        ):
            continue

        start_time = time.time()

        # Set the m/z range.
        filtered_mz, filtered_intensity = [], []
        for mz, intensity in zip(*spectrum.get_peaks()):
            if min_mz <= mz <= max_mz:
                filtered_mz.append(mz)
                filtered_intensity.append(intensity)
            spectrum.set_peaks((filtered_mz, filtered_intensity))
        # Remove the precursor peak.
        parent_peak_mower = pyopenms.ParentPeakMower()
        parent_peak_mower_params = parent_peak_mower.getDefaults()
        parent_peak_mower_params.setValue(
            b"window_size", fragment_tol_mass, b""
        )
        parent_peak_mower.setParameters(parent_peak_mower_params)
        parent_peak_mower.filterSpectrum(spectrum)
        # Filter by base peak intensity percentage.
        pyopenms.Normalizer().filterSpectrum(spectrum)
        threshold_mower = pyopenms.ThresholdMower()
        threshold_mower_params = threshold_mower.getDefaults()
        threshold_mower_params.setValue(b"threshold", min_intensity, b"")
        threshold_mower.setParameters(threshold_mower_params)
        threshold_mower.filterSpectrum(spectrum)
        # Restrict to the most intense peaks.
        n_largest = pyopenms.NLargest()
        n_largest_params = n_largest.getDefaults()
        n_largest_params.setValue(b"n", max_num_peaks, b"")
        n_largest.setParameters(n_largest_params)
        n_largest.filterSpectrum(spectrum)
        # Scale the peak intensities by their square root and normalize.
        pyopenms.SqrtMower().filterSpectrum(spectrum)
        pyopenms.Normalizer().filterSpectrum(spectrum)

        runtimes.append(time.time() - start_time)

    return runtimes


mgf_filename = "iPRG2012.mgf"
runtimes_spectrum_utils = time_spectrum_utils(mgf_filename)
runtimes_pyopenms = time_pyopenms(mgf_filename)
runtimes_pymzml = time_pymzml(mgf_filename)

fig, ax = plt.subplots()
sns.boxplot(
    data=[runtimes_spectrum_utils, runtimes_pymzml, runtimes_pyopenms],
    flierprops={"markersize": 2},
    ax=ax,
)
ax.set_yscale("log")
ax.xaxis.set_ticklabels(("spectrum_utils", "pymzML", "pyOpenMS"))
ax.set_ylabel("Processing time per spectrum (s)")
sns.despine()
plt.savefig("runtime.png", bbox_inches="tight", dpi=300, transparent=True)
plt.close()

JIT compilation

Note that the significant outlier for spectrum_utils is caused by Numba’s JIT compilation of the first method call, allowing subsequent calls to be made very efficiently.

If the user knows in advance that only a single method call needs to be made, Numba’s JIT compilation can be disabled to avoid this overhead by setting the NUMBA_DISABLE_JIT environment variable to 1. See the Numba documentation for more information.