Computational efficiency
Spectrum processing in spectrum_utils has been optimized for computational efficiency using NumPy and Numba to be able to process thousands of spectra per second.
As shown below, spectrum_utils (version 0.4.0) is faster than alternative libraries, such as pymzML (version 2.5.2) and pyOpenMS (version 2.7.0), when performing typical spectrum processing tasks, including the following steps:
The m/z range is set to 100–1400 m/z.
The precursor peak is removed.
Low-intensity noise peaks are removed.
Peak intensities are scaled by their square root.
import time
import matplotlib.pyplot as plt
import numpy as np
import pyopenms
import pyteomics.mgf
import seaborn as sns
import spectrum_utils.spectrum as sus
from pymzml.spec import Spectrum
min_peaks = 10
min_mz, max_mz = 100, 1400
fragment_tol_mass, fragment_tol_mode = 0.02, "Da"
min_intensity = 0.05
max_num_peaks = 150
def time_spectrum_utils(mgf_filename):
runtimes = []
for spec_dict in pyteomics.mgf.read(mgf_filename):
# Omit invalid spectra.
if (
len(spec_dict["m/z array"]) < min_peaks
or "charge" not in spec_dict["params"]
):
continue
spectrum = sus.MsmsSpectrum(
spec_dict["params"]["title"],
spec_dict["params"]["pepmass"][0],
spec_dict["params"]["charge"][0],
spec_dict["m/z array"],
spec_dict["intensity array"],
float(spec_dict["params"]["rtinseconds"]),
)._inner
start_time = time.time()
spectrum.set_mz_range(min_mz, max_mz)
spectrum.remove_precursor_peak(fragment_tol_mass, fragment_tol_mode)
spectrum.filter_intensity(min_intensity, max_num_peaks)
spectrum.scale_intensity("root", 1)
runtimes.append(time.time() - start_time)
return runtimes
def time_pymzml(mgf_filename):
runtimes = []
for spec_dict in pyteomics.mgf.read(mgf_filename):
# Omit invalid spectra.
if (
len(spec_dict["m/z array"]) < min_peaks
or "charge" not in spec_dict["params"]
):
continue
spec = Spectrum()
spec.set_peaks(
[*zip(spec_dict["m/z array"], spec_dict["intensity array"])], "raw"
)
start_time = time.time()
spec.reduce("raw", (min_mz, max_mz))
spec.remove_precursor_peak()
spec.remove_noise(noise_level=min_intensity)
spec /= np.amax(spec.i)
spec.i = np.sqrt(spec.i)
runtimes.append(time.time() - start_time)
return runtimes
def time_pyopenms(mgf_filename):
experiment = pyopenms.MSExperiment()
pyopenms.MascotGenericFile().load(mgf_filename, experiment)
runtimes = []
for spectrum in experiment:
# Omit invalid spectra.
if (
len(spectrum.get_peaks()[0]) < min_peaks
or spectrum.getPrecursors()[0].getCharge() == 0
):
continue
start_time = time.time()
# Set the m/z range.
filtered_mz, filtered_intensity = [], []
for mz, intensity in zip(*spectrum.get_peaks()):
if min_mz <= mz <= max_mz:
filtered_mz.append(mz)
filtered_intensity.append(intensity)
spectrum.set_peaks((filtered_mz, filtered_intensity))
# Remove the precursor peak.
parent_peak_mower = pyopenms.ParentPeakMower()
parent_peak_mower_params = parent_peak_mower.getDefaults()
parent_peak_mower_params.setValue(
b"window_size", fragment_tol_mass, b""
)
parent_peak_mower.setParameters(parent_peak_mower_params)
parent_peak_mower.filterSpectrum(spectrum)
# Filter by base peak intensity percentage.
pyopenms.Normalizer().filterSpectrum(spectrum)
threshold_mower = pyopenms.ThresholdMower()
threshold_mower_params = threshold_mower.getDefaults()
threshold_mower_params.setValue(b"threshold", min_intensity, b"")
threshold_mower.setParameters(threshold_mower_params)
threshold_mower.filterSpectrum(spectrum)
# Restrict to the most intense peaks.
n_largest = pyopenms.NLargest()
n_largest_params = n_largest.getDefaults()
n_largest_params.setValue(b"n", max_num_peaks, b"")
n_largest.setParameters(n_largest_params)
n_largest.filterSpectrum(spectrum)
# Scale the peak intensities by their square root and normalize.
pyopenms.SqrtMower().filterSpectrum(spectrum)
pyopenms.Normalizer().filterSpectrum(spectrum)
runtimes.append(time.time() - start_time)
return runtimes
mgf_filename = "iPRG2012.mgf"
runtimes_spectrum_utils = time_spectrum_utils(mgf_filename)
runtimes_pyopenms = time_pyopenms(mgf_filename)
runtimes_pymzml = time_pymzml(mgf_filename)
fig, ax = plt.subplots()
sns.boxplot(
data=[runtimes_spectrum_utils, runtimes_pymzml, runtimes_pyopenms],
flierprops={"markersize": 2},
ax=ax,
)
ax.set_yscale("log")
ax.xaxis.set_ticklabels(("spectrum_utils", "pymzML", "pyOpenMS"))
ax.set_ylabel("Processing time per spectrum (s)")
sns.despine()
plt.savefig("runtime.png", bbox_inches="tight", dpi=300, transparent=True)
plt.close()
JIT compilation
Note that the significant outlier for spectrum_utils is caused by Numba’s JIT compilation of the first method call, allowing subsequent calls to be made very efficiently.
If the user knows in advance that only a single method call needs to be made, Numba’s JIT compilation can be disabled to avoid this overhead by setting the NUMBA_DISABLE_JIT
environment variable to 1
.
See the Numba documentation for more information.