From 55e83f63ec921f538942e0278c3ad0a25fed8286 Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 27 Apr 2026 10:10:34 -0400 Subject: [PATCH 1/3] Add validation checks for empty datasets in BaseWriter methods Co-authored-by: Copilot --- pyprophet/io/_base.py | 77 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 5ddd50d..1a1747a 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -737,6 +737,16 @@ def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame: """ cfg = self.config + # Check if data is empty + if data.empty: + raise ValueError( + "No identification results passed the filtering criteria. " + "The filtered dataset is empty. Please check your filter settings: " + f"max_rs_peakgroup_qvalue={cfg.max_rs_peakgroup_qvalue}, " + f"max_global_peptide_qvalue={cfg.max_global_peptide_qvalue}, " + f"max_global_protein_qvalue={cfg.max_global_protein_qvalue}" + ) + sep = "," if cfg.out_type == "csv" else "\t" level = self.level normalization = self.config.normalization @@ -777,6 +787,13 @@ def _summarize_precursor_level( # Select top ranking peak group only idx = data.groupby(["run_id", "transition_group_id"])["m_score"].idxmin() data = data.loc[idx] + + if data.empty: + raise ValueError( + "No data available for precursor-level summarization. " + "This typically occurs when no peak groups pass the q-value thresholds." + ) + logger.info("Summarizing to precursor level.") # Create matrix matrix = data.pivot_table( @@ -801,6 +818,13 @@ def _summarize_peptide_level( # First get top peak group per precursor idx = data.groupby(["run_id", "transition_group_id"])["m_score"].idxmin() data = data.loc[idx] + + if data.empty: + raise ValueError( + "No data available after filtering for peptide-level summarization. " + "This typically occurs when no peak groups pass the q-value thresholds." + ) + logger.info("Summarizing to peptide level.") # Get top precursors for each peptide if consistent_top: @@ -830,12 +854,29 @@ def _summarize_peptide_level( .reset_index(drop=True) ) + if data.empty: + raise ValueError( + "No data available after selecting top precursors for peptide-level summarization. " + "Check that top_n is not too large for your dataset." + ) + # Summarize by peptide (mean of top precursors) - peptide_matrix = ( - data.groupby(["Sequence", "FullPeptideName", "filename"])["Intensity"] - .mean() - .unstack() - ).reset_index() + # Group and aggregate + grouped = data.groupby(["Sequence", "FullPeptideName", "filename"])["Intensity"].mean() + + # Unstack carefully to avoid index conflicts + try: + peptide_matrix = grouped.unstack(fill_value=None) + except ValueError as e: + if "cannot insert" in str(e): + raise ValueError( + "Failed to create quantification matrix due to duplicate column names. " + "This may indicate malformed data or inconsistent peptide annotations." + ) from e + raise + + # Reset index to convert index columns to regular columns + peptide_matrix = peptide_matrix.reset_index() return peptide_matrix def _summarize_protein_level( @@ -864,6 +905,13 @@ def _summarize_protein_level( right_on="FullPeptideName", how="left", ) + + if protein_matrix.empty: + raise ValueError( + "No protein data available after mapping peptides to proteins. " + "Check that protein annotations are present in the data." + ) + protein_matrix = protein_matrix.explode("ProteinName") if consistent_top: @@ -888,6 +936,12 @@ def _summarize_protein_level( protein_matrix = ( protein_matrix.groupby("ProteinName").mean(numeric_only=True).reset_index() ) + + if protein_matrix.empty: + raise ValueError( + "No data available after protein-level summarization. " + "This may indicate all proteins were filtered out." + ) return protein_matrix @@ -917,6 +971,13 @@ def _summarize_gene_level( right_on="FullPeptideName", how="left", ) + + if gene_matrix.empty: + raise ValueError( + "No gene data available after mapping peptides to genes. " + "Check that gene annotations are present in the data." + ) + gene_matrix = gene_matrix.explode("Gene") if consistent_top: @@ -939,6 +1000,12 @@ def _summarize_gene_level( # Summarize by gene (mean of top peptides) gene_matrix = gene_matrix.groupby("Gene").mean(numeric_only=True).reset_index() + + if gene_matrix.empty: + raise ValueError( + "No data available after gene-level summarization. " + "This may indicate all genes were filtered out." + ) return gene_matrix From 0454065f6b2a12e56ba8867d6f01195eadb7e804 Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 27 Apr 2026 10:47:31 -0400 Subject: [PATCH 2/3] Improve error handling and clarity in BaseWriter methods for quantification matrix creation Co-authored-by: Copilot --- pyprophet/io/_base.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 1a1747a..aeef55f 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -864,19 +864,20 @@ def _summarize_peptide_level( # Group and aggregate grouped = data.groupby(["Sequence", "FullPeptideName", "filename"])["Intensity"].mean() - # Unstack carefully to avoid index conflicts + # Unstack and reset index carefully to avoid column name conflicts try: - peptide_matrix = grouped.unstack(fill_value=None) + peptide_matrix = grouped.unstack() + # Reset index to convert index columns to regular columns + peptide_matrix = peptide_matrix.reset_index() except ValueError as e: if "cannot insert" in str(e): raise ValueError( - "Failed to create quantification matrix due to duplicate column names. " - "This may indicate malformed data or inconsistent peptide annotations." + "Failed to create quantification matrix because a run filename " + "conflicts with reserved column names ('Sequence' or 'FullPeptideName'). " + "Please rename the conflicting input file or adjust the data before summarization." ) from e raise - # Reset index to convert index columns to regular columns - peptide_matrix = peptide_matrix.reset_index() return peptide_matrix def _summarize_protein_level( @@ -906,7 +907,7 @@ def _summarize_protein_level( how="left", ) - if protein_matrix.empty: + if protein_map.empty or protein_matrix["ProteinName"].isna().all(): raise ValueError( "No protein data available after mapping peptides to proteins. " "Check that protein annotations are present in the data." @@ -972,7 +973,7 @@ def _summarize_gene_level( how="left", ) - if gene_matrix.empty: + if gene_map.empty or gene_matrix["Gene"].isna().all(): raise ValueError( "No gene data available after mapping peptides to genes. " "Check that gene annotations are present in the data." From f4ade3a095a813518dd5750c328f748261d995c8 Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 27 Apr 2026 11:09:52 -0400 Subject: [PATCH 3/3] Fix unstacking in BaseWriter to avoid column name conflicts by setting fill_value to None Co-authored-by: Copilot --- pyprophet/io/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index aeef55f..c3c1e7a 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -866,7 +866,7 @@ def _summarize_peptide_level( # Unstack and reset index carefully to avoid column name conflicts try: - peptide_matrix = grouped.unstack() + peptide_matrix = grouped.unstack(fill_value=None) # Reset index to convert index columns to regular columns peptide_matrix = peptide_matrix.reset_index() except ValueError as e: