From 55e83f63ec921f538942e0278c3ad0a25fed8286 Mon Sep 17 00:00:00 2001
From: singjc <justincsing@gmail.com>
Date: Mon, 27 Apr 2026 10:10:34 -0400
Subject: [PATCH 1/3] Add validation checks for empty datasets in BaseWriter
 methods

Co-authored-by: Copilot <copilot@github.com>
---
 pyprophet/io/_base.py | 77 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 5 deletions(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 5ddd50d..1a1747a 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -737,6 +737,16 @@ def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame:
         """
         cfg = self.config
 
+        # Check if data is empty
+        if data.empty:
+            raise ValueError(
+                "No identification results passed the filtering criteria. "
+                "The filtered dataset is empty. Please check your filter settings: "
+                f"max_rs_peakgroup_qvalue={cfg.max_rs_peakgroup_qvalue}, "
+                f"max_global_peptide_qvalue={cfg.max_global_peptide_qvalue}, "
+                f"max_global_protein_qvalue={cfg.max_global_protein_qvalue}"
+            )
+
         sep = "," if cfg.out_type == "csv" else "\t"
         level = self.level
         normalization = self.config.normalization
@@ -777,6 +787,13 @@ def _summarize_precursor_level(
         # Select top ranking peak group only
         idx = data.groupby(["run_id", "transition_group_id"])["m_score"].idxmin()
         data = data.loc[idx]
+        
+        if data.empty:
+            raise ValueError(
+                "No data available for precursor-level summarization. "
+                "This typically occurs when no peak groups pass the q-value thresholds."
+            )
+        
         logger.info("Summarizing to precursor level.")
         # Create matrix
         matrix = data.pivot_table(
@@ -801,6 +818,13 @@ def _summarize_peptide_level(
         # First get top peak group per precursor
         idx = data.groupby(["run_id", "transition_group_id"])["m_score"].idxmin()
         data = data.loc[idx]
+        
+        if data.empty:
+            raise ValueError(
+                "No data available after filtering for peptide-level summarization. "
+                "This typically occurs when no peak groups pass the q-value thresholds."
+            )
+        
         logger.info("Summarizing to peptide level.")
         # Get top precursors for each peptide
         if consistent_top:
@@ -830,12 +854,29 @@ def _summarize_peptide_level(
                 .reset_index(drop=True)
             )
 
+        if data.empty:
+            raise ValueError(
+                "No data available after selecting top precursors for peptide-level summarization. "
+                "Check that top_n is not too large for your dataset."
+            )
+
         # Summarize by peptide (mean of top precursors)
-        peptide_matrix = (
-            data.groupby(["Sequence", "FullPeptideName", "filename"])["Intensity"]
-            .mean()
-            .unstack()
-        ).reset_index()
+        # Group and aggregate
+        grouped = data.groupby(["Sequence", "FullPeptideName", "filename"])["Intensity"].mean()
+        
+        # Unstack carefully to avoid index conflicts
+        try:
+            peptide_matrix = grouped.unstack(fill_value=None)
+        except ValueError as e:
+            if "cannot insert" in str(e):
+                raise ValueError(
+                    "Failed to create quantification matrix due to duplicate column names. "
+                    "This may indicate malformed data or inconsistent peptide annotations."
+                ) from e
+            raise
+        
+        # Reset index to convert index columns to regular columns
+        peptide_matrix = peptide_matrix.reset_index()
         return peptide_matrix
 
     def _summarize_protein_level(
@@ -864,6 +905,13 @@ def _summarize_protein_level(
             right_on="FullPeptideName",
             how="left",
         )
+        
+        if protein_matrix.empty:
+            raise ValueError(
+                "No protein data available after mapping peptides to proteins. "
+                "Check that protein annotations are present in the data."
+            )
+        
         protein_matrix = protein_matrix.explode("ProteinName")
 
         if consistent_top:
@@ -888,6 +936,12 @@ def _summarize_protein_level(
         protein_matrix = (
             protein_matrix.groupby("ProteinName").mean(numeric_only=True).reset_index()
         )
+        
+        if protein_matrix.empty:
+            raise ValueError(
+                "No data available after protein-level summarization. "
+                "This may indicate all proteins were filtered out."
+            )
 
         return protein_matrix
 
@@ -917,6 +971,13 @@ def _summarize_gene_level(
             right_on="FullPeptideName",
             how="left",
         )
+        
+        if gene_matrix.empty:
+            raise ValueError(
+                "No gene data available after mapping peptides to genes. "
+                "Check that gene annotations are present in the data."
+            )
+        
         gene_matrix = gene_matrix.explode("Gene")
 
         if consistent_top:
@@ -939,6 +1000,12 @@ def _summarize_gene_level(
 
         # Summarize by gene (mean of top peptides)
         gene_matrix = gene_matrix.groupby("Gene").mean(numeric_only=True).reset_index()
+        
+        if gene_matrix.empty:
+            raise ValueError(
+                "No data available after gene-level summarization. "
+                "This may indicate all genes were filtered out."
+            )
 
         return gene_matrix
 

From 0454065f6b2a12e56ba8867d6f01195eadb7e804 Mon Sep 17 00:00:00 2001
From: singjc <justincsing@gmail.com>
Date: Mon, 27 Apr 2026 10:47:31 -0400
Subject: [PATCH 2/3] Improve error handling and clarity in BaseWriter methods
 for quantification matrix creation

Co-authored-by: Copilot <copilot@github.com>
---
 pyprophet/io/_base.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 1a1747a..aeef55f 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -864,19 +864,20 @@ def _summarize_peptide_level(
         # Group and aggregate
         grouped = data.groupby(["Sequence", "FullPeptideName", "filename"])["Intensity"].mean()
         
-        # Unstack carefully to avoid index conflicts
+        # Unstack and reset index carefully to avoid column name conflicts
         try:
-            peptide_matrix = grouped.unstack(fill_value=None)
+            peptide_matrix = grouped.unstack()
+            # Reset index to convert index columns to regular columns
+            peptide_matrix = peptide_matrix.reset_index()
         except ValueError as e:
             if "cannot insert" in str(e):
                 raise ValueError(
-                    "Failed to create quantification matrix due to duplicate column names. "
-                    "This may indicate malformed data or inconsistent peptide annotations."
+                    "Failed to create quantification matrix because a run filename "
+                    "conflicts with reserved column names ('Sequence' or 'FullPeptideName'). "
+                    "Please rename the conflicting input file or adjust the data before summarization."
                 ) from e
             raise
         
-        # Reset index to convert index columns to regular columns
-        peptide_matrix = peptide_matrix.reset_index()
         return peptide_matrix
 
     def _summarize_protein_level(
@@ -906,7 +907,7 @@ def _summarize_protein_level(
             how="left",
         )
         
-        if protein_matrix.empty:
+        if protein_map.empty or protein_matrix["ProteinName"].isna().all():
             raise ValueError(
                 "No protein data available after mapping peptides to proteins. "
                 "Check that protein annotations are present in the data."
@@ -972,7 +973,7 @@ def _summarize_gene_level(
             how="left",
         )
         
-        if gene_matrix.empty:
+        if gene_map.empty or gene_matrix["Gene"].isna().all():
             raise ValueError(
                 "No gene data available after mapping peptides to genes. "
                 "Check that gene annotations are present in the data."

From f4ade3a095a813518dd5750c328f748261d995c8 Mon Sep 17 00:00:00 2001
From: singjc <justincsing@gmail.com>
Date: Mon, 27 Apr 2026 11:09:52 -0400
Subject: [PATCH 3/3] Fix unstacking in BaseWriter to avoid column name
 conflicts by setting fill_value to None

Co-authored-by: Copilot <copilot@github.com>
---
 pyprophet/io/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index aeef55f..c3c1e7a 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -866,7 +866,7 @@ def _summarize_peptide_level(
         
         # Unstack and reset index carefully to avoid column name conflicts
         try:
-            peptide_matrix = grouped.unstack()
+            peptide_matrix = grouped.unstack(fill_value=None)
             # Reset index to convert index columns to regular columns
             peptide_matrix = peptide_matrix.reset_index()
         except ValueError as e: