From 5fae41097e69094cbf02bf41a452ceddce213734 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 28 Feb 2026 17:14:07 +0000
Subject: [PATCH 1/2] Initial plan
From 5d354d5106031d9b46530095b3a7700abbfbf001 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 28 Feb 2026 17:23:36 +0000
Subject: [PATCH 2/2] Sync dotnet CsvFileReader with Python multiline-field fix
(#2248)
Co-authored-by: sharpninja <16146732+sharpninja@users.noreply.github.com>
---
dotnet/src/GraphRag.Input/CsvFileReader.cs | 82 +++++++++-----
.../Input/CsvFileReaderTests.cs | 105 ++++++++++++++++++
2 files changed, 162 insertions(+), 25 deletions(-)
create mode 100644 dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs
diff --git a/dotnet/src/GraphRag.Input/CsvFileReader.cs b/dotnet/src/GraphRag.Input/CsvFileReader.cs
index 176584ba2..502cf1913 100644
--- a/dotnet/src/GraphRag.Input/CsvFileReader.cs
+++ b/dotnet/src/GraphRag.Input/CsvFileReader.cs
@@ -1,16 +1,14 @@
// Copyright (c) 2025 Microsoft Corporation.
// Licensed under the MIT License
-using System.Globalization;
-
using GraphRag.Storage;
namespace GraphRag.Input;
///
/// Reads CSV files from storage and produces instances.
-/// Uses a simple built-in CSV parser. For robust CSV parsing (quoted fields,
-/// escaping, multi-line values), use the GraphRag.Input.CsvHelper strategy library.
+/// Handles quoted fields, escaped double-quotes, and multi-line field values.
+/// For additional CSV parsing features, use the GraphRag.Input.CsvHelper strategy library.
///
public class CsvFileReader : StructuredFileReader
{
@@ -47,33 +45,36 @@ public CsvFileReader(
return null;
}
- var rows = new List>();
- using var reader = new StringReader(content);
+ var allRows = ParseCsvContent(content);
+ if (allRows.Count == 0)
+ {
+ return [];
+ }
- // Read header line.
- var headerLine = await reader.ReadLineAsync(ct).ConfigureAwait(false);
- if (headerLine is null)
+ var headers = allRows[0];
+ if (Array.TrueForAll(headers, string.IsNullOrWhiteSpace))
{
- return rows;
+ return [];
}
- var headers = ParseCsvLine(headerLine);
+ var rows = new List>();
- // Read data lines.
- string? line;
- while ((line = await reader.ReadLineAsync(ct).ConfigureAwait(false)) is not null)
+ for (var i = 1; i < allRows.Count; i++)
{
ct.ThrowIfCancellationRequested();
- if (string.IsNullOrWhiteSpace(line))
+
+ var fields = allRows[i];
+
+ // Skip blank rows (a single empty field means an empty line).
+ if (fields.Length == 1 && string.IsNullOrWhiteSpace(fields[0]))
{
continue;
}
- var fields = ParseCsvLine(line);
var row = new Dictionary();
- for (var i = 0; i < headers.Length; i++)
+ for (var j = 0; j < headers.Length; j++)
{
- row[headers[i]] = i < fields.Length ? fields[i] : null;
+ row[headers[j]] = j < fields.Length ? fields[j] : null;
}
rows.Add(row);
@@ -82,20 +83,27 @@ public CsvFileReader(
return rows;
}
- private static string[] ParseCsvLine(string line)
+ ///
+ /// Parses CSV content into rows, correctly handling quoted fields that may contain
+ /// embedded newlines, commas, and escaped double-quotes ("").
+ ///
+ private static List ParseCsvContent(string content)
{
+ var rows = new List();
var fields = new List();
- var inQuotes = false;
var field = new System.Text.StringBuilder();
+ var inQuotes = false;
- for (var i = 0; i < line.Length; i++)
+ for (var i = 0; i < content.Length; i++)
{
- var c = line[i];
+ var c = content[i];
+
if (inQuotes)
{
if (c == '"')
{
- if (i + 1 < line.Length && line[i + 1] == '"')
+ // Escaped quote: "" inside a quoted field → single "
+ if (i + 1 < content.Length && content[i + 1] == '"')
{
field.Append('"');
i++;
@@ -107,6 +115,7 @@ private static string[] ParseCsvLine(string line)
}
else
{
+ // Append everything verbatim inside quotes, including \r and \n.
field.Append(c);
}
}
@@ -119,13 +128,36 @@ private static string[] ParseCsvLine(string line)
fields.Add(field.ToString());
field.Clear();
}
+ else if (c == '\r' || c == '\n')
+ {
+ // End of row — consume \r\n as a single line ending.
+ fields.Add(field.ToString());
+ field.Clear();
+ rows.Add([.. fields]);
+ fields.Clear();
+
+ if (c == '\r' && i + 1 < content.Length && content[i + 1] == '\n')
+ {
+ i++;
+ }
+ }
else
{
field.Append(c);
}
}
- fields.Add(field.ToString());
- return fields.ToArray();
+ // Flush trailing content that is not terminated by a newline.
+ // Only add a row if there is actual non-empty content remaining.
+ if (field.Length > 0 || fields.Count > 0)
+ {
+ fields.Add(field.ToString());
+ if (!fields.TrueForAll(string.IsNullOrEmpty))
+ {
+ rows.Add([.. fields]);
+ }
+ }
+
+ return rows;
}
}
diff --git a/dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs b/dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs
new file mode 100644
index 000000000..fe2aec9bf
--- /dev/null
+++ b/dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs
@@ -0,0 +1,105 @@
+// Copyright (c) 2025 Microsoft Corporation.
+// Licensed under the MIT License
+
+using FluentAssertions;
+using GraphRag.Input;
+using GraphRag.Storage;
+
+namespace GraphRag.Tests.Unit.Input;
+
+///
+/// Unit tests for .
+///
+public class CsvFileReaderTests
+{
+ [Fact]
+ public async Task ReadFilesAsync_BasicCsv_ReturnsDocuments()
+ {
+ var storage = new MemoryStorage();
+ await storage.SetAsync("data.csv", "id,title,text\n1,Hello,Hi how are you today?\n2,World,Fine thanks\n");
+
+ var reader = new CsvFileReader(storage, idColumn: "id", titleColumn: "title", textColumn: "text");
+
+ var docs = await reader.ReadFilesAsync();
+
+ docs.Should().HaveCount(2);
+ docs[0].Id.Should().Be("1");
+ docs[0].Title.Should().Be("Hello");
+ docs[0].Text.Should().Be("Hi how are you today?");
+ docs[1].Id.Should().Be("2");
+ }
+
+ [Fact]
+ public async Task ReadFilesAsync_MultilineQuotedField_PreservesInternalNewlines()
+ {
+ // Equivalent to Python test: test_csv_loader_preserves_multiline_fields
+ var csvContent = "title,text\r\n\"Post 1\",\"Line one.\nLine two.\nLine three.\"\r\n\"Post 2\",\"Single line.\"\r\n";
+
+ var storage = new MemoryStorage();
+ await storage.SetAsync("input.csv", csvContent);
+
+ var reader = new CsvFileReader(storage, titleColumn: "title", textColumn: "text");
+
+ var docs = await reader.ReadFilesAsync();
+
+ docs.Should().HaveCount(2);
+ docs[0].Title.Should().Be("Post 1");
+ docs[0].Text.Should().Be("Line one.\nLine two.\nLine three.");
+ docs[1].Title.Should().Be("Post 2");
+ docs[1].Text.Should().Be("Single line.");
+ }
+
+ [Fact]
+ public async Task ReadFilesAsync_QuotedFieldWithComma_FieldNotSplit()
+ {
+ var storage = new MemoryStorage();
+ await storage.SetAsync("data.csv", "id,text\n1,\"hello, world\"\n");
+
+ var reader = new CsvFileReader(storage, idColumn: "id", textColumn: "text");
+
+ var docs = await reader.ReadFilesAsync();
+
+ docs.Should().HaveCount(1);
+ docs[0].Text.Should().Be("hello, world");
+ }
+
+ [Fact]
+ public async Task ReadFilesAsync_EscapedDoubleQuote_DecodedCorrectly()
+ {
+ var storage = new MemoryStorage();
+ await storage.SetAsync("data.csv", "id,text\n1,\"say \"\"hello\"\"\"\n");
+
+ var reader = new CsvFileReader(storage, idColumn: "id", textColumn: "text");
+
+ var docs = await reader.ReadFilesAsync();
+
+ docs.Should().HaveCount(1);
+ docs[0].Text.Should().Be("say \"hello\"");
+ }
+
+ [Fact]
+ public async Task ReadFilesAsync_EmptyContent_ReturnsEmpty()
+ {
+ var storage = new MemoryStorage();
+ await storage.SetAsync("data.csv", string.Empty);
+
+ var reader = new CsvFileReader(storage);
+
+ var docs = await reader.ReadFilesAsync();
+
+ docs.Should().BeEmpty();
+ }
+
+ [Fact]
+ public async Task ReadFilesAsync_NoMatchingFiles_ReturnsEmpty()
+ {
+ var storage = new MemoryStorage();
+ await storage.SetAsync("data.json", "{}");
+
+ var reader = new CsvFileReader(storage);
+
+ var docs = await reader.ReadFilesAsync();
+
+ docs.Should().BeEmpty();
+ }
+}