From 5fae41097e69094cbf02bf41a452ceddce213734 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 28 Feb 2026 17:14:07 +0000 Subject: [PATCH 1/2] Initial plan From 5d354d5106031d9b46530095b3a7700abbfbf001 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 28 Feb 2026 17:23:36 +0000 Subject: [PATCH 2/2] Sync dotnet CsvFileReader with Python multiline-field fix (#2248) Co-authored-by: sharpninja <16146732+sharpninja@users.noreply.github.com> --- dotnet/src/GraphRag.Input/CsvFileReader.cs | 82 +++++++++----- .../Input/CsvFileReaderTests.cs | 105 ++++++++++++++++++ 2 files changed, 162 insertions(+), 25 deletions(-) create mode 100644 dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs diff --git a/dotnet/src/GraphRag.Input/CsvFileReader.cs b/dotnet/src/GraphRag.Input/CsvFileReader.cs index 176584ba2..502cf1913 100644 --- a/dotnet/src/GraphRag.Input/CsvFileReader.cs +++ b/dotnet/src/GraphRag.Input/CsvFileReader.cs @@ -1,16 +1,14 @@ // Copyright (c) 2025 Microsoft Corporation. // Licensed under the MIT License -using System.Globalization; - using GraphRag.Storage; namespace GraphRag.Input; /// /// Reads CSV files from storage and produces instances. -/// Uses a simple built-in CSV parser. For robust CSV parsing (quoted fields, -/// escaping, multi-line values), use the GraphRag.Input.CsvHelper strategy library. +/// Handles quoted fields, escaped double-quotes, and multi-line field values. +/// For additional CSV parsing features, use the GraphRag.Input.CsvHelper strategy library. /// public class CsvFileReader : StructuredFileReader { @@ -47,33 +45,36 @@ public CsvFileReader( return null; } - var rows = new List>(); - using var reader = new StringReader(content); + var allRows = ParseCsvContent(content); + if (allRows.Count == 0) + { + return []; + } - // Read header line. - var headerLine = await reader.ReadLineAsync(ct).ConfigureAwait(false); - if (headerLine is null) + var headers = allRows[0]; + if (Array.TrueForAll(headers, string.IsNullOrWhiteSpace)) { - return rows; + return []; } - var headers = ParseCsvLine(headerLine); + var rows = new List>(); - // Read data lines. - string? line; - while ((line = await reader.ReadLineAsync(ct).ConfigureAwait(false)) is not null) + for (var i = 1; i < allRows.Count; i++) { ct.ThrowIfCancellationRequested(); - if (string.IsNullOrWhiteSpace(line)) + + var fields = allRows[i]; + + // Skip blank rows (a single empty field means an empty line). + if (fields.Length == 1 && string.IsNullOrWhiteSpace(fields[0])) { continue; } - var fields = ParseCsvLine(line); var row = new Dictionary(); - for (var i = 0; i < headers.Length; i++) + for (var j = 0; j < headers.Length; j++) { - row[headers[i]] = i < fields.Length ? fields[i] : null; + row[headers[j]] = j < fields.Length ? fields[j] : null; } rows.Add(row); @@ -82,20 +83,27 @@ public CsvFileReader( return rows; } - private static string[] ParseCsvLine(string line) + /// + /// Parses CSV content into rows, correctly handling quoted fields that may contain + /// embedded newlines, commas, and escaped double-quotes (""). + /// + private static List ParseCsvContent(string content) { + var rows = new List(); var fields = new List(); - var inQuotes = false; var field = new System.Text.StringBuilder(); + var inQuotes = false; - for (var i = 0; i < line.Length; i++) + for (var i = 0; i < content.Length; i++) { - var c = line[i]; + var c = content[i]; + if (inQuotes) { if (c == '"') { - if (i + 1 < line.Length && line[i + 1] == '"') + // Escaped quote: "" inside a quoted field → single " + if (i + 1 < content.Length && content[i + 1] == '"') { field.Append('"'); i++; @@ -107,6 +115,7 @@ private static string[] ParseCsvLine(string line) } else { + // Append everything verbatim inside quotes, including \r and \n. field.Append(c); } } @@ -119,13 +128,36 @@ private static string[] ParseCsvLine(string line) fields.Add(field.ToString()); field.Clear(); } + else if (c == '\r' || c == '\n') + { + // End of row — consume \r\n as a single line ending. + fields.Add(field.ToString()); + field.Clear(); + rows.Add([.. fields]); + fields.Clear(); + + if (c == '\r' && i + 1 < content.Length && content[i + 1] == '\n') + { + i++; + } + } else { field.Append(c); } } - fields.Add(field.ToString()); - return fields.ToArray(); + // Flush trailing content that is not terminated by a newline. + // Only add a row if there is actual non-empty content remaining. + if (field.Length > 0 || fields.Count > 0) + { + fields.Add(field.ToString()); + if (!fields.TrueForAll(string.IsNullOrEmpty)) + { + rows.Add([.. fields]); + } + } + + return rows; } } diff --git a/dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs b/dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs new file mode 100644 index 000000000..fe2aec9bf --- /dev/null +++ b/dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs @@ -0,0 +1,105 @@ +// Copyright (c) 2025 Microsoft Corporation. +// Licensed under the MIT License + +using FluentAssertions; +using GraphRag.Input; +using GraphRag.Storage; + +namespace GraphRag.Tests.Unit.Input; + +/// +/// Unit tests for . +/// +public class CsvFileReaderTests +{ + [Fact] + public async Task ReadFilesAsync_BasicCsv_ReturnsDocuments() + { + var storage = new MemoryStorage(); + await storage.SetAsync("data.csv", "id,title,text\n1,Hello,Hi how are you today?\n2,World,Fine thanks\n"); + + var reader = new CsvFileReader(storage, idColumn: "id", titleColumn: "title", textColumn: "text"); + + var docs = await reader.ReadFilesAsync(); + + docs.Should().HaveCount(2); + docs[0].Id.Should().Be("1"); + docs[0].Title.Should().Be("Hello"); + docs[0].Text.Should().Be("Hi how are you today?"); + docs[1].Id.Should().Be("2"); + } + + [Fact] + public async Task ReadFilesAsync_MultilineQuotedField_PreservesInternalNewlines() + { + // Equivalent to Python test: test_csv_loader_preserves_multiline_fields + var csvContent = "title,text\r\n\"Post 1\",\"Line one.\nLine two.\nLine three.\"\r\n\"Post 2\",\"Single line.\"\r\n"; + + var storage = new MemoryStorage(); + await storage.SetAsync("input.csv", csvContent); + + var reader = new CsvFileReader(storage, titleColumn: "title", textColumn: "text"); + + var docs = await reader.ReadFilesAsync(); + + docs.Should().HaveCount(2); + docs[0].Title.Should().Be("Post 1"); + docs[0].Text.Should().Be("Line one.\nLine two.\nLine three."); + docs[1].Title.Should().Be("Post 2"); + docs[1].Text.Should().Be("Single line."); + } + + [Fact] + public async Task ReadFilesAsync_QuotedFieldWithComma_FieldNotSplit() + { + var storage = new MemoryStorage(); + await storage.SetAsync("data.csv", "id,text\n1,\"hello, world\"\n"); + + var reader = new CsvFileReader(storage, idColumn: "id", textColumn: "text"); + + var docs = await reader.ReadFilesAsync(); + + docs.Should().HaveCount(1); + docs[0].Text.Should().Be("hello, world"); + } + + [Fact] + public async Task ReadFilesAsync_EscapedDoubleQuote_DecodedCorrectly() + { + var storage = new MemoryStorage(); + await storage.SetAsync("data.csv", "id,text\n1,\"say \"\"hello\"\"\"\n"); + + var reader = new CsvFileReader(storage, idColumn: "id", textColumn: "text"); + + var docs = await reader.ReadFilesAsync(); + + docs.Should().HaveCount(1); + docs[0].Text.Should().Be("say \"hello\""); + } + + [Fact] + public async Task ReadFilesAsync_EmptyContent_ReturnsEmpty() + { + var storage = new MemoryStorage(); + await storage.SetAsync("data.csv", string.Empty); + + var reader = new CsvFileReader(storage); + + var docs = await reader.ReadFilesAsync(); + + docs.Should().BeEmpty(); + } + + [Fact] + public async Task ReadFilesAsync_NoMatchingFiles_ReturnsEmpty() + { + var storage = new MemoryStorage(); + await storage.SetAsync("data.json", "{}"); + + var reader = new CsvFileReader(storage); + + var docs = await reader.ReadFilesAsync(); + + docs.Should().BeEmpty(); + } +}