Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 57 additions & 25 deletions dotnet/src/GraphRag.Input/CsvFileReader.cs
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
// Copyright (c) 2025 Microsoft Corporation.
// Licensed under the MIT License

using System.Globalization;

using GraphRag.Storage;

namespace GraphRag.Input;

/// <summary>
/// Reads CSV files from storage and produces <see cref="TextDocument"/> instances.
/// Uses a simple built-in CSV parser. For robust CSV parsing (quoted fields,
/// escaping, multi-line values), use the GraphRag.Input.CsvHelper strategy library.
/// Handles quoted fields, escaped double-quotes, and multi-line field values.
/// For additional CSV parsing features, use the GraphRag.Input.CsvHelper strategy library.
/// </summary>
public class CsvFileReader : StructuredFileReader
{
Expand Down Expand Up @@ -47,33 +45,36 @@ public CsvFileReader(
return null;
}

var rows = new List<Dictionary<string, object?>>();
using var reader = new StringReader(content);
var allRows = ParseCsvContent(content);
if (allRows.Count == 0)
{
return [];
}

// Read header line.
var headerLine = await reader.ReadLineAsync(ct).ConfigureAwait(false);
if (headerLine is null)
var headers = allRows[0];
if (Array.TrueForAll(headers, string.IsNullOrWhiteSpace))
{
return rows;
return [];
}

var headers = ParseCsvLine(headerLine);
var rows = new List<Dictionary<string, object?>>();

// Read data lines.
string? line;
while ((line = await reader.ReadLineAsync(ct).ConfigureAwait(false)) is not null)
for (var i = 1; i < allRows.Count; i++)
{
ct.ThrowIfCancellationRequested();
if (string.IsNullOrWhiteSpace(line))

var fields = allRows[i];

// Skip blank rows (a single empty field means an empty line).
if (fields.Length == 1 && string.IsNullOrWhiteSpace(fields[0]))
{
continue;
}

var fields = ParseCsvLine(line);
var row = new Dictionary<string, object?>();
for (var i = 0; i < headers.Length; i++)
for (var j = 0; j < headers.Length; j++)
{
row[headers[i]] = i < fields.Length ? fields[i] : null;
row[headers[j]] = j < fields.Length ? fields[j] : null;
}

rows.Add(row);
Expand All @@ -82,20 +83,27 @@ public CsvFileReader(
return rows;
}

private static string[] ParseCsvLine(string line)
/// <summary>
/// Parses CSV content into rows, correctly handling quoted fields that may contain
/// embedded newlines, commas, and escaped double-quotes (<c>""</c>).
/// </summary>
private static List<string[]> ParseCsvContent(string content)
{
var rows = new List<string[]>();
var fields = new List<string>();
var inQuotes = false;
var field = new System.Text.StringBuilder();
var inQuotes = false;

for (var i = 0; i < line.Length; i++)
for (var i = 0; i < content.Length; i++)
{
var c = line[i];
var c = content[i];

if (inQuotes)
{
if (c == '"')
{
if (i + 1 < line.Length && line[i + 1] == '"')
// Escaped quote: "" inside a quoted field → single "
if (i + 1 < content.Length && content[i + 1] == '"')
{
field.Append('"');
i++;
Expand All @@ -107,6 +115,7 @@ private static string[] ParseCsvLine(string line)
}
else
{
// Append everything verbatim inside quotes, including \r and \n.
field.Append(c);
}
}
Expand All @@ -119,13 +128,36 @@ private static string[] ParseCsvLine(string line)
fields.Add(field.ToString());
field.Clear();
}
else if (c == '\r' || c == '\n')
{
// End of row — consume \r\n as a single line ending.
fields.Add(field.ToString());
field.Clear();
rows.Add([.. fields]);
fields.Clear();

if (c == '\r' && i + 1 < content.Length && content[i + 1] == '\n')
{
i++;
}
}
else
{
field.Append(c);
}
}

fields.Add(field.ToString());
return fields.ToArray();
// Flush trailing content that is not terminated by a newline.
// Only add a row if there is actual non-empty content remaining.
if (field.Length > 0 || fields.Count > 0)
{
fields.Add(field.ToString());
if (!fields.TrueForAll(string.IsNullOrEmpty))
{
rows.Add([.. fields]);
}
}

return rows;
}
}
105 changes: 105 additions & 0 deletions dotnet/tests/GraphRag.Tests.Unit/Input/CsvFileReaderTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright (c) 2025 Microsoft Corporation.
// Licensed under the MIT License

using FluentAssertions;
using GraphRag.Input;
using GraphRag.Storage;

namespace GraphRag.Tests.Unit.Input;

/// <summary>
/// Unit tests for <see cref="CsvFileReader"/>.
/// </summary>
public class CsvFileReaderTests
{
[Fact]
public async Task ReadFilesAsync_BasicCsv_ReturnsDocuments()
{
var storage = new MemoryStorage();
await storage.SetAsync("data.csv", "id,title,text\n1,Hello,Hi how are you today?\n2,World,Fine thanks\n");

var reader = new CsvFileReader(storage, idColumn: "id", titleColumn: "title", textColumn: "text");

var docs = await reader.ReadFilesAsync();

docs.Should().HaveCount(2);
docs[0].Id.Should().Be("1");
docs[0].Title.Should().Be("Hello");
docs[0].Text.Should().Be("Hi how are you today?");
docs[1].Id.Should().Be("2");
}

[Fact]
public async Task ReadFilesAsync_MultilineQuotedField_PreservesInternalNewlines()
{
// Equivalent to Python test: test_csv_loader_preserves_multiline_fields
var csvContent = "title,text\r\n\"Post 1\",\"Line one.\nLine two.\nLine three.\"\r\n\"Post 2\",\"Single line.\"\r\n";

var storage = new MemoryStorage();
await storage.SetAsync("input.csv", csvContent);

var reader = new CsvFileReader(storage, titleColumn: "title", textColumn: "text");

var docs = await reader.ReadFilesAsync();

docs.Should().HaveCount(2);
docs[0].Title.Should().Be("Post 1");
docs[0].Text.Should().Be("Line one.\nLine two.\nLine three.");
docs[1].Title.Should().Be("Post 2");
docs[1].Text.Should().Be("Single line.");
}

[Fact]
public async Task ReadFilesAsync_QuotedFieldWithComma_FieldNotSplit()
{
var storage = new MemoryStorage();
await storage.SetAsync("data.csv", "id,text\n1,\"hello, world\"\n");

var reader = new CsvFileReader(storage, idColumn: "id", textColumn: "text");

var docs = await reader.ReadFilesAsync();

docs.Should().HaveCount(1);
docs[0].Text.Should().Be("hello, world");
}

[Fact]
public async Task ReadFilesAsync_EscapedDoubleQuote_DecodedCorrectly()
{
var storage = new MemoryStorage();
await storage.SetAsync("data.csv", "id,text\n1,\"say \"\"hello\"\"\"\n");

var reader = new CsvFileReader(storage, idColumn: "id", textColumn: "text");

var docs = await reader.ReadFilesAsync();

docs.Should().HaveCount(1);
docs[0].Text.Should().Be("say \"hello\"");
}

[Fact]
public async Task ReadFilesAsync_EmptyContent_ReturnsEmpty()
{
var storage = new MemoryStorage();
await storage.SetAsync("data.csv", string.Empty);

var reader = new CsvFileReader(storage);

var docs = await reader.ReadFilesAsync();

docs.Should().BeEmpty();
}

[Fact]
public async Task ReadFilesAsync_NoMatchingFiles_ReturnsEmpty()
{
var storage = new MemoryStorage();
await storage.SetAsync("data.json", "{}");

var reader = new CsvFileReader(storage);

var docs = await reader.ReadFilesAsync();

docs.Should().BeEmpty();
}
}
Loading