From a95b8e4ead30db83e3ca1cbbfd3eee9b0a77a7bf Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 22 May 2026 16:38:44 +0200 Subject: [PATCH 1/4] feat: autodetect DATE/DATETIME fields in CSV type inference (#142) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extend ColumnType enum with DATE, DATE_EU, DATE_US, DATETIME, DATETIME_EU, DATETIME_US sub-variants; all map to TEXT affinity in DDL - Add displayName() method: DATE*/DATETIME* display as DATE/DATETIME in --columns, --validate, and --sample output instead of internal tag name - Add isDate() / isDateTime() detectors (length-gated; no overlap) - Add SlashOrder enum + accumSlashOrder() for DD/MM vs MM/DD disambiguation per column: d1>12→EU, d2>12→US, both≤12→abstain, contradictory→TEXT - Rewrite inferTypes() with 11 tracking arrays; DATETIME>DATE>INTEGER>REAL>TEXT priority; mixed ISO+slash format or mixed date+datetime → TEXT fallback - Add normalizeDateToIso() / normalizeDateTimeToIso() helpers that reformat EU/US/dash/T-separator values into YYYY-MM-DD / YYYY-MM-DD HH:MM:SS - Update insertRowTyped() with 6 new ColumnType cases; stack-buffer bind uses sqliteTransient() (SQLITE_TRANSIENT sentinel via @setRuntimeSafety(false)) - Add sqliteTransient() fn in sqlite.zig (replaces unrepresentable const) - Add loader unit test binary to build.zig (unit-test step) - Add 15 date/datetime integration tests (140-154) covering ISO, EU dash, EU slash, US slash, ambiguous, --columns, --validate, ORDER BY, --no-type-inference - All 154 integration tests + CSV/XML/loader unit tests pass; ziglint clean --- build.zig | 178 +++++++++++- src/loader.zig | 595 +++++++++++++++++++++++++++++++++++++++-- src/modes/columns.zig | 2 +- src/modes/sample.zig | 2 +- src/modes/validate.zig | 2 +- src/sqlite.zig | 60 ++++- 6 files changed, 816 insertions(+), 23 deletions(-) diff --git a/build.zig b/build.zig index 4aa353e..571469e 100644 --- a/build.zig +++ b/build.zig @@ -1438,7 +1438,161 @@ pub fn build(b: *std.Build) void { test_json_path_format_mismatch.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_json_path_format_mismatch.step); - // Unit tests for the RFC 4180 CSV parser (src/csv.zig) + // ─── Date / datetime type inference integration tests ──────────────────── + + // Integration test 140: ISO date column is stored and queryable as YYYY-MM-DD text + const test_date_iso = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,2024-01-15\n2,1999-12-31\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT dob FROM t WHERE id=1") + \\[ "$result" = "2024-01-15" ] + }); + test_date_iso.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_date_iso.step); + + // Integration test 141: ISO date column supports SQLite date() function + const test_date_iso_func = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,2024-01-15\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT date(dob) FROM t") + \\[ "$result" = "2024-01-15" ] + }); + test_date_iso_func.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_date_iso_func.step); + + // Integration test 142: EU-dash date (DD-MM-YYYY) normalized to ISO on insert + const test_date_eu_dash = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,15-01-2024\n2,31-12-1999\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT dob FROM t ORDER BY dob") + \\expected=$(printf '1999-12-31\n2024-01-15') + \\[ "$result" = "$expected" ] + }); + test_date_eu_dash.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_date_eu_dash.step); + + // Integration test 143: EU-slash date (DD/MM/YYYY) detected when d1 > 12 + const test_date_eu_slash = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,15/01/2024\n2,31/12/1999\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT dob FROM t ORDER BY dob") + \\expected=$(printf '1999-12-31\n2024-01-15') + \\[ "$result" = "$expected" ] + }); + test_date_eu_slash.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_date_eu_slash.step); + + // Integration test 144: US-slash date (MM/DD/YYYY) detected when d2 > 12 + const test_date_us_slash = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,01/15/2024\n2,12/31/1999\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT dob FROM t ORDER BY dob") + \\expected=$(printf '1999-12-31\n2024-01-15') + \\[ "$result" = "$expected" ] + }); + test_date_us_slash.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_date_us_slash.step); + + // Integration test 145: ambiguous slash date (both ≤ 12) → TEXT, no normalization + const test_date_slash_ambiguous = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,05/06/2024\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT dob FROM t") + \\[ "$result" = "05/06/2024" ] + }); + test_date_slash_ambiguous.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_date_slash_ambiguous.step); + + // Integration test 146: ISO datetime (space separator) stored as ISO and queryable + const test_datetime_iso_space = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,ts\n1,2024-01-15 10:30:00\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT ts FROM t") + \\[ "$result" = "2024-01-15 10:30:00" ] + }); + test_datetime_iso_space.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_datetime_iso_space.step); + + // Integration test 147: ISO datetime T-separator normalized to space on insert + const test_datetime_iso_t = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,ts\n1,2024-01-15T10:30:00\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT ts FROM t") + \\[ "$result" = "2024-01-15 10:30:00" ] + }); + test_datetime_iso_t.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_datetime_iso_t.step); + + // Integration test 148: EU-slash datetime (DD/MM/YYYY HH:MM) normalized to ISO + const test_datetime_eu_slash = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,ts\n1,15/01/2024 10:30\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT ts FROM t") + \\[ "$result" = "2024-01-15 10:30:00" ] + }); + test_datetime_eu_slash.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_datetime_eu_slash.step); + + // Integration test 149: US-slash datetime (MM/DD/YYYY HH:MM) normalized to ISO + const test_datetime_us_slash = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,ts\n1,01/15/2024 10:30\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT ts FROM t") + \\[ "$result" = "2024-01-15 10:30:00" ] + }); + test_datetime_us_slash.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_datetime_us_slash.step); + + // Integration test 150: --columns --verbose shows DATE for date column + const test_columns_date_type = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,2024-01-15\n' \ + \\ | ./zig-out/bin/sql-pipe --columns --verbose) + \\echo "$result" | grep -q "dob DATE" + }); + test_columns_date_type.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_date_type.step); + + // Integration test 151: --columns --verbose shows DATETIME for datetime column + const test_columns_datetime_type = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,ts\n1,2024-01-15 10:30:00\n' \ + \\ | ./zig-out/bin/sql-pipe --columns --verbose) + \\echo "$result" | grep -q "ts DATETIME" + }); + test_columns_datetime_type.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_columns_datetime_type.step); + + // Integration test 152: --validate shows DATE in schema summary + const test_validate_date_type = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,2024-01-15\n2,1999-12-31\n' \ + \\ | ./zig-out/bin/sql-pipe --validate) + \\echo "$result" | grep -q "dob DATE" + }); + test_validate_date_type.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_validate_date_type.step); + + // Integration test 153: date column supports ORDER BY (ISO sort = chronological) + const test_date_order_by = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'name,dob\nBob,15-01-1990\nAlice,20-03-1985\nCarol,01-07-1992\n' \ + \\ | ./zig-out/bin/sql-pipe "SELECT name FROM t ORDER BY dob") + \\expected=$(printf 'Alice\nBob\nCarol') + \\[ "$result" = "$expected" ] + }); + test_date_order_by.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_date_order_by.step); + + // Integration test 154: --no-type-inference keeps date as TEXT (no normalization) + const test_date_no_type_inference = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'id,dob\n1,15/01/2024\n' \ + \\ | ./zig-out/bin/sql-pipe --no-type-inference "SELECT dob FROM t") + \\[ "$result" = "15/01/2024" ] + }); + test_date_no_type_inference.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_date_no_type_inference.step); const unit_tests = b.addTest(.{ .root_module = b.createModule(.{ .root_source_file = b.path("src/csv.zig"), @@ -1472,4 +1626,26 @@ pub fn build(b: *std.Build) void { const run_xml_unit_tests = b.addRunArtifact(xml_unit_tests); test_step.dependOn(&run_xml_unit_tests.step); unit_test_step.dependOn(&run_xml_unit_tests.step); + + // Unit tests for the CSV loader (src/loader.zig) — isDate, isDateTime, inferTypes, normalize helpers + const loader_unit_tests = b.addTest(.{ + .root_module = b.createModule(.{ + .root_source_file = b.path("src/loader.zig"), + .target = target, + .optimize = optimize, + .link_libc = true, + }), + }); + loader_unit_tests.root_module.addImport("c", translate_c.createModule()); + if (bundle_sqlite) { + loader_unit_tests.root_module.addIncludePath(b.path("lib")); + loader_unit_tests.root_module.addCSourceFile(.{ + .file = b.path("lib/sqlite3.c"), + .flags = &.{"-DSQLITE_OMIT_LOAD_EXTENSION=1"}, + }); + } else { + loader_unit_tests.root_module.linkSystemLibrary("sqlite3", .{}); + } + const run_loader_unit_tests = b.addRunArtifact(loader_unit_tests); + unit_test_step.dependOn(&run_loader_unit_tests.step); } diff --git a/src/loader.zig b/src/loader.zig index 7eac40f..0852104 100644 --- a/src/loader.zig +++ b/src/loader.zig @@ -45,15 +45,122 @@ pub fn isReal(val: []const u8) bool { return true; } +/// isDate(val) → bool +/// Pre: val is a valid UTF-8 slice +/// Post: result = val matches one of the supported date formats: +/// YYYY-MM-DD (ISO 8601) +/// DD-MM-YYYY (European dash) +/// DD/MM/YYYY (European slash — D1 may be > 12) +/// MM/DD/YYYY (American slash — D2 may be > 12) +/// Slash formats are accepted regardless of which component is day vs month; +/// the caller resolves ambiguity at the column level (see inferTypes). +/// Basic range checks: month 01-12, day 01-31. +pub fn isDate(val: []const u8) bool { + if (val.len != 10) return false; + if (val[4] == '-' and val[7] == '-') { + // YYYY-MM-DD + for ([_]usize{ 0, 1, 2, 3, 5, 6, 8, 9 }) |i| if (!std.ascii.isDigit(val[i])) return false; + const month: u8 = (val[5] - '0') * 10 + (val[6] - '0'); + const day: u8 = (val[8] - '0') * 10 + (val[9] - '0'); + return month >= 1 and month <= 12 and day >= 1 and day <= 31; + } + if (val[2] == '-' and val[5] == '-') { + // DD-MM-YYYY (European dash) + for ([_]usize{ 0, 1, 3, 4, 6, 7, 8, 9 }) |i| if (!std.ascii.isDigit(val[i])) return false; + const day: u8 = (val[0] - '0') * 10 + (val[1] - '0'); + const month: u8 = (val[3] - '0') * 10 + (val[4] - '0'); + return month >= 1 and month <= 12 and day >= 1 and day <= 31; + } + if (val[2] == '/' and val[5] == '/') { + // DD/MM/YYYY or MM/DD/YYYY (slash format — ambiguity resolved at column level) + for ([_]usize{ 0, 1, 3, 4, 6, 7, 8, 9 }) |i| if (!std.ascii.isDigit(val[i])) return false; + const d1: u8 = (val[0] - '0') * 10 + (val[1] - '0'); + const d2: u8 = (val[3] - '0') * 10 + (val[4] - '0'); + if (d1 == 0 or d1 > 31) return false; + if (d2 == 0 or d2 > 31) return false; + return d1 <= 12 or d2 <= 12; // at least one must be a valid month + } + return false; +} + +/// isDateTime(val) → bool +/// Pre: val is a valid UTF-8 slice +/// Post: result = val matches one of the supported datetime formats: +/// YYYY-MM-DD HH:MM:SS (ISO 8601 with space separator, 19 chars) +/// YYYY-MM-DDTHH:MM:SS (ISO 8601 with T separator, 19 chars) +/// DD/MM/YYYY HH:MM (European slash, 16 chars) +/// MM/DD/YYYY HH:MM (American slash, 16 chars) +/// Slash formats: D1/D2 ambiguity resolved at column level (see inferTypes). +/// Range checks: month 01-12, day 01-31, hour 00-23, min 00-59, sec 00-59. +pub fn isDateTime(val: []const u8) bool { + if (val.len == 19) { + // YYYY-MM-DD[T ]HH:MM:SS + if (val[4] != '-' or val[7] != '-') return false; + if (val[10] != ' ' and val[10] != 'T') return false; + if (val[13] != ':' or val[16] != ':') return false; + for ([_]usize{ 0, 1, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18 }) |i| + if (!std.ascii.isDigit(val[i])) return false; + const month: u8 = (val[5] - '0') * 10 + (val[6] - '0'); + const day: u8 = (val[8] - '0') * 10 + (val[9] - '0'); + const hour: u8 = (val[11] - '0') * 10 + (val[12] - '0'); + const min: u8 = (val[14] - '0') * 10 + (val[15] - '0'); + const sec: u8 = (val[17] - '0') * 10 + (val[18] - '0'); + return month >= 1 and month <= 12 and day >= 1 and day <= 31 and + hour <= 23 and min <= 59 and sec <= 59; + } + if (val.len == 16) { + // DD/MM/YYYY HH:MM or MM/DD/YYYY HH:MM + if (val[2] != '/' or val[5] != '/') return false; + if (val[10] != ' ' or val[13] != ':') return false; + for ([_]usize{ 0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15 }) |i| + if (!std.ascii.isDigit(val[i])) return false; + const d1: u8 = (val[0] - '0') * 10 + (val[1] - '0'); + const d2: u8 = (val[3] - '0') * 10 + (val[4] - '0'); + if (d1 == 0 or d1 > 31) return false; + if (d2 == 0 or d2 > 31) return false; + if (d1 > 12 and d2 > 12) return false; // no valid month + const hour: u8 = (val[11] - '0') * 10 + (val[12] - '0'); + const min: u8 = (val[14] - '0') * 10 + (val[15] - '0'); + return hour <= 23 and min <= 59; + } + return false; +} + +/// Slash-format day/month order for a date or datetime column. +/// Accumulated per-column during type inference to disambiguate DD/MM vs MM/DD. +const SlashOrder = enum { unknown, eu, us, contradictory }; + +/// accumSlashOrder(current, vote) → SlashOrder +/// Merge a new vote into the running slash-order state for a column. +/// `.unknown` votes (ambiguous value, both components ≤ 12) are ignored. +fn accumSlashOrder(current: SlashOrder, vote: SlashOrder) SlashOrder { + if (vote == .unknown) return current; + return switch (current) { + .unknown => vote, + .eu => if (vote == .eu) .eu else .contradictory, + .us => if (vote == .us) .us else .contradictory, + .contradictory => .contradictory, + }; +} + /// inferTypes(buffer, num_cols, allocator) → []ColumnType /// Pre: buffer is a slice of rows (each row is a slice of field strings) /// num_cols > 0; allocator is valid -/// Post: result.len = num_cols -/// result[j] = INTEGER ⟺ all non-empty values in column j are integers -/// result[j] = REAL ⟺ all non-empty values are numeric but at least one -/// is not a plain integer -/// result[j] = TEXT ⟺ at least one non-empty value is non-numeric, -/// OR no non-empty values exist +/// Post: result.len = num_cols; result[j] is the most specific type that +/// accommodates all non-empty values in column j: +/// DATETIME / DATETIME_EU / DATETIME_US — all values are datetime strings +/// DATE / DATE_EU / DATE_US — all values are date strings (no datetime) +/// INTEGER — all values are plain integers +/// REAL — all values are numeric (at least one is non-integer) +/// TEXT — any non-numeric/non-date value, or no data, +/// or ambiguous/contradictory slash format +/// +/// Slash-format disambiguation: for DD/MM/YYYY and MM/DD/YYYY, values with +/// D1 > 12 vote EU; values with D2 > 12 vote US; both ≤ 12 abstain. +/// Contradictory votes or all-abstain → TEXT. +/// +/// Datetime/date priority: isDateTime is checked before isDate; columns with +/// mixed datetime+date values or mixed ISO+slash formats fall back to TEXT. pub fn inferTypes( allocator: std.mem.Allocator, buffer: []const [][]u8, @@ -62,6 +169,7 @@ pub fn inferTypes( const types = try allocator.alloc(ColumnType, num_cols); errdefer allocator.free(types); + // Numeric tracking (existing) const can_be_integer = try allocator.alloc(bool, num_cols); defer allocator.free(can_be_integer); const can_be_real = try allocator.alloc(bool, num_cols); @@ -69,24 +177,87 @@ pub fn inferTypes( const has_data = try allocator.alloc(bool, num_cols); defer allocator.free(has_data); - // Initialise: optimistically assume every column can be INTEGER + // Datetime tracking + const can_be_datetime = try allocator.alloc(bool, num_cols); + defer allocator.free(can_be_datetime); + const dt_has_iso = try allocator.alloc(bool, num_cols); // 19-char ISO datetime values seen + defer allocator.free(dt_has_iso); + const dt_has_slash = try allocator.alloc(bool, num_cols); // 16-char slash datetime values seen + defer allocator.free(dt_has_slash); + const slash_order_dt = try allocator.alloc(SlashOrder, num_cols); + defer allocator.free(slash_order_dt); + + // Date tracking + const can_be_date = try allocator.alloc(bool, num_cols); + defer allocator.free(can_be_date); + const d_has_nonslash = try allocator.alloc(bool, num_cols); // YYYY-MM-DD or DD-MM-YYYY seen + defer allocator.free(d_has_nonslash); + const d_has_slash = try allocator.alloc(bool, num_cols); // D1/D2/YYYY slash values seen + defer allocator.free(d_has_slash); + const slash_order_d = try allocator.alloc(SlashOrder, num_cols); + defer allocator.free(slash_order_d); + for (0..num_cols) |j| { can_be_integer[j] = true; can_be_real[j] = true; has_data[j] = false; + can_be_datetime[j] = true; + dt_has_iso[j] = false; + dt_has_slash[j] = false; + slash_order_dt[j] = .unknown; + can_be_date[j] = true; + d_has_nonslash[j] = false; + d_has_slash[j] = false; + slash_order_d[j] = .unknown; } - // Loop invariant I: for each j in 0..num_cols, - // can_be_integer[j] = true ⟺ all non-empty values in column j seen so far are integers - // can_be_real[j] = true ⟺ all non-empty values in column j seen so far are numeric - // has_data[j] = true ⟺ at least one non-empty value has been seen in column j + // Loop invariant I: for each j in 0..num_cols and each value seen so far, + // can_be_datetime[j] = true ⟺ all non-empty values pass isDateTime + // can_be_date[j] = true ⟺ all non-empty values pass isDate and not isDateTime + // can_be_integer[j] = true ⟺ all non-empty values are integers + // can_be_real[j] = true ⟺ all non-empty values are numeric + // has_data[j] = true ⟺ at least one non-empty value has been seen // Bounding function: buffer.len - row_idx for (buffer) |row| { for (row, 0..) |val, j| { if (j >= num_cols) break; - if (val.len == 0) continue; // NULL/empty → skip, does not affect inference + if (val.len == 0) continue; has_data[j] = true; - if (!can_be_real[j]) continue; // already TEXT, no need to re-check + + // ── Datetime check (highest priority) ──────────────────────────── + if (can_be_datetime[j]) { + if (!isDateTime(val)) { + can_be_datetime[j] = false; + } else if (val.len == 16) { + // Slash datetime: accumulate D1/D2 order vote + dt_has_slash[j] = true; + const d1: u8 = (val[0] - '0') * 10 + (val[1] - '0'); + const d2: u8 = (val[3] - '0') * 10 + (val[4] - '0'); + const vote: SlashOrder = if (d1 > 12) .eu else if (d2 > 12) .us else .unknown; + slash_order_dt[j] = accumSlashOrder(slash_order_dt[j], vote); + } else { + dt_has_iso[j] = true; + } + } + + // ── Date check (isDate is length-10 only; no overlap with isDateTime) ── + if (can_be_date[j]) { + if (!isDate(val)) { + can_be_date[j] = false; + } else if (val[2] == '/') { + // Slash date: accumulate D1/D2 order vote + d_has_slash[j] = true; + const d1: u8 = (val[0] - '0') * 10 + (val[1] - '0'); + const d2: u8 = (val[3] - '0') * 10 + (val[4] - '0'); + const vote: SlashOrder = if (d1 > 12) .eu else if (d2 > 12) .us else .unknown; + slash_order_d[j] = accumSlashOrder(slash_order_d[j], vote); + } else { + d_has_nonslash[j] = true; // YYYY-MM-DD or DD-MM-YYYY + } + } + + // ── Numeric check ──────────────────────────────────────────────── + if (!can_be_real[j]) continue; if (!isReal(val)) { can_be_real[j] = false; can_be_integer[j] = false; @@ -96,12 +267,37 @@ pub fn inferTypes( } } - // Determine final type per column - // Post: types[j] reflects can_be_integer[j] / can_be_real[j] / has_data[j] + // Determine final type per column (DATETIME > DATE > INTEGER > REAL > TEXT) for (0..num_cols) |j| { - if (has_data[j] and can_be_integer[j]) { + if (!has_data[j]) { + types[j] = .TEXT; + } else if (can_be_datetime[j]) { + if (dt_has_iso[j] and dt_has_slash[j]) { + types[j] = .TEXT; // mixed ISO + slash datetime formats + } else if (dt_has_slash[j]) { + types[j] = switch (slash_order_dt[j]) { + .eu => .DATETIME_EU, + .us => .DATETIME_US, + else => .TEXT, // unknown (all ambiguous) or contradictory + }; + } else { + types[j] = .DATETIME; // pure ISO datetime + } + } else if (can_be_date[j]) { + if (d_has_nonslash[j] and d_has_slash[j]) { + types[j] = .TEXT; // mixed ISO/dash + slash date formats + } else if (d_has_slash[j]) { + types[j] = switch (slash_order_d[j]) { + .eu => .DATE_EU, + .us => .DATE_US, + else => .TEXT, // unknown (all ambiguous) or contradictory + }; + } else { + types[j] = .DATE; // YYYY-MM-DD or DD-MM-YYYY (detected at bind time) + } + } else if (can_be_integer[j]) { types[j] = .INTEGER; - } else if (has_data[j] and can_be_real[j]) { + } else if (can_be_real[j]) { types[j] = .REAL; } else { types[j] = .TEXT; @@ -228,6 +424,21 @@ pub fn insertRowTyped( if (c.sqlite3_bind_text(stmt, col_idx, val.ptr, @intCast(val.len), sqlite_static) != c.SQLITE_OK) return error.BindFailed; }, + .DATE, .DATE_EU, .DATE_US => { + // Normalize to ISO 8601 YYYY-MM-DD in a stack buffer. + // Use SQLITE_TRANSIENT so SQLite copies the value before the buffer + // is reclaimed (the step happens after this function's stack frame). + var buf: [10]u8 = undefined; + const iso = normalizeDateToIso(col_type, val, &buf); + if (c.sqlite3_bind_text(stmt, col_idx, iso.ptr, @intCast(iso.len), sqlite_mod.sqliteTransient()) != c.SQLITE_OK) + return error.BindFailed; + }, + .DATETIME, .DATETIME_EU, .DATETIME_US => { + var buf: [19]u8 = undefined; + const iso = normalizeDateTimeToIso(col_type, val, &buf); + if (c.sqlite3_bind_text(stmt, col_idx, iso.ptr, @intCast(iso.len), sqlite_mod.sqliteTransient()) != c.SQLITE_OK) + return error.BindFailed; + }, } col_idx += 1; } @@ -242,6 +453,97 @@ pub fn insertRowTyped( if (c.sqlite3_step(stmt) != c.SQLITE_DONE) return error.StepFailed; } +/// normalizeDateToIso(col_type, val, buf) → []const u8 +/// Pre: col_type ∈ {DATE, DATE_EU, DATE_US} +/// val was accepted by isDate during type inference; val.len == 10 +/// buf.len >= 10 +/// Post: result is buf[0..10] formatted as "YYYY-MM-DD" (ISO 8601) +/// DATE with val[4]=='-': YYYY-MM-DD passthrough +/// DATE with val[2]=='-': DD-MM-YYYY → YYYY-MM-DD +/// DATE_EU: DD/MM/YYYY → YYYY-MM-DD +/// DATE_US: MM/DD/YYYY → YYYY-MM-DD +fn normalizeDateToIso(col_type: ColumnType, val: []const u8, buf: *[10]u8) []const u8 { + switch (col_type) { + .DATE => { + if (val[4] == '-') { + // YYYY-MM-DD — already ISO, copy as-is + @memcpy(buf, val[0..10]); + } else { + // DD-MM-YYYY → YYYY-MM-DD + buf[0] = val[6]; buf[1] = val[7]; buf[2] = val[8]; buf[3] = val[9]; + buf[4] = '-'; + buf[5] = val[3]; buf[6] = val[4]; + buf[7] = '-'; + buf[8] = val[0]; buf[9] = val[1]; + } + }, + .DATE_EU => { + // DD/MM/YYYY → YYYY-MM-DD + buf[0] = val[6]; buf[1] = val[7]; buf[2] = val[8]; buf[3] = val[9]; + buf[4] = '-'; + buf[5] = val[3]; buf[6] = val[4]; + buf[7] = '-'; + buf[8] = val[0]; buf[9] = val[1]; + }, + .DATE_US => { + // MM/DD/YYYY → YYYY-MM-DD + buf[0] = val[6]; buf[1] = val[7]; buf[2] = val[8]; buf[3] = val[9]; + buf[4] = '-'; + buf[5] = val[0]; buf[6] = val[1]; + buf[7] = '-'; + buf[8] = val[3]; buf[9] = val[4]; + }, + else => unreachable, + } + return buf[0..10]; +} + +/// normalizeDateTimeToIso(col_type, val, buf) → []const u8 +/// Pre: col_type ∈ {DATETIME, DATETIME_EU, DATETIME_US} +/// val was accepted by isDateTime during type inference +/// buf.len >= 19 +/// Post: result is buf[0..19] formatted as "YYYY-MM-DD HH:MM:SS" (ISO 8601 with space) +/// DATETIME: T-separator normalized to space; space-separator passed through +/// DATETIME_EU: DD/MM/YYYY HH:MM → YYYY-MM-DD HH:MM:00 +/// DATETIME_US: MM/DD/YYYY HH:MM → YYYY-MM-DD HH:MM:00 +fn normalizeDateTimeToIso(col_type: ColumnType, val: []const u8, buf: *[19]u8) []const u8 { + switch (col_type) { + .DATETIME => { + // YYYY-MM-DD[T ]HH:MM:SS → YYYY-MM-DD HH:MM:SS + @memcpy(buf, val[0..19]); + buf[10] = ' '; // normalize T → space (no-op when already space) + }, + .DATETIME_EU => { + // DD/MM/YYYY HH:MM → YYYY-MM-DD HH:MM:00 + buf[0] = val[6]; buf[1] = val[7]; buf[2] = val[8]; buf[3] = val[9]; + buf[4] = '-'; + buf[5] = val[3]; buf[6] = val[4]; + buf[7] = '-'; + buf[8] = val[0]; buf[9] = val[1]; + buf[10] = ' '; + buf[11] = val[11]; buf[12] = val[12]; + buf[13] = ':'; + buf[14] = val[14]; buf[15] = val[15]; + buf[16] = ':'; buf[17] = '0'; buf[18] = '0'; + }, + .DATETIME_US => { + // MM/DD/YYYY HH:MM → YYYY-MM-DD HH:MM:00 + buf[0] = val[6]; buf[1] = val[7]; buf[2] = val[8]; buf[3] = val[9]; + buf[4] = '-'; + buf[5] = val[0]; buf[6] = val[1]; + buf[7] = '-'; + buf[8] = val[3]; buf[9] = val[4]; + buf[10] = ' '; + buf[11] = val[11]; buf[12] = val[12]; + buf[13] = ':'; + buf[14] = val[14]; buf[15] = val[15]; + buf[16] = ':'; buf[17] = '0'; buf[18] = '0'; + }, + else => unreachable, + } + return buf[0..19]; +} + /// fmtThousands(buf, n) → []const u8 /// Pre: buf.len >= 26 (accommodates any usize value with thousands separators) /// Post: n is formatted as a decimal string with ',' separating each group of @@ -428,3 +730,262 @@ pub fn loadCsvInput( return rows_inserted; } + +// ─── Unit tests ─────────────────────────────────────── + +test "isDate: valid ISO dates" { + try std.testing.expect(isDate("2024-01-15")); + try std.testing.expect(isDate("1999-12-31")); + try std.testing.expect(isDate("2000-02-29")); // range check only — calendar validity not enforced +} + +test "isDate: valid EU dash dates" { + try std.testing.expect(isDate("15-01-2024")); + try std.testing.expect(isDate("31-12-1999")); +} + +test "isDate: valid slash dates" { + try std.testing.expect(isDate("15/01/2024")); // EU slash (d1=15 > 12) + try std.testing.expect(isDate("01/15/2024")); // US slash (d2=15 > 12) + try std.testing.expect(isDate("05/06/2024")); // ambiguous (both ≤ 12) +} + +test "isDate: invalid inputs" { + try std.testing.expect(!isDate("")); + try std.testing.expect(!isDate("2024-1-15")); // single-digit month + try std.testing.expect(!isDate("not-a-date")); + try std.testing.expect(!isDate("2024-00-15")); // month 0 + try std.testing.expect(!isDate("2024-13-01")); // month 13 + try std.testing.expect(!isDate("2024-01-00")); // day 0 + try std.testing.expect(!isDate("2024-01-32")); // day 32 + try std.testing.expect(!isDate("20240115")); // no separators + try std.testing.expect(!isDate("2024/01/15")); // YYYY/MM/DD not supported + try std.testing.expect(!isDate("13/13/2024")); // both > 12, no valid month +} + +test "isDate does not match datetimes" { + // datetime values are 16 or 19 chars; isDate is length-gated to 10 + try std.testing.expect(!isDate("2024-01-15 10:30:00")); + try std.testing.expect(!isDate("2024-01-15T10:30:00")); + try std.testing.expect(!isDate("15/01/2024 10:30")); +} + +test "isDateTime: valid ISO datetimes" { + try std.testing.expect(isDateTime("2024-01-15 10:30:00")); + try std.testing.expect(isDateTime("2024-01-15T10:30:00")); + try std.testing.expect(isDateTime("1999-12-31 23:59:59")); + try std.testing.expect(isDateTime("2000-01-01 00:00:00")); +} + +test "isDateTime: valid slash datetimes" { + try std.testing.expect(isDateTime("15/01/2024 10:30")); // EU (d1=15 > 12) + try std.testing.expect(isDateTime("01/15/2024 10:30")); // US (d2=15 > 12) + try std.testing.expect(isDateTime("05/06/2024 08:00")); // ambiguous +} + +test "isDateTime: invalid inputs" { + try std.testing.expect(!isDateTime("")); + try std.testing.expect(!isDateTime("2024-01-15 25:00:00")); // hour 25 + try std.testing.expect(!isDateTime("2024-01-15 10:60:00")); // min 60 + try std.testing.expect(!isDateTime("2024-01-15 10:30:60")); // sec 60 + try std.testing.expect(!isDateTime("2024-13-15 10:30:00")); // month 13 + try std.testing.expect(!isDateTime("13/13/2024 10:30")); // both > 12, no valid month + try std.testing.expect(!isDateTime("2024-01-15")); // date only, length 10 + try std.testing.expect(!isDateTime("not-a-datetime")); +} + +test "accumSlashOrder: merging votes" { + try std.testing.expectEqual(SlashOrder.eu, accumSlashOrder(.unknown, .eu)); + try std.testing.expectEqual(SlashOrder.us, accumSlashOrder(.unknown, .us)); + try std.testing.expectEqual(SlashOrder.eu, accumSlashOrder(.eu, .eu)); + try std.testing.expectEqual(SlashOrder.us, accumSlashOrder(.us, .us)); + try std.testing.expectEqual(SlashOrder.unknown, accumSlashOrder(.unknown, .unknown)); + try std.testing.expectEqual(SlashOrder.eu, accumSlashOrder(.eu, .unknown)); + try std.testing.expectEqual(SlashOrder.us, accumSlashOrder(.us, .unknown)); + try std.testing.expectEqual(SlashOrder.contradictory, accumSlashOrder(.eu, .us)); + try std.testing.expectEqual(SlashOrder.contradictory, accumSlashOrder(.us, .eu)); + try std.testing.expectEqual(SlashOrder.contradictory, accumSlashOrder(.contradictory, .eu)); +} + +test "normalizeDateToIso: ISO passthrough" { + var buf: [10]u8 = undefined; + const result = normalizeDateToIso(.DATE, "2024-01-15", &buf); + try std.testing.expectEqualStrings("2024-01-15", result); +} + +test "normalizeDateToIso: EU dash to ISO" { + var buf: [10]u8 = undefined; + const result = normalizeDateToIso(.DATE, "15-01-2024", &buf); + try std.testing.expectEqualStrings("2024-01-15", result); +} + +test "normalizeDateToIso: EU slash to ISO" { + var buf: [10]u8 = undefined; + const result = normalizeDateToIso(.DATE_EU, "15/01/2024", &buf); + try std.testing.expectEqualStrings("2024-01-15", result); +} + +test "normalizeDateToIso: US slash to ISO" { + var buf: [10]u8 = undefined; + const result = normalizeDateToIso(.DATE_US, "01/15/2024", &buf); + try std.testing.expectEqualStrings("2024-01-15", result); +} + +test "normalizeDateTimeToIso: ISO T-separator normalized to space" { + var buf: [19]u8 = undefined; + const result = normalizeDateTimeToIso(.DATETIME, "2024-01-15T10:30:00", &buf); + try std.testing.expectEqualStrings("2024-01-15 10:30:00", result); +} + +test "normalizeDateTimeToIso: ISO space-separator passthrough" { + var buf: [19]u8 = undefined; + const result = normalizeDateTimeToIso(.DATETIME, "2024-01-15 10:30:00", &buf); + try std.testing.expectEqualStrings("2024-01-15 10:30:00", result); +} + +test "normalizeDateTimeToIso: EU slash to ISO" { + var buf: [19]u8 = undefined; + const result = normalizeDateTimeToIso(.DATETIME_EU, "15/01/2024 10:30", &buf); + try std.testing.expectEqualStrings("2024-01-15 10:30:00", result); +} + +test "normalizeDateTimeToIso: US slash to ISO" { + var buf: [19]u8 = undefined; + const result = normalizeDateTimeToIso(.DATETIME_US, "01/15/2024 10:30", &buf); + try std.testing.expectEqualStrings("2024-01-15 10:30:00", result); +} + +test "inferTypes: empty buffer → all TEXT" { + const allocator = std.testing.allocator; + const buffer: []const [][]u8 = &.{}; + const types = try inferTypes(allocator, buffer, 3); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.TEXT, types[0]); + try std.testing.expectEqual(ColumnType.TEXT, types[1]); + try std.testing.expectEqual(ColumnType.TEXT, types[2]); +} + +test "inferTypes: detects INTEGER" { + const allocator = std.testing.allocator; + var f1: [2][]u8 = .{ @constCast("42"), @constCast("hello") }; + var f2: [2][]u8 = .{ @constCast("-7"), @constCast("world") }; + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 2); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.INTEGER, types[0]); + try std.testing.expectEqual(ColumnType.TEXT, types[1]); +} + +test "inferTypes: detects REAL" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("3.14")}; + const rows: []const [][]u8 = &.{&f1}; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.REAL, types[0]); +} + +test "inferTypes: detects DATE (ISO YYYY-MM-DD)" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("2024-01-15")}; + var f2: [1][]u8 = .{@constCast("1999-12-31")}; + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATE, types[0]); +} + +test "inferTypes: detects DATE_EU (slash with d1 > 12)" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("15/01/2024")}; + const rows: []const [][]u8 = &.{&f1}; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATE_EU, types[0]); +} + +test "inferTypes: detects DATE_US (slash with d2 > 12)" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("01/15/2024")}; + const rows: []const [][]u8 = &.{&f1}; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATE_US, types[0]); +} + +test "inferTypes: ambiguous slash date → TEXT" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("05/06/2024")}; + const rows: []const [][]u8 = &.{&f1}; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.TEXT, types[0]); +} + +test "inferTypes: contradictory slash votes → TEXT" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("15/01/2024")}; // EU vote (d1=15 > 12) + var f2: [1][]u8 = .{@constCast("01/15/2024")}; // US vote (d2=15 > 12) + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.TEXT, types[0]); +} + +test "inferTypes: detects DATETIME (ISO space)" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("2024-01-15 10:30:00")}; + const rows: []const [][]u8 = &.{&f1}; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATETIME, types[0]); +} + +test "inferTypes: detects DATETIME (ISO T-separator)" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("2024-01-15T10:30:00")}; + const rows: []const [][]u8 = &.{&f1}; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATETIME, types[0]); +} + +test "inferTypes: DATETIME beats DATE — datetime column stays DATETIME" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("2024-01-15T10:30:00")}; + var f2: [1][]u8 = .{@constCast("1999-12-31 23:59:59")}; + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATETIME, types[0]); +} + +test "inferTypes: mixed date and datetime → TEXT" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("2024-01-15")}; + var f2: [1][]u8 = .{@constCast("2024-01-16 10:30:00")}; + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.TEXT, types[0]); +} + +test "inferTypes: mixed ISO and slash datetime → TEXT" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("2024-01-15 10:30:00")}; // ISO datetime + var f2: [1][]u8 = .{@constCast("15/01/2024 10:30")}; // EU slash datetime + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.TEXT, types[0]); +} + +test "inferTypes: empty values ignored in type inference" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("")}; // empty → skip + var f2: [1][]u8 = .{@constCast("2024-01-15")}; + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATE, types[0]); +} diff --git a/src/modes/columns.zig b/src/modes/columns.zig index d0717ea..e959f60 100644 --- a/src/modes/columns.zig +++ b/src/modes/columns.zig @@ -71,7 +71,7 @@ pub fn runColumns( fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); defer allocator.free(types); for (cols, types) |col, t| { - stdout_writer.print("{s} {s}\n", .{ col, @tagName(t) }) catch |err| { + stdout_writer.print("{s} {s}\n", .{ col, t.displayName() }) catch |err| { std.log.err("failed to write output: {}", .{err}); }; } diff --git a/src/modes/sample.zig b/src/modes/sample.zig index 52d47f4..e6b6ceb 100644 --- a/src/modes/sample.zig +++ b/src/modes/sample.zig @@ -114,7 +114,7 @@ pub fn runSample( std.log.err("failed to write schema: {}", .{err}); }; } - stderr_writer.print("{s}\n", .{@tagName(t)}) catch |err| { + stderr_writer.print("{s}\n", .{t.displayName()}) catch |err| { std.log.err("failed to write schema: {}", .{err}); }; } diff --git a/src/modes/validate.zig b/src/modes/validate.zig index 5b723da..8b4cd62 100644 --- a/src/modes/validate.zig +++ b/src/modes/validate.zig @@ -129,7 +129,7 @@ pub fn runValidate( std.process.exit(@intFromEnum(ExitCode.usage)); }; } - stdout_writer.print("{s} {s}", .{ col, @tagName(t) }) catch |err| { + stdout_writer.print("{s} {s}", .{ col, t.displayName() }) catch |err| { std.log.err("failed to write output: {}", .{err}); std.process.exit(@intFromEnum(ExitCode.usage)); }; diff --git a/src/sqlite.zig b/src/sqlite.zig index 6d9e589..7485e10 100644 --- a/src/sqlite.zig +++ b/src/sqlite.zig @@ -9,8 +9,57 @@ pub const ExitCode = args_mod.ExitCode; /// SQLITE_STATIC: caller manages string lifetime; SQLite must not free it. pub const sqlite_static: c.sqlite3_destructor_type = null; -/// Inferred SQLite affinity for a CSV column. -pub const ColumnType = enum { TEXT, INTEGER, REAL }; +/// Returns the SQLITE_TRANSIENT sentinel (-1 cast to destructor type). +/// Defined as a function to force runtime evaluation: `const` locals are +/// comptime-known in Zig and would trigger a compile-time alignment error; +/// `var` makes `addr` runtime-unknown, so @ptrFromInt does a runtime check +/// that we suppress with @setRuntimeSafety(false). +/// SQLite checks this value by comparison, not by calling it, so the +/// misalignment is harmless at runtime. +pub fn sqliteTransient() c.sqlite3_destructor_type { + @setRuntimeSafety(false); + var addr: usize = @bitCast(@as(isize, -1)); + _ = &addr; // prevent constant-folding + return @ptrFromInt(addr); +} + +/// Inferred SQLite column type for a CSV column. +/// +/// DATE and DATETIME variants track the source format for normalization at +/// insert time. All DATE* variants map to TEXT affinity in DDL (SQLite stores +/// dates as ISO 8601 text). displayName() returns "DATE" or "DATETIME" for +/// user-facing output regardless of sub-variant. +/// +/// Sub-variant semantics: +/// DATE — YYYY-MM-DD or DD-MM-YYYY (format detected at bind time by separator position) +/// DATE_EU — DD/MM/YYYY (European slash; D1 > 12 was found during inference) +/// DATE_US — MM/DD/YYYY (American slash; D2 > 12 was found during inference) +/// DATETIME — YYYY-MM-DD HH:MM:SS or YYYY-MM-DDTHH:MM:SS (19-char ISO) +/// DATETIME_EU — DD/MM/YYYY HH:MM (16-char European slash) +/// DATETIME_US — MM/DD/YYYY HH:MM (16-char American slash) +pub const ColumnType = enum { + TEXT, + INTEGER, + REAL, + DATE, + DATE_EU, + DATE_US, + DATETIME, + DATETIME_EU, + DATETIME_US, + + /// User-facing name used in --validate, --columns --verbose, and --sample output. + /// All DATE* variants display as "DATE"; all DATETIME* as "DATETIME". + pub fn displayName(self: ColumnType) []const u8 { + return switch (self) { + .TEXT => "TEXT", + .INTEGER => "INTEGER", + .REAL => "REAL", + .DATE, .DATE_EU, .DATE_US => "DATE", + .DATETIME, .DATETIME_EU, .DATETIME_US => "DATETIME", + }; + } +}; /// fatal(fmt, writer, code, args) → noreturn /// @@ -161,6 +210,13 @@ pub fn createTable( .INTEGER => " INTEGER", .REAL => " REAL", .TEXT => " TEXT", + // DATE* and DATETIME* emit TEXT in DDL to guarantee TEXT affinity. + // Declaring "DATE" or "DATETIME" would give NUMERIC affinity, which + // would attempt numeric coercion of ISO 8601 strings. TEXT affinity + // preserves string semantics and lets all SQLite date functions work. + .DATE, .DATE_EU, .DATE_US, + .DATETIME, .DATETIME_EU, .DATETIME_US, + => " TEXT", }) catch fatal("out of memory", writer, .csv_error, .{}); } sql.appendSlice(allocator, ")") catch fatal("out of memory", writer, .csv_error, .{}); From 45e854a48fb64eaea3327ad83e7ac6c501825da0 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 22 May 2026 16:46:00 +0200 Subject: [PATCH 2/4] test: add missing inferTypes unit tests for EU-dash and slash datetime variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - EU-dash DD-MM-YYYY date → DATE - Mixed ISO + EU-dash dates → DATE (bind-time distinction) - Slash datetime with d1>12 → DATETIME_EU - Slash datetime with d2>12 → DATETIME_US --- src/loader.zig | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/loader.zig b/src/loader.zig index 0852104..a28bb18 100644 --- a/src/loader.zig +++ b/src/loader.zig @@ -989,3 +989,43 @@ test "inferTypes: empty values ignored in type inference" { defer allocator.free(types); try std.testing.expectEqual(ColumnType.DATE, types[0]); } + +test "inferTypes: detects DATE (EU-dash DD-MM-YYYY)" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("15-01-2024")}; + var f2: [1][]u8 = .{@constCast("31-12-1999")}; + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATE, types[0]); +} + +test "inferTypes: mixed ISO and EU-dash dates → DATE" { + // Both YYYY-MM-DD and DD-MM-YYYY are non-slash; both infer to .DATE. + // bind-time detection distinguishes them via val[4]=='-'. + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("2024-01-15")}; // ISO + var f2: [1][]u8 = .{@constCast("15-01-2024")}; // EU dash + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATE, types[0]); +} + +test "inferTypes: detects DATETIME_EU (slash datetime with d1 > 12)" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("15/01/2024 10:30")}; + const rows: []const [][]u8 = &.{&f1}; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATETIME_EU, types[0]); +} + +test "inferTypes: detects DATETIME_US (slash datetime with d2 > 12)" { + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("01/15/2024 10:30")}; + const rows: []const [][]u8 = &.{&f1}; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.DATETIME_US, types[0]); +} From cb3f7e6781fe1ce6b887b1b77f290388924a200e Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 22 May 2026 18:42:11 +0200 Subject: [PATCH 3/4] docs: add DATE/DATETIME to type inference docs and real-world example - README: update type inference description to list DATE/DATETIME - README: add La Liga season-lengths real-world example (julianday arithmetic on auto-detected DATE column, COVID and World Cup anomalies) - README: update 'Date range filter' recipe and 'How it works' section - man page: add DATE/DATETIME to DESCRIPTION, --columns, and --sample entries --- README.md | 33 ++++++++++++++++++++++++++++++--- docs/sql-pipe.1.scd | 12 +++++++----- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 0e39f8c..8c77503 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,7 @@ $ printf '[{"name":"Alice","score":95},{"name":"Bob","score":72}]' \ Alice,95 ``` -Columns are auto-detected as `INTEGER`, `REAL`, or `TEXT` based on the first 100 rows. Use `--no-type-inference` to force all columns to `TEXT`: +Columns are auto-detected as `INTEGER`, `REAL`, `DATE`, `DATETIME`, or `TEXT` based on the first 100 rows. Date and datetime values are normalized to ISO 8601 on insert, so SQLite date functions (`date()`, `strftime()`, `julianday()`) work immediately. Use `--no-type-inference` to force all columns to `TEXT`: ```sh $ cat orders.csv | sql-pipe 'SELECT COUNT(*), AVG(amount) FROM t WHERE status = "paid"' @@ -315,12 +315,14 @@ $ cat contacts.csv | sql-pipe 'SELECT DISTINCT email FROM t' $ cat users.csv | sql-pipe 'SELECT * FROM t WHERE email = "" OR email IS NULL' ``` -**Date range filter (dates stored as text):** +**Date range filter:** ```sh $ cat logs.csv | sql-pipe 'SELECT * FROM t WHERE ts >= "2024-01-01" AND ts < "2024-02-01"' ``` +Date columns are auto-detected and stored as ISO 8601 text, so comparison operators and `strftime()` / `julianday()` work without any preprocessing. + **Compute a derived column:** ```sh @@ -464,9 +466,34 @@ $ curl -s "https://api.open-meteo.com/v1/forecast?latitude=40.4168&longitude=-3. 2026-05-07,19.6,10.7,2.1 ``` +**La Liga: season lengths reveal COVID and the World Cup** + +The same [engsoccerdata](https://github.com/jalapic/engsoccerdata) dataset has a +`Date` column in `YYYY-MM-DD` format. `sql-pipe` auto-detects it as `DATE` and +stores it as ISO 8601 text, so `julianday()` works directly — no preprocessing: + +```sh +$ curl -s https://raw.githubusercontent.com/jalapic/engsoccerdata/master/data-raw/spain.csv \ + | sql-pipe 'SELECT Season, + MIN(Date) AS start, + MAX(Date) AS end, + CAST(julianday(MAX(Date)) - julianday(MIN(Date)) AS INTEGER) AS days + FROM t WHERE tier=1 AND Season BETWEEN 2018 AND 2022 + GROUP BY Season ORDER BY Season' +2018,2018-08-17,2019-05-19,275 +2019,2019-08-16,2020-07-19,337 +2020,2020-09-12,2021-05-23,253 +2021,2021-08-13,2022-05-22,282 +2022,2022-08-12,2023-06-04,296 +``` + +The 2019–20 season spans 337 days: COVID suspended play in March 2020 and pushed +the final round to July. The 2022–23 season runs 296 days due to the November +World Cup break. A normal season is ~275 days. + ## How it works -Each run opens a fresh `:memory:` SQLite database. The header row drives a `CREATE TABLE t (...)` with all columns as `TEXT`. Rows are loaded in a single transaction via a prepared `INSERT` statement, then `sqlite3_exec` runs your query and prints rows one by one. +Each run opens a fresh `:memory:` SQLite database. The header row drives a `CREATE TABLE t (...)` with types inferred from the first 100 rows — `INTEGER`, `REAL`, `DATE`, `DATETIME`, or `TEXT`. Date variants use TEXT affinity so ISO 8601 string semantics are preserved and all SQLite date functions work correctly. Rows are loaded in a single transaction via a prepared `INSERT` statement, then `sqlite3_exec` runs your query and prints rows one by one. The database never touches disk and vanishes when the process exits. No state, no cleanup. diff --git a/docs/sql-pipe.1.scd b/docs/sql-pipe.1.scd index cca37bf..075671d 100644 --- a/docs/sql-pipe.1.scd +++ b/docs/sql-pipe.1.scd @@ -17,9 +17,11 @@ DESCRIPTION aggregations on CSV files without manual SQL database setup. All input columns are automatically loaded into the table with names derived from - the CSV header row. By default, column types (TEXT, INTEGER, REAL) are inferred - from the first 100 rows of data. Use *--no-type-inference* to disable this - behavior and treat all columns as TEXT. + the CSV header row. By default, column types (TEXT, INTEGER, REAL, DATE, DATETIME) + are inferred from the first 100 rows of data. DATE and DATETIME values are + normalized to ISO 8601 on insert, enabling SQLite date functions (*date()*, + *strftime()*, *julianday()*) to work directly on those columns. Use + *--no-type-inference* to disable this behavior and treat all columns as TEXT. By default, input fields are parsed as comma-separated values. Use *--delimiter* (or *-d*) to parse other delimiters (1–8 characters), or *--tsv* @@ -95,7 +97,7 @@ OPTIONS Read the input header, print each column name on its own line to standard output, and exit with code 0. Supported for CSV, TSV, JSON, NDJSON, and XML input. When combined with *-v* / *--verbose*, - also shows the inferred type (INTEGER, REAL, or TEXT) for each column + also shows the inferred type (INTEGER, REAL, DATE, DATETIME, or TEXT) for each column (CSV/TSV only; other formats always show TEXT), using the first 100 data rows for inference. Respects *--delimiter* and *--tsv*. Mutually exclusive with a query argument. @@ -104,7 +106,7 @@ OPTIONS Print a schema comment block to standard error and the first data rows to standard output as delimited text (default: 10 rows if no value is given). The schema block lists each column name and its inferred - SQLite type (INTEGER, REAL, or TEXT), each line prefixed with *#* so it + type (INTEGER, REAL, DATE, DATETIME, or TEXT), each line prefixed with *#* so it is ignored by downstream CSV parsers. Column header is always printed as the first output row (implies *--header*). Type inference reads up to 100 rows or rows, whichever is larger, before emitting output. From 3be0be8cd06f406db922d23978a921953beed446 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 22 May 2026 19:00:01 +0200 Subject: [PATCH 4/4] fix: correct 2019 season length (338 days, not 337) and add missing unit test - README: fix julianday(2020-07-19)-julianday(2019-08-16) = 338, not 337 - loader.zig: add unit test for d_has_nonslash && d_has_slash -> TEXT path (ISO date + slash date in same column falls back to TEXT) --- README.md | 4 ++-- src/loader.zig | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8c77503..a4d2ee3 100644 --- a/README.md +++ b/README.md @@ -481,13 +481,13 @@ $ curl -s https://raw.githubusercontent.com/jalapic/engsoccerdata/master/data-ra FROM t WHERE tier=1 AND Season BETWEEN 2018 AND 2022 GROUP BY Season ORDER BY Season' 2018,2018-08-17,2019-05-19,275 -2019,2019-08-16,2020-07-19,337 +2019,2019-08-16,2020-07-19,338 2020,2020-09-12,2021-05-23,253 2021,2021-08-13,2022-05-22,282 2022,2022-08-12,2023-06-04,296 ``` -The 2019–20 season spans 337 days: COVID suspended play in March 2020 and pushed +The 2019–20 season spans 338 days: COVID suspended play in March 2020 and pushed the final round to July. The 2022–23 season runs 296 days due to the November World Cup break. A normal season is ~275 days. diff --git a/src/loader.zig b/src/loader.zig index a28bb18..a3085a1 100644 --- a/src/loader.zig +++ b/src/loader.zig @@ -1029,3 +1029,14 @@ test "inferTypes: detects DATETIME_US (slash datetime with d2 > 12)" { defer allocator.free(types); try std.testing.expectEqual(ColumnType.DATETIME_US, types[0]); } + +test "inferTypes: mixed ISO date and slash date → TEXT (d_has_nonslash && d_has_slash)" { + // Exercises loader.zig line 287: d_has_nonslash[j] and d_has_slash[j] → TEXT + const allocator = std.testing.allocator; + var f1: [1][]u8 = .{@constCast("2024-01-15")}; // ISO → d_has_nonslash + var f2: [1][]u8 = .{@constCast("15/01/2024")}; // EU slash → d_has_slash + const rows: []const [][]u8 = &.{ &f1, &f2 }; + const types = try inferTypes(allocator, rows, 1); + defer allocator.free(types); + try std.testing.expectEqual(ColumnType.TEXT, types[0]); +}