From 1c8b850024710e65595d013b982605919851ae7e Mon Sep 17 00:00:00 2001 From: Anamika AggarwaL Date: Tue, 31 Mar 2026 10:01:47 +0530 Subject: [PATCH 1/2] Document and test safeColumns in ParquetReadOptions --- docs/cookbook.md | 19 ++++++++++-- src/DataFrame/IO/Parquet.hs | 1 + tests/Parquet.hs | 61 ++++++++++++++++++++++++++++++++++++- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/docs/cookbook.md b/docs/cookbook.md index 22254f5..1706fc8 100644 --- a/docs/cookbook.md +++ b/docs/cookbook.md @@ -369,8 +369,9 @@ dataframe> :declareColumns df0 - `selectedColumns` - `predicate` - `rowRange` +- `safeColumns` -Options are applied in this order: predicate filtering, column projection, then row range. +Options are applied in this order: predicate filtering, column projection, row range, then safe column promotion. **Exercise 11: Parquet projection** @@ -414,7 +415,21 @@ dataframe| "./data/mtcars.parquet" When `selectedColumns` is set, columns referenced by `predicate` are automatically read as needed, then projected back to the requested output columns. -**Exercise 14: using the typed API** +**Exercise 14: Safe column promotion** + +Read the file while promoting every output column to an optional column. + +### Solution + +```haskell +dataframe> D.readParquetWithOpts +dataframe| (D.defaultParquetReadOptions{D.safeColumns = True}) +dataframe| "./data/mtcars.parquet" +``` + +Use `safeColumns` when downstream code wants a uniformly nullable schema, even when the Parquet file marks some columns as non-nullable. + +**Exercise 15: using the typed API** _This problem is called "Interviews" in Hackerrank. Samantha interviews many candidates from different colleges using coding challenges and contests. Write a query to print the contest_id, hacker_id, name, and the sums of total_submissions, total_accepted_submissions, total_views, and total_unique_views for each contest sorted by contest_id. Exclude the contest from the result if all four sums are 0. diff --git a/src/DataFrame/IO/Parquet.hs b/src/DataFrame/IO/Parquet.hs index eb0f42d..760ec81 100644 --- a/src/DataFrame/IO/Parquet.hs +++ b/src/DataFrame/IO/Parquet.hs @@ -64,6 +64,7 @@ These options are applied in this order: 1. predicate filtering 2. column projection 3. row range +4. safe column promotion Column selection for @selectedColumns@ uses leaf column names only. -} diff --git a/tests/Parquet.hs b/tests/Parquet.hs index ea09240..5bc4a95 100644 --- a/tests/Parquet.hs +++ b/tests/Parquet.hs @@ -4,6 +4,7 @@ module Parquet where import Assertions (assertExpectException) +import Control.Monad (forM_) import qualified DataFrame as D import qualified DataFrame.Functions as F import qualified DataFrame.IO.Parquet as DP @@ -29,7 +30,7 @@ import DataFrame.Internal.Binary ( word32ToLittleEndian, word64ToLittleEndian, ) -import DataFrame.Internal.Column (hasMissing) +import DataFrame.Internal.Column (hasElemType, hasMissing) import DataFrame.Internal.DataFrame (unsafeGetColumn) import GHC.IO (unsafePerformIO) import Test.HUnit @@ -41,6 +42,17 @@ testBothReadParquetPaths test = , test (DP._readParquetWithOpts (Just True) D.defaultParquetReadOptions) ] +assertColumnNullability :: String -> [(T.Text, Bool)] -> D.DataFrame -> Assertion +assertColumnNullability label expected df = + forM_ expected $ \(columnName, shouldBeNullable) -> + assertBool + ( label + <> ": expected " + <> T.unpack columnName + <> if shouldBeNullable then " to be nullable" else " to be non-nullable" + ) + (hasMissing (unsafeGetColumn columnName df) == shouldBeNullable) + allTypesPlain :: Test allTypesPlain = testBothReadParquetPaths $ \readParquet -> TestCase @@ -169,6 +181,51 @@ predicateUsesNonSelectedColumnWithOpts = ) ) +safeColumnsWithOpts :: Test +safeColumnsWithOpts = + TestCase $ do + defaultDf <- D.readParquet "./tests/data/alltypes_plain.parquet" + safeDf <- + D.readParquetWithOpts + (D.defaultParquetReadOptions{D.safeColumns = True}) + "./tests/data/alltypes_plain.parquet" + + assertEqual "safeColumnsWithOpts dimensions" (D.dimensions defaultDf) (D.dimensions safeDf) + assertColumnNullability + "default read" + [("id", False), ("bool_col", False)] + defaultDf + assertColumnNullability + "safeColumns read" + [("id", True), ("bool_col", True)] + safeDf + assertBool "safeColumns id type" (hasElemType @(Maybe Int32) (unsafeGetColumn "id" safeDf)) + assertBool "safeColumns bool_col type" (hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" safeDf)) + +safeColumnsWithSelectedColumns :: Test +safeColumnsWithSelectedColumns = + TestCase $ do + df <- + D.readParquetWithOpts + ( D.defaultParquetReadOptions + { D.selectedColumns = Just ["id", "bool_col"] + , D.safeColumns = True + } + ) + "./tests/data/alltypes_plain.parquet" + + assertEqual "safeColumnsWithSelectedColumns dimensions" (8, 2) (D.dimensions df) + assertColumnNullability + "safeColumns projected read" + [("id", True), ("bool_col", True)] + df + assertBool + "safeColumns projected id type" + (hasElemType @(Maybe Int32) (unsafeGetColumn "id" df)) + assertBool + "safeColumns projected bool_col type" + (hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" df)) + predicateWithOptsAcrossFiles :: Test predicateWithOptsAcrossFiles = TestCase @@ -1029,6 +1086,8 @@ tests = , rowRangeWithOpts , predicateWithOpts , predicateUsesNonSelectedColumnWithOpts + , safeColumnsWithOpts + , safeColumnsWithSelectedColumns , predicateWithOptsAcrossFiles , missingSelectedColumnWithOpts , mtCars From 9583757969ff04b5b71c22118768846f2f9b181e Mon Sep 17 00:00:00 2001 From: Anamika AggarwaL Date: Tue, 31 Mar 2026 10:16:56 +0530 Subject: [PATCH 2/2] Fix Fourmolu formatting --- src/DataFrame/Typed/Lazy.hs | 2 +- tests/Parquet.hs | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/DataFrame/Typed/Lazy.hs b/src/DataFrame/Typed/Lazy.hs index bf2e3fc..559ab6f 100644 --- a/src/DataFrame/Typed/Lazy.hs +++ b/src/DataFrame/Typed/Lazy.hs @@ -72,8 +72,8 @@ module DataFrame.Typed.Lazy ( SortOrder (..), ) where -import qualified Data.Text as T import Data.Proxy (Proxy (..)) +import qualified Data.Text as T import GHC.TypeLits (KnownSymbol, Symbol, symbolVal) import Prelude hiding (filter, take) diff --git a/tests/Parquet.hs b/tests/Parquet.hs index 5bc4a95..f6a1dd2 100644 --- a/tests/Parquet.hs +++ b/tests/Parquet.hs @@ -42,7 +42,8 @@ testBothReadParquetPaths test = , test (DP._readParquetWithOpts (Just True) D.defaultParquetReadOptions) ] -assertColumnNullability :: String -> [(T.Text, Bool)] -> D.DataFrame -> Assertion +assertColumnNullability :: + String -> [(T.Text, Bool)] -> D.DataFrame -> Assertion assertColumnNullability label expected df = forM_ expected $ \(columnName, shouldBeNullable) -> assertBool @@ -190,7 +191,10 @@ safeColumnsWithOpts = (D.defaultParquetReadOptions{D.safeColumns = True}) "./tests/data/alltypes_plain.parquet" - assertEqual "safeColumnsWithOpts dimensions" (D.dimensions defaultDf) (D.dimensions safeDf) + assertEqual + "safeColumnsWithOpts dimensions" + (D.dimensions defaultDf) + (D.dimensions safeDf) assertColumnNullability "default read" [("id", False), ("bool_col", False)] @@ -199,8 +203,12 @@ safeColumnsWithOpts = "safeColumns read" [("id", True), ("bool_col", True)] safeDf - assertBool "safeColumns id type" (hasElemType @(Maybe Int32) (unsafeGetColumn "id" safeDf)) - assertBool "safeColumns bool_col type" (hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" safeDf)) + assertBool + "safeColumns id type" + (hasElemType @(Maybe Int32) (unsafeGetColumn "id" safeDf)) + assertBool + "safeColumns bool_col type" + (hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" safeDf)) safeColumnsWithSelectedColumns :: Test safeColumnsWithSelectedColumns =