DataHaskell · mchav · Apr 2, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/docs/cookbook.md b/docs/cookbook.md
@@ -369,8 +369,9 @@ dataframe> :declareColumns df0
 - `selectedColumns`
 - `predicate`
 - `rowRange`
+- `safeColumns`
 
-Options are applied in this order: predicate filtering, column projection, then row range.
+Options are applied in this order: predicate filtering, column projection, row range, then safe column promotion.
 
 **Exercise 11: Parquet projection**
 
@@ -414,7 +415,21 @@ dataframe|   "./data/mtcars.parquet"
 
 When `selectedColumns` is set, columns referenced by `predicate` are automatically read as needed, then projected back to the requested output columns.
 
-**Exercise 14: using the typed API**
+**Exercise 14: Safe column promotion**
+
+Read the file while promoting every output column to an optional column.
+
+### Solution
+
+```haskell
+dataframe> D.readParquetWithOpts
+dataframe|   (D.defaultParquetReadOptions{D.safeColumns = True})
+dataframe|   "./data/mtcars.parquet"
+```
+
+Use `safeColumns` when downstream code wants a uniformly nullable schema, even when the Parquet file marks some columns as non-nullable.
+
+**Exercise 15: using the typed API**
 _This problem is called "Interviews" in Hackerrank.
 Samantha interviews many candidates from different colleges using coding challenges and contests. Write a query to print the contest_id, hacker_id, name, and the sums of total_submissions, total_accepted_submissions, total_views, and total_unique_views for each contest sorted by contest_id. Exclude the contest from the result if all four sums are 0.
 

diff --git a/src/DataFrame/IO/Parquet.hs b/src/DataFrame/IO/Parquet.hs
@@ -64,6 +64,7 @@ These options are applied in this order:
 1. predicate filtering
 2. column projection
 3. row range
+4. safe column promotion
 
 Column selection for @selectedColumns@ uses leaf column names only.
 -}

diff --git a/src/DataFrame/Typed/Lazy.hs b/src/DataFrame/Typed/Lazy.hs
@@ -72,8 +72,8 @@
     SortOrder (..),
 ) where
 
-import qualified Data.Text as T
 import Data.Proxy (Proxy (..))
+import qualified Data.Text as T
 import GHC.TypeLits (KnownSymbol, Symbol, symbolVal)
 import Prelude hiding (filter, take)
 
@@ -90,7 +90,7 @@
 import DataFrame.Typed.Types

 -- | A lazy query with compile-time schema tracking.
 newtype TypedLazyDataFrame (cols :: [*]) = TLD {unTLD :: LazyDataFrame}

 instance Show (TypedLazyDataFrame cols) where
    show (TLD ldf) = "TypedLazyDataFrame { " ++ show ldf ++ " }"
@@ -152,7 +152,7 @@
 select (TLD ldf) = TLD (L.select (DataFrame.Typed.Schema.symbolVals @names) ldf)

 -- | A typed lazy grouped query.
 newtype TypedLazyGrouped (keys :: [Symbol]) (cols :: [*]) = TLG
    { unTLG :: ([T.Text], LazyDataFrame)
    }


diff --git a/tests/Parquet.hs b/tests/Parquet.hs
@@ -4,6 +4,7 @@
 module Parquet where
 
 import Assertions (assertExpectException)
+import Control.Monad (forM_)
 import qualified DataFrame as D
 import qualified DataFrame.Functions as F
 import qualified DataFrame.IO.Parquet as DP
@@ -29,7 +30,7 @@ import DataFrame.Internal.Binary (
     word32ToLittleEndian,
     word64ToLittleEndian,
  )
-import DataFrame.Internal.Column (hasMissing)
+import DataFrame.Internal.Column (hasElemType, hasMissing)
 import DataFrame.Internal.DataFrame (unsafeGetColumn)
 import GHC.IO (unsafePerformIO)
 import Test.HUnit
@@ -41,6 +42,18 @@ testBothReadParquetPaths test =
         , test (DP._readParquetWithOpts (Just True) D.defaultParquetReadOptions)
         ]
 
+assertColumnNullability ::
+    String -> [(T.Text, Bool)] -> D.DataFrame -> Assertion
+assertColumnNullability label expected df =
+    forM_ expected $ \(columnName, shouldBeNullable) ->
+        assertBool
+            ( label
+                <> ": expected "
+                <> T.unpack columnName
+                <> if shouldBeNullable then " to be nullable" else " to be non-nullable"
+            )
+            (hasMissing (unsafeGetColumn columnName df) == shouldBeNullable)
+
 allTypesPlain :: Test
 allTypesPlain = testBothReadParquetPaths $ \readParquet ->
     TestCase
@@ -169,6 +182,58 @@ predicateUsesNonSelectedColumnWithOpts =
             )
         )
 
+safeColumnsWithOpts :: Test
+safeColumnsWithOpts =
+    TestCase $ do
+        defaultDf <- D.readParquet "./tests/data/alltypes_plain.parquet"
+        safeDf <-
+            D.readParquetWithOpts
+                (D.defaultParquetReadOptions{D.safeColumns = True})
+                "./tests/data/alltypes_plain.parquet"
+
+        assertEqual
+            "safeColumnsWithOpts dimensions"
+            (D.dimensions defaultDf)
+            (D.dimensions safeDf)
+        assertColumnNullability
+            "default read"
+            [("id", False), ("bool_col", False)]
+            defaultDf
+        assertColumnNullability
+            "safeColumns read"
+            [("id", True), ("bool_col", True)]
+            safeDf
+        assertBool
+            "safeColumns id type"
+            (hasElemType @(Maybe Int32) (unsafeGetColumn "id" safeDf))
+        assertBool
+            "safeColumns bool_col type"
+            (hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" safeDf))
+
+safeColumnsWithSelectedColumns :: Test
+safeColumnsWithSelectedColumns =
+    TestCase $ do
+        df <-
+            D.readParquetWithOpts
+                ( D.defaultParquetReadOptions
+                    { D.selectedColumns = Just ["id", "bool_col"]
+                    , D.safeColumns = True
+                    }
+                )
+                "./tests/data/alltypes_plain.parquet"
+
+        assertEqual "safeColumnsWithSelectedColumns dimensions" (8, 2) (D.dimensions df)
+        assertColumnNullability
+            "safeColumns projected read"
+            [("id", True), ("bool_col", True)]
+            df
+        assertBool
+            "safeColumns projected id type"
+            (hasElemType @(Maybe Int32) (unsafeGetColumn "id" df))
+        assertBool
+            "safeColumns projected bool_col type"
+            (hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" df))
+
 predicateWithOptsAcrossFiles :: Test
 predicateWithOptsAcrossFiles =
     TestCase
@@ -1029,6 +1094,8 @@ tests =
     , rowRangeWithOpts
     , predicateWithOpts
     , predicateUsesNonSelectedColumnWithOpts
+    , safeColumnsWithOpts
+    , safeColumnsWithSelectedColumns
     , predicateWithOptsAcrossFiles
     , missingSelectedColumnWithOpts
     , mtCars