Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions docs/cookbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,9 @@ dataframe> :declareColumns df0
- `selectedColumns`
- `predicate`
- `rowRange`
- `safeColumns`

Options are applied in this order: predicate filtering, column projection, then row range.
Options are applied in this order: predicate filtering, column projection, row range, then safe column promotion.

**Exercise 11: Parquet projection**

Expand Down Expand Up @@ -414,7 +415,21 @@ dataframe| "./data/mtcars.parquet"

When `selectedColumns` is set, columns referenced by `predicate` are automatically read as needed, then projected back to the requested output columns.

**Exercise 14: using the typed API**
**Exercise 14: Safe column promotion**

Read the file while promoting every output column to an optional column.

### Solution

```haskell
dataframe> D.readParquetWithOpts
dataframe| (D.defaultParquetReadOptions{D.safeColumns = True})
dataframe| "./data/mtcars.parquet"
```

Use `safeColumns` when downstream code wants a uniformly nullable schema, even when the Parquet file marks some columns as non-nullable.

**Exercise 15: using the typed API**
_This problem is called "Interviews" in Hackerrank.
Samantha interviews many candidates from different colleges using coding challenges and contests. Write a query to print the contest_id, hacker_id, name, and the sums of total_submissions, total_accepted_submissions, total_views, and total_unique_views for each contest sorted by contest_id. Exclude the contest from the result if all four sums are 0.

Expand Down
1 change: 1 addition & 0 deletions src/DataFrame/IO/Parquet.hs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ These options are applied in this order:
1. predicate filtering
2. column projection
3. row range
4. safe column promotion

Column selection for @selectedColumns@ uses leaf column names only.
-}
Expand Down
2 changes: 1 addition & 1 deletion src/DataFrame/Typed/Lazy.hs
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@
SortOrder (..),
) where

import qualified Data.Text as T
import Data.Proxy (Proxy (..))
import qualified Data.Text as T
import GHC.TypeLits (KnownSymbol, Symbol, symbolVal)
import Prelude hiding (filter, take)

Expand All @@ -90,7 +90,7 @@
import DataFrame.Typed.Types

-- | A lazy query with compile-time schema tracking.
newtype TypedLazyDataFrame (cols :: [*]) = TLD {unTLD :: LazyDataFrame}

Check warning on line 93 in src/DataFrame/Typed/Lazy.hs

View workflow job for this annotation

GitHub Actions / GHC 9.8.4

Using ‘*’ (or its Unicode variant) to mean ‘Data.Kind.Type’

Check warning on line 93 in src/DataFrame/Typed/Lazy.hs

View workflow job for this annotation

GitHub Actions / GHC 9.6.7

Using ‘*’ (or its Unicode variant) to mean ‘Data.Kind.Type’

Check warning on line 93 in src/DataFrame/Typed/Lazy.hs

View workflow job for this annotation

GitHub Actions / GHC 9.10.3

Using ‘*’ (or its Unicode variant) to mean ‘Data.Kind.Type’

Check warning on line 93 in src/DataFrame/Typed/Lazy.hs

View workflow job for this annotation

GitHub Actions / GHC 9.12.2

Using ‘*’ (or its Unicode variant) to mean ‘Data.Kind.Type’

instance Show (TypedLazyDataFrame cols) where
show (TLD ldf) = "TypedLazyDataFrame { " ++ show ldf ++ " }"
Expand Down Expand Up @@ -152,7 +152,7 @@
select (TLD ldf) = TLD (L.select (DataFrame.Typed.Schema.symbolVals @names) ldf)

-- | A typed lazy grouped query.
newtype TypedLazyGrouped (keys :: [Symbol]) (cols :: [*]) = TLG

Check warning on line 155 in src/DataFrame/Typed/Lazy.hs

View workflow job for this annotation

GitHub Actions / GHC 9.8.4

Using ‘*’ (or its Unicode variant) to mean ‘Data.Kind.Type’

Check warning on line 155 in src/DataFrame/Typed/Lazy.hs

View workflow job for this annotation

GitHub Actions / GHC 9.6.7

Using ‘*’ (or its Unicode variant) to mean ‘Data.Kind.Type’

Check warning on line 155 in src/DataFrame/Typed/Lazy.hs

View workflow job for this annotation

GitHub Actions / GHC 9.10.3

Using ‘*’ (or its Unicode variant) to mean ‘Data.Kind.Type’

Check warning on line 155 in src/DataFrame/Typed/Lazy.hs

View workflow job for this annotation

GitHub Actions / GHC 9.12.2

Using ‘*’ (or its Unicode variant) to mean ‘Data.Kind.Type’
{ unTLG :: ([T.Text], LazyDataFrame)
}

Expand Down
69 changes: 68 additions & 1 deletion tests/Parquet.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
module Parquet where

import Assertions (assertExpectException)
import Control.Monad (forM_)
import qualified DataFrame as D
import qualified DataFrame.Functions as F
import qualified DataFrame.IO.Parquet as DP
Expand All @@ -29,7 +30,7 @@ import DataFrame.Internal.Binary (
word32ToLittleEndian,
word64ToLittleEndian,
)
import DataFrame.Internal.Column (hasMissing)
import DataFrame.Internal.Column (hasElemType, hasMissing)
import DataFrame.Internal.DataFrame (unsafeGetColumn)
import GHC.IO (unsafePerformIO)
import Test.HUnit
Expand All @@ -41,6 +42,18 @@ testBothReadParquetPaths test =
, test (DP._readParquetWithOpts (Just True) D.defaultParquetReadOptions)
]

assertColumnNullability ::
String -> [(T.Text, Bool)] -> D.DataFrame -> Assertion
assertColumnNullability label expected df =
forM_ expected $ \(columnName, shouldBeNullable) ->
assertBool
( label
<> ": expected "
<> T.unpack columnName
<> if shouldBeNullable then " to be nullable" else " to be non-nullable"
)
(hasMissing (unsafeGetColumn columnName df) == shouldBeNullable)

allTypesPlain :: Test
allTypesPlain = testBothReadParquetPaths $ \readParquet ->
TestCase
Expand Down Expand Up @@ -169,6 +182,58 @@ predicateUsesNonSelectedColumnWithOpts =
)
)

safeColumnsWithOpts :: Test
safeColumnsWithOpts =
TestCase $ do
defaultDf <- D.readParquet "./tests/data/alltypes_plain.parquet"
safeDf <-
D.readParquetWithOpts
(D.defaultParquetReadOptions{D.safeColumns = True})
"./tests/data/alltypes_plain.parquet"

assertEqual
"safeColumnsWithOpts dimensions"
(D.dimensions defaultDf)
(D.dimensions safeDf)
assertColumnNullability
"default read"
[("id", False), ("bool_col", False)]
defaultDf
assertColumnNullability
"safeColumns read"
[("id", True), ("bool_col", True)]
safeDf
assertBool
"safeColumns id type"
(hasElemType @(Maybe Int32) (unsafeGetColumn "id" safeDf))
assertBool
"safeColumns bool_col type"
(hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" safeDf))

safeColumnsWithSelectedColumns :: Test
safeColumnsWithSelectedColumns =
TestCase $ do
df <-
D.readParquetWithOpts
( D.defaultParquetReadOptions
{ D.selectedColumns = Just ["id", "bool_col"]
, D.safeColumns = True
}
)
"./tests/data/alltypes_plain.parquet"

assertEqual "safeColumnsWithSelectedColumns dimensions" (8, 2) (D.dimensions df)
assertColumnNullability
"safeColumns projected read"
[("id", True), ("bool_col", True)]
df
assertBool
"safeColumns projected id type"
(hasElemType @(Maybe Int32) (unsafeGetColumn "id" df))
assertBool
"safeColumns projected bool_col type"
(hasElemType @(Maybe Bool) (unsafeGetColumn "bool_col" df))

predicateWithOptsAcrossFiles :: Test
predicateWithOptsAcrossFiles =
TestCase
Expand Down Expand Up @@ -1029,6 +1094,8 @@ tests =
, rowRangeWithOpts
, predicateWithOpts
, predicateUsesNonSelectedColumnWithOpts
, safeColumnsWithOpts
, safeColumnsWithSelectedColumns
, predicateWithOptsAcrossFiles
, missingSelectedColumnWithOpts
, mtCars
Expand Down
Loading