diff --git a/documentation/components/core/partitioning.md b/documentation/components/core/partitioning.md index 1ecbcb94f..ac0dcc75b 100644 --- a/documentation/components/core/partitioning.md +++ b/documentation/components/core/partitioning.md @@ -107,4 +107,160 @@ $dataFrame->partitionBy('id'); // If IDs are unique, creates many tiny partition // Good partitioning - moderate cardinality $dataFrame->partitionBy('department'); // Assuming reasonable number of departments -``` \ No newline at end of file +``` + +## Save Modes with Partitioning + +When writing partitioned data, the save mode determines how existing partition directories are handled. + +### Overwrite Mode + +```php +read(from_array([ + ['date' => '2024-01-01', 'amount' => 100], + ['date' => '2024-01-02', 'amount' => 200], + ])) + ->partitionBy(ref('date')) + ->mode(overwrite()) + ->write(to_csv(__DIR__ . '/output/sales.csv')) + ->run(); +``` + +**Behavior:** +- Removes ALL files within partition directories being written to +- Partitions NOT in the current dataset are preserved +- Running twice with the same partition values replaces the first write completely + +**Common pitfall:** If you write two separate DataFrames to the same partitions using `overwrite()`, the second write deletes data from the first: + +```php +read(from_array([['date' => '2024-01-01', 'amount' => 100]])) + ->partitionBy(ref('date')) + ->mode(overwrite()) + ->write(to_csv(__DIR__ . '/output/sales.csv')) + ->run(); + +// Second write - DELETES the 100 and writes 200 +data_frame() + ->read(from_array([['date' => '2024-01-01', 'amount' => 200]])) + ->partitionBy(ref('date')) + ->mode(overwrite()) + ->write(to_csv(__DIR__ . '/output/sales.csv')) + ->run(); + +// Result: date=2024-01-01/sales.csv contains ONLY amount=200 +``` + +To combine data from multiple sources into the same partition, use `append()` mode or merge data before writing. + +### Append Mode + +```php +read(from_array([['date' => '2024-01-01', 'amount' => 100]])) + ->partitionBy(ref('date')) + ->mode(append()) + ->write(to_csv(__DIR__ . '/output/sales.csv')) + ->run(); +``` + +**Behavior:** +- Creates new files with randomized suffixes in existing partition directories +- Does not remove existing files +- Multiple runs accumulate files (may cause duplicates) + +### Ignore Mode + +```php +read(from_array([['date' => '2024-01-01', 'amount' => 100]])) + ->partitionBy(ref('date')) + ->mode(ignore()) + ->write(to_csv(__DIR__ . '/output/sales.csv')) + ->run(); +``` + +**Behavior:** +- Skips writing if partition directory already exists +- No error thrown, silently continues + +### Exception If Exists Mode (Default) + +```php +read(from_array([['date' => '2024-01-01', 'amount' => 100]])) + ->partitionBy(ref('date')) + ->write(to_csv(__DIR__ . '/output/sales.csv')) // Default mode + ->run(); +``` + +**Behavior:** +- Throws `RuntimeException` if any partition path already exists +- Safest option to prevent accidental overwrites + +## Reading Partitioned Data + +Read partitioned data using glob patterns to match partition directories: + +```php +read(from_csv(__DIR__ . '/output/date=*/*.csv')) + ->write(to_output()) + ->run(); +``` + +### Partition Pruning + +Skip entire partitions without reading their contents using `filterPartitions()`: + +```php +read(from_csv(__DIR__ . '/output/date=*/department=*/*.csv')) + ->filterPartitions(ref('date')->greaterThanEqual(lit('2024-01-01'))) + ->write(to_output()) + ->run(); +``` + +Unlike `filter()` which reads all data then discards non-matching rows, `filterPartitions()` evaluates partition metadata first and only reads matching partitions - significantly improving performance for large datasets. + +### Path Partitions + +Extract partition metadata without reading file contents using `from_path_partitions()`: + +```php +read(from_path_partitions(__DIR__ . '/output/date=*/department=*/*.csv')) + ->write(to_output()) + ->run(); + +// Output includes 'path' and 'partitions' columns +``` + +Useful for discovering available partitions or building file manifests before processing data. \ No newline at end of file diff --git a/src/core/etl/src/Flow/ETL/Filesystem/FilesystemStreams.php b/src/core/etl/src/Flow/ETL/Filesystem/FilesystemStreams.php index 2bbb411ec..2632c435e 100644 --- a/src/core/etl/src/Flow/ETL/Filesystem/FilesystemStreams.php +++ b/src/core/etl/src/Flow/ETL/Filesystem/FilesystemStreams.php @@ -41,9 +41,14 @@ public function closeStreams(Path $path) : void if ($this->saveMode === SaveMode::Overwrite) { if ($fileStream->path()->partitions()->count()) { - $partitionFilesPatter = \Flow\Filesystem\DSL\path($fileStream->path()->parentDirectory()->uri() . '/*', $fileStream->path()->options()); + $filename = \str_replace(self::FLOW_TMP_FILE_PREFIX, '', $fileStream->path()->filename()); - foreach ($fs->list($partitionFilesPatter) as $partitionFile) { + $partitionFilesPattern = \Flow\Filesystem\DSL\path( + $fileStream->path()->parentDirectory()->uri() . '/' . $filename . '*.' . $fileStream->path()->extension(), + $fileStream->path()->options() + ); + + foreach ($fs->list($partitionFilesPattern) as $partitionFile) { if (\str_contains($partitionFile->path->path(), self::FLOW_TMP_FILE_PREFIX)) { continue; } diff --git a/src/core/etl/tests/Flow/ETL/Tests/Integration/Filesystem/FilesystemStreams/Partitioned/OverwriteMultipleFilesTest.php b/src/core/etl/tests/Flow/ETL/Tests/Integration/Filesystem/FilesystemStreams/Partitioned/OverwriteMultipleFilesTest.php new file mode 100644 index 000000000..5fd3c5a84 --- /dev/null +++ b/src/core/etl/tests/Flow/ETL/Tests/Integration/Filesystem/FilesystemStreams/Partitioned/OverwriteMultipleFilesTest.php @@ -0,0 +1,142 @@ +cleanFiles(); + } + + public function test_multiple_writes_to_same_partition_with_different_filenames() : void + { + $this->setupFiles([ + __FUNCTION__ => [], + ]); + + $salesStreams = $this->streams(); + $salesFile = $this->getPath(__FUNCTION__ . '/sales.csv'); + $salesStream = $salesStreams->writeTo($salesFile, partitions: [new Partition('partition', 'value')]); + $salesStream->append('sales data'); + $salesStreams->closeStreams($salesFile); + + $ordersStreams = $this->streams(); + $ordersFile = $this->getPath(__FUNCTION__ . '/orders.csv'); + $ordersStream = $ordersStreams->writeTo($ordersFile, partitions: [new Partition('partition', 'value')]); + $ordersStream->append('orders data'); + $ordersStreams->closeStreams($ordersFile); + + $files = \iterator_to_array($this->fs()->list(path($this->filesDirectory() . '/' . __FUNCTION__ . '/partition=value/*'))); + + self::assertCount(2, $files); + + $basenames = \array_map(static fn ($file) => $file->path->basename(), $files); + \sort($basenames); + + self::assertSame(['orders.csv', 'sales.csv'], $basenames); + + $contentByBasename = []; + + foreach ($files as $file) { + $contentByBasename[$file->path->basename()] = \file_get_contents($file->path->path()); + } + + self::assertSame('sales data', $contentByBasename['sales.csv']); + self::assertSame('orders data', $contentByBasename['orders.csv']); + } + + public function test_overwrite_cleans_up_randomized_files_with_same_basename() : void + { + $streams = $this->streams(); + + $this->setupFiles([ + __FUNCTION__ => [ + 'partition=value' => [ + 'file_abc123.csv' => 'randomized file content', + 'file.csv' => 'original file content', + ], + ], + ]); + + $file = $this->getPath(__FUNCTION__ . '/file.csv'); + $fileStream = $streams->writeTo($file, partitions: [new Partition('partition', 'value')]); + $fileStream->append('overwritten content'); + $streams->closeStreams($file); + + $files = \iterator_to_array($this->fs()->list(path($this->filesDirectory() . '/' . __FUNCTION__ . '/partition=value/*'))); + + self::assertCount(1, $files); + self::assertSame('file.csv', $files[0]->path->basename()); + self::assertSame('overwritten content', \file_get_contents($files[0]->path->path())); + } + + public function test_overwrite_does_not_delete_files_with_different_basename() : void + { + $streams = $this->streams(); + + $this->setupFiles([ + __FUNCTION__ => [ + 'partition=value' => [ + 'sales.csv' => 'sales data', + ], + ], + ]); + + $ordersFile = $this->getPath(__FUNCTION__ . '/orders.csv'); + $ordersStream = $streams->writeTo($ordersFile, partitions: [new Partition('partition', 'value')]); + $ordersStream->append('orders data'); + $streams->closeStreams($ordersFile); + + $files = \iterator_to_array($this->fs()->list(path($this->filesDirectory() . '/' . __FUNCTION__ . '/partition=value/*'))); + + self::assertCount(2, $files); + + $basenames = \array_map(static fn ($file) => $file->path->basename(), $files); + \sort($basenames); + + self::assertSame(['orders.csv', 'sales.csv'], $basenames); + } + + public function test_overwrite_replaces_file_with_same_basename() : void + { + $streams = $this->streams(); + + $this->setupFiles([ + __FUNCTION__ => [ + 'partition=value' => [ + 'file.csv' => 'old content', + ], + ], + ]); + + $file = $this->getPath(__FUNCTION__ . '/file.csv'); + $fileStream = $streams->writeTo($file, partitions: [new Partition('partition', 'value')]); + $fileStream->append('new content'); + $streams->closeStreams($file); + + $files = \iterator_to_array($this->fs()->list(path($this->filesDirectory() . '/' . __FUNCTION__ . '/partition=value/*'))); + + self::assertCount(1, $files); + self::assertSame('file.csv', $files[0]->path->basename()); + self::assertSame('new content', \file_get_contents($files[0]->path->path())); + } + + protected function streams() : FilesystemStreams + { + $streams = new FilesystemStreams($this->fstab()); + $streams->setMode(overwrite()); + + return $streams; + } +} diff --git a/web/landing/.php-version b/web/landing/.php-version index 223a93930..0f197cfc8 100644 --- a/web/landing/.php-version +++ b/web/landing/.php-version @@ -1 +1 @@ -8.3 \ No newline at end of file +8.3.30 \ No newline at end of file diff --git a/web/landing/content/examples/topics/partitioning/partition_pruning/description.md b/web/landing/content/examples/topics/partitioning/partition_pruning/description.md new file mode 100644 index 000000000..635707450 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partition_pruning/description.md @@ -0,0 +1 @@ +Skip entire partitions without reading their data using filterPartitions(). Unlike filter() which reads all data then filters, partition pruning evaluates metadata first and only reads matching partitions - dramatically improving performance for large datasets. diff --git a/web/landing/content/examples/topics/partitioning/partitioning/append/_meta.yaml b/web/landing/content/examples/topics/partitioning/partitioning/append/_meta.yaml new file mode 100644 index 000000000..372963400 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/append/_meta.yaml @@ -0,0 +1,2 @@ +priority: 2 +hidden: false diff --git a/web/landing/content/examples/topics/partitioning/partitioning/append/code.php b/web/landing/content/examples/topics/partitioning/partitioning/append/code.php new file mode 100644 index 000000000..0cb33876a --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/append/code.php @@ -0,0 +1,26 @@ +read(from_array( + [ + ['id' => 1, 'color' => 'red', 'sku' => 'PRODUCT01'], + ['id' => 2, 'color' => 'red', 'sku' => 'PRODUCT02'], + ['id' => 3, 'color' => 'red', 'sku' => 'PRODUCT03'], + ['id' => 4, 'color' => 'green', 'sku' => 'PRODUCT01'], + ['id' => 5, 'color' => 'green', 'sku' => 'PRODUCT02'], + ['id' => 6, 'color' => 'green', 'sku' => 'PRODUCT03'], + ['id' => 7, 'color' => 'blue', 'sku' => 'PRODUCT01'], + ['id' => 8, 'color' => 'blue', 'sku' => 'PRODUCT02'], + ] + )) + ->partitionBy(ref('color'), ref('sku')) + ->mode(append()) + ->write(to_csv(__DIR__ . '/output/products.csv')) + ->run(); diff --git a/web/landing/content/examples/topics/partitioning/partitioning/append/description.md b/web/landing/content/examples/topics/partitioning/partitioning/append/description.md new file mode 100644 index 000000000..e6ede4326 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/append/description.md @@ -0,0 +1 @@ +Write partitioned data with append mode. New files with randomized suffixes are created in partition directories without removing existing data. Useful for incremental updates. diff --git a/web/landing/content/examples/topics/partitioning/partitioning/append/documentation.md b/web/landing/content/examples/topics/partitioning/partitioning/append/documentation.md new file mode 100644 index 000000000..17ac40fac --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/append/documentation.md @@ -0,0 +1,2 @@ +- [Partitioning](/documentation/components/core/partitioning) +- [Save Mode](/documentation/components/core/save-mode) diff --git a/web/landing/content/examples/topics/partitioning/partitioning/documentation.md b/web/landing/content/examples/topics/partitioning/partitioning/documentation.md index 7f38d898f..17ac40fac 100644 --- a/web/landing/content/examples/topics/partitioning/partitioning/documentation.md +++ b/web/landing/content/examples/topics/partitioning/partitioning/documentation.md @@ -1 +1,2 @@ - [Partitioning](/documentation/components/core/partitioning) +- [Save Mode](/documentation/components/core/save-mode) diff --git a/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/_meta.yaml b/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/_meta.yaml new file mode 100644 index 000000000..d3f9270d4 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/_meta.yaml @@ -0,0 +1,2 @@ +priority: 4 +hidden: false diff --git a/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/code.php b/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/code.php new file mode 100644 index 000000000..3c8ce258e --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/code.php @@ -0,0 +1,26 @@ +read(from_array( + [ + ['id' => 1, 'color' => 'red', 'sku' => 'PRODUCT01'], + ['id' => 2, 'color' => 'red', 'sku' => 'PRODUCT02'], + ['id' => 3, 'color' => 'red', 'sku' => 'PRODUCT03'], + ['id' => 4, 'color' => 'green', 'sku' => 'PRODUCT01'], + ['id' => 5, 'color' => 'green', 'sku' => 'PRODUCT02'], + ['id' => 6, 'color' => 'green', 'sku' => 'PRODUCT03'], + ['id' => 7, 'color' => 'blue', 'sku' => 'PRODUCT01'], + ['id' => 8, 'color' => 'blue', 'sku' => 'PRODUCT02'], + ] + )) + ->partitionBy(ref('color'), ref('sku')) + ->mode(exception_if_exists()) + ->write(to_csv(__DIR__ . '/output/products.csv')) + ->run(); diff --git a/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/description.md b/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/description.md new file mode 100644 index 000000000..4ab8a3fc5 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/description.md @@ -0,0 +1 @@ +Write partitioned data with exception_if_exists mode (default). Throws RuntimeException if any partition path already exists. Prevents accidental data overwrites. diff --git a/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/documentation.md b/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/documentation.md new file mode 100644 index 000000000..17ac40fac --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/exception_if_exists/documentation.md @@ -0,0 +1,2 @@ +- [Partitioning](/documentation/components/core/partitioning) +- [Save Mode](/documentation/components/core/save-mode) diff --git a/web/landing/content/examples/topics/partitioning/partitioning/ignore/_meta.yaml b/web/landing/content/examples/topics/partitioning/partitioning/ignore/_meta.yaml new file mode 100644 index 000000000..8608dd76b --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/ignore/_meta.yaml @@ -0,0 +1,2 @@ +priority: 3 +hidden: false diff --git a/web/landing/content/examples/topics/partitioning/partitioning/ignore/code.php b/web/landing/content/examples/topics/partitioning/partitioning/ignore/code.php new file mode 100644 index 000000000..bf099b7a4 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/ignore/code.php @@ -0,0 +1,26 @@ +read(from_array( + [ + ['id' => 1, 'color' => 'red', 'sku' => 'PRODUCT01'], + ['id' => 2, 'color' => 'red', 'sku' => 'PRODUCT02'], + ['id' => 3, 'color' => 'red', 'sku' => 'PRODUCT03'], + ['id' => 4, 'color' => 'green', 'sku' => 'PRODUCT01'], + ['id' => 5, 'color' => 'green', 'sku' => 'PRODUCT02'], + ['id' => 6, 'color' => 'green', 'sku' => 'PRODUCT03'], + ['id' => 7, 'color' => 'blue', 'sku' => 'PRODUCT01'], + ['id' => 8, 'color' => 'blue', 'sku' => 'PRODUCT02'], + ] + )) + ->partitionBy(ref('color'), ref('sku')) + ->mode(ignore()) + ->write(to_csv(__DIR__ . '/output/products.csv')) + ->run(); diff --git a/web/landing/content/examples/topics/partitioning/partitioning/ignore/description.md b/web/landing/content/examples/topics/partitioning/partitioning/ignore/description.md new file mode 100644 index 000000000..a814446ca --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/ignore/description.md @@ -0,0 +1 @@ +Write partitioned data with ignore mode. If a partition path already exists, writing is silently skipped. Useful for idempotent pipelines where re-running should not duplicate data. diff --git a/web/landing/content/examples/topics/partitioning/partitioning/ignore/documentation.md b/web/landing/content/examples/topics/partitioning/partitioning/ignore/documentation.md new file mode 100644 index 000000000..17ac40fac --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/ignore/documentation.md @@ -0,0 +1,2 @@ +- [Partitioning](/documentation/components/core/partitioning) +- [Save Mode](/documentation/components/core/save-mode) diff --git a/web/landing/content/examples/topics/partitioning/partitioning/overwrite/_meta.yaml b/web/landing/content/examples/topics/partitioning/partitioning/overwrite/_meta.yaml new file mode 100644 index 000000000..0d96c143f --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/overwrite/_meta.yaml @@ -0,0 +1,2 @@ +priority: 1 +hidden: false diff --git a/web/landing/content/examples/topics/partitioning/partitioning/code.php b/web/landing/content/examples/topics/partitioning/partitioning/overwrite/code.php similarity index 100% rename from web/landing/content/examples/topics/partitioning/partitioning/code.php rename to web/landing/content/examples/topics/partitioning/partitioning/overwrite/code.php diff --git a/web/landing/content/examples/topics/partitioning/partitioning/overwrite/description.md b/web/landing/content/examples/topics/partitioning/partitioning/overwrite/description.md new file mode 100644 index 000000000..86748aa2b --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/overwrite/description.md @@ -0,0 +1 @@ +Write partitioned data with overwrite mode. When writing, ALL files within each affected partition directory are removed and replaced. Partitions NOT in the current dataset remain untouched. diff --git a/web/landing/content/examples/topics/partitioning/partitioning/overwrite/documentation.md b/web/landing/content/examples/topics/partitioning/partitioning/overwrite/documentation.md new file mode 100644 index 000000000..17ac40fac --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/partitioning/overwrite/documentation.md @@ -0,0 +1,2 @@ +- [Partitioning](/documentation/components/core/partitioning) +- [Save Mode](/documentation/components/core/save-mode) diff --git a/web/landing/content/examples/topics/partitioning/path_partitions/description.md b/web/landing/content/examples/topics/partitioning/path_partitions/description.md new file mode 100644 index 000000000..06fbf1de1 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/path_partitions/description.md @@ -0,0 +1 @@ +Extract partition metadata from file paths without reading file contents using from_path_partitions(). Returns the file path and a map of partition key-value pairs. Useful for discovering available partitions or building file manifests. diff --git a/web/landing/content/examples/topics/partitioning/reading/after_append/_meta.yaml b/web/landing/content/examples/topics/partitioning/reading/after_append/_meta.yaml new file mode 100644 index 000000000..8608dd76b --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/after_append/_meta.yaml @@ -0,0 +1,2 @@ +priority: 3 +hidden: false diff --git a/web/landing/content/examples/topics/partitioning/reading/after_append/code.php b/web/landing/content/examples/topics/partitioning/reading/after_append/code.php new file mode 100644 index 000000000..5275c6466 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/after_append/code.php @@ -0,0 +1,39 @@ +read(from_array( + [ + ['id' => 1, 'color' => 'red', 'name' => 'Widget'], + ['id' => 2, 'color' => 'blue', 'name' => 'Gadget'], + ] + )) + ->partitionBy(ref('color')) + ->mode(append()) + ->write(to_csv($outputPath . '/products.csv')) + ->run(); + +data_frame() + ->read(from_array( + [ + ['id' => 3, 'color' => 'red', 'name' => 'Sprocket'], + ['id' => 4, 'color' => 'blue', 'name' => 'Gear'], + ] + )) + ->partitionBy(ref('color')) + ->mode(append()) + ->write(to_csv($outputPath . '/products.csv')) + ->run(); + +data_frame() + ->read(from_csv($outputPath . '/color=*/*.csv')) + ->write(to_output(truncate: false)) + ->run(); diff --git a/web/landing/content/examples/topics/partitioning/reading/after_append/description.md b/web/landing/content/examples/topics/partitioning/reading/after_append/description.md new file mode 100644 index 000000000..999119e5b --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/after_append/description.md @@ -0,0 +1 @@ +Read partitioned data after multiple writes with append mode. Flow automatically combines all files matching `filename*.ext` pattern in each partition. diff --git a/web/landing/content/examples/topics/partitioning/reading/after_append/documentation.md b/web/landing/content/examples/topics/partitioning/reading/after_append/documentation.md new file mode 100644 index 000000000..17ac40fac --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/after_append/documentation.md @@ -0,0 +1,2 @@ +- [Partitioning](/documentation/components/core/partitioning) +- [Save Mode](/documentation/components/core/save-mode) diff --git a/web/landing/content/examples/topics/partitioning/reading/after_overwrite/_meta.yaml b/web/landing/content/examples/topics/partitioning/reading/after_overwrite/_meta.yaml new file mode 100644 index 000000000..372963400 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/after_overwrite/_meta.yaml @@ -0,0 +1,2 @@ +priority: 2 +hidden: false diff --git a/web/landing/content/examples/topics/partitioning/reading/after_overwrite/code.php b/web/landing/content/examples/topics/partitioning/reading/after_overwrite/code.php new file mode 100644 index 000000000..f191ac860 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/after_overwrite/code.php @@ -0,0 +1,27 @@ +read(from_array( + [ + ['id' => 1, 'color' => 'red', 'name' => 'Widget'], + ['id' => 2, 'color' => 'blue', 'name' => 'Gadget'], + ] + )) + ->partitionBy(ref('color')) + ->mode(overwrite()) + ->write(to_csv($outputPath . '/products.csv')) + ->run(); + +data_frame() + ->read(from_csv($outputPath . '/color=*/*.csv')) + ->write(to_output(truncate: false)) + ->run(); diff --git a/web/landing/content/examples/topics/partitioning/reading/after_overwrite/description.md b/web/landing/content/examples/topics/partitioning/reading/after_overwrite/description.md new file mode 100644 index 000000000..09e5c0222 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/after_overwrite/description.md @@ -0,0 +1 @@ +Read partitioned data after writing with overwrite mode. Each partition contains exactly one file with the latest data. diff --git a/web/landing/content/examples/topics/partitioning/reading/after_overwrite/documentation.md b/web/landing/content/examples/topics/partitioning/reading/after_overwrite/documentation.md new file mode 100644 index 000000000..17ac40fac --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/after_overwrite/documentation.md @@ -0,0 +1,2 @@ +- [Partitioning](/documentation/components/core/partitioning) +- [Save Mode](/documentation/components/core/save-mode) diff --git a/web/landing/content/examples/topics/partitioning/reading/basic/_meta.yaml b/web/landing/content/examples/topics/partitioning/reading/basic/_meta.yaml new file mode 100644 index 000000000..0d96c143f --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/basic/_meta.yaml @@ -0,0 +1,2 @@ +priority: 1 +hidden: false diff --git a/web/landing/content/examples/topics/partitioning/reading/code.php b/web/landing/content/examples/topics/partitioning/reading/basic/code.php similarity index 100% rename from web/landing/content/examples/topics/partitioning/reading/code.php rename to web/landing/content/examples/topics/partitioning/reading/basic/code.php diff --git a/web/landing/content/examples/topics/partitioning/reading/basic/description.md b/web/landing/content/examples/topics/partitioning/reading/basic/description.md new file mode 100644 index 000000000..eab6ec847 --- /dev/null +++ b/web/landing/content/examples/topics/partitioning/reading/basic/description.md @@ -0,0 +1 @@ +Read data from Hive-style partitioned directories using glob patterns. Flow automatically discovers all partitions matching `column=*/` patterns. diff --git a/web/landing/content/examples/topics/partitioning/reading/documentation.md b/web/landing/content/examples/topics/partitioning/reading/basic/documentation.md similarity index 100% rename from web/landing/content/examples/topics/partitioning/reading/documentation.md rename to web/landing/content/examples/topics/partitioning/reading/basic/documentation.md diff --git a/web/landing/content/examples/topics/partitioning/reading/input/color=blue/sku=A/data.csv b/web/landing/content/examples/topics/partitioning/reading/basic/input/color=blue/sku=A/data.csv similarity index 100% rename from web/landing/content/examples/topics/partitioning/reading/input/color=blue/sku=A/data.csv rename to web/landing/content/examples/topics/partitioning/reading/basic/input/color=blue/sku=A/data.csv diff --git a/web/landing/content/examples/topics/partitioning/reading/input/color=green/sku=A/data.csv b/web/landing/content/examples/topics/partitioning/reading/basic/input/color=green/sku=A/data.csv similarity index 100% rename from web/landing/content/examples/topics/partitioning/reading/input/color=green/sku=A/data.csv rename to web/landing/content/examples/topics/partitioning/reading/basic/input/color=green/sku=A/data.csv diff --git a/web/landing/content/examples/topics/partitioning/reading/input/color=red/sku=A/data.csv b/web/landing/content/examples/topics/partitioning/reading/basic/input/color=red/sku=A/data.csv similarity index 100% rename from web/landing/content/examples/topics/partitioning/reading/input/color=red/sku=A/data.csv rename to web/landing/content/examples/topics/partitioning/reading/basic/input/color=red/sku=A/data.csv diff --git a/web/landing/content/examples/topics/partitioning/reading/input/color=red/sku=B/data.csv b/web/landing/content/examples/topics/partitioning/reading/basic/input/color=red/sku=B/data.csv similarity index 100% rename from web/landing/content/examples/topics/partitioning/reading/input/color=red/sku=B/data.csv rename to web/landing/content/examples/topics/partitioning/reading/basic/input/color=red/sku=B/data.csv