diff --git a/src/View/CsvView.php b/src/View/CsvView.php index a76afc5..9c807b4 100644 --- a/src/View/CsvView.php +++ b/src/View/CsvView.php @@ -105,6 +105,30 @@ class CsvView extends SerializedView */ public const EXTENSION_MBSTRING = 'mbstring'; + /** + * Transcoding mode: throw on any unconvertible byte / character (default). + * + * @var string + */ + public const TRANSCODING_MODE_STRICT = 'strict'; + + /** + * Transcoding mode: silently drop unconvertible characters and keep going. + * Maps to iconv's `//IGNORE` suffix and mbstring's substitute-char `'none'`. + * + * @var string + */ + public const TRANSCODING_MODE_IGNORE = 'ignore'; + + /** + * Transcoding mode: transliterate where possible, ignore otherwise. + * Maps to iconv's `//TRANSLIT//IGNORE` suffix. For mbstring this falls + * back to ignore (mbstring has no transliteration). + * + * @var string + */ + public const TRANSCODING_MODE_TRANSLITERATE = 'transliterate'; + /** * List of bom signs for encodings. * @@ -137,7 +161,10 @@ class CsvView extends SerializedView * - 'delimiter': (default ',') CSV Delimiter, defaults to comma * - 'enclosure': (default '"') CSV Enclosure for use with fputcsv() * - 'newline': (default '\n') CSV Newline replacement for use with fputcsv() - * - 'escape': (default '\\') CSV escape character for use with fputcsv() + * - 'escape': (default '') CSV escape character for use with fputcsv(). + * Empty string is RFC 4180 compliant and avoids PHP 8.4's + * deprecation warning for non-empty escape values. Set to '\\' for + * legacy PHP-style escaping (will emit E_DEPRECATED on PHP 8.4+). * - 'eol': (default '\n') End-of-line character the csv * - 'bom': (default false) Adds BOM (byte order mark) header * - 'setSeparator': (default false) Adds sep=[_delimiter] in the first line @@ -148,6 +175,12 @@ class CsvView extends SerializedView * When true, sets `bom => true`, `eol => "\r\n"`, and `csvEncoding => 'UTF-8'`. * These specific keys are forced; if you need a different combination * do not enable `excel` and set them individually instead. + * - 'transcodingMode': (default 'strict') How to handle source bytes that + * cannot be encoded in the target encoding. One of: + * - 'strict': throw a CakeException naming the source/target encoding. + * - 'ignore': silently drop unconvertible characters and continue. + * - 'transliterate': transliterate where possible (e.g. é → e), ignore + * otherwise. For iconv only; mbstring falls back to 'ignore'. * * @var array */ @@ -159,7 +192,7 @@ class CsvView extends SerializedView 'delimiter' => ',', 'enclosure' => '"', 'newline' => "\n", - 'escape' => '\\', + 'escape' => '', 'eol' => PHP_EOL, 'null' => '', 'bom' => false, @@ -168,6 +201,7 @@ class CsvView extends SerializedView 'dataEncoding' => 'UTF-8', 'transcodingExtension' => self::EXTENSION_ICONV, 'excel' => false, + 'transcodingMode' => self::TRANSCODING_MODE_STRICT, ]; /** @@ -431,12 +465,7 @@ protected function _generateRow(?array $row = null): string|false $dataEncoding = $this->getConfig('dataEncoding'); $csvEncoding = $this->getConfig('csvEncoding'); if ($dataEncoding !== $csvEncoding) { - $extension = $this->getConfig('transcodingExtension'); - if ($extension === static::EXTENSION_ICONV) { - $csv = iconv($dataEncoding, $csvEncoding, $csv); - } elseif ($extension === static::EXTENSION_MBSTRING) { - $csv = mb_convert_encoding($csv, $csvEncoding, $dataEncoding); - } + $csv = $this->_transcode($csv, $dataEncoding, $csvEncoding); } // BOM must be added after encoding @@ -461,4 +490,77 @@ protected function getBom(string $csvEncoding): string return $this->bomMap[$csvEncoding] ?? ''; } + + /** + * Transcode a row's worth of CSV between encodings, honoring the + * configured `transcodingMode` (strict / ignore / transliterate). + * + * @param string $csv The current CSV chunk. + * @param string $dataEncoding Source encoding. + * @param string $csvEncoding Target encoding. + * @return string Transcoded CSV chunk. + * @throws \Cake\Core\Exception\CakeException When mode is `strict` and the + * transcoder reports a conversion failure. + */ + protected function _transcode(string $csv, string $dataEncoding, string $csvEncoding): string + { + $extension = $this->getConfig('transcodingExtension'); + $mode = $this->getConfig('transcodingMode'); + + if ($extension === static::EXTENSION_ICONV) { + $targetSpec = match ($mode) { + static::TRANSCODING_MODE_IGNORE => $csvEncoding . '//IGNORE', + static::TRANSCODING_MODE_TRANSLITERATE => $csvEncoding . '//TRANSLIT//IGNORE', + default => $csvEncoding, + }; + // iconv() emits an E_NOTICE / E_WARNING immediately before returning + // false on unconvertible input. Install a no-op handler for the + // duration of the call so we surface the failure via our own + // (strict-mode) exception below rather than as two near-duplicate + // signals. PHPUnit's own error handler is restored on `finally`. + set_error_handler(static fn(): bool => true, E_NOTICE | E_WARNING); + try { + $converted = iconv($dataEncoding, $targetSpec, $csv); + } finally { + restore_error_handler(); + } + if ($converted === false) { + if ($mode === static::TRANSCODING_MODE_STRICT) { + throw new CakeException(sprintf( + 'iconv() failed to transcode row from `%s` to `%s`. ' + . 'Check that the source data is valid `%s` and that both ' + . 'encodings are supported by your iconv build, or set ' + . '`transcodingMode` to `ignore` or `transliterate` to ' + . 'tolerate unconvertible characters.', + $dataEncoding, + $csvEncoding, + $dataEncoding, + )); + } + + return ''; + } + + return $converted; + } + + if ($extension === static::EXTENSION_MBSTRING) { + $previousSubstitute = null; + if ($mode !== static::TRANSCODING_MODE_STRICT) { + $previousSubstitute = mb_substitute_character(); + mb_substitute_character('none'); + } + try { + $converted = mb_convert_encoding($csv, $csvEncoding, $dataEncoding); + } finally { + if ($previousSubstitute !== null) { + mb_substitute_character($previousSubstitute); + } + } + + return $converted; + } + + return $csv; + } } diff --git a/tests/TestCase/View/CsvViewTest.php b/tests/TestCase/View/CsvViewTest.php index 283ca4b..3706a5a 100644 --- a/tests/TestCase/View/CsvViewTest.php +++ b/tests/TestCase/View/CsvViewTest.php @@ -641,4 +641,127 @@ public function testExcelPresetOverridesIndividualKeys() $this->assertStringStartsWith($bom, $output); $this->assertStringEndsWith("\r\n", $output); } + + /** + * The default `escape` value is `''` (RFC 4180 compliant) to avoid + * PHP 8.4's deprecation warning for any non-empty escape passed to + * `fputcsv()`. Rendering a row with a quote in it must produce + * doubled-quote escaping rather than legacy backslash escaping, and + * must not raise E_DEPRECATED. + * + * @return void + */ + public function testDefaultEscapeIsRfc4180() + { + $deprecations = []; + set_error_handler(function ($severity, $message) use (&$deprecations) { + $deprecations[] = $message; + }, E_DEPRECATED | E_USER_DEPRECATED); + + try { + $data = [['contains "quote"']]; + $this->view->set(['data' => $data]) + ->setConfig(['serialize' => 'data']); + $output = $this->view->render(); + } finally { + restore_error_handler(); + } + + // RFC 4180: quote is escaped by doubling, not by backslash. + $this->assertSame('"contains ""quote"""' . PHP_EOL, $output); + $this->assertSame( + [], + $deprecations, + 'fputcsv() raised an unexpected deprecation: ' . implode(', ', $deprecations), + ); + } + + public function testIconvFailureThrows() + { + if (!extension_loaded('iconv')) { + $this->markTestSkipped('The iconv extension is not available.'); + } + + $data = [['hello']]; + $this->view->set(['data' => $data]) + ->setConfig([ + 'serialize' => 'data', + 'dataEncoding' => 'UTF-8', + // Bogus target encoding name. iconv returns false for this. + 'csvEncoding' => 'NOT-A-REAL-ENCODING', + 'transcodingExtension' => CsvView::EXTENSION_ICONV, + ]); + + try { + $this->view->render(); + $this->fail('Expected exception for iconv() returning false.'); + } catch (Exception $e) { + $previous = $e->getPrevious() ?? $e; + $this->assertInstanceOf(CakeException::class, $previous); + $this->assertStringContainsString( + 'iconv() failed to transcode', + $previous->getMessage(), + ); + } + } + + /** + * `transcodingMode => 'ignore'` must keep generating the CSV when iconv + * cannot convert a character: the unconvertible character is dropped and + * the rest of the row is preserved instead of throwing. + * + * @return void + */ + public function testIconvIgnoreModeDropsUnconvertibleChars() + { + if (!extension_loaded('iconv')) { + $this->markTestSkipped('The iconv extension is not available.'); + } + + // `あ` cannot be represented in ASCII; in ignore mode it is dropped. + $data = [['hello あ world']]; + $this->view->set(['data' => $data]) + ->setConfig([ + 'serialize' => 'data', + 'dataEncoding' => 'UTF-8', + 'csvEncoding' => 'ASCII', + 'transcodingExtension' => CsvView::EXTENSION_ICONV, + 'transcodingMode' => CsvView::TRANSCODING_MODE_IGNORE, + ]); + + $output = $this->view->render(); + $this->assertStringContainsString('hello ', $output); + $this->assertStringContainsString(' world', $output); + $this->assertStringNotContainsString('あ', $output); + } + + /** + * `transcodingMode => 'transliterate'` must convert what it can (e.g. + * accented Latin → ASCII equivalents). + * + * @return void + */ + public function testIconvTransliterateModeConvertsAccentedChars() + { + if (!extension_loaded('iconv')) { + $this->markTestSkipped('The iconv extension is not available.'); + } + + $data = [['café Möhre']]; + $this->view->set(['data' => $data]) + ->setConfig([ + 'serialize' => 'data', + 'dataEncoding' => 'UTF-8', + 'csvEncoding' => 'ASCII', + 'transcodingExtension' => CsvView::EXTENSION_ICONV, + 'transcodingMode' => CsvView::TRANSCODING_MODE_TRANSLITERATE, + ]); + + $output = $this->view->render(); + // iconv//TRANSLIT typically produces `cafe` and `Mohre`; the exact + // output varies by libiconv build, but neither é nor ö should + // survive. + $this->assertStringNotContainsString('é', $output); + $this->assertStringNotContainsString('ö', $output); + } }