Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 110 additions & 8 deletions src/View/CsvView.php
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,30 @@ class CsvView extends SerializedView
*/
public const EXTENSION_MBSTRING = 'mbstring';

/**
* Transcoding mode: throw on any unconvertible byte / character (default).
*
* @var string
*/
public const TRANSCODING_MODE_STRICT = 'strict';

/**
* Transcoding mode: silently drop unconvertible characters and keep going.
* Maps to iconv's `//IGNORE` suffix and mbstring's substitute-char `'none'`.
*
* @var string
*/
public const TRANSCODING_MODE_IGNORE = 'ignore';

/**
* Transcoding mode: transliterate where possible, ignore otherwise.
* Maps to iconv's `//TRANSLIT//IGNORE` suffix. For mbstring this falls
* back to ignore (mbstring has no transliteration).
*
* @var string
*/
public const TRANSCODING_MODE_TRANSLITERATE = 'transliterate';

/**
* List of bom signs for encodings.
*
Expand Down Expand Up @@ -137,7 +161,10 @@ class CsvView extends SerializedView
* - 'delimiter': (default ',') CSV Delimiter, defaults to comma
* - 'enclosure': (default '"') CSV Enclosure for use with fputcsv()
* - 'newline': (default '\n') CSV Newline replacement for use with fputcsv()
* - 'escape': (default '\\') CSV escape character for use with fputcsv()
* - 'escape': (default '') CSV escape character for use with fputcsv().
* Empty string is RFC 4180 compliant and avoids PHP 8.4's
* deprecation warning for non-empty escape values. Set to '\\' for
* legacy PHP-style escaping (will emit E_DEPRECATED on PHP 8.4+).
* - 'eol': (default '\n') End-of-line character the csv
* - 'bom': (default false) Adds BOM (byte order mark) header
* - 'setSeparator': (default false) Adds sep=[_delimiter] in the first line
Expand All @@ -148,6 +175,12 @@ class CsvView extends SerializedView
* When true, sets `bom => true`, `eol => "\r\n"`, and `csvEncoding => 'UTF-8'`.
* These specific keys are forced; if you need a different combination
* do not enable `excel` and set them individually instead.
* - 'transcodingMode': (default 'strict') How to handle source bytes that
* cannot be encoded in the target encoding. One of:
* - 'strict': throw a CakeException naming the source/target encoding.
* - 'ignore': silently drop unconvertible characters and continue.
* - 'transliterate': transliterate where possible (e.g. é → e), ignore
* otherwise. For iconv only; mbstring falls back to 'ignore'.
*
* @var array<string, mixed>
*/
Expand All @@ -159,7 +192,7 @@ class CsvView extends SerializedView
'delimiter' => ',',
'enclosure' => '"',
'newline' => "\n",
'escape' => '\\',
'escape' => '',
'eol' => PHP_EOL,
'null' => '',
'bom' => false,
Expand All @@ -168,6 +201,7 @@ class CsvView extends SerializedView
'dataEncoding' => 'UTF-8',
'transcodingExtension' => self::EXTENSION_ICONV,
'excel' => false,
'transcodingMode' => self::TRANSCODING_MODE_STRICT,
];

/**
Expand Down Expand Up @@ -431,12 +465,7 @@ protected function _generateRow(?array $row = null): string|false
$dataEncoding = $this->getConfig('dataEncoding');
$csvEncoding = $this->getConfig('csvEncoding');
if ($dataEncoding !== $csvEncoding) {
$extension = $this->getConfig('transcodingExtension');
if ($extension === static::EXTENSION_ICONV) {
$csv = iconv($dataEncoding, $csvEncoding, $csv);
} elseif ($extension === static::EXTENSION_MBSTRING) {
$csv = mb_convert_encoding($csv, $csvEncoding, $dataEncoding);
}
$csv = $this->_transcode($csv, $dataEncoding, $csvEncoding);
}

// BOM must be added after encoding
Expand All @@ -461,4 +490,77 @@ protected function getBom(string $csvEncoding): string

return $this->bomMap[$csvEncoding] ?? '';
}

/**
* Transcode a row's worth of CSV between encodings, honoring the
* configured `transcodingMode` (strict / ignore / transliterate).
*
* @param string $csv The current CSV chunk.
* @param string $dataEncoding Source encoding.
* @param string $csvEncoding Target encoding.
* @return string Transcoded CSV chunk.
* @throws \Cake\Core\Exception\CakeException When mode is `strict` and the
* transcoder reports a conversion failure.
*/
protected function _transcode(string $csv, string $dataEncoding, string $csvEncoding): string
{
$extension = $this->getConfig('transcodingExtension');
$mode = $this->getConfig('transcodingMode');

if ($extension === static::EXTENSION_ICONV) {
$targetSpec = match ($mode) {
static::TRANSCODING_MODE_IGNORE => $csvEncoding . '//IGNORE',
static::TRANSCODING_MODE_TRANSLITERATE => $csvEncoding . '//TRANSLIT//IGNORE',
default => $csvEncoding,
};
// iconv() emits an E_NOTICE / E_WARNING immediately before returning
// false on unconvertible input. Install a no-op handler for the
// duration of the call so we surface the failure via our own
// (strict-mode) exception below rather than as two near-duplicate
// signals. PHPUnit's own error handler is restored on `finally`.
set_error_handler(static fn(): bool => true, E_NOTICE | E_WARNING);
try {
$converted = iconv($dataEncoding, $targetSpec, $csv);
} finally {
restore_error_handler();
}
if ($converted === false) {
if ($mode === static::TRANSCODING_MODE_STRICT) {
throw new CakeException(sprintf(
'iconv() failed to transcode row from `%s` to `%s`. '
. 'Check that the source data is valid `%s` and that both '
. 'encodings are supported by your iconv build, or set '
. '`transcodingMode` to `ignore` or `transliterate` to '
. 'tolerate unconvertible characters.',
$dataEncoding,
$csvEncoding,
$dataEncoding,
));
}

return '';
}

return $converted;
}

if ($extension === static::EXTENSION_MBSTRING) {
$previousSubstitute = null;
if ($mode !== static::TRANSCODING_MODE_STRICT) {
$previousSubstitute = mb_substitute_character();
mb_substitute_character('none');
}
try {
$converted = mb_convert_encoding($csv, $csvEncoding, $dataEncoding);
} finally {
if ($previousSubstitute !== null) {
mb_substitute_character($previousSubstitute);
}
}

return $converted;
}

return $csv;
}
}
123 changes: 123 additions & 0 deletions tests/TestCase/View/CsvViewTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -641,4 +641,127 @@ public function testExcelPresetOverridesIndividualKeys()
$this->assertStringStartsWith($bom, $output);
$this->assertStringEndsWith("\r\n", $output);
}

/**
* The default `escape` value is `''` (RFC 4180 compliant) to avoid
* PHP 8.4's deprecation warning for any non-empty escape passed to
* `fputcsv()`. Rendering a row with a quote in it must produce
* doubled-quote escaping rather than legacy backslash escaping, and
* must not raise E_DEPRECATED.
*
* @return void
*/
public function testDefaultEscapeIsRfc4180()
{
$deprecations = [];
set_error_handler(function ($severity, $message) use (&$deprecations) {
$deprecations[] = $message;
}, E_DEPRECATED | E_USER_DEPRECATED);

try {
$data = [['contains "quote"']];
$this->view->set(['data' => $data])
->setConfig(['serialize' => 'data']);
$output = $this->view->render();
} finally {
restore_error_handler();
}

// RFC 4180: quote is escaped by doubling, not by backslash.
$this->assertSame('"contains ""quote"""' . PHP_EOL, $output);
$this->assertSame(
[],
$deprecations,
'fputcsv() raised an unexpected deprecation: ' . implode(', ', $deprecations),
);
}

public function testIconvFailureThrows()
{
if (!extension_loaded('iconv')) {
$this->markTestSkipped('The iconv extension is not available.');
}

$data = [['hello']];
$this->view->set(['data' => $data])
->setConfig([
'serialize' => 'data',
'dataEncoding' => 'UTF-8',
// Bogus target encoding name. iconv returns false for this.
'csvEncoding' => 'NOT-A-REAL-ENCODING',
'transcodingExtension' => CsvView::EXTENSION_ICONV,
]);

try {
$this->view->render();
$this->fail('Expected exception for iconv() returning false.');
} catch (Exception $e) {
$previous = $e->getPrevious() ?? $e;
$this->assertInstanceOf(CakeException::class, $previous);
$this->assertStringContainsString(
'iconv() failed to transcode',
$previous->getMessage(),
);
}
}

/**
* `transcodingMode => 'ignore'` must keep generating the CSV when iconv
* cannot convert a character: the unconvertible character is dropped and
* the rest of the row is preserved instead of throwing.
*
* @return void
*/
public function testIconvIgnoreModeDropsUnconvertibleChars()
{
if (!extension_loaded('iconv')) {
$this->markTestSkipped('The iconv extension is not available.');
}

// `あ` cannot be represented in ASCII; in ignore mode it is dropped.
$data = [['hello あ world']];
$this->view->set(['data' => $data])
->setConfig([
'serialize' => 'data',
'dataEncoding' => 'UTF-8',
'csvEncoding' => 'ASCII',
'transcodingExtension' => CsvView::EXTENSION_ICONV,
'transcodingMode' => CsvView::TRANSCODING_MODE_IGNORE,
]);

$output = $this->view->render();
$this->assertStringContainsString('hello ', $output);
$this->assertStringContainsString(' world', $output);
$this->assertStringNotContainsString('あ', $output);
}

/**
* `transcodingMode => 'transliterate'` must convert what it can (e.g.
* accented Latin → ASCII equivalents).
*
* @return void
*/
public function testIconvTransliterateModeConvertsAccentedChars()
{
if (!extension_loaded('iconv')) {
$this->markTestSkipped('The iconv extension is not available.');
}

$data = [['café Möhre']];
$this->view->set(['data' => $data])
->setConfig([
'serialize' => 'data',
'dataEncoding' => 'UTF-8',
'csvEncoding' => 'ASCII',
'transcodingExtension' => CsvView::EXTENSION_ICONV,
'transcodingMode' => CsvView::TRANSCODING_MODE_TRANSLITERATE,
]);

$output = $this->view->render();
// iconv//TRANSLIT typically produces `cafe` and `Mohre`; the exact
// output varies by libiconv build, but neither é nor ö should
// survive.
$this->assertStringNotContainsString('é', $output);
$this->assertStringNotContainsString('ö', $output);
}
}
Loading