diff --git a/docs/topics/reading-and-writing-to-file.md b/docs/topics/reading-and-writing-to-file.md index e55471a7c4..e1b7e3a2f1 100644 --- a/docs/topics/reading-and-writing-to-file.md +++ b/docs/topics/reading-and-writing-to-file.md @@ -458,6 +458,24 @@ $reader->setSheetIndex(0); $spreadsheet = $reader->load("sample.csv"); ``` +You may also let PhpSpreadsheet attempt to guess the input encoding. +It will do so based on a test for BOM (UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, +or UTF-32LE), +or by doing heuristic tests for those encodings, falling back to a +specifiable encoding (default is CP1252) if all of those tests fail. + +```php +$reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv(); +$encoding = \PhpOffice\PhpSpreadsheet\Reader\Csv::guessEncoding('sample.csv'); +// or, e.g. $encoding = \PhpOffice\PhpSpreadsheet\Reader\Csv::guessEncoding( +// 'sample.csv', 'ISO-8859-2'); +$reader->setInputEncoding($encoding); +$reader->setDelimiter(';'); +$reader->setEnclosure(''); +$reader->setSheetIndex(0); + +$spreadsheet = $reader->load('sample.csv'); +``` #### Read a specific worksheet diff --git a/src/PhpSpreadsheet/Reader/Csv.php b/src/PhpSpreadsheet/Reader/Csv.php index d6eb16b0af..1495d102c0 100644 --- a/src/PhpSpreadsheet/Reader/Csv.php +++ b/src/PhpSpreadsheet/Reader/Csv.php @@ -9,6 +9,21 @@ class Csv extends BaseReader { + const UTF8_BOM = "\xEF\xBB\xBF"; + const UTF8_BOM_LEN = 3; + const UTF16BE_BOM = "\xfe\xff"; + const UTF16BE_BOM_LEN = 2; + const UTF16BE_LF = "\x00\x0a"; + const UTF16LE_BOM = "\xff\xfe"; + const UTF16LE_BOM_LEN = 2; + const UTF16LE_LF = "\x0a\x00"; + const UTF32BE_BOM = "\x00\x00\xfe\xff"; + const UTF32BE_BOM_LEN = 4; + const UTF32BE_LF = "\x00\x00\x00\x0a"; + const UTF32LE_BOM = "\xff\xfe\x00\x00"; + const UTF32LE_BOM_LEN = 4; + const UTF32LE_LF = "\x0a\x00\x00\x00"; + /** * Input encoding. * @@ -90,12 +105,8 @@ protected function skipBOM(): void { rewind($this->fileHandle); - switch ($this->inputEncoding) { - case 'UTF-8': - fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ? - fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0); - - break; + if (fgets($this->fileHandle, self::UTF8_BOM_LEN + 1) !== self::UTF8_BOM) { + rewind($this->fileHandle); } } @@ -213,7 +224,9 @@ function ($sum, $value) use ($median) { private function getNextLine() { $line = ''; - $enclosure = '(?escapeCharacter, '/') . ')' . preg_quote($this->enclosure, '/'); + $enclosure = ($this->escapeCharacter === '' ? '' + : ('(?escapeCharacter, '/') . ')')) + . preg_quote($this->enclosure, '/'); do { // Get the next line in the file @@ -307,7 +320,7 @@ private function openFileOrMemory($pFilename): void $this->fileHandle = fopen('php://memory', 'r+b'); $data = StringHelper::convertEncoding($entireFile, 'UTF-8', $this->inputEncoding); fwrite($this->fileHandle, $data); - rewind($this->fileHandle); + $this->skipBOM(); } } @@ -531,4 +544,63 @@ public function canRead($pFilename) return in_array($type, $supportedTypes, true); } + + private static function guessEncodingTestNoBom(string &$encoding, string &$contents, string $compare, string $setEncoding): void + { + if ($encoding === '') { + $pos = strpos($contents, $compare); + if ($pos !== false && $pos % strlen($compare) === 0) { + $encoding = $setEncoding; + } + } + } + + private static function guessEncodingNoBom(string $filename): string + { + $encoding = ''; + $contents = file_get_contents($filename); + self::guessEncodingTestNoBom($encoding, $contents, self::UTF32BE_LF, 'UTF-32BE'); + self::guessEncodingTestNoBom($encoding, $contents, self::UTF32LE_LF, 'UTF-32LE'); + self::guessEncodingTestNoBom($encoding, $contents, self::UTF16BE_LF, 'UTF-16BE'); + self::guessEncodingTestNoBom($encoding, $contents, self::UTF16LE_LF, 'UTF-16LE'); + if ($encoding === '' && preg_match('//u', $contents) === 1) { + $encoding = 'UTF-8'; + } + + return $encoding; + } + + private static function guessEncodingTestBom(string &$encoding, string $first4, string $compare, string $setEncoding): void + { + if ($encoding === '') { + if ($compare === substr($first4, 0, strlen($compare))) { + $encoding = $setEncoding; + } + } + } + + private static function guessEncodingBom(string $filename): string + { + $encoding = ''; + $first4 = file_get_contents($filename, false, null, 0, 4); + if ($first4 !== false) { + self::guessEncodingTestBom($encoding, $first4, self::UTF8_BOM, 'UTF-8'); + self::guessEncodingTestBom($encoding, $first4, self::UTF16BE_BOM, 'UTF-16BE'); + self::guessEncodingTestBom($encoding, $first4, self::UTF32BE_BOM, 'UTF-32BE'); + self::guessEncodingTestBom($encoding, $first4, self::UTF32LE_BOM, 'UTF-32LE'); + self::guessEncodingTestBom($encoding, $first4, self::UTF16LE_BOM, 'UTF-16LE'); + } + + return $encoding; + } + + public static function guessEncoding(string $filename, string $dflt = 'CP1252'): string + { + $encoding = self::guessEncodingBom($filename); + if ($encoding === '') { + $encoding = self::guessEncodingNoBom($filename); + } + + return ($encoding === '') ? $dflt : $encoding; + } } diff --git a/tests/PhpSpreadsheetTests/Reader/CsvTest.php b/tests/PhpSpreadsheetTests/Reader/CsvTest.php index 797f3f1deb..e543ff4832 100644 --- a/tests/PhpSpreadsheetTests/Reader/CsvTest.php +++ b/tests/PhpSpreadsheetTests/Reader/CsvTest.php @@ -275,4 +275,66 @@ public function testReadNonexistentFileName(): void $reader = new Csv(); $reader->load('tests/data/Reader/CSV/encoding.utf8.csvxxx'); } + + /** + * @dataProvider providerEscapes + */ + public function testInferSeparator(string $escape, string $delimiter): void + { + $reader = new Csv(); + $reader->setEscapeCharacter($escape); + $filename = 'tests/data/Reader/CSV/escape.csv'; + $reader->listWorksheetInfo($filename); + self::assertEquals($delimiter, $reader->getDelimiter()); + } + + public function providerEscapes() + { + return [ + ['\\', ';'], + ["\x0", ','], + [(version_compare(PHP_VERSION, '7.4') < 0) ? "\x0" : '', ','], + ]; + } + + /** + * @dataProvider providerGuessEncoding + */ + public function testGuessEncoding(string $filename): void + { + $reader = new Csv(); + $reader->setInputEncoding(Csv::guessEncoding($filename)); + $spreadsheet = $reader->load($filename); + $sheet = $spreadsheet->getActiveSheet(); + self::assertEquals('première', $sheet->getCell('A1')->getValue()); + self::assertEquals('sixième', $sheet->getCell('C2')->getValue()); + } + + public function providerGuessEncoding() + { + return [ + ['tests/data/Reader/CSV/premiere.utf8.csv'], + ['tests/data/Reader/CSV/premiere.utf8bom.csv'], + ['tests/data/Reader/CSV/premiere.utf16be.csv'], + ['tests/data/Reader/CSV/premiere.utf16bebom.csv'], + ['tests/data/Reader/CSV/premiere.utf16le.csv'], + ['tests/data/Reader/CSV/premiere.utf16lebom.csv'], + ['tests/data/Reader/CSV/premiere.utf32be.csv'], + ['tests/data/Reader/CSV/premiere.utf32bebom.csv'], + ['tests/data/Reader/CSV/premiere.utf32le.csv'], + ['tests/data/Reader/CSV/premiere.utf32lebom.csv'], + ['tests/data/Reader/CSV/premiere.win1252.csv'], + ]; + } + + public function testGuessEncodingDefltIso2(): void + { + $filename = 'tests/data/Reader/CSV/premiere.win1252.csv'; + $reader = new Csv(); + $reader->setInputEncoding(Csv::guessEncoding($filename, 'ISO-8859-2')); + $spreadsheet = $reader->load($filename); + $sheet = $spreadsheet->getActiveSheet(); + self::assertEquals('premičre', $sheet->getCell('A1')->getValue()); + self::assertEquals('sixičme', $sheet->getCell('C2')->getValue()); + } } diff --git a/tests/data/Reader/CSV/escape.csv b/tests/data/Reader/CSV/escape.csv new file mode 100644 index 0000000000..a8b0c08435 --- /dev/null +++ b/tests/data/Reader/CSV/escape.csv @@ -0,0 +1,4 @@ +a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\" +a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\",d +a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\" +a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\" diff --git a/tests/data/Reader/CSV/premiere.utf16be.csv b/tests/data/Reader/CSV/premiere.utf16be.csv new file mode 100644 index 0000000000..44c25684bc Binary files /dev/null and b/tests/data/Reader/CSV/premiere.utf16be.csv differ diff --git a/tests/data/Reader/CSV/premiere.utf16bebom.csv b/tests/data/Reader/CSV/premiere.utf16bebom.csv new file mode 100644 index 0000000000..2d63bbe12f Binary files /dev/null and b/tests/data/Reader/CSV/premiere.utf16bebom.csv differ diff --git a/tests/data/Reader/CSV/premiere.utf16le.csv b/tests/data/Reader/CSV/premiere.utf16le.csv new file mode 100644 index 0000000000..a5bb1ff12e Binary files /dev/null and b/tests/data/Reader/CSV/premiere.utf16le.csv differ diff --git a/tests/data/Reader/CSV/premiere.utf16lebom.csv b/tests/data/Reader/CSV/premiere.utf16lebom.csv new file mode 100644 index 0000000000..fe6bb5b6b7 Binary files /dev/null and b/tests/data/Reader/CSV/premiere.utf16lebom.csv differ diff --git a/tests/data/Reader/CSV/premiere.utf32be.csv b/tests/data/Reader/CSV/premiere.utf32be.csv new file mode 100644 index 0000000000..d6517533f9 Binary files /dev/null and b/tests/data/Reader/CSV/premiere.utf32be.csv differ diff --git a/tests/data/Reader/CSV/premiere.utf32bebom.csv b/tests/data/Reader/CSV/premiere.utf32bebom.csv new file mode 100644 index 0000000000..83326b64e4 Binary files /dev/null and b/tests/data/Reader/CSV/premiere.utf32bebom.csv differ diff --git a/tests/data/Reader/CSV/premiere.utf32le.csv b/tests/data/Reader/CSV/premiere.utf32le.csv new file mode 100644 index 0000000000..64d29f13cf Binary files /dev/null and b/tests/data/Reader/CSV/premiere.utf32le.csv differ diff --git a/tests/data/Reader/CSV/premiere.utf32lebom.csv b/tests/data/Reader/CSV/premiere.utf32lebom.csv new file mode 100644 index 0000000000..25617c6e9b Binary files /dev/null and b/tests/data/Reader/CSV/premiere.utf32lebom.csv differ diff --git a/tests/data/Reader/CSV/premiere.utf8.csv b/tests/data/Reader/CSV/premiere.utf8.csv new file mode 100644 index 0000000000..c668120175 --- /dev/null +++ b/tests/data/Reader/CSV/premiere.utf8.csv @@ -0,0 +1,2 @@ +première,second,troisième +Quatrième,cinquième,sixième diff --git a/tests/data/Reader/CSV/premiere.utf8bom.csv b/tests/data/Reader/CSV/premiere.utf8bom.csv new file mode 100644 index 0000000000..4068e6c38e --- /dev/null +++ b/tests/data/Reader/CSV/premiere.utf8bom.csv @@ -0,0 +1,2 @@ +première,second,troisième +Quatrième,cinquième,sixième diff --git a/tests/data/Reader/CSV/premiere.win1252.csv b/tests/data/Reader/CSV/premiere.win1252.csv new file mode 100644 index 0000000000..908cb88fe2 --- /dev/null +++ b/tests/data/Reader/CSV/premiere.win1252.csv @@ -0,0 +1,2 @@ +premi�re,second,troisi�me +Quatri�me,cinqui�me,sixi�me