Add new optional beta email parser thats based on ImapEngine instead of Webklex

This commit is contained in:
johnnyq
2026-02-26 16:11:49 -05:00
parent 1ba19cc249
commit 9cb1ff7330
682 changed files with 101834 additions and 8 deletions

View File

@@ -0,0 +1,25 @@
BSD 2-Clause License
Copyright (c) 2018, Zaahid Bateson
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,67 @@
# zbateson/mb-wrapper
Charset conversion and string manipulation wrapper with a large defined set of aliases.
[![Tests](https://github.com/zbateson/mb-wrapper.svg/actions/workflows/tests.yml/badge.svg)](https://github.com/zbateson/mb-wrapper.svg/actions/workflows/tests.yml)
[![Code Coverage](https://scrutinizer-ci.com/g/zbateson/mb-wrapper/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/zbateson/mb-wrapper/?branch=master)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/zbateson/mb-wrapper/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/zbateson/mb-wrapper/?branch=master)
[![Total Downloads](https://poser.pugx.org/zbateson/mb-wrapper/downloads)](https://packagist.org/packages/zbateson/mb-wrapper)
[![Latest Stable Version](https://poser.pugx.org/zbateson/mb-wrapper/version)](https://packagist.org/packages/zbateson/mb-wrapper)
The goals of this project are to be:
* Well written
* Tested where possible
* Support as wide a range of charset aliases as possible
To include it for use in your project, please install via composer:
```
composer require zbateson/mb-wrapper
```
## Php 7 Support Dropped
As of mb-wrapper 2.0, support for php 7 has been dropped.
## Requirements
mb-wrapper requires PHP 8.0 or newer. Tested on PHP 8.0, 8.1, 8.2, and 8.3 on GitHub Actions.
## New in 2.0
If converting or performing an operation on a string fails in iconv, an UnsupportedCharsetException is now thrown.
## Description
MbWrapper is intended for use wherever mb_* or iconv_* is used. It scans supported charsets returned by mb_list_encodings(), and prefers mb_* functions, but will fallback to iconv if a charset isn't supported by the mb_ functions.
A list of aliased charsets is maintained for both mb_* and iconv, where a supported charset exists for an alias. This is useful for mail and http parsing as other systems may report encodings not recognized by mb_* or iconv.
Charset lookup is done by removing non-alphanumeric characters as well, so UTF8 will always be matched to UTF-8, etc...
## Usage
The following wrapper methods are exposed:
* mb_convert_encoding, iconv with MbWrapper::convert
* mb_substr, iconv_substr with MbWrapper::getSubstr
* mb_strlen, iconv_strlen with MbWrapper::getLength
* mb_check_encoding, iconv (for verification) with MbWrapper::checkEncoding
```php
$mbWrapper = new \ZBateson\MbWrapper\MbWrapper();
$fromCharset = 'ISO-8859-1';
$toCharset = 'UTF-8';
$mbWrapper->convert('data', $fromCharset, $toCharset);
$mbWrapper->getLength('data', 'UTF-8');
$mbWrapper->substr('data', 'UTF-8', 1, 2);
if ($mbWrapper->checkEncoding('data', 'UTF-8')) {
echo 'Compatible';
}
```
## License
BSD licensed - please see [license agreement](https://github.com/zbateson/mb-wrapper/blob/master/LICENSE).

View File

@@ -0,0 +1,35 @@
{
"name": "zbateson/mb-wrapper",
"description": "Wrapper for mbstring with fallback to iconv for encoding conversion and string manipulation",
"keywords": ["mb_convert_encoding", "charset", "encoding", "string", "mbstring", "iconv", "multibyte", "mb", "mime", "mail", "http"],
"license": "BSD-2-Clause",
"authors": [
{
"name": "Zaahid Bateson"
}
],
"require": {
"php": ">=8.0",
"symfony/polyfill-mbstring": "^1.9",
"symfony/polyfill-iconv": "^1.9"
},
"require-dev": {
"phpunit/phpunit": "^9.6|^10.0",
"friendsofphp/php-cs-fixer": "*",
"phpstan/phpstan": "*"
},
"suggest": {
"ext-mbstring": "For best support/performance",
"ext-iconv": "For best support/performance"
},
"autoload": {
"psr-4": {
"ZBateson\\MbWrapper\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"ZBateson\\MbWrapper\\": "tests/MbWrapper"
}
}
}

View File

@@ -0,0 +1,489 @@
<?php
/**
* This file is part of the ZBateson\MbWrapper project.
*
* @license http://opensource.org/licenses/bsd-license.php BSD
*/
namespace ZBateson\MbWrapper;
/**
* Helper class for converting strings between charsets, finding a multibyte
* strings length, and creating a substring.
*
* MbWrapper prefers PHP's mb_* extension first, and reverts to iconv_* if the
* charsets aren't listed as supported by mb_list_encodings().
*
* A list of aliased charsets are maintained to support the greatest number of
* charsets. In addition, when searching for a charset, separator characters
* such as dashes are removed, and searches are always performed
* case-insensitively. This is to support strange reported encodings in emails,
* etc...
*
* @author Zaahid Bateson
*/
class MbWrapper
{
/**
* @var array<string, string> aliased charsets supported by mb_convert_encoding.
* The alias is stripped of any non-alphanumeric characters (so CP367
* is equal to CP-367) when comparing.
* Some of these translations are already supported by
* mb_convert_encoding on "my" PHP 5.5.9, but may not be supported in
* other implementations or versions since they're not part of
* documented support.
*/
public static $mbAliases = [
// supported but not included in mb_list_encodings for some reason...
'CP850' => 'CP850',
'GB2312' => 'GB18030',
'SJIS2004' => 'SJIS-2004',
// aliases
'ANSIX341968' => 'ASCII',
'ANSIX341986' => 'ASCII',
'ARABIC' => 'ISO-8859-6',
'ASMO708' => 'ISO-8859-6',
'BIG5' => 'BIG-5',
'BIG5TW' => 'BIG-5',
'CESU8' => 'UTF-8',
'CHINESE' => 'GB18030',
'CP367' => 'ASCII',
'CP819' => 'ISO-8859-1',
'CP1251' => 'WINDOWS-1251',
'CP1252' => 'WINDOWS-1252',
'CP1254' => 'WINDOWS-1254',
'CP1255' => 'ISO-8859-8',
'CSASCII' => 'ASCII',
'CSBIG5' => 'BIG-5',
'CSIBM866' => 'CP866',
'CSISO2022JP' => 'ISO-2022-JP',
'CSISO2022KR' => 'ISO-2022-KR',
'CSISO58GB231280' => 'GB18030',
'CSISOLATIN1' => 'ISO-8859-1',
'CSISOLATIN2' => 'ISO-8859-2',
'CSISOLATIN3' => 'ISO-8859-3',
'CSISOLATIN4' => 'ISO-8859-4',
'CSISOLATIN5' => 'ISO-8859-9',
'CSISOLATIN6' => 'ISO-8859-10',
'CSISOLATINARABIC' => 'ISO-8859-6',
'CSISOLATINCYRILLIC' => 'ISO-8859-5',
'CSISOLATINGREEK' => 'ISO-8859-7',
'CSISOLATINHEBREW' => 'ISO-8859-8',
'CSKOI8R' => 'KOI8-R',
'CSPC850MULTILINGUAL' => 'CP850',
'CSSHIFTJIS' => 'SJIS',
'CYRILLIC' => 'ISO-8859-5',
'ECMA114' => 'ISO-8859-6',
'ECMA118' => 'ISO-8859-7',
'ELOT928' => 'ISO-8859-7',
'EUCCN' => 'GB18030',
'EUCGB2312CN' => 'GB18030',
'GB180302000' => 'GB18030',
'GB23121980' => 'GB18030',
'GB231280' => 'GB18030',
'GBK' => 'CP936',
'GREEK8' => 'ISO-8859-7',
'GREEK' => 'ISO-8859-7',
'HEBREW' => 'ISO-8859-8',
'HZGB2312' => 'HZ',
'HZGB' => 'HZ',
'IBM367' => 'ASCII',
'IBM819' => 'ISO-8859-1',
'IBM850' => 'CP850',
'IBM866' => 'CP866',
'ISO2022JP2004' => 'ISO-2022-JP-2004',
'ISO646IRV1991' => 'ASCII',
'ISO646US' => 'ASCII',
'ISO8859' => 'ISO-8859-1',
'ISO8859101992' => 'ISO-8859-10',
'ISO885911987' => 'ISO-8859-1',
'ISO8859141998' => 'ISO-8859-14',
'ISO8859162001' => 'ISO-8859-16',
'ISO885921987' => 'ISO-8859-2',
'ISO885931988' => 'ISO-8859-3',
'ISO885941988' => 'ISO-8859-4',
'ISO885951988' => 'ISO-8859-5',
'ISO885961987' => 'ISO-8859-6',
'ISO885971987' => 'ISO-8859-7',
'ISO885981988' => 'ISO-8859-8',
'ISO88598I' => 'ISO-8859-8',
'ISO885991989' => 'ISO-8859-9',
'ISOCELTIC' => 'ISO-8859-14',
'ISOIR100' => 'ISO-8859-1',
'ISOIR101' => 'ISO-8859-2',
'ISOIR109' => 'ISO-8859-3',
'ISOIR110' => 'ISO-8859-4',
'ISOIR126' => 'ISO-8859-7',
'ISOIR127' => 'ISO-8859-6',
'ISOIR138' => 'ISO-8859-8',
'ISOIR144' => 'ISO-8859-5',
'ISOIR148' => 'ISO-8859-9',
'ISOIR157' => 'ISO-8859-10',
'ISOIR199' => 'ISO-8859-14',
'ISOIR226' => 'ISO-8859-16',
'ISOIR58' => 'GB18030',
'ISOIR6' => 'ASCII',
'KOI8R' => 'KOI8-R',
'KOREAN' => 'EUC-KR',
'KSC56011987' => 'EUC-KR',
'KSC5601' => 'EUC-KR',
'KSX1001' => 'EUC-KR',
'L1' => 'ISO-8859-1',
'L2' => 'ISO-8859-2',
'L3' => 'ISO-8859-3',
'L4' => 'ISO-8859-4',
'L5' => 'ISO-8859-9',
'L6' => 'ISO-8859-10',
'L8' => 'ISO-8859-14',
'L10' => 'ISO-8859-16',
'LATIN' => 'ISO-8859-1',
'LATIN1' => 'ISO-8859-1',
'LATIN2' => 'ISO-8859-2',
'LATIN3' => 'ISO-8859-3',
'LATIN4' => 'ISO-8859-4',
'LATIN5' => 'ISO-8859-9',
'LATIN6' => 'ISO-8859-10',
'LATIN8' => 'ISO-8859-14',
'LATIN10' => 'ISO-8859-16',
'MS932' => 'CP932',
'ms936' => 'CP936',
'MS950' => 'CP950',
'MSKANJI' => 'CP932',
'SHIFTJIS2004' => 'SJIS',
'SHIFTJIS' => 'SJIS',
'UJIS' => 'EUC-JP',
'UNICODE11UTF7' => 'UTF-7',
'US' => 'ASCII',
'USASCII' => 'ASCII',
'WE8MSWIN1252' => 'WINDOWS-1252',
'WINDOWS1251' => 'WINDOWS-1251',
'WINDOWS1252' => 'WINDOWS-1252',
'WINDOWS1254' => 'WINDOWS-1254',
'WINDOWS1255' => 'ISO-8859-8',
'0' => 'WINDOWS-1252',
'128' => 'SJIS',
'129' => 'EUC-KR',
'134' => 'GB18030',
'136' => 'BIG-5',
'161' => 'WINDOWS-1253',
'162' => 'WINDOWS-1254',
'177' => 'WINDOWS-1255',
'178' => 'WINDOWS-1256',
'186' => 'WINDOWS-1257',
'204' => 'WINDOWS-1251',
'222' => 'WINDOWS-874',
'238' => 'WINDOWS-1250',
'646' => 'ASCII',
'850' => 'CP850',
'866' => 'CP866',
'932' => 'CP932',
'936' => 'CP936',
'950' => 'CP950',
'1251' => 'WINDOWS-1251',
'1252' => 'WINDOWS-1252',
'1254' => 'WINDOWS-1254',
'1255' => 'ISO-8859-8',
'8859' => 'ISO-8859-1',
];
/**
* @var array<string, string> aliased charsets supported by iconv.
*/
public static $iconvAliases = [
// iconv aliases -- a lot of these may already be supported
'CESU8' => 'UTF8',
'CP154' => 'PT154',
'CPGR' => 'CP869',
'CPIS' => 'CP861',
'CSHPROMAN8' => 'ROMAN8',
'CSIBM037' => 'CP037',
'CSIBM1026' => 'CP1026',
'CSIBM424' => 'CP424',
'CSIBM500' => 'CP500',
'CSIBM860' => 'CP860',
'CSIBM861' => 'CP861',
'CSIBM863' => 'CP863',
'CSIBM864' => 'CP864',
'CSIBM865' => 'CP865',
'CSIBM869' => 'CP869',
'CSPC775BALTIC' => 'CP775',
'CSPC862LATINHEBREW' => 'CP862',
'CSPC8CODEPAGE437' => 'CP437',
'CSPTCP154' => 'PT154',
'CYRILLICASIAN' => 'PT154',
'EBCDICCPBE' => 'CP500',
'EBCDICCPCA' => 'CP037',
'EBCDICCPCH' => 'CP500',
'EBCDICCPHE' => 'CP424',
'EBCDICCPNL' => 'CP037',
'EBCDICCPUS' => 'CP037',
'EBCDICCPWT' => 'CP037',
'HKSCS' => 'BIG5HKSCS',
'HPROMAN8' => 'ROMAN8',
'IBM037' => 'CP037',
'IBM039' => 'CP037',
'IBM424' => 'CP424',
'IBM437' => 'CP437',
'IBM500' => 'CP500',
'IBM775' => 'CP775',
'IBM860' => 'CP860',
'IBM861' => 'CP861',
'IBM862' => 'CP862',
'IBM863' => 'CP863',
'IBM864' => 'CP864',
'IBM865' => 'CP865',
'IBM869' => 'CP869',
'IBM1026' => 'CP1026',
'IBM1140' => 'CP1140',
'ISO2022JP2' => 'ISO2022JP2',
'ISO8859112001' => 'ISO885911',
'ISO885911' => 'ISO885911',
'ISOIR166' => 'TIS620',
'JOHAB' => 'CP1361',
'MACCYRILLIC' => 'MACCYRILLIC',
'MS1361' => 'CP1361',
'MS949' => 'CP949',
'PTCP154' => 'PT154',
'R8' => 'ROMAN8',
'ROMAN8' => 'ROMAN8',
'THAI' => 'ISO885911',
'TIS6200' => 'TIS620',
'TIS62025290' => 'TIS620',
'TIS62025291' => 'TIS620',
'TIS620' => 'TIS620',
'UHC' => 'CP949',
'WINDOWS1250' => 'CP1250',
'WINDOWS1253' => 'CP1253',
'WINDOWS1256' => 'CP1256',
'WINDOWS1257' => 'CP1257',
'WINDOWS1258' => 'CP1258',
'037' => 'CP037',
'424' => 'CP424',
'437' => 'CP437',
'500' => 'CP500',
'775' => 'CP775',
'860' => 'CP860',
'861' => 'CP861',
'862' => 'CP862',
'863' => 'CP863',
'864' => 'CP864',
'865' => 'CP865',
'869' => 'CP869',
'949' => 'CP949',
'1026' => 'CP1026',
'1140' => 'CP1140',
'1250' => 'CP1250',
'1253' => 'CP1253',
'1256' => 'CP1256',
'1257' => 'CP1257',
'1258' => 'CP1258',
];
/**
* @var string[] cached lookups for quicker retrieval
*/
protected $mappedMbCharsets = [
'UTF8' => 'UTF-8',
'USASCII' => 'US-ASCII',
'ISO88591' => 'ISO-8859-1',
];
/**
* @var string[] An array of encodings supported by the mb_* extension, as
* returned by mb_list_encodings(), with the key set to the charset's
* name afte
*/
private static $mbListedEncodings;
/**
* Initializes the static mb_* encoding array.
*/
public function __construct()
{
if (self::$mbListedEncodings === null) {
$cs = \mb_list_encodings();
$keys = $this->getNormalizedCharset($cs);
self::$mbListedEncodings = \array_combine($keys, $cs);
}
}
/**
* The passed charset is uppercased, and stripped of non-alphanumeric
* characters before being returned.
*
* @param string|string[] $charset
* @return string|string[]
*/
private function getNormalizedCharset($charset)
{
$upper = null;
if (\is_array($charset)) {
$upper = \array_map('strtoupper', $charset);
} else {
$upper = \strtoupper($charset);
}
return \preg_replace('/[^A-Z0-9]+/', '', $upper);
}
private function iconv(string $fromCharset, string $toCharset, string $str) : string
{
$ret = @\iconv($fromCharset, $toCharset . '//TRANSLIT//IGNORE', $str);
if ($ret === false) {
throw new UnsupportedCharsetException("Unable to convert from charsets: $fromCharset to $toCharset");
}
return $ret;
}
private function iconvStrlen(string $str, string $charset) : int
{
$ret = @\iconv_strlen($str, $charset . '//TRANSLIT//IGNORE');
if ($ret === false) {
throw new UnsupportedCharsetException("Charset $charset is not supported");
}
return $ret;
}
private function iconvSubstr(string $str, string $charset, int $start, ?int $length = null) : string
{
$ret = @\iconv_substr($str, $start, $length, $charset . '//TRANSLIT//IGNORE');
if ($ret === false) {
$strLength = $this->iconvStrlen($str, $charset);
if ($start > $strLength) {
// returns empty to keep in line with mb_substr functionality
return '';
}
throw new UnsupportedCharsetException("Charset $charset is not supported");
}
return $ret;
}
/**
* Converts the passed string's charset from the passed $fromCharset to the
* passed $toCharset
*
* The function attempts to use mb_convert_encoding if possible, and falls
* back to iconv if not. If the source or destination character sets aren't
* supported, a blank string is returned.
*
* @throws UnsupportedCharsetException if iconv fails
*/
public function convert(string $str, string $fromCharset, string $toCharset) : string
{
// there may be some mb-supported encodings not supported by iconv (on my libiconv for instance
// HZ isn't supported), and so it may happen that failing an mb_convert_encoding, an iconv
// may also fail even though both support an encoding separately.
// For cases like that, a two-way encoding is done with UTF-8 as an intermediary.
$from = $this->getMbCharset($fromCharset);
$to = $this->getMbCharset($toCharset);
if ($str !== '') {
if ($from !== false && $to === false) {
$str = \mb_convert_encoding($str, 'UTF-8', $from);
return $this->iconv('UTF-8', $this->getIconvAlias($toCharset), $str);
} elseif ($from === false && $to !== false) {
$str = $this->iconv($this->getIconvAlias($fromCharset), 'UTF-8', $str);
return \mb_convert_encoding($str, $to, 'UTF-8');
} elseif ($from !== false && $to !== false) {
return \mb_convert_encoding($str, $to, $from);
}
return $this->iconv(
$this->getIconvAlias($fromCharset),
$this->getIconvAlias($toCharset),
$str
);
}
return $str;
}
/**
* Returns true if the passed string is valid in the $charset encoding.
*
* Either uses mb_check_encoding, or iconv if it's not a supported mb
* encoding.
*/
public function checkEncoding(string $str, string $charset) : bool
{
$mb = $this->getMbCharset($charset);
if ($mb !== false) {
return \mb_check_encoding($str, $mb);
}
$ic = $this->getIconvAlias($charset);
return (@\iconv($ic, $ic . '//TRANSLIT//IGNORE', $str) !== false);
}
/**
* Uses either mb_strlen or iconv_strlen to return the number of characters
* in the passed $str for the given $charset
*
* @throws UnsupportedCharsetException if iconv fails
*/
public function getLength(string $str, string $charset) : int
{
$mb = $this->getMbCharset($charset);
if ($mb !== false) {
return \mb_strlen($str, $mb);
}
return $this->iconvStrlen($str, $this->getIconvAlias($charset));
}
/**
* Uses either mb_substr or iconv_substr to create and return a substring of
* the passed $str.
*
* If the offset provided in $start is greater than the length of the
* string, an empty string is returned.
*
* @throws UnsupportedCharsetException if iconv fails
*/
public function getSubstr(string $str, string $charset, int $start, ?int $length = null) : string
{
$mb = $this->getMbCharset($charset);
if ($mb !== false) {
return \mb_substr($str, $start, $length, $mb);
}
$ic = $this->getIconvAlias($charset);
if ($ic === 'CP1258') {
// iconv_substr fails with CP1258 for some reason, and returns only
// a subset of characters (e.g. the first 5, instead of $length)
$str = $this->convert($str, $ic, 'UTF-8');
return $this->convert($this->getSubstr($str, 'UTF-8', $start, $length), 'UTF-8', $ic);
}
return $this->iconvSubstr($str, $ic, $start, $length);
}
/**
* Looks up a charset from mb_list_encodings and identified aliases,
* checking if the lookup has been cached already first.
*
* If the encoding is not listed, the method will return false.
*
* On success, the method will return the charset name as accepted by mb_*.
*
* @return string|bool
*/
private function getMbCharset(string $cs)
{
$normalized = $this->getNormalizedCharset($cs);
if (\array_key_exists($normalized, self::$mbListedEncodings)) {
return self::$mbListedEncodings[$normalized];
} elseif (\array_key_exists($normalized, self::$mbAliases)) {
return self::$mbAliases[$normalized];
}
return false;
}
/**
* Looks up the passed charset in self::$iconvAliases, returning the mapped
* charset if applicable. Otherwise returns charset.
*
* @return string the mapped charset (if mapped) or $cs otherwise
*/
private function getIconvAlias(string $cs) : string
{
$normalized = $this->getNormalizedCharset($cs);
if (\array_key_exists($normalized, self::$iconvAliases)) {
return static::$iconvAliases[$normalized];
}
return $cs;
}
}

View File

@@ -0,0 +1,19 @@
<?php
/**
* This file is part of the ZBateson\MailMimeParser project.
*
* @license http://opensource.org/licenses/bsd-license.php BSD
*/
namespace ZBateson\MbWrapper;
use RuntimeException;
/**
* Exception thrown if MbWrapper can't convert from or two a specified charset.
*
* @author Zaahid Bateson
*/
class UnsupportedCharsetException extends RuntimeException
{
}