Fix filters with UTF-8 characters

This commit is contained in:
Sébastien Kergreis
2016-05-31 15:44:09 -05:00
parent 4987e245bb
commit 6af3b6dfdb
2 changed files with 32 additions and 7 deletions

View File

@@ -25,13 +25,13 @@ class Lexer
* @var array * @var array
*/ */
private $tokenMap = array( private $tokenMap = array(
"/^(\s+)/" => 'T_WHITESPACE', '/^(\s+)/' => 'T_WHITESPACE',
'/^([<=>]{0,2}[0-9]{4}-[0-9]{2}-[0-9]{2})/' => 'T_STRING', '/^([<=>]{0,2}[0-9]{4}-[0-9]{2}-[0-9]{2})/' => 'T_STRING',
'/^([<=>]{1,2}\w+)/' => 'T_STRING', '/^([<=>]{1,2}\w+)/u' => 'T_STRING',
'/^([<=>]{1,2}".+")/' => 'T_STRING', '/^([<=>]{1,2}".+")/' => 'T_STRING',
'/^("(.+)")/' => 'T_STRING', '/^("(.+)")/' => 'T_STRING',
"/^(\w+)/" => 'T_STRING', '/^(\w+)/u' => 'T_STRING',
"/^(#\d+)/" => 'T_STRING', '/^(#\d+)/' => 'T_STRING',
); );
/** /**
@@ -80,9 +80,10 @@ class Lexer
{ {
$tokens = array(); $tokens = array();
$this->offset = 0; $this->offset = 0;
$input_length = mb_strlen($input, 'UTF-8');
while (isset($input[$this->offset])) { while ($this->offset < $input_length) {
$result = $this->match(substr($input, $this->offset)); $result = $this->match(mb_substr($input, $this->offset, $input_length, 'UTF-8'));
if ($result === false) { if ($result === false) {
return array(); return array();
@@ -105,7 +106,7 @@ class Lexer
{ {
foreach ($this->tokenMap as $pattern => $name) { foreach ($this->tokenMap as $pattern => $name) {
if (preg_match($pattern, $string, $matches)) { if (preg_match($pattern, $string, $matches)) {
$this->offset += strlen($matches[1]); $this->offset += mb_strlen($matches[1], 'UTF-8');
return array( return array(
'match' => str_replace('"', '', $matches[1]), 'match' => str_replace('"', '', $matches[1]),

View File

@@ -178,4 +178,28 @@ class LexerTest extends Base
$this->assertSame($expected, $lexer->tokenize('date:<=2016-01-01 something else')); $this->assertSame($expected, $lexer->tokenize('date:<=2016-01-01 something else'));
} }
public function testTokenizeWithUtf8Letters()
{
$lexer = new Lexer();
$lexer->setDefaultToken('myDefaultToken');
$expected = array(
'myDefaultToken' => array('àa éçùe'),
);
$this->assertSame($expected, $lexer->tokenize('àa éçùe'));
}
public function testTokenizeWithUtf8Numbers()
{
$lexer = new Lexer();
$lexer->setDefaultToken('myDefaultToken');
$expected = array(
'myDefaultToken' => array('६Δↈ五一'),
);
$this->assertSame($expected, $lexer->tokenize('६Δↈ五一'));
}
} }