Fix filters with UTF-8 characters

2016-05-31 15:44:09 -05:00
parent 4987e245bb
commit 6af3b6dfdb
2 changed files with 32 additions and 7 deletions
--- a/app/Core/Filter/Lexer.php
+++ b/app/Core/Filter/Lexer.php
@@ -25,13 +25,13 @@ class Lexer
     * @var array
     */
    private $tokenMap = array(
-        "/^(\s+)/"                                       => 'T_WHITESPACE',
+        '/^(\s+)/'                                       => 'T_WHITESPACE',
        '/^([<=>]{0,2}[0-9]{4}-[0-9]{2}-[0-9]{2})/'      => 'T_STRING',
-        '/^([<=>]{1,2}\w+)/'                             => 'T_STRING',
+        '/^([<=>]{1,2}\w+)/u'                            => 'T_STRING',
        '/^([<=>]{1,2}".+")/'                            => 'T_STRING',
        '/^("(.+)")/'                                    => 'T_STRING',
-        "/^(\w+)/"                                       => 'T_STRING',
-        "/^(#\d+)/"                                      => 'T_STRING',
+        '/^(\w+)/u'                                      => 'T_STRING',
+        '/^(#\d+)/'                                      => 'T_STRING',
    );

    /**
@@ -80,9 +80,10 @@ class Lexer
    {
        $tokens = array();
        $this->offset = 0;
+        $input_length = mb_strlen($input, 'UTF-8');

-        while (isset($input[$this->offset])) {
-            $result = $this->match(substr($input, $this->offset));
+        while ($this->offset < $input_length) {
+            $result = $this->match(mb_substr($input, $this->offset, $input_length, 'UTF-8'));

            if ($result === false) {
                return array();
@@ -105,7 +106,7 @@ class Lexer
    {
        foreach ($this->tokenMap as $pattern => $name) {
            if (preg_match($pattern, $string, $matches)) {
-                $this->offset += strlen($matches[1]);
+                $this->offset += mb_strlen($matches[1], 'UTF-8');

                return array(
                    'match' => str_replace('"', '', $matches[1]),
--- a/tests/units/Core/Filter/LexerTest.php
+++ b/tests/units/Core/Filter/LexerTest.php
@@ -178,4 +178,28 @@ class LexerTest extends Base

        $this->assertSame($expected, $lexer->tokenize('date:<=2016-01-01 something else'));
    }
+
+    public function testTokenizeWithUtf8Letters()
+    {
+        $lexer = new Lexer();
+        $lexer->setDefaultToken('myDefaultToken');
+
+        $expected = array(
+            'myDefaultToken' => array('àa éçùe'),
+        );
+
+        $this->assertSame($expected, $lexer->tokenize('àa éçùe'));
+    }
+
+    public function testTokenizeWithUtf8Numbers()
+    {
+        $lexer = new Lexer();
+        $lexer->setDefaultToken('myDefaultToken');
+
+        $expected = array(
+            'myDefaultToken' => array('६Δↈ五一'),
+        );
+
+        $this->assertSame($expected, $lexer->tokenize('६Δↈ五一'));
+    }
 }