123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202 |
- <?php
-
- /*
- * This file is part of the Symfony package.
- *
- * (c) Fabien Potencier <fabien@symfony.com>
- *
- * For the full copyright and license information, please view the LICENSE
- * file that was distributed with this source code.
- */
-
- namespace Symfony\Component\CssSelector;
-
- use Symfony\Component\CssSelector\Exception\ParseException;
-
- /**
- * Tokenizer lexes a CSS Selector to tokens.
- *
- * This component is a port of the Python lxml library,
- * which is copyright Infrae and distributed under the BSD license.
- *
- * @author Fabien Potencier <fabien@symfony.com>
- */
- class Tokenizer
- {
- /**
- * Takes a CSS selector and returns an array holding the Tokens
- * it contains.
- *
- * @param string $s The selector to lex.
- *
- * @return array Token[]
- */
- public function tokenize($s)
- {
- if (function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) {
- $mbEncoding = mb_internal_encoding();
- mb_internal_encoding('ASCII');
- }
-
- $tokens = array();
- $pos = 0;
- $s = preg_replace('#/\*.*?\*/#s', '', $s);
-
- while (true) {
- if (preg_match('#\s+#A', $s, $match, 0, $pos)) {
- $precedingWhitespacePos = $pos;
- $pos += strlen($match[0]);
- } else {
- $precedingWhitespacePos = 0;
- }
-
- if ($pos >= strlen($s)) {
- if (isset($mbEncoding)) {
- mb_internal_encoding($mbEncoding);
- }
-
- return $tokens;
- }
-
- if (preg_match('#[+-]?\d*n(?:[+-]\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) {
- $sym = substr($s, $pos, strlen($match[0]));
- $tokens[] = new Token('Symbol', $sym, $pos);
- $pos += strlen($match[0]);
-
- continue;
- }
-
- $c = $s[$pos];
- $c2 = substr($s, $pos, 2);
- if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) {
- $tokens[] = new Token('Token', $c2, $pos);
- $pos += 2;
-
- continue;
- }
-
- if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) {
- if (in_array($c, array('.', '#', '[')) && $precedingWhitespacePos > 0) {
- $tokens[] = new Token('Token', ' ', $precedingWhitespacePos);
- }
- $tokens[] = new Token('Token', $c, $pos);
- ++$pos;
-
- continue;
- }
-
- if ('"' === $c || "'" === $c) {
- // Quoted string
- $oldPos = $pos;
- list($sym, $pos) = $this->tokenizeEscapedString($s, $pos);
-
- $tokens[] = new Token('String', $sym, $oldPos);
-
- continue;
- }
-
- $oldPos = $pos;
- list($sym, $pos) = $this->tokenizeSymbol($s, $pos);
-
- $tokens[] = new Token('Symbol', $sym, $oldPos);
-
- continue;
- }
- }
-
- /**
- * Tokenizes a quoted string (i.e. 'A string quoted with \' characters'),
- * and returns an array holding the unquoted string contained by $s and
- * the new position from which tokenizing should take over.
- *
- * @param string $s The selector string containing the quoted string.
- * @param integer $pos The starting position for the quoted string.
- *
- * @return array
- *
- * @throws ParseException When expected closing is not found
- */
- private function tokenizeEscapedString($s, $pos)
- {
- $quote = $s[$pos];
-
- $pos = $pos + 1;
- $start = $pos;
- while (true) {
- $next = strpos($s, $quote, $pos);
- if (false === $next) {
- throw new ParseException(sprintf('Expected closing %s for string in: %s', $quote, substr($s, $start)));
- }
-
- $result = substr($s, $start, $next - $start);
- if (strlen($result) > 0 && '\\' === $result[strlen($result) - 1]) {
- // next quote character is escaped
- $pos = $next + 1;
- continue;
- }
-
- if (false !== strpos($result, '\\')) {
- $result = $this->unescapeStringLiteral($result);
- }
-
- return array($result, $next + 1);
- }
- }
-
- /**
- * Unescapes a string literal and returns the unescaped string.
- *
- * @param string $literal The string literal to unescape.
- *
- * @return string
- *
- * @throws ParseException When invalid escape sequence is found
- */
- private function unescapeStringLiteral($literal)
- {
- return preg_replace_callback('#(\\\\(?:[A-Fa-f0-9]{1,6}(?:\r\n|\s)?|[^A-Fa-f0-9]))#', function ($matches) use ($literal) {
- if ($matches[0][0] == '\\' && strlen($matches[0]) > 1) {
- $matches[0] = substr($matches[0], 1);
- if (in_array($matches[0][0], array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'))) {
- return chr(trim($matches[0]));
- }
- } else {
- throw new ParseException(sprintf('Invalid escape sequence %s in string %s', $matches[0], $literal));
- }
- }, $literal);
- }
-
- /**
- * Lexes selector $s and returns an array holding the name of the symbol
- * contained in it and the new position from which tokenizing should take
- * over.
- *
- * @param string $s The selector string.
- * @param integer $pos The position in $s at which the symbol starts.
- *
- * @return array
- *
- * @throws ParseException When Unexpected symbol is found
- */
- private function tokenizeSymbol($s, $pos)
- {
- $start = $pos;
-
- if (!preg_match('#[^\w\-]#', $s, $match, PREG_OFFSET_CAPTURE, $pos)) {
- // Goes to end of s
- return array(substr($s, $start), strlen($s));
- }
-
- $matchStart = $match[0][1];
-
- if ($matchStart == $pos) {
- throw new ParseException(sprintf('Unexpected symbol: %s at %s', $s[$pos], $pos));
- }
-
- $result = substr($s, $start, $matchStart - $start);
- $pos = $matchStart;
-
- return array($result, $pos);
- }
- }
|