Crawler.php 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelector;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. *
  17. * @api
  18. */
  19. class Crawler extends \SplObjectStorage
  20. {
  21. /**
  22. * @var string The current URI or the base href value
  23. */
  24. private $uri;
  25. /**
  26. * Constructor.
  27. *
  28. * @param mixed $node A Node to use as the base for the crawling
  29. * @param string $uri The current URI or the base href value
  30. *
  31. * @api
  32. */
  33. public function __construct($node = null, $uri = null)
  34. {
  35. $this->uri = $uri;
  36. $this->add($node);
  37. }
  38. /**
  39. * Removes all the nodes.
  40. *
  41. * @api
  42. */
  43. public function clear()
  44. {
  45. $this->removeAll($this);
  46. }
  47. /**
  48. * Adds a node to the current list of nodes.
  49. *
  50. * This method uses the appropriate specialized add*() method based
  51. * on the type of the argument.
  52. *
  53. * @param null|\DOMNodeList|array|\DOMNode $node A node
  54. *
  55. * @api
  56. */
  57. public function add($node)
  58. {
  59. if ($node instanceof \DOMNodeList) {
  60. $this->addNodeList($node);
  61. } elseif (is_array($node)) {
  62. $this->addNodes($node);
  63. } elseif (is_string($node)) {
  64. $this->addContent($node);
  65. } elseif (is_object($node)) {
  66. $this->addNode($node);
  67. }
  68. }
  69. /**
  70. * Adds HTML/XML content.
  71. *
  72. * @param string $content A string to parse as HTML/XML
  73. * @param null|string $type The content type of the string
  74. *
  75. * @return null|void
  76. */
  77. public function addContent($content, $type = null)
  78. {
  79. if (empty($type)) {
  80. $type = 'text/html';
  81. }
  82. // DOM only for HTML/XML content
  83. if (!preg_match('/(x|ht)ml/i', $type, $matches)) {
  84. return null;
  85. }
  86. $charset = 'ISO-8859-1';
  87. if (false !== $pos = strpos($type, 'charset=')) {
  88. $charset = substr($type, $pos + 8);
  89. if (false !== $pos = strpos($charset, ';')) {
  90. $charset = substr($charset, 0, $pos);
  91. }
  92. }
  93. if ('x' === $matches[1]) {
  94. $this->addXmlContent($content, $charset);
  95. } else {
  96. $this->addHtmlContent($content, $charset);
  97. }
  98. }
  99. /**
  100. * Adds an HTML content to the list of nodes.
  101. *
  102. * @param string $content The HTML content
  103. * @param string $charset The charset
  104. *
  105. * @api
  106. */
  107. public function addHtmlContent($content, $charset = 'UTF-8')
  108. {
  109. $dom = new \DOMDocument('1.0', $charset);
  110. $dom->validateOnParse = true;
  111. @$dom->loadHTML($content);
  112. $this->addDocument($dom);
  113. $base = $this->filter('base')->extract(array('href'));
  114. if (count($base)) {
  115. $this->uri = current($base);
  116. }
  117. }
  118. /**
  119. * Adds an XML content to the list of nodes.
  120. *
  121. * @param string $content The XML content
  122. * @param string $charset The charset
  123. *
  124. * @api
  125. */
  126. public function addXmlContent($content, $charset = 'UTF-8')
  127. {
  128. $dom = new \DOMDocument('1.0', $charset);
  129. $dom->validateOnParse = true;
  130. // remove the default namespace to make XPath expressions simpler
  131. @$dom->loadXML(str_replace('xmlns', 'ns', $content));
  132. $this->addDocument($dom);
  133. }
  134. /**
  135. * Adds a \DOMDocument to the list of nodes.
  136. *
  137. * @param \DOMDocument $dom A \DOMDocument instance
  138. *
  139. * @api
  140. */
  141. public function addDocument(\DOMDocument $dom)
  142. {
  143. if ($dom->documentElement) {
  144. $this->addNode($dom->documentElement);
  145. }
  146. }
  147. /**
  148. * Adds a \DOMNodeList to the list of nodes.
  149. *
  150. * @param \DOMNodeList $nodes A \DOMNodeList instance
  151. *
  152. * @api
  153. */
  154. public function addNodeList(\DOMNodeList $nodes)
  155. {
  156. foreach ($nodes as $node) {
  157. $this->addNode($node);
  158. }
  159. }
  160. /**
  161. * Adds an array of \DOMNode instances to the list of nodes.
  162. *
  163. * @param array $nodes An array of \DOMNode instances
  164. *
  165. * @api
  166. */
  167. public function addNodes(array $nodes)
  168. {
  169. foreach ($nodes as $node) {
  170. $this->add($node);
  171. }
  172. }
  173. /**
  174. * Adds a \DOMNode instance to the list of nodes.
  175. *
  176. * @param \DOMNode $node A \DOMNode instance
  177. *
  178. * @api
  179. */
  180. public function addNode(\DOMNode $node)
  181. {
  182. if ($node instanceof \DOMDocument) {
  183. $this->attach($node->documentElement);
  184. } else {
  185. $this->attach($node);
  186. }
  187. }
  188. /**
  189. * Returns a node given its position in the node list.
  190. *
  191. * @param integer $position The position
  192. *
  193. * @return Crawler A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  194. *
  195. * @api
  196. */
  197. public function eq($position)
  198. {
  199. foreach ($this as $i => $node) {
  200. if ($i == $position) {
  201. return new static($node, $this->uri);
  202. }
  203. }
  204. return new static(null, $this->uri);
  205. }
  206. /**
  207. * Calls an anonymous function on each node of the list.
  208. *
  209. * The anonymous function receives the position and the node as arguments.
  210. *
  211. * Example:
  212. *
  213. * $crawler->filter('h1')->each(function ($node, $i)
  214. * {
  215. * return $node->nodeValue;
  216. * });
  217. *
  218. * @param \Closure $closure An anonymous function
  219. *
  220. * @return array An array of values returned by the anonymous function
  221. *
  222. * @api
  223. */
  224. public function each(\Closure $closure)
  225. {
  226. $data = array();
  227. foreach ($this as $i => $node) {
  228. $data[] = $closure($node, $i);
  229. }
  230. return $data;
  231. }
  232. /**
  233. * Reduces the list of nodes by calling an anonymous function.
  234. *
  235. * To remove a node from the list, the anonymous function must return false.
  236. *
  237. * @param \Closure $closure An anonymous function
  238. *
  239. * @return Crawler A Crawler instance with the selected nodes.
  240. *
  241. * @api
  242. */
  243. public function reduce(\Closure $closure)
  244. {
  245. $nodes = array();
  246. foreach ($this as $i => $node) {
  247. if (false !== $closure($node, $i)) {
  248. $nodes[] = $node;
  249. }
  250. }
  251. return new static($nodes, $this->uri);
  252. }
  253. /**
  254. * Returns the first node of the current selection
  255. *
  256. * @return Crawler A Crawler instance with the first selected node
  257. *
  258. * @api
  259. */
  260. public function first()
  261. {
  262. return $this->eq(0);
  263. }
  264. /**
  265. * Returns the last node of the current selection
  266. *
  267. * @return Crawler A Crawler instance with the last selected node
  268. *
  269. * @api
  270. */
  271. public function last()
  272. {
  273. return $this->eq(count($this) - 1);
  274. }
  275. /**
  276. * Returns the siblings nodes of the current selection
  277. *
  278. * @return Crawler A Crawler instance with the sibling nodes
  279. *
  280. * @throws \InvalidArgumentException When current node is empty
  281. *
  282. * @api
  283. */
  284. public function siblings()
  285. {
  286. if (!count($this)) {
  287. throw new \InvalidArgumentException('The current node list is empty.');
  288. }
  289. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  290. }
  291. /**
  292. * Returns the next siblings nodes of the current selection
  293. *
  294. * @return Crawler A Crawler instance with the next sibling nodes
  295. *
  296. * @throws \InvalidArgumentException When current node is empty
  297. *
  298. * @api
  299. */
  300. public function nextAll()
  301. {
  302. if (!count($this)) {
  303. throw new \InvalidArgumentException('The current node list is empty.');
  304. }
  305. return new static($this->sibling($this->getNode(0)), $this->uri);
  306. }
  307. /**
  308. * Returns the previous sibling nodes of the current selection
  309. *
  310. * @return Crawler A Crawler instance with the previous sibling nodes
  311. *
  312. * @api
  313. */
  314. public function previousAll()
  315. {
  316. if (!count($this)) {
  317. throw new \InvalidArgumentException('The current node list is empty.');
  318. }
  319. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  320. }
  321. /**
  322. * Returns the parents nodes of the current selection
  323. *
  324. * @return Crawler A Crawler instance with the parents nodes of the current selection
  325. *
  326. * @throws \InvalidArgumentException When current node is empty
  327. *
  328. * @api
  329. */
  330. public function parents()
  331. {
  332. if (!count($this)) {
  333. throw new \InvalidArgumentException('The current node list is empty.');
  334. }
  335. $node = $this->getNode(0);
  336. $nodes = array();
  337. while ($node = $node->parentNode) {
  338. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  339. $nodes[] = $node;
  340. }
  341. }
  342. return new static($nodes, $this->uri);
  343. }
  344. /**
  345. * Returns the children nodes of the current selection
  346. *
  347. * @return Crawler A Crawler instance with the children nodes
  348. *
  349. * @throws \InvalidArgumentException When current node is empty
  350. *
  351. * @api
  352. */
  353. public function children()
  354. {
  355. if (!count($this)) {
  356. throw new \InvalidArgumentException('The current node list is empty.');
  357. }
  358. return new static($this->sibling($this->getNode(0)->firstChild), $this->uri);
  359. }
  360. /**
  361. * Returns the attribute value of the first node of the list.
  362. *
  363. * @param string $attribute The attribute name
  364. *
  365. * @return string The attribute value
  366. *
  367. * @throws \InvalidArgumentException When current node is empty
  368. *
  369. * @api
  370. */
  371. public function attr($attribute)
  372. {
  373. if (!count($this)) {
  374. throw new \InvalidArgumentException('The current node list is empty.');
  375. }
  376. return $this->getNode(0)->getAttribute($attribute);
  377. }
  378. /**
  379. * Returns the node value of the first node of the list.
  380. *
  381. * @return string The node value
  382. *
  383. * @throws \InvalidArgumentException When current node is empty
  384. *
  385. * @api
  386. */
  387. public function text()
  388. {
  389. if (!count($this)) {
  390. throw new \InvalidArgumentException('The current node list is empty.');
  391. }
  392. return $this->getNode(0)->nodeValue;
  393. }
  394. /**
  395. * Extracts information from the list of nodes.
  396. *
  397. * You can extract attributes or/and the node value (_text).
  398. *
  399. * Example:
  400. *
  401. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  402. *
  403. * @param array $attributes An array of attributes
  404. *
  405. * @return array An array of extracted values
  406. *
  407. * @api
  408. */
  409. public function extract($attributes)
  410. {
  411. $attributes = (array) $attributes;
  412. $data = array();
  413. foreach ($this as $node) {
  414. $elements = array();
  415. foreach ($attributes as $attribute) {
  416. if ('_text' === $attribute) {
  417. $elements[] = $node->nodeValue;
  418. } else {
  419. $elements[] = $node->getAttribute($attribute);
  420. }
  421. }
  422. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  423. }
  424. return $data;
  425. }
  426. /**
  427. * Filters the list of nodes with an XPath expression.
  428. *
  429. * @param string $xpath An XPath expression
  430. *
  431. * @return Crawler A new instance of Crawler with the filtered list of nodes
  432. *
  433. * @api
  434. */
  435. public function filterXPath($xpath)
  436. {
  437. $document = new \DOMDocument('1.0', 'UTF-8');
  438. $root = $document->appendChild($document->createElement('_root'));
  439. foreach ($this as $node) {
  440. $root->appendChild($document->importNode($node, true));
  441. }
  442. $domxpath = new \DOMXPath($document);
  443. return new static($domxpath->query($xpath), $this->uri);
  444. }
  445. /**
  446. * Filters the list of nodes with a CSS selector.
  447. *
  448. * This method only works if you have installed the CssSelector Symfony Component.
  449. *
  450. * @param string $selector A CSS selector
  451. *
  452. * @return Crawler A new instance of Crawler with the filtered list of nodes
  453. *
  454. * @throws \RuntimeException if the CssSelector Component is not available
  455. *
  456. * @api
  457. */
  458. public function filter($selector)
  459. {
  460. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) {
  461. // @codeCoverageIgnoreStart
  462. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  463. // @codeCoverageIgnoreEnd
  464. }
  465. return $this->filterXPath(CssSelector::toXPath($selector));
  466. }
  467. /**
  468. * Selects links by name or alt value for clickable images.
  469. *
  470. * @param string $value The link text
  471. *
  472. * @return Crawler A new instance of Crawler with the filtered list of nodes
  473. *
  474. * @api
  475. */
  476. public function selectLink($value)
  477. {
  478. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  479. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  480. return $this->filterXPath($xpath);
  481. }
  482. /**
  483. * Selects a button by name or alt value for images.
  484. *
  485. * @param string $value The button text
  486. *
  487. * @return Crawler A new instance of Crawler with the filtered list of nodes
  488. *
  489. * @api
  490. */
  491. public function selectButton($value)
  492. {
  493. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  494. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  495. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  496. return $this->filterXPath($xpath);
  497. }
  498. /**
  499. * Returns a Link object for the first node in the list.
  500. *
  501. * @param string $method The method for the link (get by default)
  502. *
  503. * @return Link A Link instance
  504. *
  505. * @throws \InvalidArgumentException If the current node list is empty
  506. *
  507. * @api
  508. */
  509. public function link($method = 'get')
  510. {
  511. if (!count($this)) {
  512. throw new \InvalidArgumentException('The current node list is empty.');
  513. }
  514. $node = $this->getNode(0);
  515. return new Link($node, $this->uri, $method);
  516. }
  517. /**
  518. * Returns an array of Link objects for the nodes in the list.
  519. *
  520. * @return array An array of Link instances
  521. *
  522. * @api
  523. */
  524. public function links()
  525. {
  526. $links = array();
  527. foreach ($this as $node) {
  528. $links[] = new Link($node, $this->uri, 'get');
  529. }
  530. return $links;
  531. }
  532. /**
  533. * Returns a Form object for the first node in the list.
  534. *
  535. * @param array $values An array of values for the form fields
  536. * @param string $method The method for the form
  537. *
  538. * @return Form A Form instance
  539. *
  540. * @throws \InvalidArgumentException If the current node list is empty
  541. *
  542. * @api
  543. */
  544. public function form(array $values = null, $method = null)
  545. {
  546. if (!count($this)) {
  547. throw new \InvalidArgumentException('The current node list is empty.');
  548. }
  549. $form = new Form($this->getNode(0), $this->uri, $method);
  550. if (null !== $values) {
  551. $form->setValues($values);
  552. }
  553. return $form;
  554. }
  555. /**
  556. * Converts string for XPath expressions.
  557. *
  558. * Escaped characters are: quotes (") and apostrophe (').
  559. *
  560. * Examples:
  561. * <code>
  562. * echo Crawler::xpathLiteral('foo " bar');
  563. * //prints 'foo " bar'
  564. *
  565. * echo Crawler::xpathLiteral("foo ' bar");
  566. * //prints "foo ' bar"
  567. *
  568. * echo Crawler::xpathLiteral('a\'b"c');
  569. * //prints concat('a', "'", 'b"c')
  570. * </code>
  571. *
  572. * @param string $s String to be escaped
  573. *
  574. * @return string Converted string
  575. *
  576. */
  577. static public function xpathLiteral($s)
  578. {
  579. if (false === strpos($s, "'")) {
  580. return sprintf("'%s'", $s);
  581. }
  582. if (false === strpos($s, '"')) {
  583. return sprintf('"%s"', $s);
  584. }
  585. $string = $s;
  586. $parts = array();
  587. while (true) {
  588. if (false !== $pos = strpos($string, "'")) {
  589. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  590. $parts[] = "\"'\"";
  591. $string = substr($string, $pos + 1);
  592. } else {
  593. $parts[] = "'$string'";
  594. break;
  595. }
  596. }
  597. return sprintf("concat(%s)", implode($parts, ', '));
  598. }
  599. private function getNode($position)
  600. {
  601. foreach ($this as $i => $node) {
  602. if ($i == $position) {
  603. return $node;
  604. }
  605. // @codeCoverageIgnoreStart
  606. }
  607. return null;
  608. // @codeCoverageIgnoreEnd
  609. }
  610. private function sibling($node, $siblingDir = 'nextSibling')
  611. {
  612. $nodes = array();
  613. do {
  614. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  615. $nodes[] = $node;
  616. }
  617. } while ($node = $node->$siblingDir);
  618. return $nodes;
  619. }
  620. }