You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

22478 lines
682 KiB

7 years ago
  1. <?php
  2. /**
  3. * @file
  4. * This file was auto-generated by generate-includes.php and includes all of
  5. * the core files required by HTML Purifier. Use this if performance is a
  6. * primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
  7. * FILE, changes will be overwritten the next time the script is run.
  8. *
  9. * @version 4.9.3
  10. *
  11. * @warning
  12. * You must *not* include any other HTML Purifier files before this file,
  13. * because 'require' not 'require_once' is used.
  14. *
  15. * @warning
  16. * This file requires that the include path contains the HTML Purifier
  17. * library directory; this is not auto-set.
  18. */
  19. /*! @mainpage
  20. *
  21. * HTML Purifier is an HTML filter that will take an arbitrary snippet of
  22. * HTML and rigorously test, validate and filter it into a version that
  23. * is safe for output onto webpages. It achieves this by:
  24. *
  25. * -# Lexing (parsing into tokens) the document,
  26. * -# Executing various strategies on the tokens:
  27. * -# Removing all elements not in the whitelist,
  28. * -# Making the tokens well-formed,
  29. * -# Fixing the nesting of the nodes, and
  30. * -# Validating attributes of the nodes; and
  31. * -# Generating HTML from the purified tokens.
  32. *
  33. * However, most users will only need to interface with the HTMLPurifier
  34. * and HTMLPurifier_Config.
  35. */
  36. /*
  37. HTML Purifier 4.9.3 - Standards Compliant HTML Filtering
  38. Copyright (C) 2006-2008 Edward Z. Yang
  39. This library is free software; you can redistribute it and/or
  40. modify it under the terms of the GNU Lesser General Public
  41. License as published by the Free Software Foundation; either
  42. version 2.1 of the License, or (at your option) any later version.
  43. This library is distributed in the hope that it will be useful,
  44. but WITHOUT ANY WARRANTY; without even the implied warranty of
  45. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  46. Lesser General Public License for more details.
  47. You should have received a copy of the GNU Lesser General Public
  48. License along with this library; if not, write to the Free Software
  49. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  50. */
  51. /**
  52. * Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
  53. *
  54. * @note There are several points in which configuration can be specified
  55. * for HTML Purifier. The precedence of these (from lowest to
  56. * highest) is as follows:
  57. * -# Instance: new HTMLPurifier($config)
  58. * -# Invocation: purify($html, $config)
  59. * These configurations are entirely independent of each other and
  60. * are *not* merged (this behavior may change in the future).
  61. *
  62. * @todo We need an easier way to inject strategies using the configuration
  63. * object.
  64. */
  65. class HTMLPurifier
  66. {
  67. /**
  68. * Version of HTML Purifier.
  69. * @type string
  70. */
  71. public $version = '4.9.3';
  72. /**
  73. * Constant with version of HTML Purifier.
  74. */
  75. const VERSION = '4.9.3';
  76. /**
  77. * Global configuration object.
  78. * @type HTMLPurifier_Config
  79. */
  80. public $config;
  81. /**
  82. * Array of extra filter objects to run on HTML,
  83. * for backwards compatibility.
  84. * @type HTMLPurifier_Filter[]
  85. */
  86. private $filters = array();
  87. /**
  88. * Single instance of HTML Purifier.
  89. * @type HTMLPurifier
  90. */
  91. private static $instance;
  92. /**
  93. * @type HTMLPurifier_Strategy_Core
  94. */
  95. protected $strategy;
  96. /**
  97. * @type HTMLPurifier_Generator
  98. */
  99. protected $generator;
  100. /**
  101. * Resultant context of last run purification.
  102. * Is an array of contexts if the last called method was purifyArray().
  103. * @type HTMLPurifier_Context
  104. */
  105. public $context;
  106. /**
  107. * Initializes the purifier.
  108. *
  109. * @param HTMLPurifier_Config|mixed $config Optional HTMLPurifier_Config object
  110. * for all instances of the purifier, if omitted, a default
  111. * configuration is supplied (which can be overridden on a
  112. * per-use basis).
  113. * The parameter can also be any type that
  114. * HTMLPurifier_Config::create() supports.
  115. */
  116. public function __construct($config = null)
  117. {
  118. $this->config = HTMLPurifier_Config::create($config);
  119. $this->strategy = new HTMLPurifier_Strategy_Core();
  120. }
  121. /**
  122. * Adds a filter to process the output. First come first serve
  123. *
  124. * @param HTMLPurifier_Filter $filter HTMLPurifier_Filter object
  125. */
  126. public function addFilter($filter)
  127. {
  128. trigger_error(
  129. 'HTMLPurifier->addFilter() is deprecated, use configuration directives' .
  130. ' in the Filter namespace or Filter.Custom',
  131. E_USER_WARNING
  132. );
  133. $this->filters[] = $filter;
  134. }
  135. /**
  136. * Filters an HTML snippet/document to be XSS-free and standards-compliant.
  137. *
  138. * @param string $html String of HTML to purify
  139. * @param HTMLPurifier_Config $config Config object for this operation,
  140. * if omitted, defaults to the config object specified during this
  141. * object's construction. The parameter can also be any type
  142. * that HTMLPurifier_Config::create() supports.
  143. *
  144. * @return string Purified HTML
  145. */
  146. public function purify($html, $config = null)
  147. {
  148. // :TODO: make the config merge in, instead of replace
  149. $config = $config ? HTMLPurifier_Config::create($config) : $this->config;
  150. // implementation is partially environment dependant, partially
  151. // configuration dependant
  152. $lexer = HTMLPurifier_Lexer::create($config);
  153. $context = new HTMLPurifier_Context();
  154. // setup HTML generator
  155. $this->generator = new HTMLPurifier_Generator($config, $context);
  156. $context->register('Generator', $this->generator);
  157. // set up global context variables
  158. if ($config->get('Core.CollectErrors')) {
  159. // may get moved out if other facilities use it
  160. $language_factory = HTMLPurifier_LanguageFactory::instance();
  161. $language = $language_factory->create($config, $context);
  162. $context->register('Locale', $language);
  163. $error_collector = new HTMLPurifier_ErrorCollector($context);
  164. $context->register('ErrorCollector', $error_collector);
  165. }
  166. // setup id_accumulator context, necessary due to the fact that
  167. // AttrValidator can be called from many places
  168. $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
  169. $context->register('IDAccumulator', $id_accumulator);
  170. $html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
  171. // setup filters
  172. $filter_flags = $config->getBatch('Filter');
  173. $custom_filters = $filter_flags['Custom'];
  174. unset($filter_flags['Custom']);
  175. $filters = array();
  176. foreach ($filter_flags as $filter => $flag) {
  177. if (!$flag) {
  178. continue;
  179. }
  180. if (strpos($filter, '.') !== false) {
  181. continue;
  182. }
  183. $class = "HTMLPurifier_Filter_$filter";
  184. $filters[] = new $class;
  185. }
  186. foreach ($custom_filters as $filter) {
  187. // maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
  188. $filters[] = $filter;
  189. }
  190. $filters = array_merge($filters, $this->filters);
  191. // maybe prepare(), but later
  192. for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
  193. $html = $filters[$i]->preFilter($html, $config, $context);
  194. }
  195. // purified HTML
  196. $html =
  197. $this->generator->generateFromTokens(
  198. // list of tokens
  199. $this->strategy->execute(
  200. // list of un-purified tokens
  201. $lexer->tokenizeHTML(
  202. // un-purified HTML
  203. $html,
  204. $config,
  205. $context
  206. ),
  207. $config,
  208. $context
  209. )
  210. );
  211. for ($i = $filter_size - 1; $i >= 0; $i--) {
  212. $html = $filters[$i]->postFilter($html, $config, $context);
  213. }
  214. $html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
  215. $this->context =& $context;
  216. return $html;
  217. }
  218. /**
  219. * Filters an array of HTML snippets
  220. *
  221. * @param string[] $array_of_html Array of html snippets
  222. * @param HTMLPurifier_Config $config Optional config object for this operation.
  223. * See HTMLPurifier::purify() for more details.
  224. *
  225. * @return string[] Array of purified HTML
  226. */
  227. public function purifyArray($array_of_html, $config = null)
  228. {
  229. $context_array = array();
  230. foreach ($array_of_html as $key => $html) {
  231. $array_of_html[$key] = $this->purify($html, $config);
  232. $context_array[$key] = $this->context;
  233. }
  234. $this->context = $context_array;
  235. return $array_of_html;
  236. }
  237. /**
  238. * Singleton for enforcing just one HTML Purifier in your system
  239. *
  240. * @param HTMLPurifier|HTMLPurifier_Config $prototype Optional prototype
  241. * HTMLPurifier instance to overload singleton with,
  242. * or HTMLPurifier_Config instance to configure the
  243. * generated version with.
  244. *
  245. * @return HTMLPurifier
  246. */
  247. public static function instance($prototype = null)
  248. {
  249. if (!self::$instance || $prototype) {
  250. if ($prototype instanceof HTMLPurifier) {
  251. self::$instance = $prototype;
  252. } elseif ($prototype) {
  253. self::$instance = new HTMLPurifier($prototype);
  254. } else {
  255. self::$instance = new HTMLPurifier();
  256. }
  257. }
  258. return self::$instance;
  259. }
  260. /**
  261. * Singleton for enforcing just one HTML Purifier in your system
  262. *
  263. * @param HTMLPurifier|HTMLPurifier_Config $prototype Optional prototype
  264. * HTMLPurifier instance to overload singleton with,
  265. * or HTMLPurifier_Config instance to configure the
  266. * generated version with.
  267. *
  268. * @return HTMLPurifier
  269. * @note Backwards compatibility, see instance()
  270. */
  271. public static function getInstance($prototype = null)
  272. {
  273. return HTMLPurifier::instance($prototype);
  274. }
  275. }
  276. /**
  277. * Converts a stream of HTMLPurifier_Token into an HTMLPurifier_Node,
  278. * and back again.
  279. *
  280. * @note This transformation is not an equivalence. We mutate the input
  281. * token stream to make it so; see all [MUT] markers in code.
  282. */
  283. class HTMLPurifier_Arborize
  284. {
  285. public static function arborize($tokens, $config, $context) {
  286. $definition = $config->getHTMLDefinition();
  287. $parent = new HTMLPurifier_Token_Start($definition->info_parent);
  288. $stack = array($parent->toNode());
  289. foreach ($tokens as $token) {
  290. $token->skip = null; // [MUT]
  291. $token->carryover = null; // [MUT]
  292. if ($token instanceof HTMLPurifier_Token_End) {
  293. $token->start = null; // [MUT]
  294. $r = array_pop($stack);
  295. //assert($r->name === $token->name);
  296. //assert(empty($token->attr));
  297. $r->endCol = $token->col;
  298. $r->endLine = $token->line;
  299. $r->endArmor = $token->armor;
  300. continue;
  301. }
  302. $node = $token->toNode();
  303. $stack[count($stack)-1]->children[] = $node;
  304. if ($token instanceof HTMLPurifier_Token_Start) {
  305. $stack[] = $node;
  306. }
  307. }
  308. //assert(count($stack) == 1);
  309. return $stack[0];
  310. }
  311. public static function flatten($node, $config, $context) {
  312. $level = 0;
  313. $nodes = array($level => new HTMLPurifier_Queue(array($node)));
  314. $closingTokens = array();
  315. $tokens = array();
  316. do {
  317. while (!$nodes[$level]->isEmpty()) {
  318. $node = $nodes[$level]->shift(); // FIFO
  319. list($start, $end) = $node->toTokenPair();
  320. if ($level > 0) {
  321. $tokens[] = $start;
  322. }
  323. if ($end !== NULL) {
  324. $closingTokens[$level][] = $end;
  325. }
  326. if ($node instanceof HTMLPurifier_Node_Element) {
  327. $level++;
  328. $nodes[$level] = new HTMLPurifier_Queue();
  329. foreach ($node->children as $childNode) {
  330. $nodes[$level]->push($childNode);
  331. }
  332. }
  333. }
  334. $level--;
  335. if ($level && isset($closingTokens[$level])) {
  336. while ($token = array_pop($closingTokens[$level])) {
  337. $tokens[] = $token;
  338. }
  339. }
  340. } while ($level > 0);
  341. return $tokens;
  342. }
  343. }
  344. /**
  345. * Defines common attribute collections that modules reference
  346. */
  347. class HTMLPurifier_AttrCollections
  348. {
  349. /**
  350. * Associative array of attribute collections, indexed by name.
  351. * @type array
  352. */
  353. public $info = array();
  354. /**
  355. * Performs all expansions on internal data for use by other inclusions
  356. * It also collects all attribute collection extensions from
  357. * modules
  358. * @param HTMLPurifier_AttrTypes $attr_types HTMLPurifier_AttrTypes instance
  359. * @param HTMLPurifier_HTMLModule[] $modules Hash array of HTMLPurifier_HTMLModule members
  360. */
  361. public function __construct($attr_types, $modules)
  362. {
  363. $this->doConstruct($attr_types, $modules);
  364. }
  365. public function doConstruct($attr_types, $modules)
  366. {
  367. // load extensions from the modules
  368. foreach ($modules as $module) {
  369. foreach ($module->attr_collections as $coll_i => $coll) {
  370. if (!isset($this->info[$coll_i])) {
  371. $this->info[$coll_i] = array();
  372. }
  373. foreach ($coll as $attr_i => $attr) {
  374. if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
  375. // merge in includes
  376. $this->info[$coll_i][$attr_i] = array_merge(
  377. $this->info[$coll_i][$attr_i],
  378. $attr
  379. );
  380. continue;
  381. }
  382. $this->info[$coll_i][$attr_i] = $attr;
  383. }
  384. }
  385. }
  386. // perform internal expansions and inclusions
  387. foreach ($this->info as $name => $attr) {
  388. // merge attribute collections that include others
  389. $this->performInclusions($this->info[$name]);
  390. // replace string identifiers with actual attribute objects
  391. $this->expandIdentifiers($this->info[$name], $attr_types);
  392. }
  393. }
  394. /**
  395. * Takes a reference to an attribute associative array and performs
  396. * all inclusions specified by the zero index.
  397. * @param array &$attr Reference to attribute array
  398. */
  399. public function performInclusions(&$attr)
  400. {
  401. if (!isset($attr[0])) {
  402. return;
  403. }
  404. $merge = $attr[0];
  405. $seen = array(); // recursion guard
  406. // loop through all the inclusions
  407. for ($i = 0; isset($merge[$i]); $i++) {
  408. if (isset($seen[$merge[$i]])) {
  409. continue;
  410. }
  411. $seen[$merge[$i]] = true;
  412. // foreach attribute of the inclusion, copy it over
  413. if (!isset($this->info[$merge[$i]])) {
  414. continue;
  415. }
  416. foreach ($this->info[$merge[$i]] as $key => $value) {
  417. if (isset($attr[$key])) {
  418. continue;
  419. } // also catches more inclusions
  420. $attr[$key] = $value;
  421. }
  422. if (isset($this->info[$merge[$i]][0])) {
  423. // recursion
  424. $merge = array_merge($merge, $this->info[$merge[$i]][0]);
  425. }
  426. }
  427. unset($attr[0]);
  428. }
  429. /**
  430. * Expands all string identifiers in an attribute array by replacing
  431. * them with the appropriate values inside HTMLPurifier_AttrTypes
  432. * @param array &$attr Reference to attribute array
  433. * @param HTMLPurifier_AttrTypes $attr_types HTMLPurifier_AttrTypes instance
  434. */
  435. public function expandIdentifiers(&$attr, $attr_types)
  436. {
  437. // because foreach will process new elements we add, make sure we
  438. // skip duplicates
  439. $processed = array();
  440. foreach ($attr as $def_i => $def) {
  441. // skip inclusions
  442. if ($def_i === 0) {
  443. continue;
  444. }
  445. if (isset($processed[$def_i])) {
  446. continue;
  447. }
  448. // determine whether or not attribute is required
  449. if ($required = (strpos($def_i, '*') !== false)) {
  450. // rename the definition
  451. unset($attr[$def_i]);
  452. $def_i = trim($def_i, '*');
  453. $attr[$def_i] = $def;
  454. }
  455. $processed[$def_i] = true;
  456. // if we've already got a literal object, move on
  457. if (is_object($def)) {
  458. // preserve previous required
  459. $attr[$def_i]->required = ($required || $attr[$def_i]->required);
  460. continue;
  461. }
  462. if ($def === false) {
  463. unset($attr[$def_i]);
  464. continue;
  465. }
  466. if ($t = $attr_types->get($def)) {
  467. $attr[$def_i] = $t;
  468. $attr[$def_i]->required = $required;
  469. } else {
  470. unset($attr[$def_i]);
  471. }
  472. }
  473. }
  474. }
  475. /**
  476. * Base class for all validating attribute definitions.
  477. *
  478. * This family of classes forms the core for not only HTML attribute validation,
  479. * but also any sort of string that needs to be validated or cleaned (which
  480. * means CSS properties and composite definitions are defined here too).
  481. * Besides defining (through code) what precisely makes the string valid,
  482. * subclasses are also responsible for cleaning the code if possible.
  483. */
  484. abstract class HTMLPurifier_AttrDef
  485. {
  486. /**
  487. * Tells us whether or not an HTML attribute is minimized.
  488. * Has no meaning in other contexts.
  489. * @type bool
  490. */
  491. public $minimized = false;
  492. /**
  493. * Tells us whether or not an HTML attribute is required.
  494. * Has no meaning in other contexts
  495. * @type bool
  496. */
  497. public $required = false;
  498. /**
  499. * Validates and cleans passed string according to a definition.
  500. *
  501. * @param string $string String to be validated and cleaned.
  502. * @param HTMLPurifier_Config $config Mandatory HTMLPurifier_Config object.
  503. * @param HTMLPurifier_Context $context Mandatory HTMLPurifier_Context object.
  504. */
  505. abstract public function validate($string, $config, $context);
  506. /**
  507. * Convenience method that parses a string as if it were CDATA.
  508. *
  509. * This method process a string in the manner specified at
  510. * <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
  511. * leading and trailing whitespace, ignoring line feeds, and replacing
  512. * carriage returns and tabs with spaces. While most useful for HTML
  513. * attributes specified as CDATA, it can also be applied to most CSS
  514. * values.
  515. *
  516. * @note This method is not entirely standards compliant, as trim() removes
  517. * more types of whitespace than specified in the spec. In practice,
  518. * this is rarely a problem, as those extra characters usually have
  519. * already been removed by HTMLPurifier_Encoder.
  520. *
  521. * @warning This processing is inconsistent with XML's whitespace handling
  522. * as specified by section 3.3.3 and referenced XHTML 1.0 section
  523. * 4.7. However, note that we are NOT necessarily
  524. * parsing XML, thus, this behavior may still be correct. We
  525. * assume that newlines have been normalized.
  526. */
  527. public function parseCDATA($string)
  528. {
  529. $string = trim($string);
  530. $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
  531. return $string;
  532. }
  533. /**
  534. * Factory method for creating this class from a string.
  535. * @param string $string String construction info
  536. * @return HTMLPurifier_AttrDef Created AttrDef object corresponding to $string
  537. */
  538. public function make($string)
  539. {
  540. // default implementation, return a flyweight of this object.
  541. // If $string has an effect on the returned object (i.e. you
  542. // need to overload this method), it is best
  543. // to clone or instantiate new copies. (Instantiation is safer.)
  544. return $this;
  545. }
  546. /**
  547. * Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
  548. * properly. THIS IS A HACK!
  549. * @param string $string a CSS colour definition
  550. * @return string
  551. */
  552. protected function mungeRgb($string)
  553. {
  554. $p = '\s*(\d+(\.\d+)?([%]?))\s*';
  555. if (preg_match('/(rgba|hsla)\(/', $string)) {
  556. return preg_replace('/(rgba|hsla)\('.$p.','.$p.','.$p.','.$p.'\)/', '\1(\2,\5,\8,\11)', $string);
  557. }
  558. return preg_replace('/(rgb|hsl)\('.$p.','.$p.','.$p.'\)/', '\1(\2,\5,\8)', $string);
  559. }
  560. /**
  561. * Parses a possibly escaped CSS string and returns the "pure"
  562. * version of it.
  563. */
  564. protected function expandCSSEscape($string)
  565. {
  566. // flexibly parse it
  567. $ret = '';
  568. for ($i = 0, $c = strlen($string); $i < $c; $i++) {
  569. if ($string[$i] === '\\') {
  570. $i++;
  571. if ($i >= $c) {
  572. $ret .= '\\';
  573. break;
  574. }
  575. if (ctype_xdigit($string[$i])) {
  576. $code = $string[$i];
  577. for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
  578. if (!ctype_xdigit($string[$i])) {
  579. break;
  580. }
  581. $code .= $string[$i];
  582. }
  583. // We have to be extremely careful when adding
  584. // new characters, to make sure we're not breaking
  585. // the encoding.
  586. $char = HTMLPurifier_Encoder::unichr(hexdec($code));
  587. if (HTMLPurifier_Encoder::cleanUTF8($char) === '') {
  588. continue;
  589. }
  590. $ret .= $char;
  591. if ($i < $c && trim($string[$i]) !== '') {
  592. $i--;
  593. }
  594. continue;
  595. }
  596. if ($string[$i] === "\n") {
  597. continue;
  598. }
  599. }
  600. $ret .= $string[$i];
  601. }
  602. return $ret;
  603. }
  604. }
  605. /**
  606. * Processes an entire attribute array for corrections needing multiple values.
  607. *
  608. * Occasionally, a certain attribute will need to be removed and popped onto
  609. * another value. Instead of creating a complex return syntax for
  610. * HTMLPurifier_AttrDef, we just pass the whole attribute array to a
  611. * specialized object and have that do the special work. That is the
  612. * family of HTMLPurifier_AttrTransform.
  613. *
  614. * An attribute transformation can be assigned to run before or after
  615. * HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
  616. * more details.
  617. */
  618. abstract class HTMLPurifier_AttrTransform
  619. {
  620. /**
  621. * Abstract: makes changes to the attributes dependent on multiple values.
  622. *
  623. * @param array $attr Assoc array of attributes, usually from
  624. * HTMLPurifier_Token_Tag::$attr
  625. * @param HTMLPurifier_Config $config Mandatory HTMLPurifier_Config object.
  626. * @param HTMLPurifier_Context $context Mandatory HTMLPurifier_Context object
  627. * @return array Processed attribute array.
  628. */
  629. abstract public function transform($attr, $config, $context);
  630. /**
  631. * Prepends CSS properties to the style attribute, creating the
  632. * attribute if it doesn't exist.
  633. * @param array &$attr Attribute array to process (passed by reference)
  634. * @param string $css CSS to prepend
  635. */
  636. public function prependCSS(&$attr, $css)
  637. {
  638. $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
  639. $attr['style'] = $css . $attr['style'];
  640. }
  641. /**
  642. * Retrieves and removes an attribute
  643. * @param array &$attr Attribute array to process (passed by reference)
  644. * @param mixed $key Key of attribute to confiscate
  645. * @return mixed
  646. */
  647. public function confiscateAttr(&$attr, $key)
  648. {
  649. if (!isset($attr[$key])) {
  650. return null;
  651. }
  652. $value = $attr[$key];
  653. unset($attr[$key]);
  654. return $value;
  655. }
  656. }
  657. /**
  658. * Provides lookup array of attribute types to HTMLPurifier_AttrDef objects
  659. */
  660. class HTMLPurifier_AttrTypes
  661. {
  662. /**
  663. * Lookup array of attribute string identifiers to concrete implementations.
  664. * @type HTMLPurifier_AttrDef[]
  665. */
  666. protected $info = array();
  667. /**
  668. * Constructs the info array, supplying default implementations for attribute
  669. * types.
  670. */
  671. public function __construct()
  672. {
  673. // XXX This is kind of poor, since we don't actually /clone/
  674. // instances; instead, we use the supplied make() attribute. So,
  675. // the underlying class must know how to deal with arguments.
  676. // With the old implementation of Enum, that ignored its
  677. // arguments when handling a make dispatch, the IAlign
  678. // definition wouldn't work.
  679. // pseudo-types, must be instantiated via shorthand
  680. $this->info['Enum'] = new HTMLPurifier_AttrDef_Enum();
  681. $this->info['Bool'] = new HTMLPurifier_AttrDef_HTML_Bool();
  682. $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
  683. $this->info['ID'] = new HTMLPurifier_AttrDef_HTML_ID();
  684. $this->info['Length'] = new HTMLPurifier_AttrDef_HTML_Length();
  685. $this->info['MultiLength'] = new HTMLPurifier_AttrDef_HTML_MultiLength();
  686. $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_HTML_Nmtokens();
  687. $this->info['Pixels'] = new HTMLPurifier_AttrDef_HTML_Pixels();
  688. $this->info['Text'] = new HTMLPurifier_AttrDef_Text();
  689. $this->info['URI'] = new HTMLPurifier_AttrDef_URI();
  690. $this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
  691. $this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
  692. $this->info['IAlign'] = self::makeEnum('top,middle,bottom,left,right');
  693. $this->info['LAlign'] = self::makeEnum('top,bottom,left,right');
  694. $this->info['FrameTarget'] = new HTMLPurifier_AttrDef_HTML_FrameTarget();
  695. // unimplemented aliases
  696. $this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
  697. $this->info['ContentTypes'] = new HTMLPurifier_AttrDef_Text();
  698. $this->info['Charsets'] = new HTMLPurifier_AttrDef_Text();
  699. $this->info['Character'] = new HTMLPurifier_AttrDef_Text();
  700. // "proprietary" types
  701. $this->info['Class'] = new HTMLPurifier_AttrDef_HTML_Class();
  702. // number is really a positive integer (one or more digits)
  703. // FIXME: ^^ not always, see start and value of list items
  704. $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
  705. }
  706. private static function makeEnum($in)
  707. {
  708. return new HTMLPurifier_AttrDef_Clone(new HTMLPurifier_AttrDef_Enum(explode(',', $in)));
  709. }
  710. /**
  711. * Retrieves a type
  712. * @param string $type String type name
  713. * @return HTMLPurifier_AttrDef Object AttrDef for type
  714. */
  715. public function get($type)
  716. {
  717. // determine if there is any extra info tacked on
  718. if (strpos($type, '#') !== false) {
  719. list($type, $string) = explode('#', $type, 2);
  720. } else {
  721. $string = '';
  722. }
  723. if (!isset($this->info[$type])) {
  724. trigger_error('Cannot retrieve undefined attribute type ' . $type, E_USER_ERROR);
  725. return;
  726. }
  727. return $this->info[$type]->make($string);
  728. }
  729. /**
  730. * Sets a new implementation for a type
  731. * @param string $type String type name
  732. * @param HTMLPurifier_AttrDef $impl Object AttrDef for type
  733. */
  734. public function set($type, $impl)
  735. {
  736. $this->info[$type] = $impl;
  737. }
  738. }
  739. /**
  740. * Validates the attributes of a token. Doesn't manage required attributes
  741. * very well. The only reason we factored this out was because RemoveForeignElements
  742. * also needed it besides ValidateAttributes.
  743. */
  744. class HTMLPurifier_AttrValidator
  745. {
  746. /**
  747. * Validates the attributes of a token, mutating it as necessary.
  748. * that has valid tokens
  749. * @param HTMLPurifier_Token $token Token to validate.
  750. * @param HTMLPurifier_Config $config Instance of HTMLPurifier_Config
  751. * @param HTMLPurifier_Context $context Instance of HTMLPurifier_Context
  752. */
  753. public function validateToken($token, $config, $context)
  754. {
  755. $definition = $config->getHTMLDefinition();
  756. $e =& $context->get('ErrorCollector', true);
  757. // initialize IDAccumulator if necessary
  758. $ok =& $context->get('IDAccumulator', true);
  759. if (!$ok) {
  760. $id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
  761. $context->register('IDAccumulator', $id_accumulator);
  762. }
  763. // initialize CurrentToken if necessary
  764. $current_token =& $context->get('CurrentToken', true);
  765. if (!$current_token) {
  766. $context->register('CurrentToken', $token);
  767. }
  768. if (!$token instanceof HTMLPurifier_Token_Start &&
  769. !$token instanceof HTMLPurifier_Token_Empty
  770. ) {
  771. return;
  772. }
  773. // create alias to global definition array, see also $defs
  774. // DEFINITION CALL
  775. $d_defs = $definition->info_global_attr;
  776. // don't update token until the very end, to ensure an atomic update
  777. $attr = $token->attr;
  778. // do global transformations (pre)
  779. // nothing currently utilizes this
  780. foreach ($definition->info_attr_transform_pre as $transform) {
  781. $attr = $transform->transform($o = $attr, $config, $context);
  782. if ($e) {
  783. if ($attr != $o) {
  784. $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  785. }
  786. }
  787. }
  788. // do local transformations only applicable to this element (pre)
  789. // ex. <p align="right"> to <p style="text-align:right;">
  790. foreach ($definition->info[$token->name]->attr_transform_pre as $transform) {
  791. $attr = $transform->transform($o = $attr, $config, $context);
  792. if ($e) {
  793. if ($attr != $o) {
  794. $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  795. }
  796. }
  797. }
  798. // create alias to this element's attribute definition array, see
  799. // also $d_defs (global attribute definition array)
  800. // DEFINITION CALL
  801. $defs = $definition->info[$token->name]->attr;
  802. $attr_key = false;
  803. $context->register('CurrentAttr', $attr_key);
  804. // iterate through all the attribute keypairs
  805. // Watch out for name collisions: $key has previously been used
  806. foreach ($attr as $attr_key => $value) {
  807. // call the definition
  808. if (isset($defs[$attr_key])) {
  809. // there is a local definition defined
  810. if ($defs[$attr_key] === false) {
  811. // We've explicitly been told not to allow this element.
  812. // This is usually when there's a global definition
  813. // that must be overridden.
  814. // Theoretically speaking, we could have a
  815. // AttrDef_DenyAll, but this is faster!
  816. $result = false;
  817. } else {
  818. // validate according to the element's definition
  819. $result = $defs[$attr_key]->validate(
  820. $value,
  821. $config,
  822. $context
  823. );
  824. }
  825. } elseif (isset($d_defs[$attr_key])) {
  826. // there is a global definition defined, validate according
  827. // to the global definition
  828. $result = $d_defs[$attr_key]->validate(
  829. $value,
  830. $config,
  831. $context
  832. );
  833. } else {
  834. // system never heard of the attribute? DELETE!
  835. $result = false;
  836. }
  837. // put the results into effect
  838. if ($result === false || $result === null) {
  839. // this is a generic error message that should replaced
  840. // with more specific ones when possible
  841. if ($e) {
  842. $e->send(E_ERROR, 'AttrValidator: Attribute removed');
  843. }
  844. // remove the attribute
  845. unset($attr[$attr_key]);
  846. } elseif (is_string($result)) {
  847. // generally, if a substitution is happening, there
  848. // was some sort of implicit correction going on. We'll
  849. // delegate it to the attribute classes to say exactly what.
  850. // simple substitution
  851. $attr[$attr_key] = $result;
  852. } else {
  853. // nothing happens
  854. }
  855. // we'd also want slightly more complicated substitution
  856. // involving an array as the return value,
  857. // although we're not sure how colliding attributes would
  858. // resolve (certain ones would be completely overriden,
  859. // others would prepend themselves).
  860. }
  861. $context->destroy('CurrentAttr');
  862. // post transforms
  863. // global (error reporting untested)
  864. foreach ($definition->info_attr_transform_post as $transform) {
  865. $attr = $transform->transform($o = $attr, $config, $context);
  866. if ($e) {
  867. if ($attr != $o) {
  868. $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  869. }
  870. }
  871. }
  872. // local (error reporting untested)
  873. foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
  874. $attr = $transform->transform($o = $attr, $config, $context);
  875. if ($e) {
  876. if ($attr != $o) {
  877. $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
  878. }
  879. }
  880. }
  881. $token->attr = $attr;
  882. // destroy CurrentToken if we made it ourselves
  883. if (!$current_token) {
  884. $context->destroy('CurrentToken');
  885. }
  886. }
  887. }
  888. // constants are slow, so we use as few as possible
  889. if (!defined('HTMLPURIFIER_PREFIX')) {
  890. define('HTMLPURIFIER_PREFIX', dirname(__FILE__) . '/standalone');
  891. set_include_path(HTMLPURIFIER_PREFIX . PATH_SEPARATOR . get_include_path());
  892. }
  893. // accomodations for versions earlier than 5.0.2
  894. // borrowed from PHP_Compat, LGPL licensed, by Aidan Lister <aidan@php.net>
  895. if (!defined('PHP_EOL')) {
  896. switch (strtoupper(substr(PHP_OS, 0, 3))) {
  897. case 'WIN':
  898. define('PHP_EOL', "\r\n");
  899. break;
  900. case 'DAR':
  901. define('PHP_EOL', "\r");
  902. break;
  903. default:
  904. define('PHP_EOL', "\n");
  905. }
  906. }
  907. /**
  908. * Bootstrap class that contains meta-functionality for HTML Purifier such as
  909. * the autoload function.
  910. *
  911. * @note
  912. * This class may be used without any other files from HTML Purifier.
  913. */
  914. class HTMLPurifier_Bootstrap
  915. {
  916. /**
  917. * Autoload function for HTML Purifier
  918. * @param string $class Class to load
  919. * @return bool
  920. */
  921. public static function autoload($class)
  922. {
  923. $file = HTMLPurifier_Bootstrap::getPath($class);
  924. if (!$file) {
  925. return false;
  926. }
  927. // Technically speaking, it should be ok and more efficient to
  928. // just do 'require', but Antonio Parraga reports that with
  929. // Zend extensions such as Zend debugger and APC, this invariant
  930. // may be broken. Since we have efficient alternatives, pay
  931. // the cost here and avoid the bug.
  932. require_once HTMLPURIFIER_PREFIX . '/' . $file;
  933. return true;
  934. }
  935. /**
  936. * Returns the path for a specific class.
  937. * @param string $class Class path to get
  938. * @return string
  939. */
  940. public static function getPath($class)
  941. {
  942. if (strncmp('HTMLPurifier', $class, 12) !== 0) {
  943. return false;
  944. }
  945. // Custom implementations
  946. if (strncmp('HTMLPurifier_Language_', $class, 22) === 0) {
  947. $code = str_replace('_', '-', substr($class, 22));
  948. $file = 'HTMLPurifier/Language/classes/' . $code . '.php';
  949. } else {
  950. $file = str_replace('_', '/', $class) . '.php';
  951. }
  952. if (!file_exists(HTMLPURIFIER_PREFIX . '/' . $file)) {
  953. return false;
  954. }
  955. return $file;
  956. }
  957. /**
  958. * "Pre-registers" our autoloader on the SPL stack.
  959. */
  960. public static function registerAutoload()
  961. {
  962. $autoload = array('HTMLPurifier_Bootstrap', 'autoload');
  963. if (($funcs = spl_autoload_functions()) === false) {
  964. spl_autoload_register($autoload);
  965. } elseif (function_exists('spl_autoload_unregister')) {
  966. if (version_compare(PHP_VERSION, '5.3.0', '>=')) {
  967. // prepend flag exists, no need for shenanigans
  968. spl_autoload_register($autoload, true, true);
  969. } else {
  970. $buggy = version_compare(PHP_VERSION, '5.2.11', '<');
  971. $compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
  972. version_compare(PHP_VERSION, '5.1.0', '>=');
  973. foreach ($funcs as $func) {
  974. if ($buggy && is_array($func)) {
  975. // :TRICKY: There are some compatibility issues and some
  976. // places where we need to error out
  977. $reflector = new ReflectionMethod($func[0], $func[1]);
  978. if (!$reflector->isStatic()) {
  979. throw new Exception(
  980. 'HTML Purifier autoloader registrar is not compatible
  981. with non-static object methods due to PHP Bug #44144;
  982. Please do not use HTMLPurifier.autoload.php (or any
  983. file that includes this file); instead, place the code:
  984. spl_autoload_register(array(\'HTMLPurifier_Bootstrap\', \'autoload\'))
  985. after your own autoloaders.'
  986. );
  987. }
  988. // Suprisingly, spl_autoload_register supports the
  989. // Class::staticMethod callback format, although call_user_func doesn't
  990. if ($compat) {
  991. $func = implode('::', $func);
  992. }
  993. }
  994. spl_autoload_unregister($func);
  995. }
  996. spl_autoload_register($autoload);
  997. foreach ($funcs as $func) {
  998. spl_autoload_register($func);
  999. }
  1000. }
  1001. }
  1002. }
  1003. }
  1004. /**
  1005. * Super-class for definition datatype objects, implements serialization
  1006. * functions for the class.
  1007. */
  1008. abstract class HTMLPurifier_Definition
  1009. {
  1010. /**
  1011. * Has setup() been called yet?
  1012. * @type bool
  1013. */
  1014. public $setup = false;
  1015. /**
  1016. * If true, write out the final definition object to the cache after
  1017. * setup. This will be true only if all invocations to get a raw
  1018. * definition object are also optimized. This does not cause file
  1019. * system thrashing because on subsequent calls the cached object
  1020. * is used and any writes to the raw definition object are short
  1021. * circuited. See enduser-customize.html for the high-level
  1022. * picture.
  1023. * @type bool
  1024. */
  1025. public $optimized = null;
  1026. /**
  1027. * What type of definition is it?
  1028. * @type string
  1029. */
  1030. public $type;
  1031. /**
  1032. * Sets up the definition object into the final form, something
  1033. * not done by the constructor
  1034. * @param HTMLPurifier_Config $config
  1035. */
  1036. abstract protected function doSetup($config);
  1037. /**
  1038. * Setup function that aborts if already setup
  1039. * @param HTMLPurifier_Config $config
  1040. */
  1041. public function setup($config)
  1042. {
  1043. if ($this->setup) {
  1044. return;
  1045. }
  1046. $this->setup = true;
  1047. $this->doSetup($config);
  1048. }
  1049. }
  1050. /**
  1051. * Defines allowed CSS attributes and what their values are.
  1052. * @see HTMLPurifier_HTMLDefinition
  1053. */
  1054. class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
  1055. {
  1056. public $type = 'CSS';
  1057. /**
  1058. * Assoc array of attribute name to definition object.
  1059. * @type HTMLPurifier_AttrDef[]
  1060. */
  1061. public $info = array();
  1062. /**
  1063. * Constructs the info array. The meat of this class.
  1064. * @param HTMLPurifier_Config $config
  1065. */
  1066. protected function doSetup($config)
  1067. {
  1068. $this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
  1069. array('left', 'right', 'center', 'justify'),
  1070. false
  1071. );
  1072. $border_style =
  1073. $this->info['border-bottom-style'] =
  1074. $this->info['border-right-style'] =
  1075. $this->info['border-left-style'] =
  1076. $this->info['border-top-style'] = new HTMLPurifier_AttrDef_Enum(
  1077. array(
  1078. 'none',
  1079. 'hidden',
  1080. 'dotted',
  1081. 'dashed',
  1082. 'solid',
  1083. 'double',
  1084. 'groove',
  1085. 'ridge',
  1086. 'inset',
  1087. 'outset'
  1088. ),
  1089. false
  1090. );
  1091. $this->info['border-style'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_style);
  1092. $this->info['clear'] = new HTMLPurifier_AttrDef_Enum(
  1093. array('none', 'left', 'right', 'both'),
  1094. false
  1095. );
  1096. $this->info['float'] = new HTMLPurifier_AttrDef_Enum(
  1097. array('none', 'left', 'right'),
  1098. false
  1099. );
  1100. $this->info['font-style'] = new HTMLPurifier_AttrDef_Enum(
  1101. array('normal', 'italic', 'oblique'),
  1102. false
  1103. );
  1104. $this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
  1105. array('normal', 'small-caps'),
  1106. false
  1107. );
  1108. $uri_or_none = new HTMLPurifier_AttrDef_CSS_Composite(
  1109. array(
  1110. new HTMLPurifier_AttrDef_Enum(array('none')),
  1111. new HTMLPurifier_AttrDef_CSS_URI()
  1112. )
  1113. );
  1114. $this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
  1115. array('inside', 'outside'),
  1116. false
  1117. );
  1118. $this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
  1119. array(
  1120. 'disc',
  1121. 'circle',
  1122. 'square',
  1123. 'decimal',
  1124. 'lower-roman',
  1125. 'upper-roman',
  1126. 'lower-alpha',
  1127. 'upper-alpha',
  1128. 'none'
  1129. ),
  1130. false
  1131. );
  1132. $this->info['list-style-image'] = $uri_or_none;
  1133. $this->info['list-style'] = new HTMLPurifier_AttrDef_CSS_ListStyle($config);
  1134. $this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
  1135. array('capitalize', 'uppercase', 'lowercase', 'none'),
  1136. false
  1137. );
  1138. $this->info['color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1139. $this->info['background-image'] = $uri_or_none;
  1140. $this->info['background-repeat'] = new HTMLPurifier_AttrDef_Enum(
  1141. array('repeat', 'repeat-x', 'repeat-y', 'no-repeat')
  1142. );
  1143. $this->info['background-attachment'] = new HTMLPurifier_AttrDef_Enum(
  1144. array('scroll', 'fixed')
  1145. );
  1146. $this->info['background-position'] = new HTMLPurifier_AttrDef_CSS_BackgroundPosition();
  1147. $border_color =
  1148. $this->info['border-top-color'] =
  1149. $this->info['border-bottom-color'] =
  1150. $this->info['border-left-color'] =
  1151. $this->info['border-right-color'] =
  1152. $this->info['background-color'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1153. array(
  1154. new HTMLPurifier_AttrDef_Enum(array('transparent')),
  1155. new HTMLPurifier_AttrDef_CSS_Color()
  1156. )
  1157. );
  1158. $this->info['background'] = new HTMLPurifier_AttrDef_CSS_Background($config);
  1159. $this->info['border-color'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_color);
  1160. $border_width =
  1161. $this->info['border-top-width'] =
  1162. $this->info['border-bottom-width'] =
  1163. $this->info['border-left-width'] =
  1164. $this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1165. array(
  1166. new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
  1167. new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
  1168. )
  1169. );
  1170. $this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
  1171. $this->info['letter-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1172. array(
  1173. new HTMLPurifier_AttrDef_Enum(array('normal')),
  1174. new HTMLPurifier_AttrDef_CSS_Length()
  1175. )
  1176. );
  1177. $this->info['word-spacing'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1178. array(
  1179. new HTMLPurifier_AttrDef_Enum(array('normal')),
  1180. new HTMLPurifier_AttrDef_CSS_Length()
  1181. )
  1182. );
  1183. $this->info['font-size'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1184. array(
  1185. new HTMLPurifier_AttrDef_Enum(
  1186. array(
  1187. 'xx-small',
  1188. 'x-small',
  1189. 'small',
  1190. 'medium',
  1191. 'large',
  1192. 'x-large',
  1193. 'xx-large',
  1194. 'larger',
  1195. 'smaller'
  1196. )
  1197. ),
  1198. new HTMLPurifier_AttrDef_CSS_Percentage(),
  1199. new HTMLPurifier_AttrDef_CSS_Length()
  1200. )
  1201. );
  1202. $this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1203. array(
  1204. new HTMLPurifier_AttrDef_Enum(array('normal')),
  1205. new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
  1206. new HTMLPurifier_AttrDef_CSS_Length('0'),
  1207. new HTMLPurifier_AttrDef_CSS_Percentage(true)
  1208. )
  1209. );
  1210. $margin =
  1211. $this->info['margin-top'] =
  1212. $this->info['margin-bottom'] =
  1213. $this->info['margin-left'] =
  1214. $this->info['margin-right'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1215. array(
  1216. new HTMLPurifier_AttrDef_CSS_Length(),
  1217. new HTMLPurifier_AttrDef_CSS_Percentage(),
  1218. new HTMLPurifier_AttrDef_Enum(array('auto'))
  1219. )
  1220. );
  1221. $this->info['margin'] = new HTMLPurifier_AttrDef_CSS_Multiple($margin);
  1222. // non-negative
  1223. $padding =
  1224. $this->info['padding-top'] =
  1225. $this->info['padding-bottom'] =
  1226. $this->info['padding-left'] =
  1227. $this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1228. array(
  1229. new HTMLPurifier_AttrDef_CSS_Length('0'),
  1230. new HTMLPurifier_AttrDef_CSS_Percentage(true)
  1231. )
  1232. );
  1233. $this->info['padding'] = new HTMLPurifier_AttrDef_CSS_Multiple($padding);
  1234. $this->info['text-indent'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1235. array(
  1236. new HTMLPurifier_AttrDef_CSS_Length(),
  1237. new HTMLPurifier_AttrDef_CSS_Percentage()
  1238. )
  1239. );
  1240. $trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(
  1241. array(
  1242. new HTMLPurifier_AttrDef_CSS_Length('0'),
  1243. new HTMLPurifier_AttrDef_CSS_Percentage(true),
  1244. new HTMLPurifier_AttrDef_Enum(array('auto'))
  1245. )
  1246. );
  1247. $max = $config->get('CSS.MaxImgLength');
  1248. $this->info['min-width'] =
  1249. $this->info['max-width'] =
  1250. $this->info['min-height'] =
  1251. $this->info['max-height'] =
  1252. $this->info['width'] =
  1253. $this->info['height'] =
  1254. $max === null ?
  1255. $trusted_wh :
  1256. new HTMLPurifier_AttrDef_Switch(
  1257. 'img',
  1258. // For img tags:
  1259. new HTMLPurifier_AttrDef_CSS_Composite(
  1260. array(
  1261. new HTMLPurifier_AttrDef_CSS_Length('0', $max),
  1262. new HTMLPurifier_AttrDef_Enum(array('auto'))
  1263. )
  1264. ),
  1265. // For everyone else:
  1266. $trusted_wh
  1267. );
  1268. $this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
  1269. $this->info['font-family'] = new HTMLPurifier_AttrDef_CSS_FontFamily();
  1270. // this could use specialized code
  1271. $this->info['font-weight'] = new HTMLPurifier_AttrDef_Enum(
  1272. array(
  1273. 'normal',
  1274. 'bold',
  1275. 'bolder',
  1276. 'lighter',
  1277. '100',
  1278. '200',
  1279. '300',
  1280. '400',
  1281. '500',
  1282. '600',
  1283. '700',
  1284. '800',
  1285. '900'
  1286. ),
  1287. false
  1288. );
  1289. // MUST be called after other font properties, as it references
  1290. // a CSSDefinition object
  1291. $this->info['font'] = new HTMLPurifier_AttrDef_CSS_Font($config);
  1292. // same here
  1293. $this->info['border'] =
  1294. $this->info['border-bottom'] =
  1295. $this->info['border-top'] =
  1296. $this->info['border-left'] =
  1297. $this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
  1298. $this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(
  1299. array('collapse', 'separate')
  1300. );
  1301. $this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(
  1302. array('top', 'bottom')
  1303. );
  1304. $this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(
  1305. array('auto', 'fixed')
  1306. );
  1307. $this->info['vertical-align'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1308. array(
  1309. new HTMLPurifier_AttrDef_Enum(
  1310. array(
  1311. 'baseline',
  1312. 'sub',
  1313. 'super',
  1314. 'top',
  1315. 'text-top',
  1316. 'middle',
  1317. 'bottom',
  1318. 'text-bottom'
  1319. )
  1320. ),
  1321. new HTMLPurifier_AttrDef_CSS_Length(),
  1322. new HTMLPurifier_AttrDef_CSS_Percentage()
  1323. )
  1324. );
  1325. $this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
  1326. // These CSS properties don't work on many browsers, but we live
  1327. // in THE FUTURE!
  1328. $this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(
  1329. array('nowrap', 'normal', 'pre', 'pre-wrap', 'pre-line')
  1330. );
  1331. if ($config->get('CSS.Proprietary')) {
  1332. $this->doSetupProprietary($config);
  1333. }
  1334. if ($config->get('CSS.AllowTricky')) {
  1335. $this->doSetupTricky($config);
  1336. }
  1337. if ($config->get('CSS.Trusted')) {
  1338. $this->doSetupTrusted($config);
  1339. }
  1340. $allow_important = $config->get('CSS.AllowImportant');
  1341. // wrap all attr-defs with decorator that handles !important
  1342. foreach ($this->info as $k => $v) {
  1343. $this->info[$k] = new HTMLPurifier_AttrDef_CSS_ImportantDecorator($v, $allow_important);
  1344. }
  1345. $this->setupConfigStuff($config);
  1346. }
  1347. /**
  1348. * @param HTMLPurifier_Config $config
  1349. */
  1350. protected function doSetupProprietary($config)
  1351. {
  1352. // Internet Explorer only scrollbar colors
  1353. $this->info['scrollbar-arrow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1354. $this->info['scrollbar-base-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1355. $this->info['scrollbar-darkshadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1356. $this->info['scrollbar-face-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1357. $this->info['scrollbar-highlight-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1358. $this->info['scrollbar-shadow-color'] = new HTMLPurifier_AttrDef_CSS_Color();
  1359. // vendor specific prefixes of opacity
  1360. $this->info['-moz-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1361. $this->info['-khtml-opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1362. // only opacity, for now
  1363. $this->info['filter'] = new HTMLPurifier_AttrDef_CSS_Filter();
  1364. // more CSS3
  1365. $this->info['page-break-after'] =
  1366. $this->info['page-break-before'] = new HTMLPurifier_AttrDef_Enum(
  1367. array(
  1368. 'auto',
  1369. 'always',
  1370. 'avoid',
  1371. 'left',
  1372. 'right'
  1373. )
  1374. );
  1375. $this->info['page-break-inside'] = new HTMLPurifier_AttrDef_Enum(array('auto', 'avoid'));
  1376. $border_radius = new HTMLPurifier_AttrDef_CSS_Composite(
  1377. array(
  1378. new HTMLPurifier_AttrDef_CSS_Percentage(true), // disallow negative
  1379. new HTMLPurifier_AttrDef_CSS_Length('0') // disallow negative
  1380. ));
  1381. $this->info['border-top-left-radius'] =
  1382. $this->info['border-top-right-radius'] =
  1383. $this->info['border-bottom-right-radius'] =
  1384. $this->info['border-bottom-left-radius'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_radius, 2);
  1385. // TODO: support SLASH syntax
  1386. $this->info['border-radius'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_radius, 4);
  1387. }
  1388. /**
  1389. * @param HTMLPurifier_Config $config
  1390. */
  1391. protected function doSetupTricky($config)
  1392. {
  1393. $this->info['display'] = new HTMLPurifier_AttrDef_Enum(
  1394. array(
  1395. 'inline',
  1396. 'block',
  1397. 'list-item',
  1398. 'run-in',
  1399. 'compact',
  1400. 'marker',
  1401. 'table',
  1402. 'inline-block',
  1403. 'inline-table',
  1404. 'table-row-group',
  1405. 'table-header-group',
  1406. 'table-footer-group',
  1407. 'table-row',
  1408. 'table-column-group',
  1409. 'table-column',
  1410. 'table-cell',
  1411. 'table-caption',
  1412. 'none'
  1413. )
  1414. );
  1415. $this->info['visibility'] = new HTMLPurifier_AttrDef_Enum(
  1416. array('visible', 'hidden', 'collapse')
  1417. );
  1418. $this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
  1419. $this->info['opacity'] = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  1420. }
  1421. /**
  1422. * @param HTMLPurifier_Config $config
  1423. */
  1424. protected function doSetupTrusted($config)
  1425. {
  1426. $this->info['position'] = new HTMLPurifier_AttrDef_Enum(
  1427. array('static', 'relative', 'absolute', 'fixed')
  1428. );
  1429. $this->info['top'] =
  1430. $this->info['left'] =
  1431. $this->info['right'] =
  1432. $this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1433. array(
  1434. new HTMLPurifier_AttrDef_CSS_Length(),
  1435. new HTMLPurifier_AttrDef_CSS_Percentage(),
  1436. new HTMLPurifier_AttrDef_Enum(array('auto')),
  1437. )
  1438. );
  1439. $this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(
  1440. array(
  1441. new HTMLPurifier_AttrDef_Integer(),
  1442. new HTMLPurifier_AttrDef_Enum(array('auto')),
  1443. )
  1444. );
  1445. }
  1446. /**
  1447. * Performs extra config-based processing. Based off of
  1448. * HTMLPurifier_HTMLDefinition.
  1449. * @param HTMLPurifier_Config $config
  1450. * @todo Refactor duplicate elements into common class (probably using
  1451. * composition, not inheritance).
  1452. */
  1453. protected function setupConfigStuff($config)
  1454. {
  1455. // setup allowed elements
  1456. $support = "(for information on implementing this, see the " .
  1457. "support forums) ";
  1458. $allowed_properties = $config->get('CSS.AllowedProperties');
  1459. if ($allowed_properties !== null) {
  1460. foreach ($this->info as $name => $d) {
  1461. if (!isset($allowed_properties[$name])) {
  1462. unset($this->info[$name]);
  1463. }
  1464. unset($allowed_properties[$name]);
  1465. }
  1466. // emit errors
  1467. foreach ($allowed_properties as $name => $d) {
  1468. // :TODO: Is this htmlspecialchars() call really necessary?
  1469. $name = htmlspecialchars($name);
  1470. trigger_error("Style attribute '$name' is not supported $support", E_USER_WARNING);
  1471. }
  1472. }
  1473. $forbidden_properties = $config->get('CSS.ForbiddenProperties');
  1474. if ($forbidden_properties !== null) {
  1475. foreach ($this->info as $name => $d) {
  1476. if (isset($forbidden_properties[$name])) {
  1477. unset($this->info[$name]);
  1478. }
  1479. }
  1480. }
  1481. }
  1482. }
  1483. /**
  1484. * Defines allowed child nodes and validates nodes against it.
  1485. */
  1486. abstract class HTMLPurifier_ChildDef
  1487. {
  1488. /**
  1489. * Type of child definition, usually right-most part of class name lowercase.
  1490. * Used occasionally in terms of context.
  1491. * @type string
  1492. */
  1493. public $type;
  1494. /**
  1495. * Indicates whether or not an empty array of children is okay.
  1496. *
  1497. * This is necessary for redundant checking when changes affecting
  1498. * a child node may cause a parent node to now be disallowed.
  1499. * @type bool
  1500. */
  1501. public $allow_empty;
  1502. /**
  1503. * Lookup array of all elements that this definition could possibly allow.
  1504. * @type array
  1505. */
  1506. public $elements = array();
  1507. /**
  1508. * Get lookup of tag names that should not close this element automatically.
  1509. * All other elements will do so.
  1510. * @param HTMLPurifier_Config $config HTMLPurifier_Config object
  1511. * @return array
  1512. */
  1513. public function getAllowedElements($config)
  1514. {
  1515. return $this->elements;
  1516. }
  1517. /**
  1518. * Validates nodes according to definition and returns modification.
  1519. *
  1520. * @param HTMLPurifier_Node[] $children Array of HTMLPurifier_Node
  1521. * @param HTMLPurifier_Config $config HTMLPurifier_Config object
  1522. * @param HTMLPurifier_Context $context HTMLPurifier_Context object
  1523. * @return bool|array true to leave nodes as is, false to remove parent node, array of replacement children
  1524. */
  1525. abstract public function validateChildren($children, $config, $context);
  1526. }
  1527. /**
  1528. * Configuration object that triggers customizable behavior.
  1529. *
  1530. * @warning This class is strongly defined: that means that the class
  1531. * will fail if an undefined directive is retrieved or set.
  1532. *
  1533. * @note Many classes that could (although many times don't) use the
  1534. * configuration object make it a mandatory parameter. This is
  1535. * because a configuration object should always be forwarded,
  1536. * otherwise, you run the risk of missing a parameter and then
  1537. * being stumped when a configuration directive doesn't work.
  1538. *
  1539. * @todo Reconsider some of the public member variables
  1540. */
  1541. class HTMLPurifier_Config
  1542. {
  1543. /**
  1544. * HTML Purifier's version
  1545. * @type string
  1546. */
  1547. public $version = '4.9.3';
  1548. /**
  1549. * Whether or not to automatically finalize
  1550. * the object if a read operation is done.
  1551. * @type bool
  1552. */
  1553. public $autoFinalize = true;
  1554. // protected member variables
  1555. /**
  1556. * Namespace indexed array of serials for specific namespaces.
  1557. * @see getSerial() for more info.
  1558. * @type string[]
  1559. */
  1560. protected $serials = array();
  1561. /**
  1562. * Serial for entire configuration object.
  1563. * @type string
  1564. */
  1565. protected $serial;
  1566. /**
  1567. * Parser for variables.
  1568. * @type HTMLPurifier_VarParser_Flexible
  1569. */
  1570. protected $parser = null;
  1571. /**
  1572. * Reference HTMLPurifier_ConfigSchema for value checking.
  1573. * @type HTMLPurifier_ConfigSchema
  1574. * @note This is public for introspective purposes. Please don't
  1575. * abuse!
  1576. */
  1577. public $def;
  1578. /**
  1579. * Indexed array of definitions.
  1580. * @type HTMLPurifier_Definition[]
  1581. */
  1582. protected $definitions;
  1583. /**
  1584. * Whether or not config is finalized.
  1585. * @type bool
  1586. */
  1587. protected $finalized = false;
  1588. /**
  1589. * Property list containing configuration directives.
  1590. * @type array
  1591. */
  1592. protected $plist;
  1593. /**
  1594. * Whether or not a set is taking place due to an alias lookup.
  1595. * @type bool
  1596. */
  1597. private $aliasMode;
  1598. /**
  1599. * Set to false if you do not want line and file numbers in errors.
  1600. * (useful when unit testing). This will also compress some errors
  1601. * and exceptions.
  1602. * @type bool
  1603. */
  1604. public $chatty = true;
  1605. /**
  1606. * Current lock; only gets to this namespace are allowed.
  1607. * @type string
  1608. */
  1609. private $lock;
  1610. /**
  1611. * Constructor
  1612. * @param HTMLPurifier_ConfigSchema $definition ConfigSchema that defines
  1613. * what directives are allowed.
  1614. * @param HTMLPurifier_PropertyList $parent
  1615. */
  1616. public function __construct($definition, $parent = null)
  1617. {
  1618. $parent = $parent ? $parent : $definition->defaultPlist;
  1619. $this->plist = new HTMLPurifier_PropertyList($parent);
  1620. $this->def = $definition; // keep a copy around for checking
  1621. $this->parser = new HTMLPurifier_VarParser_Flexible();
  1622. }
  1623. /**
  1624. * Convenience constructor that creates a config object based on a mixed var
  1625. * @param mixed $config Variable that defines the state of the config
  1626. * object. Can be: a HTMLPurifier_Config() object,
  1627. * an array of directives based on loadArray(),
  1628. * or a string filename of an ini file.
  1629. * @param HTMLPurifier_ConfigSchema $schema Schema object
  1630. * @return HTMLPurifier_Config Configured object
  1631. */
  1632. public static function create($config, $schema = null)
  1633. {
  1634. if ($config instanceof HTMLPurifier_Config) {
  1635. // pass-through
  1636. return $config;
  1637. }
  1638. if (!$schema) {
  1639. $ret = HTMLPurifier_Config::createDefault();
  1640. } else {
  1641. $ret = new HTMLPurifier_Config($schema);
  1642. }
  1643. if (is_string($config)) {
  1644. $ret->loadIni($config);
  1645. } elseif (is_array($config)) $ret->loadArray($config);
  1646. return $ret;
  1647. }
  1648. /**
  1649. * Creates a new config object that inherits from a previous one.
  1650. * @param HTMLPurifier_Config $config Configuration object to inherit from.
  1651. * @return HTMLPurifier_Config object with $config as its parent.
  1652. */
  1653. public static function inherit(HTMLPurifier_Config $config)
  1654. {
  1655. return new HTMLPurifier_Config($config->def, $config->plist);
  1656. }
  1657. /**
  1658. * Convenience constructor that creates a default configuration object.
  1659. * @return HTMLPurifier_Config default object.
  1660. */
  1661. public static function createDefault()
  1662. {
  1663. $definition = HTMLPurifier_ConfigSchema::instance();
  1664. $config = new HTMLPurifier_Config($definition);
  1665. return $config;
  1666. }
  1667. /**
  1668. * Retrieves a value from the configuration.
  1669. *
  1670. * @param string $key String key
  1671. * @param mixed $a
  1672. *
  1673. * @return mixed
  1674. */
  1675. public function get($key, $a = null)
  1676. {
  1677. if ($a !== null) {
  1678. $this->triggerError(
  1679. "Using deprecated API: use \$config->get('$key.$a') instead",
  1680. E_USER_WARNING
  1681. );
  1682. $key = "$key.$a";
  1683. }
  1684. if (!$this->finalized) {
  1685. $this->autoFinalize();
  1686. }
  1687. if (!isset($this->def->info[$key])) {
  1688. // can't add % due to SimpleTest bug
  1689. $this->triggerError(
  1690. 'Cannot retrieve value of undefined directive ' . htmlspecialchars($key),
  1691. E_USER_WARNING
  1692. );
  1693. return;
  1694. }
  1695. if (isset($this->def->info[$key]->isAlias)) {
  1696. $d = $this->def->info[$key];
  1697. $this->triggerError(
  1698. 'Cannot get value from aliased directive, use real name ' . $d->key,
  1699. E_USER_ERROR
  1700. );
  1701. return;
  1702. }
  1703. if ($this->lock) {
  1704. list($ns) = explode('.', $key);
  1705. if ($ns !== $this->lock) {
  1706. $this->triggerError(
  1707. 'Cannot get value of namespace ' . $ns . ' when lock for ' .
  1708. $this->lock .
  1709. ' is active, this probably indicates a Definition setup method ' .
  1710. 'is accessing directives that are not within its namespace',
  1711. E_USER_ERROR
  1712. );
  1713. return;
  1714. }
  1715. }
  1716. return $this->plist->get($key);
  1717. }
  1718. /**
  1719. * Retrieves an array of directives to values from a given namespace
  1720. *
  1721. * @param string $namespace String namespace
  1722. *
  1723. * @return array
  1724. */
  1725. public function getBatch($namespace)
  1726. {
  1727. if (!$this->finalized) {
  1728. $this->autoFinalize();
  1729. }
  1730. $full = $this->getAll();
  1731. if (!isset($full[$namespace])) {
  1732. $this->triggerError(
  1733. 'Cannot retrieve undefined namespace ' .
  1734. htmlspecialchars($namespace),
  1735. E_USER_WARNING
  1736. );
  1737. return;
  1738. }
  1739. return $full[$namespace];
  1740. }
  1741. /**
  1742. * Returns a SHA-1 signature of a segment of the configuration object
  1743. * that uniquely identifies that particular configuration
  1744. *
  1745. * @param string $namespace Namespace to get serial for
  1746. *
  1747. * @return string
  1748. * @note Revision is handled specially and is removed from the batch
  1749. * before processing!
  1750. */
  1751. public function getBatchSerial($namespace)
  1752. {
  1753. if (empty($this->serials[$namespace])) {
  1754. $batch = $this->getBatch($namespace);
  1755. unset($batch['DefinitionRev']);
  1756. $this->serials[$namespace] = sha1(serialize($batch));
  1757. }
  1758. return $this->serials[$namespace];
  1759. }
  1760. /**
  1761. * Returns a SHA-1 signature for the entire configuration object
  1762. * that uniquely identifies that particular configuration
  1763. *
  1764. * @return string
  1765. */
  1766. public function getSerial()
  1767. {
  1768. if (empty($this->serial)) {
  1769. $this->serial = sha1(serialize($this->getAll()));
  1770. }
  1771. return $this->serial;
  1772. }
  1773. /**
  1774. * Retrieves all directives, organized by namespace
  1775. *
  1776. * @warning This is a pretty inefficient function, avoid if you can
  1777. */
  1778. public function getAll()
  1779. {
  1780. if (!$this->finalized) {
  1781. $this->autoFinalize();
  1782. }
  1783. $ret = array();
  1784. foreach ($this->plist->squash() as $name => $value) {
  1785. list($ns, $key) = explode('.', $name, 2);
  1786. $ret[$ns][$key] = $value;
  1787. }
  1788. return $ret;
  1789. }
  1790. /**
  1791. * Sets a value to configuration.
  1792. *
  1793. * @param string $key key
  1794. * @param mixed $value value
  1795. * @param mixed $a
  1796. */
  1797. public function set($key, $value, $a = null)
  1798. {
  1799. if (strpos($key, '.') === false) {
  1800. $namespace = $key;
  1801. $directive = $value;
  1802. $value = $a;
  1803. $key = "$key.$directive";
  1804. $this->triggerError("Using deprecated API: use \$config->set('$key', ...) instead", E_USER_NOTICE);
  1805. } else {
  1806. list($namespace) = explode('.', $key);
  1807. }
  1808. if ($this->isFinalized('Cannot set directive after finalization')) {
  1809. return;
  1810. }
  1811. if (!isset($this->def->info[$key])) {
  1812. $this->triggerError(
  1813. 'Cannot set undefined directive ' . htmlspecialchars($key) . ' to value',
  1814. E_USER_WARNING
  1815. );
  1816. return;
  1817. }
  1818. $def = $this->def->info[$key];
  1819. if (isset($def->isAlias)) {
  1820. if ($this->aliasMode) {
  1821. $this->triggerError(
  1822. 'Double-aliases not allowed, please fix '.
  1823. 'ConfigSchema bug with' . $key,
  1824. E_USER_ERROR
  1825. );
  1826. return;
  1827. }
  1828. $this->aliasMode = true;
  1829. $this->set($def->key, $value);
  1830. $this->aliasMode = false;
  1831. $this->triggerError("$key is an alias, preferred directive name is {$def->key}", E_USER_NOTICE);
  1832. return;
  1833. }
  1834. // Raw type might be negative when using the fully optimized form
  1835. // of stdClass, which indicates allow_null == true
  1836. $rtype = is_int($def) ? $def : $def->type;
  1837. if ($rtype < 0) {
  1838. $type = -$rtype;
  1839. $allow_null = true;
  1840. } else {
  1841. $type = $rtype;
  1842. $allow_null = isset($def->allow_null);
  1843. }
  1844. try {
  1845. $value = $this->parser->parse($value, $type, $allow_null);
  1846. } catch (HTMLPurifier_VarParserException $e) {
  1847. $this->triggerError(
  1848. 'Value for ' . $key . ' is of invalid type, should be ' .
  1849. HTMLPurifier_VarParser::getTypeName($type),
  1850. E_USER_WARNING
  1851. );
  1852. return;
  1853. }
  1854. if (is_string($value) && is_object($def)) {
  1855. // resolve value alias if defined
  1856. if (isset($def->aliases[$value])) {
  1857. $value = $def->aliases[$value];
  1858. }
  1859. // check to see if the value is allowed
  1860. if (isset($def->allowed) && !isset($def->allowed[$value])) {
  1861. $this->triggerError(
  1862. 'Value not supported, valid values are: ' .
  1863. $this->_listify($def->allowed),
  1864. E_USER_WARNING
  1865. );
  1866. return;
  1867. }
  1868. }
  1869. $this->plist->set($key, $value);
  1870. // reset definitions if the directives they depend on changed
  1871. // this is a very costly process, so it's discouraged
  1872. // with finalization
  1873. if ($namespace == 'HTML' || $namespace == 'CSS' || $namespace == 'URI') {
  1874. $this->definitions[$namespace] = null;
  1875. }
  1876. $this->serials[$namespace] = false;
  1877. }
  1878. /**
  1879. * Convenience function for error reporting
  1880. *
  1881. * @param array $lookup
  1882. *
  1883. * @return string
  1884. */
  1885. private function _listify($lookup)
  1886. {
  1887. $list = array();
  1888. foreach ($lookup as $name => $b) {
  1889. $list[] = $name;
  1890. }
  1891. return implode(', ', $list);
  1892. }
  1893. /**
  1894. * Retrieves object reference to the HTML definition.
  1895. *
  1896. * @param bool $raw Return a copy that has not been setup yet. Must be
  1897. * called before it's been setup, otherwise won't work.
  1898. * @param bool $optimized If true, this method may return null, to
  1899. * indicate that a cached version of the modified
  1900. * definition object is available and no further edits
  1901. * are necessary. Consider using
  1902. * maybeGetRawHTMLDefinition, which is more explicitly
  1903. * named, instead.
  1904. *
  1905. * @return HTMLPurifier_HTMLDefinition
  1906. */
  1907. public function getHTMLDefinition($raw = false, $optimized = false)
  1908. {
  1909. return $this->getDefinition('HTML', $raw, $optimized);
  1910. }
  1911. /**
  1912. * Retrieves object reference to the CSS definition
  1913. *
  1914. * @param bool $raw Return a copy that has not been setup yet. Must be
  1915. * called before it's been setup, otherwise won't work.
  1916. * @param bool $optimized If true, this method may return null, to
  1917. * indicate that a cached version of the modified
  1918. * definition object is available and no further edits
  1919. * are necessary. Consider using
  1920. * maybeGetRawCSSDefinition, which is more explicitly
  1921. * named, instead.
  1922. *
  1923. * @return HTMLPurifier_CSSDefinition
  1924. */
  1925. public function getCSSDefinition($raw = false, $optimized = false)
  1926. {
  1927. return $this->getDefinition('CSS', $raw, $optimized);
  1928. }
  1929. /**
  1930. * Retrieves object reference to the URI definition
  1931. *
  1932. * @param bool $raw Return a copy that has not been setup yet. Must be
  1933. * called before it's been setup, otherwise won't work.
  1934. * @param bool $optimized If true, this method may return null, to
  1935. * indicate that a cached version of the modified
  1936. * definition object is available and no further edits
  1937. * are necessary. Consider using
  1938. * maybeGetRawURIDefinition, which is more explicitly
  1939. * named, instead.
  1940. *
  1941. * @return HTMLPurifier_URIDefinition
  1942. */
  1943. public function getURIDefinition($raw = false, $optimized = false)
  1944. {
  1945. return $this->getDefinition('URI', $raw, $optimized);
  1946. }
  1947. /**
  1948. * Retrieves a definition
  1949. *
  1950. * @param string $type Type of definition: HTML, CSS, etc
  1951. * @param bool $raw Whether or not definition should be returned raw
  1952. * @param bool $optimized Only has an effect when $raw is true. Whether
  1953. * or not to return null if the result is already present in
  1954. * the cache. This is off by default for backwards
  1955. * compatibility reasons, but you need to do things this
  1956. * way in order to ensure that caching is done properly.
  1957. * Check out enduser-customize.html for more details.
  1958. * We probably won't ever change this default, as much as the
  1959. * maybe semantics is the "right thing to do."
  1960. *
  1961. * @throws HTMLPurifier_Exception
  1962. * @return HTMLPurifier_Definition
  1963. */
  1964. public function getDefinition($type, $raw = false, $optimized = false)
  1965. {
  1966. if ($optimized && !$raw) {
  1967. throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
  1968. }
  1969. if (!$this->finalized) {
  1970. $this->autoFinalize();
  1971. }
  1972. // temporarily suspend locks, so we can handle recursive definition calls
  1973. $lock = $this->lock;
  1974. $this->lock = null;
  1975. $factory = HTMLPurifier_DefinitionCacheFactory::instance();
  1976. $cache = $factory->create($type, $this);
  1977. $this->lock = $lock;
  1978. if (!$raw) {
  1979. // full definition
  1980. // ---------------
  1981. // check if definition is in memory
  1982. if (!empty($this->definitions[$type])) {
  1983. $def = $this->definitions[$type];
  1984. // check if the definition is setup
  1985. if ($def->setup) {
  1986. return $def;
  1987. } else {
  1988. $def->setup($this);
  1989. if ($def->optimized) {
  1990. $cache->add($def, $this);
  1991. }
  1992. return $def;
  1993. }
  1994. }
  1995. // check if definition is in cache
  1996. $def = $cache->get($this);
  1997. if ($def) {
  1998. // definition in cache, save to memory and return it
  1999. $this->definitions[$type] = $def;
  2000. return $def;
  2001. }
  2002. // initialize it
  2003. $def = $this->initDefinition($type);
  2004. // set it up
  2005. $this->lock = $type;
  2006. $def->setup($this);
  2007. $this->lock = null;
  2008. // save in cache
  2009. $cache->add($def, $this);
  2010. // return it
  2011. return $def;
  2012. } else {
  2013. // raw definition
  2014. // --------------
  2015. // check preconditions
  2016. $def = null;
  2017. if ($optimized) {
  2018. if (is_null($this->get($type . '.DefinitionID'))) {
  2019. // fatally error out if definition ID not set
  2020. throw new HTMLPurifier_Exception(
  2021. "Cannot retrieve raw version without specifying %$type.DefinitionID"
  2022. );
  2023. }
  2024. }
  2025. if (!empty($this->definitions[$type])) {
  2026. $def = $this->definitions[$type];
  2027. if ($def->setup && !$optimized) {
  2028. $extra = $this->chatty ?
  2029. " (try moving this code block earlier in your initialization)" :
  2030. "";
  2031. throw new HTMLPurifier_Exception(
  2032. "Cannot retrieve raw definition after it has already been setup" .
  2033. $extra
  2034. );
  2035. }
  2036. if ($def->optimized === null) {
  2037. $extra = $this->chatty ? " (try flushing your cache)" : "";
  2038. throw new HTMLPurifier_Exception(
  2039. "Optimization status of definition is unknown" . $extra
  2040. );
  2041. }
  2042. if ($def->optimized !== $optimized) {
  2043. $msg = $optimized ? "optimized" : "unoptimized";
  2044. $extra = $this->chatty ?
  2045. " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)"
  2046. : "";
  2047. throw new HTMLPurifier_Exception(
  2048. "Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra
  2049. );
  2050. }
  2051. }
  2052. // check if definition was in memory
  2053. if ($def) {
  2054. if ($def->setup) {
  2055. // invariant: $optimized === true (checked above)
  2056. return null;
  2057. } else {
  2058. return $def;
  2059. }
  2060. }
  2061. // if optimized, check if definition was in cache
  2062. // (because we do the memory check first, this formulation
  2063. // is prone to cache slamming, but I think
  2064. // guaranteeing that either /all/ of the raw
  2065. // setup code or /none/ of it is run is more important.)
  2066. if ($optimized) {
  2067. // This code path only gets run once; once we put
  2068. // something in $definitions (which is guaranteed by the
  2069. // trailing code), we always short-circuit above.
  2070. $def = $cache->get($this);
  2071. if ($def) {
  2072. // save the full definition for later, but don't
  2073. // return it yet
  2074. $this->definitions[$type] = $def;
  2075. return null;
  2076. }
  2077. }
  2078. // check invariants for creation
  2079. if (!$optimized) {
  2080. if (!is_null($this->get($type . '.DefinitionID'))) {
  2081. if ($this->chatty) {
  2082. $this->triggerError(
  2083. 'Due to a documentation error in previous version of HTML Purifier, your ' .
  2084. 'definitions are not being cached. If this is OK, you can remove the ' .
  2085. '%$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, ' .
  2086. 'modify your code to use maybeGetRawDefinition, and test if the returned ' .
  2087. 'value is null before making any edits (if it is null, that means that a ' .
  2088. 'cached version is available, and no raw operations are necessary). See ' .
  2089. '<a href="http://htmlpurifier.org/docs/enduser-customize.html#optimized">' .
  2090. 'Customize</a> for more details',
  2091. E_USER_WARNING
  2092. );
  2093. } else {
  2094. $this->triggerError(
  2095. "Useless DefinitionID declaration",
  2096. E_USER_WARNING
  2097. );
  2098. }
  2099. }
  2100. }
  2101. // initialize it
  2102. $def = $this->initDefinition($type);
  2103. $def->optimized = $optimized;
  2104. return $def;
  2105. }
  2106. throw new HTMLPurifier_Exception("The impossible happened!");
  2107. }
  2108. /**
  2109. * Initialise definition
  2110. *
  2111. * @param string $type What type of definition to create
  2112. *
  2113. * @return HTMLPurifier_CSSDefinition|HTMLPurifier_HTMLDefinition|HTMLPurifier_URIDefinition
  2114. * @throws HTMLPurifier_Exception
  2115. */
  2116. private function initDefinition($type)
  2117. {
  2118. // quick checks failed, let's create the object
  2119. if ($type == 'HTML') {
  2120. $def = new HTMLPurifier_HTMLDefinition();
  2121. } elseif ($type == 'CSS') {
  2122. $def = new HTMLPurifier_CSSDefinition();
  2123. } elseif ($type == 'URI') {
  2124. $def = new HTMLPurifier_URIDefinition();
  2125. } else {
  2126. throw new HTMLPurifier_Exception(
  2127. "Definition of $type type not supported"
  2128. );
  2129. }
  2130. $this->definitions[$type] = $def;
  2131. return $def;
  2132. }
  2133. public function maybeGetRawDefinition($name)
  2134. {
  2135. return $this->getDefinition($name, true, true);
  2136. }
  2137. /**
  2138. * @return HTMLPurifier_HTMLDefinition
  2139. */
  2140. public function maybeGetRawHTMLDefinition()
  2141. {
  2142. return $this->getDefinition('HTML', true, true);
  2143. }
  2144. /**
  2145. * @return HTMLPurifier_CSSDefinition
  2146. */
  2147. public function maybeGetRawCSSDefinition()
  2148. {
  2149. return $this->getDefinition('CSS', true, true);
  2150. }
  2151. /**
  2152. * @return HTMLPurifier_URIDefinition
  2153. */
  2154. public function maybeGetRawURIDefinition()
  2155. {
  2156. return $this->getDefinition('URI', true, true);
  2157. }
  2158. /**
  2159. * Loads configuration values from an array with the following structure:
  2160. * Namespace.Directive => Value
  2161. *
  2162. * @param array $config_array Configuration associative array
  2163. */
  2164. public function loadArray($config_array)
  2165. {
  2166. if ($this->isFinalized('Cannot load directives after finalization')) {
  2167. return;
  2168. }
  2169. foreach ($config_array as $key => $value) {
  2170. $key = str_replace('_', '.', $key);
  2171. if (strpos($key, '.') !== false) {
  2172. $this->set($key, $value);
  2173. } else {
  2174. $namespace = $key;
  2175. $namespace_values = $value;
  2176. foreach ($namespace_values as $directive => $value2) {
  2177. $this->set($namespace .'.'. $directive, $value2);
  2178. }
  2179. }
  2180. }
  2181. }
  2182. /**
  2183. * Returns a list of array(namespace, directive) for all directives
  2184. * that are allowed in a web-form context as per an allowed
  2185. * namespaces/directives list.
  2186. *
  2187. * @param array $allowed List of allowed namespaces/directives
  2188. * @param HTMLPurifier_ConfigSchema $schema Schema to use, if not global copy
  2189. *
  2190. * @return array
  2191. */
  2192. public static function getAllowedDirectivesForForm($allowed, $schema = null)
  2193. {
  2194. if (!$schema) {
  2195. $schema = HTMLPurifier_ConfigSchema::instance();
  2196. }
  2197. if ($allowed !== true) {
  2198. if (is_string($allowed)) {
  2199. $allowed = array($allowed);
  2200. }
  2201. $allowed_ns = array();
  2202. $allowed_directives = array();
  2203. $blacklisted_directives = array();
  2204. foreach ($allowed as $ns_or_directive) {
  2205. if (strpos($ns_or_directive, '.') !== false) {
  2206. // directive
  2207. if ($ns_or_directive[0] == '-') {
  2208. $blacklisted_directives[substr($ns_or_directive, 1)] = true;
  2209. } else {
  2210. $allowed_directives[$ns_or_directive] = true;
  2211. }
  2212. } else {
  2213. // namespace
  2214. $allowed_ns[$ns_or_directive] = true;
  2215. }
  2216. }
  2217. }
  2218. $ret = array();
  2219. foreach ($schema->info as $key => $def) {
  2220. list($ns, $directive) = explode('.', $key, 2);
  2221. if ($allowed !== true) {
  2222. if (isset($blacklisted_directives["$ns.$directive"])) {
  2223. continue;
  2224. }
  2225. if (!isset($allowed_directives["$ns.$directive"]) && !isset($allowed_ns[$ns])) {
  2226. continue;
  2227. }
  2228. }
  2229. if (isset($def->isAlias)) {
  2230. continue;
  2231. }
  2232. if ($directive == 'DefinitionID' || $directive == 'DefinitionRev') {
  2233. continue;
  2234. }
  2235. $ret[] = array($ns, $directive);
  2236. }
  2237. return $ret;
  2238. }
  2239. /**
  2240. * Loads configuration values from $_GET/$_POST that were posted
  2241. * via ConfigForm
  2242. *
  2243. * @param array $array $_GET or $_POST array to import
  2244. * @param string|bool $index Index/name that the config variables are in
  2245. * @param array|bool $allowed List of allowed namespaces/directives
  2246. * @param bool $mq_fix Boolean whether or not to enable magic quotes fix
  2247. * @param HTMLPurifier_ConfigSchema $schema Schema to use, if not global copy
  2248. *
  2249. * @return mixed
  2250. */
  2251. public static function loadArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null)
  2252. {
  2253. $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $schema);
  2254. $config = HTMLPurifier_Config::create($ret, $schema);
  2255. return $config;
  2256. }
  2257. /**
  2258. * Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
  2259. *
  2260. * @param array $array $_GET or $_POST array to import
  2261. * @param string|bool $index Index/name that the config variables are in
  2262. * @param array|bool $allowed List of allowed namespaces/directives
  2263. * @param bool $mq_fix Boolean whether or not to enable magic quotes fix
  2264. */
  2265. public function mergeArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true)
  2266. {
  2267. $ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix, $this->def);
  2268. $this->loadArray($ret);
  2269. }
  2270. /**
  2271. * Prepares an array from a form into something usable for the more
  2272. * strict parts of HTMLPurifier_Config
  2273. *
  2274. * @param array $array $_GET or $_POST array to import
  2275. * @param string|bool $index Index/name that the config variables are in
  2276. * @param array|bool $allowed List of allowed namespaces/directives
  2277. * @param bool $mq_fix Boolean whether or not to enable magic quotes fix
  2278. * @param HTMLPurifier_ConfigSchema $schema Schema to use, if not global copy
  2279. *
  2280. * @return array
  2281. */
  2282. public static function prepareArrayFromForm($array, $index = false, $allowed = true, $mq_fix = true, $schema = null)
  2283. {
  2284. if ($index !== false) {
  2285. $array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
  2286. }
  2287. $mq = $mq_fix && function_exists('get_magic_quotes_gpc') && get_magic_quotes_gpc();
  2288. $allowed = HTMLPurifier_Config::getAllowedDirectivesForForm($allowed, $schema);
  2289. $ret = array();
  2290. foreach ($allowed as $key) {
  2291. list($ns, $directive) = $key;
  2292. $skey = "$ns.$directive";
  2293. if (!empty($array["Null_$skey"])) {
  2294. $ret[$ns][$directive] = null;
  2295. continue;
  2296. }
  2297. if (!isset($array[$skey])) {
  2298. continue;
  2299. }
  2300. $value = $mq ? stripslashes($array[$skey]) : $array[$skey];
  2301. $ret[$ns][$directive] = $value;
  2302. }
  2303. return $ret;
  2304. }
  2305. /**
  2306. * Loads configuration values from an ini file
  2307. *
  2308. * @param string $filename Name of ini file
  2309. */
  2310. public function loadIni($filename)
  2311. {
  2312. if ($this->isFinalized('Cannot load directives after finalization')) {
  2313. return;
  2314. }
  2315. $array = parse_ini_file($filename, true);
  2316. $this->loadArray($array);
  2317. }
  2318. /**
  2319. * Checks whether or not the configuration object is finalized.
  2320. *
  2321. * @param string|bool $error String error message, or false for no error
  2322. *
  2323. * @return bool
  2324. */
  2325. public function isFinalized($error = false)
  2326. {
  2327. if ($this->finalized && $error) {
  2328. $this->triggerError($error, E_USER_ERROR);
  2329. }
  2330. return $this->finalized;
  2331. }
  2332. /**
  2333. * Finalizes configuration only if auto finalize is on and not
  2334. * already finalized
  2335. */
  2336. public function autoFinalize()
  2337. {
  2338. if ($this->autoFinalize) {
  2339. $this->finalize();
  2340. } else {
  2341. $this->plist->squash(true);
  2342. }
  2343. }
  2344. /**
  2345. * Finalizes a configuration object, prohibiting further change
  2346. */
  2347. public function finalize()
  2348. {
  2349. $this->finalized = true;
  2350. $this->parser = null;
  2351. }
  2352. /**
  2353. * Produces a nicely formatted error message by supplying the
  2354. * stack frame information OUTSIDE of HTMLPurifier_Config.
  2355. *
  2356. * @param string $msg An error message
  2357. * @param int $no An error number
  2358. */
  2359. protected function triggerError($msg, $no)
  2360. {
  2361. // determine previous stack frame
  2362. $extra = '';
  2363. if ($this->chatty) {
  2364. $trace = debug_backtrace();
  2365. // zip(tail(trace), trace) -- but PHP is not Haskell har har
  2366. for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
  2367. // XXX this is not correct on some versions of HTML Purifier
  2368. if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
  2369. continue;
  2370. }
  2371. $frame = $trace[$i];
  2372. $extra = " invoked on line {$frame['line']} in file {$frame['file']}";
  2373. break;
  2374. }
  2375. }
  2376. trigger_error($msg . $extra, $no);
  2377. }
  2378. /**
  2379. * Returns a serialized form of the configuration object that can
  2380. * be reconstituted.
  2381. *
  2382. * @return string
  2383. */
  2384. public function serialize()
  2385. {
  2386. $this->getDefinition('HTML');
  2387. $this->getDefinition('CSS');
  2388. $this->getDefinition('URI');
  2389. return serialize($this);
  2390. }
  2391. }
  2392. /**
  2393. * Configuration definition, defines directives and their defaults.
  2394. */
  2395. class HTMLPurifier_ConfigSchema
  2396. {
  2397. /**
  2398. * Defaults of the directives and namespaces.
  2399. * @type array
  2400. * @note This shares the exact same structure as HTMLPurifier_Config::$conf
  2401. */
  2402. public $defaults = array();
  2403. /**
  2404. * The default property list. Do not edit this property list.
  2405. * @type array
  2406. */
  2407. public $defaultPlist;
  2408. /**
  2409. * Definition of the directives.
  2410. * The structure of this is:
  2411. *
  2412. * array(
  2413. * 'Namespace' => array(
  2414. * 'Directive' => new stdClass(),
  2415. * )
  2416. * )
  2417. *
  2418. * The stdClass may have the following properties:
  2419. *
  2420. * - If isAlias isn't set:
  2421. * - type: Integer type of directive, see HTMLPurifier_VarParser for definitions
  2422. * - allow_null: If set, this directive allows null values
  2423. * - aliases: If set, an associative array of value aliases to real values
  2424. * - allowed: If set, a lookup array of allowed (string) values
  2425. * - If isAlias is set:
  2426. * - namespace: Namespace this directive aliases to
  2427. * - name: Directive name this directive aliases to
  2428. *
  2429. * In certain degenerate cases, stdClass will actually be an integer. In
  2430. * that case, the value is equivalent to an stdClass with the type
  2431. * property set to the integer. If the integer is negative, type is
  2432. * equal to the absolute value of integer, and allow_null is true.
  2433. *
  2434. * This class is friendly with HTMLPurifier_Config. If you need introspection
  2435. * about the schema, you're better of using the ConfigSchema_Interchange,
  2436. * which uses more memory but has much richer information.
  2437. * @type array
  2438. */
  2439. public $info = array();
  2440. /**
  2441. * Application-wide singleton
  2442. * @type HTMLPurifier_ConfigSchema
  2443. */
  2444. protected static $singleton;
  2445. public function __construct()
  2446. {
  2447. $this->defaultPlist = new HTMLPurifier_PropertyList();
  2448. }
  2449. /**
  2450. * Unserializes the default ConfigSchema.
  2451. * @return HTMLPurifier_ConfigSchema
  2452. */
  2453. public static function makeFromSerial()
  2454. {
  2455. $contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser');
  2456. $r = unserialize($contents);
  2457. if (!$r) {
  2458. $hash = sha1($contents);
  2459. trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR);
  2460. }
  2461. return $r;
  2462. }
  2463. /**
  2464. * Retrieves an instance of the application-wide configuration definition.
  2465. * @param HTMLPurifier_ConfigSchema $prototype
  2466. * @return HTMLPurifier_ConfigSchema
  2467. */
  2468. public static function instance($prototype = null)
  2469. {
  2470. if ($prototype !== null) {
  2471. HTMLPurifier_ConfigSchema::$singleton = $prototype;
  2472. } elseif (HTMLPurifier_ConfigSchema::$singleton === null || $prototype === true) {
  2473. HTMLPurifier_ConfigSchema::$singleton = HTMLPurifier_ConfigSchema::makeFromSerial();
  2474. }
  2475. return HTMLPurifier_ConfigSchema::$singleton;
  2476. }
  2477. /**
  2478. * Defines a directive for configuration
  2479. * @warning Will fail of directive's namespace is defined.
  2480. * @warning This method's signature is slightly different from the legacy
  2481. * define() static method! Beware!
  2482. * @param string $key Name of directive
  2483. * @param mixed $default Default value of directive
  2484. * @param string $type Allowed type of the directive. See
  2485. * HTMLPurifier_DirectiveDef::$type for allowed values
  2486. * @param bool $allow_null Whether or not to allow null values
  2487. */
  2488. public function add($key, $default, $type, $allow_null)
  2489. {
  2490. $obj = new stdClass();
  2491. $obj->type = is_int($type) ? $type : HTMLPurifier_VarParser::$types[$type];
  2492. if ($allow_null) {
  2493. $obj->allow_null = true;
  2494. }
  2495. $this->info[$key] = $obj;
  2496. $this->defaults[$key] = $default;
  2497. $this->defaultPlist->set($key, $default);
  2498. }
  2499. /**
  2500. * Defines a directive value alias.
  2501. *
  2502. * Directive value aliases are convenient for developers because it lets
  2503. * them set a directive to several values and get the same result.
  2504. * @param string $key Name of Directive
  2505. * @param array $aliases Hash of aliased values to the real alias
  2506. */
  2507. public function addValueAliases($key, $aliases)
  2508. {
  2509. if (!isset($this->info[$key]->aliases)) {
  2510. $this->info[$key]->aliases = array();
  2511. }
  2512. foreach ($aliases as $alias => $real) {
  2513. $this->info[$key]->aliases[$alias] = $real;
  2514. }
  2515. }
  2516. /**
  2517. * Defines a set of allowed values for a directive.
  2518. * @warning This is slightly different from the corresponding static
  2519. * method definition.
  2520. * @param string $key Name of directive
  2521. * @param array $allowed Lookup array of allowed values
  2522. */
  2523. public function addAllowedValues($key, $allowed)
  2524. {
  2525. $this->info[$key]->allowed = $allowed;
  2526. }
  2527. /**
  2528. * Defines a directive alias for backwards compatibility
  2529. * @param string $key Directive that will be aliased
  2530. * @param string $new_key Directive that the alias will be to
  2531. */
  2532. public function addAlias($key, $new_key)
  2533. {
  2534. $obj = new stdClass;
  2535. $obj->key = $new_key;
  2536. $obj->isAlias = true;
  2537. $this->info[$key] = $obj;
  2538. }
  2539. /**
  2540. * Replaces any stdClass that only has the type property with type integer.
  2541. */
  2542. public function postProcess()
  2543. {
  2544. foreach ($this->info as $key => $v) {
  2545. if (count((array) $v) == 1) {
  2546. $this->info[$key] = $v->type;
  2547. } elseif (count((array) $v) == 2 && isset($v->allow_null)) {
  2548. $this->info[$key] = -$v->type;
  2549. }
  2550. }
  2551. }
  2552. }
  2553. /**
  2554. * @todo Unit test
  2555. */
  2556. class HTMLPurifier_ContentSets
  2557. {
  2558. /**
  2559. * List of content set strings (pipe separators) indexed by name.
  2560. * @type array
  2561. */
  2562. public $info = array();
  2563. /**
  2564. * List of content set lookups (element => true) indexed by name.
  2565. * @type array
  2566. * @note This is in HTMLPurifier_HTMLDefinition->info_content_sets
  2567. */
  2568. public $lookup = array();
  2569. /**
  2570. * Synchronized list of defined content sets (keys of info).
  2571. * @type array
  2572. */
  2573. protected $keys = array();
  2574. /**
  2575. * Synchronized list of defined content values (values of info).
  2576. * @type array
  2577. */
  2578. protected $values = array();
  2579. /**
  2580. * Merges in module's content sets, expands identifiers in the content
  2581. * sets and populates the keys, values and lookup member variables.
  2582. * @param HTMLPurifier_HTMLModule[] $modules List of HTMLPurifier_HTMLModule
  2583. */
  2584. public function __construct($modules)
  2585. {
  2586. if (!is_array($modules)) {
  2587. $modules = array($modules);
  2588. }
  2589. // populate content_sets based on module hints
  2590. // sorry, no way of overloading
  2591. foreach ($modules as $module) {
  2592. foreach ($module->content_sets as $key => $value) {
  2593. $temp = $this->convertToLookup($value);
  2594. if (isset($this->lookup[$key])) {
  2595. // add it into the existing content set
  2596. $this->lookup[$key] = array_merge($this->lookup[$key], $temp);
  2597. } else {
  2598. $this->lookup[$key] = $temp;
  2599. }
  2600. }
  2601. }
  2602. $old_lookup = false;
  2603. while ($old_lookup !== $this->lookup) {
  2604. $old_lookup = $this->lookup;
  2605. foreach ($this->lookup as $i => $set) {
  2606. $add = array();
  2607. foreach ($set as $element => $x) {
  2608. if (isset($this->lookup[$element])) {
  2609. $add += $this->lookup[$element];
  2610. unset($this->lookup[$i][$element]);
  2611. }
  2612. }
  2613. $this->lookup[$i] += $add;
  2614. }
  2615. }
  2616. foreach ($this->lookup as $key => $lookup) {
  2617. $this->info[$key] = implode(' | ', array_keys($lookup));
  2618. }
  2619. $this->keys = array_keys($this->info);
  2620. $this->values = array_values($this->info);
  2621. }
  2622. /**
  2623. * Accepts a definition; generates and assigns a ChildDef for it
  2624. * @param HTMLPurifier_ElementDef $def HTMLPurifier_ElementDef reference
  2625. * @param HTMLPurifier_HTMLModule $module Module that defined the ElementDef
  2626. */
  2627. public function generateChildDef(&$def, $module)
  2628. {
  2629. if (!empty($def->child)) { // already done!
  2630. return;
  2631. }
  2632. $content_model = $def->content_model;
  2633. if (is_string($content_model)) {
  2634. // Assume that $this->keys is alphanumeric
  2635. $def->content_model = preg_replace_callback(
  2636. '/\b(' . implode('|', $this->keys) . ')\b/',
  2637. array($this, 'generateChildDefCallback'),
  2638. $content_model
  2639. );
  2640. //$def->content_model = str_replace(
  2641. // $this->keys, $this->values, $content_model);
  2642. }
  2643. $def->child = $this->getChildDef($def, $module);
  2644. }
  2645. public function generateChildDefCallback($matches)
  2646. {
  2647. return $this->info[$matches[0]];
  2648. }
  2649. /**
  2650. * Instantiates a ChildDef based on content_model and content_model_type
  2651. * member variables in HTMLPurifier_ElementDef
  2652. * @note This will also defer to modules for custom HTMLPurifier_ChildDef
  2653. * subclasses that need content set expansion
  2654. * @param HTMLPurifier_ElementDef $def HTMLPurifier_ElementDef to have ChildDef extracted
  2655. * @param HTMLPurifier_HTMLModule $module Module that defined the ElementDef
  2656. * @return HTMLPurifier_ChildDef corresponding to ElementDef
  2657. */
  2658. public function getChildDef($def, $module)
  2659. {
  2660. $value = $def->content_model;
  2661. if (is_object($value)) {
  2662. trigger_error(
  2663. 'Literal object child definitions should be stored in '.
  2664. 'ElementDef->child not ElementDef->content_model',
  2665. E_USER_NOTICE
  2666. );
  2667. return $value;
  2668. }
  2669. switch ($def->content_model_type) {
  2670. case 'required':
  2671. return new HTMLPurifier_ChildDef_Required($value);
  2672. case 'optional':
  2673. return new HTMLPurifier_ChildDef_Optional($value);
  2674. case 'empty':
  2675. return new HTMLPurifier_ChildDef_Empty();
  2676. case 'custom':
  2677. return new HTMLPurifier_ChildDef_Custom($value);
  2678. }
  2679. // defer to its module
  2680. $return = false;
  2681. if ($module->defines_child_def) { // save a func call
  2682. $return = $module->getChildDef($def);
  2683. }
  2684. if ($return !== false) {
  2685. return $return;
  2686. }
  2687. // error-out
  2688. trigger_error(
  2689. 'Could not determine which ChildDef class to instantiate',
  2690. E_USER_ERROR
  2691. );
  2692. return false;
  2693. }
  2694. /**
  2695. * Converts a string list of elements separated by pipes into
  2696. * a lookup array.
  2697. * @param string $string List of elements
  2698. * @return array Lookup array of elements
  2699. */
  2700. protected function convertToLookup($string)
  2701. {
  2702. $array = explode('|', str_replace(' ', '', $string));
  2703. $ret = array();
  2704. foreach ($array as $k) {
  2705. $ret[$k] = true;
  2706. }
  2707. return $ret;
  2708. }
  2709. }
  2710. /**
  2711. * Registry object that contains information about the current context.
  2712. * @warning Is a bit buggy when variables are set to null: it thinks
  2713. * they don't exist! So use false instead, please.
  2714. * @note Since the variables Context deals with may not be objects,
  2715. * references are very important here! Do not remove!
  2716. */
  2717. class HTMLPurifier_Context
  2718. {
  2719. /**
  2720. * Private array that stores the references.
  2721. * @type array
  2722. */
  2723. private $_storage = array();
  2724. /**
  2725. * Registers a variable into the context.
  2726. * @param string $name String name
  2727. * @param mixed $ref Reference to variable to be registered
  2728. */
  2729. public function register($name, &$ref)
  2730. {
  2731. if (array_key_exists($name, $this->_storage)) {
  2732. trigger_error(
  2733. "Name $name produces collision, cannot re-register",
  2734. E_USER_ERROR
  2735. );
  2736. return;
  2737. }
  2738. $this->_storage[$name] =& $ref;
  2739. }
  2740. /**
  2741. * Retrieves a variable reference from the context.
  2742. * @param string $name String name
  2743. * @param bool $ignore_error Boolean whether or not to ignore error
  2744. * @return mixed
  2745. */
  2746. public function &get($name, $ignore_error = false)
  2747. {
  2748. if (!array_key_exists($name, $this->_storage)) {
  2749. if (!$ignore_error) {
  2750. trigger_error(
  2751. "Attempted to retrieve non-existent variable $name",
  2752. E_USER_ERROR
  2753. );
  2754. }
  2755. $var = null; // so we can return by reference
  2756. return $var;
  2757. }
  2758. return $this->_storage[$name];
  2759. }
  2760. /**
  2761. * Destroys a variable in the context.
  2762. * @param string $name String name
  2763. */
  2764. public function destroy($name)
  2765. {
  2766. if (!array_key_exists($name, $this->_storage)) {
  2767. trigger_error(
  2768. "Attempted to destroy non-existent variable $name",
  2769. E_USER_ERROR
  2770. );
  2771. return;
  2772. }
  2773. unset($this->_storage[$name]);
  2774. }
  2775. /**
  2776. * Checks whether or not the variable exists.
  2777. * @param string $name String name
  2778. * @return bool
  2779. */
  2780. public function exists($name)
  2781. {
  2782. return array_key_exists($name, $this->_storage);
  2783. }
  2784. /**
  2785. * Loads a series of variables from an associative array
  2786. * @param array $context_array Assoc array of variables to load
  2787. */
  2788. public function loadArray($context_array)
  2789. {
  2790. foreach ($context_array as $key => $discard) {
  2791. $this->register($key, $context_array[$key]);
  2792. }
  2793. }
  2794. }
  2795. /**
  2796. * Abstract class representing Definition cache managers that implements
  2797. * useful common methods and is a factory.
  2798. * @todo Create a separate maintenance file advanced users can use to
  2799. * cache their custom HTMLDefinition, which can be loaded
  2800. * via a configuration directive
  2801. * @todo Implement memcached
  2802. */
  2803. abstract class HTMLPurifier_DefinitionCache
  2804. {
  2805. /**
  2806. * @type string
  2807. */
  2808. public $type;
  2809. /**
  2810. * @param string $type Type of definition objects this instance of the
  2811. * cache will handle.
  2812. */
  2813. public function __construct($type)
  2814. {
  2815. $this->type = $type;
  2816. }
  2817. /**
  2818. * Generates a unique identifier for a particular configuration
  2819. * @param HTMLPurifier_Config $config Instance of HTMLPurifier_Config
  2820. * @return string
  2821. */
  2822. public function generateKey($config)
  2823. {
  2824. return $config->version . ',' . // possibly replace with function calls
  2825. $config->getBatchSerial($this->type) . ',' .
  2826. $config->get($this->type . '.DefinitionRev');
  2827. }
  2828. /**
  2829. * Tests whether or not a key is old with respect to the configuration's
  2830. * version and revision number.
  2831. * @param string $key Key to test
  2832. * @param HTMLPurifier_Config $config Instance of HTMLPurifier_Config to test against
  2833. * @return bool
  2834. */
  2835. public function isOld($key, $config)
  2836. {
  2837. if (substr_count($key, ',') < 2) {
  2838. return true;
  2839. }
  2840. list($version, $hash, $revision) = explode(',', $key, 3);
  2841. $compare = version_compare($version, $config->version);
  2842. // version mismatch, is always old
  2843. if ($compare != 0) {
  2844. return true;
  2845. }
  2846. // versions match, ids match, check revision number
  2847. if ($hash == $config->getBatchSerial($this->type) &&
  2848. $revision < $config->get($this->type . '.DefinitionRev')) {
  2849. return true;
  2850. }
  2851. return false;
  2852. }
  2853. /**
  2854. * Checks if a definition's type jives with the cache's type
  2855. * @note Throws an error on failure
  2856. * @param HTMLPurifier_Definition $def Definition object to check
  2857. * @return bool true if good, false if not
  2858. */
  2859. public function checkDefType($def)
  2860. {
  2861. if ($def->type !== $this->type) {
  2862. trigger_error("Cannot use definition of type {$def->type} in cache for {$this->type}");
  2863. return false;
  2864. }
  2865. return true;
  2866. }
  2867. /**
  2868. * Adds a definition object to the cache
  2869. * @param HTMLPurifier_Definition $def
  2870. * @param HTMLPurifier_Config $config
  2871. */
  2872. abstract public function add($def, $config);
  2873. /**
  2874. * Unconditionally saves a definition object to the cache
  2875. * @param HTMLPurifier_Definition $def
  2876. * @param HTMLPurifier_Config $config
  2877. */
  2878. abstract public function set($def, $config);
  2879. /**
  2880. * Replace an object in the cache
  2881. * @param HTMLPurifier_Definition $def
  2882. * @param HTMLPurifier_Config $config
  2883. */
  2884. abstract public function replace($def, $config);
  2885. /**
  2886. * Retrieves a definition object from the cache
  2887. * @param HTMLPurifier_Config $config
  2888. */
  2889. abstract public function get($config);
  2890. /**
  2891. * Removes a definition object to the cache
  2892. * @param HTMLPurifier_Config $config
  2893. */
  2894. abstract public function remove($config);
  2895. /**
  2896. * Clears all objects from cache
  2897. * @param HTMLPurifier_Config $config
  2898. */
  2899. abstract public function flush($config);
  2900. /**
  2901. * Clears all expired (older version or revision) objects from cache
  2902. * @note Be careful implementing this method as flush. Flush must
  2903. * not interfere with other Definition types, and cleanup()
  2904. * should not be repeatedly called by userland code.
  2905. * @param HTMLPurifier_Config $config
  2906. */
  2907. abstract public function cleanup($config);
  2908. }
  2909. /**
  2910. * Responsible for creating definition caches.
  2911. */
  2912. class HTMLPurifier_DefinitionCacheFactory
  2913. {
  2914. /**
  2915. * @type array
  2916. */
  2917. protected $caches = array('Serializer' => array());
  2918. /**
  2919. * @type array
  2920. */
  2921. protected $implementations = array();
  2922. /**
  2923. * @type HTMLPurifier_DefinitionCache_Decorator[]
  2924. */
  2925. protected $decorators = array();
  2926. /**
  2927. * Initialize default decorators
  2928. */
  2929. public function setup()
  2930. {
  2931. $this->addDecorator('Cleanup');
  2932. }
  2933. /**
  2934. * Retrieves an instance of global definition cache factory.
  2935. * @param HTMLPurifier_DefinitionCacheFactory $prototype
  2936. * @return HTMLPurifier_DefinitionCacheFactory
  2937. */
  2938. public static function instance($prototype = null)
  2939. {
  2940. static $instance;
  2941. if ($prototype !== null) {
  2942. $instance = $prototype;
  2943. } elseif ($instance === null || $prototype === true) {
  2944. $instance = new HTMLPurifier_DefinitionCacheFactory();
  2945. $instance->setup();
  2946. }
  2947. return $instance;
  2948. }
  2949. /**
  2950. * Registers a new definition cache object
  2951. * @param string $short Short name of cache object, for reference
  2952. * @param string $long Full class name of cache object, for construction
  2953. */
  2954. public function register($short, $long)
  2955. {
  2956. $this->implementations[$short] = $long;
  2957. }
  2958. /**
  2959. * Factory method that creates a cache object based on configuration
  2960. * @param string $type Name of definitions handled by cache
  2961. * @param HTMLPurifier_Config $config Config instance
  2962. * @return mixed
  2963. */
  2964. public function create($type, $config)
  2965. {
  2966. $method = $config->get('Cache.DefinitionImpl');
  2967. if ($method === null) {
  2968. return new HTMLPurifier_DefinitionCache_Null($type);
  2969. }
  2970. if (!empty($this->caches[$method][$type])) {
  2971. return $this->caches[$method][$type];
  2972. }
  2973. if (isset($this->implementations[$method]) &&
  2974. class_exists($class = $this->implementations[$method], false)) {
  2975. $cache = new $class($type);
  2976. } else {
  2977. if ($method != 'Serializer') {
  2978. trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
  2979. }
  2980. $cache = new HTMLPurifier_DefinitionCache_Serializer($type);
  2981. }
  2982. foreach ($this->decorators as $decorator) {
  2983. $new_cache = $decorator->decorate($cache);
  2984. // prevent infinite recursion in PHP 4
  2985. unset($cache);
  2986. $cache = $new_cache;
  2987. }
  2988. $this->caches[$method][$type] = $cache;
  2989. return $this->caches[$method][$type];
  2990. }
  2991. /**
  2992. * Registers a decorator to add to all new cache objects
  2993. * @param HTMLPurifier_DefinitionCache_Decorator|string $decorator An instance or the name of a decorator
  2994. */
  2995. public function addDecorator($decorator)
  2996. {
  2997. if (is_string($decorator)) {
  2998. $class = "HTMLPurifier_DefinitionCache_Decorator_$decorator";
  2999. $decorator = new $class;
  3000. }
  3001. $this->decorators[$decorator->name] = $decorator;
  3002. }
  3003. }
  3004. /**
  3005. * Represents a document type, contains information on which modules
  3006. * need to be loaded.
  3007. * @note This class is inspected by Printer_HTMLDefinition->renderDoctype.
  3008. * If structure changes, please update that function.
  3009. */
  3010. class HTMLPurifier_Doctype
  3011. {
  3012. /**
  3013. * Full name of doctype
  3014. * @type string
  3015. */
  3016. public $name;
  3017. /**
  3018. * List of standard modules (string identifiers or literal objects)
  3019. * that this doctype uses
  3020. * @type array
  3021. */
  3022. public $modules = array();
  3023. /**
  3024. * List of modules to use for tidying up code
  3025. * @type array
  3026. */
  3027. public $tidyModules = array();
  3028. /**
  3029. * Is the language derived from XML (i.e. XHTML)?
  3030. * @type bool
  3031. */
  3032. public $xml = true;
  3033. /**
  3034. * List of aliases for this doctype
  3035. * @type array
  3036. */
  3037. public $aliases = array();
  3038. /**
  3039. * Public DTD identifier
  3040. * @type string
  3041. */
  3042. public $dtdPublic;
  3043. /**
  3044. * System DTD identifier
  3045. * @type string
  3046. */
  3047. public $dtdSystem;
  3048. public function __construct(
  3049. $name = null,
  3050. $xml = true,
  3051. $modules = array(),
  3052. $tidyModules = array(),
  3053. $aliases = array(),
  3054. $dtd_public = null,
  3055. $dtd_system = null
  3056. ) {
  3057. $this->name = $name;
  3058. $this->xml = $xml;
  3059. $this->modules = $modules;
  3060. $this->tidyModules = $tidyModules;
  3061. $this->aliases = $aliases;
  3062. $this->dtdPublic = $dtd_public;
  3063. $this->dtdSystem = $dtd_system;
  3064. }
  3065. }
  3066. class HTMLPurifier_DoctypeRegistry
  3067. {
  3068. /**
  3069. * Hash of doctype names to doctype objects.
  3070. * @type array
  3071. */
  3072. protected $doctypes;
  3073. /**
  3074. * Lookup table of aliases to real doctype names.
  3075. * @type array
  3076. */
  3077. protected $aliases;
  3078. /**
  3079. * Registers a doctype to the registry
  3080. * @note Accepts a fully-formed doctype object, or the
  3081. * parameters for constructing a doctype object
  3082. * @param string $doctype Name of doctype or literal doctype object
  3083. * @param bool $xml
  3084. * @param array $modules Modules doctype will load
  3085. * @param array $tidy_modules Modules doctype will load for certain modes
  3086. * @param array $aliases Alias names for doctype
  3087. * @param string $dtd_public
  3088. * @param string $dtd_system
  3089. * @return HTMLPurifier_Doctype Editable registered doctype
  3090. */
  3091. public function register(
  3092. $doctype,
  3093. $xml = true,
  3094. $modules = array(),
  3095. $tidy_modules = array(),
  3096. $aliases = array(),
  3097. $dtd_public = null,
  3098. $dtd_system = null
  3099. ) {
  3100. if (!is_array($modules)) {
  3101. $modules = array($modules);
  3102. }
  3103. if (!is_array($tidy_modules)) {
  3104. $tidy_modules = array($tidy_modules);
  3105. }
  3106. if (!is_array($aliases)) {
  3107. $aliases = array($aliases);
  3108. }
  3109. if (!is_object($doctype)) {
  3110. $doctype = new HTMLPurifier_Doctype(
  3111. $doctype,
  3112. $xml,
  3113. $modules,
  3114. $tidy_modules,
  3115. $aliases,
  3116. $dtd_public,
  3117. $dtd_system
  3118. );
  3119. }
  3120. $this->doctypes[$doctype->name] = $doctype;
  3121. $name = $doctype->name;
  3122. // hookup aliases
  3123. foreach ($doctype->aliases as $alias) {
  3124. if (isset($this->doctypes[$alias])) {
  3125. continue;
  3126. }
  3127. $this->aliases[$alias] = $name;
  3128. }
  3129. // remove old aliases
  3130. if (isset($this->aliases[$name])) {
  3131. unset($this->aliases[$name]);
  3132. }
  3133. return $doctype;
  3134. }
  3135. /**
  3136. * Retrieves reference to a doctype of a certain name
  3137. * @note This function resolves aliases
  3138. * @note When possible, use the more fully-featured make()
  3139. * @param string $doctype Name of doctype
  3140. * @return HTMLPurifier_Doctype Editable doctype object
  3141. */
  3142. public function get($doctype)
  3143. {
  3144. if (isset($this->aliases[$doctype])) {
  3145. $doctype = $this->aliases[$doctype];
  3146. }
  3147. if (!isset($this->doctypes[$doctype])) {
  3148. trigger_error('Doctype ' . htmlspecialchars($doctype) . ' does not exist', E_USER_ERROR);
  3149. $anon = new HTMLPurifier_Doctype($doctype);
  3150. return $anon;
  3151. }
  3152. return $this->doctypes[$doctype];
  3153. }
  3154. /**
  3155. * Creates a doctype based on a configuration object,
  3156. * will perform initialization on the doctype
  3157. * @note Use this function to get a copy of doctype that config
  3158. * can hold on to (this is necessary in order to tell
  3159. * Generator whether or not the current document is XML
  3160. * based or not).
  3161. * @param HTMLPurifier_Config $config
  3162. * @return HTMLPurifier_Doctype
  3163. */
  3164. public function make($config)
  3165. {
  3166. return clone $this->get($this->getDoctypeFromConfig($config));
  3167. }
  3168. /**
  3169. * Retrieves the doctype from the configuration object
  3170. * @param HTMLPurifier_Config $config
  3171. * @return string
  3172. */
  3173. public function getDoctypeFromConfig($config)
  3174. {
  3175. // recommended test
  3176. $doctype = $config->get('HTML.Doctype');
  3177. if (!empty($doctype)) {
  3178. return $doctype;
  3179. }
  3180. $doctype = $config->get('HTML.CustomDoctype');
  3181. if (!empty($doctype)) {
  3182. return $doctype;
  3183. }
  3184. // backwards-compatibility
  3185. if ($config->get('HTML.XHTML')) {
  3186. $doctype = 'XHTML 1.0';
  3187. } else {
  3188. $doctype = 'HTML 4.01';
  3189. }
  3190. if ($config->get('HTML.Strict')) {
  3191. $doctype .= ' Strict';
  3192. } else {
  3193. $doctype .= ' Transitional';
  3194. }
  3195. return $doctype;
  3196. }
  3197. }
  3198. /**
  3199. * Structure that stores an HTML element definition. Used by
  3200. * HTMLPurifier_HTMLDefinition and HTMLPurifier_HTMLModule.
  3201. * @note This class is inspected by HTMLPurifier_Printer_HTMLDefinition.
  3202. * Please update that class too.
  3203. * @warning If you add new properties to this class, you MUST update
  3204. * the mergeIn() method.
  3205. */
  3206. class HTMLPurifier_ElementDef
  3207. {
  3208. /**
  3209. * Does the definition work by itself, or is it created solely
  3210. * for the purpose of merging into another definition?
  3211. * @type bool
  3212. */
  3213. public $standalone = true;
  3214. /**
  3215. * Associative array of attribute name to HTMLPurifier_AttrDef.
  3216. * @type array
  3217. * @note Before being processed by HTMLPurifier_AttrCollections
  3218. * when modules are finalized during
  3219. * HTMLPurifier_HTMLDefinition->setup(), this array may also
  3220. * contain an array at index 0 that indicates which attribute
  3221. * collections to load into the full array. It may also
  3222. * contain string indentifiers in lieu of HTMLPurifier_AttrDef,
  3223. * see HTMLPurifier_AttrTypes on how they are expanded during
  3224. * HTMLPurifier_HTMLDefinition->setup() processing.
  3225. */
  3226. public $attr = array();
  3227. // XXX: Design note: currently, it's not possible to override
  3228. // previously defined AttrTransforms without messing around with
  3229. // the final generated config. This is by design; a previous version
  3230. // used an associated list of attr_transform, but it was extremely
  3231. // easy to accidentally override other attribute transforms by
  3232. // forgetting to specify an index (and just using 0.) While we
  3233. // could check this by checking the index number and complaining,
  3234. // there is a second problem which is that it is not at all easy to
  3235. // tell when something is getting overridden. Combine this with a
  3236. // codebase where this isn't really being used, and it's perfect for
  3237. // nuking.
  3238. /**
  3239. * List of tags HTMLPurifier_AttrTransform to be done before validation.
  3240. * @type array
  3241. */
  3242. public $attr_transform_pre = array();
  3243. /**
  3244. * List of tags HTMLPurifier_AttrTransform to be done after validation.
  3245. * @type array
  3246. */
  3247. public $attr_transform_post = array();
  3248. /**
  3249. * HTMLPurifier_ChildDef of this tag.
  3250. * @type HTMLPurifier_ChildDef
  3251. */
  3252. public $child;
  3253. /**
  3254. * Abstract string representation of internal ChildDef rules.
  3255. * @see HTMLPurifier_ContentSets for how this is parsed and then transformed
  3256. * into an HTMLPurifier_ChildDef.
  3257. * @warning This is a temporary variable that is not available after
  3258. * being processed by HTMLDefinition
  3259. * @type string
  3260. */
  3261. public $content_model;
  3262. /**
  3263. * Value of $child->type, used to determine which ChildDef to use,
  3264. * used in combination with $content_model.
  3265. * @warning This must be lowercase
  3266. * @warning This is a temporary variable that is not available after
  3267. * being processed by HTMLDefinition
  3268. * @type string
  3269. */
  3270. public $content_model_type;
  3271. /**
  3272. * Does the element have a content model (#PCDATA | Inline)*? This
  3273. * is important for chameleon ins and del processing in
  3274. * HTMLPurifier_ChildDef_Chameleon. Dynamically set: modules don't
  3275. * have to worry about this one.
  3276. * @type bool
  3277. */
  3278. public $descendants_are_inline = false;
  3279. /**
  3280. * List of the names of required attributes this element has.
  3281. * Dynamically populated by HTMLPurifier_HTMLDefinition::getElement()
  3282. * @type array
  3283. */
  3284. public $required_attr = array();
  3285. /**
  3286. * Lookup table of tags excluded from all descendants of this tag.
  3287. * @type array
  3288. * @note SGML permits exclusions for all descendants, but this is
  3289. * not possible with DTDs or XML Schemas. W3C has elected to
  3290. * use complicated compositions of content_models to simulate
  3291. * exclusion for children, but we go the simpler, SGML-style
  3292. * route of flat-out exclusions, which correctly apply to
  3293. * all descendants and not just children. Note that the XHTML
  3294. * Modularization Abstract Modules are blithely unaware of such
  3295. * distinctions.
  3296. */
  3297. public $excludes = array();
  3298. /**
  3299. * This tag is explicitly auto-closed by the following tags.
  3300. * @type array
  3301. */
  3302. public $autoclose = array();
  3303. /**
  3304. * If a foreign element is found in this element, test if it is
  3305. * allowed by this sub-element; if it is, instead of closing the
  3306. * current element, place it inside this element.
  3307. * @type string
  3308. */
  3309. public $wrap;
  3310. /**
  3311. * Whether or not this is a formatting element affected by the
  3312. * "Active Formatting Elements" algorithm.
  3313. * @type bool
  3314. */
  3315. public $formatting;
  3316. /**
  3317. * Low-level factory constructor for creating new standalone element defs
  3318. */
  3319. public static function create($content_model, $content_model_type, $attr)
  3320. {
  3321. $def = new HTMLPurifier_ElementDef();
  3322. $def->content_model = $content_model;
  3323. $def->content_model_type = $content_model_type;
  3324. $def->attr = $attr;
  3325. return $def;
  3326. }
  3327. /**
  3328. * Merges the values of another element definition into this one.
  3329. * Values from the new element def take precedence if a value is
  3330. * not mergeable.
  3331. * @param HTMLPurifier_ElementDef $def
  3332. */
  3333. public function mergeIn($def)
  3334. {
  3335. // later keys takes precedence
  3336. foreach ($def->attr as $k => $v) {
  3337. if ($k === 0) {
  3338. // merge in the includes
  3339. // sorry, no way to override an include
  3340. foreach ($v as $v2) {
  3341. $this->attr[0][] = $v2;
  3342. }
  3343. continue;
  3344. }
  3345. if ($v === false) {
  3346. if (isset($this->attr[$k])) {
  3347. unset($this->attr[$k]);
  3348. }
  3349. continue;
  3350. }
  3351. $this->attr[$k] = $v;
  3352. }
  3353. $this->_mergeAssocArray($this->excludes, $def->excludes);
  3354. $this->attr_transform_pre = array_merge($this->attr_transform_pre, $def->attr_transform_pre);
  3355. $this->attr_transform_post = array_merge($this->attr_transform_post, $def->attr_transform_post);
  3356. if (!empty($def->content_model)) {
  3357. $this->content_model =
  3358. str_replace("#SUPER", $this->content_model, $def->content_model);
  3359. $this->child = false;
  3360. }
  3361. if (!empty($def->content_model_type)) {
  3362. $this->content_model_type = $def->content_model_type;
  3363. $this->child = false;
  3364. }
  3365. if (!is_null($def->child)) {
  3366. $this->child = $def->child;
  3367. }
  3368. if (!is_null($def->formatting)) {
  3369. $this->formatting = $def->formatting;
  3370. }
  3371. if ($def->descendants_are_inline) {
  3372. $this->descendants_are_inline = $def->descendants_are_inline;
  3373. }
  3374. }
  3375. /**
  3376. * Merges one array into another, removes values which equal false
  3377. * @param $a1 Array by reference that is merged into
  3378. * @param $a2 Array that merges into $a1
  3379. */
  3380. private function _mergeAssocArray(&$a1, $a2)
  3381. {
  3382. foreach ($a2 as $k => $v) {
  3383. if ($v === false) {
  3384. if (isset($a1[$k])) {
  3385. unset($a1[$k]);
  3386. }
  3387. continue;
  3388. }
  3389. $a1[$k] = $v;
  3390. }
  3391. }
  3392. }
  3393. /**
  3394. * A UTF-8 specific character encoder that handles cleaning and transforming.
  3395. * @note All functions in this class should be static.
  3396. */
  3397. class HTMLPurifier_Encoder
  3398. {
  3399. /**
  3400. * Constructor throws fatal error if you attempt to instantiate class
  3401. */
  3402. private function __construct()
  3403. {
  3404. trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
  3405. }
  3406. /**
  3407. * Error-handler that mutes errors, alternative to shut-up operator.
  3408. */
  3409. public static function muteErrorHandler()
  3410. {
  3411. }
  3412. /**
  3413. * iconv wrapper which mutes errors, but doesn't work around bugs.
  3414. * @param string $in Input encoding
  3415. * @param string $out Output encoding
  3416. * @param string $text The text to convert
  3417. * @return string
  3418. */
  3419. public static function unsafeIconv($in, $out, $text)
  3420. {
  3421. set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  3422. $r = iconv($in, $out, $text);
  3423. restore_error_handler();
  3424. return $r;
  3425. }
  3426. /**
  3427. * iconv wrapper which mutes errors and works around bugs.
  3428. * @param string $in Input encoding
  3429. * @param string $out Output encoding
  3430. * @param string $text The text to convert
  3431. * @param int $max_chunk_size
  3432. * @return string
  3433. */
  3434. public static function iconv($in, $out, $text, $max_chunk_size = 8000)
  3435. {
  3436. $code = self::testIconvTruncateBug();
  3437. if ($code == self::ICONV_OK) {
  3438. return self::unsafeIconv($in, $out, $text);
  3439. } elseif ($code == self::ICONV_TRUNCATES) {
  3440. // we can only work around this if the input character set
  3441. // is utf-8
  3442. if ($in == 'utf-8') {
  3443. if ($max_chunk_size < 4) {
  3444. trigger_error('max_chunk_size is too small', E_USER_WARNING);
  3445. return false;
  3446. }
  3447. // split into 8000 byte chunks, but be careful to handle
  3448. // multibyte boundaries properly
  3449. if (($c = strlen($text)) <= $max_chunk_size) {
  3450. return self::unsafeIconv($in, $out, $text);
  3451. }
  3452. $r = '';
  3453. $i = 0;
  3454. while (true) {
  3455. if ($i + $max_chunk_size >= $c) {
  3456. $r .= self::unsafeIconv($in, $out, substr($text, $i));
  3457. break;
  3458. }
  3459. // wibble the boundary
  3460. if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
  3461. $chunk_size = $max_chunk_size;
  3462. } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
  3463. $chunk_size = $max_chunk_size - 1;
  3464. } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
  3465. $chunk_size = $max_chunk_size - 2;
  3466. } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
  3467. $chunk_size = $max_chunk_size - 3;
  3468. } else {
  3469. return false; // rather confusing UTF-8...
  3470. }
  3471. $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths
  3472. $r .= self::unsafeIconv($in, $out, $chunk);
  3473. $i += $chunk_size;
  3474. }
  3475. return $r;
  3476. } else {
  3477. return false;
  3478. }
  3479. } else {
  3480. return false;
  3481. }
  3482. }
  3483. /**
  3484. * Cleans a UTF-8 string for well-formedness and SGML validity
  3485. *
  3486. * It will parse according to UTF-8 and return a valid UTF8 string, with
  3487. * non-SGML codepoints excluded.
  3488. *
  3489. * Specifically, it will permit:
  3490. * \x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}
  3491. * Source: https://www.w3.org/TR/REC-xml/#NT-Char
  3492. * Arguably this function should be modernized to the HTML5 set
  3493. * of allowed characters:
  3494. * https://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
  3495. * which simultaneously expand and restrict the set of allowed characters.
  3496. *
  3497. * @param string $str The string to clean
  3498. * @param bool $force_php
  3499. * @return string
  3500. *
  3501. * @note Just for reference, the non-SGML code points are 0 to 31 and
  3502. * 127 to 159, inclusive. However, we allow code points 9, 10
  3503. * and 13, which are the tab, line feed and carriage return
  3504. * respectively. 128 and above the code points map to multibyte
  3505. * UTF-8 representations.
  3506. *
  3507. * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
  3508. * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
  3509. * LGPL license. Notes on what changed are inside, but in general,
  3510. * the original code transformed UTF-8 text into an array of integer
  3511. * Unicode codepoints. Understandably, transforming that back to
  3512. * a string would be somewhat expensive, so the function was modded to
  3513. * directly operate on the string. However, this discourages code
  3514. * reuse, and the logic enumerated here would be useful for any
  3515. * function that needs to be able to understand UTF-8 characters.
  3516. * As of right now, only smart lossless character encoding converters
  3517. * would need that, and I'm probably not going to implement them.
  3518. */
  3519. public static function cleanUTF8($str, $force_php = false)
  3520. {
  3521. // UTF-8 validity is checked since PHP 4.3.5
  3522. // This is an optimization: if the string is already valid UTF-8, no
  3523. // need to do PHP stuff. 99% of the time, this will be the case.
  3524. if (preg_match(
  3525. '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
  3526. $str
  3527. )) {
  3528. return $str;
  3529. }
  3530. $mState = 0; // cached expected number of octets after the current octet
  3531. // until the beginning of the next UTF8 character sequence
  3532. $mUcs4 = 0; // cached Unicode character
  3533. $mBytes = 1; // cached expected number of octets in the current sequence
  3534. // original code involved an $out that was an array of Unicode
  3535. // codepoints. Instead of having to convert back into UTF-8, we've
  3536. // decided to directly append valid UTF-8 characters onto a string
  3537. // $out once they're done. $char accumulates raw bytes, while $mUcs4
  3538. // turns into the Unicode code point, so there's some redundancy.
  3539. $out = '';
  3540. $char = '';
  3541. $len = strlen($str);
  3542. for ($i = 0; $i < $len; $i++) {
  3543. $in = ord($str{$i});
  3544. $char .= $str[$i]; // append byte to char
  3545. if (0 == $mState) {
  3546. // When mState is zero we expect either a US-ASCII character
  3547. // or a multi-octet sequence.
  3548. if (0 == (0x80 & ($in))) {
  3549. // US-ASCII, pass straight through.
  3550. if (($in <= 31 || $in == 127) &&
  3551. !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
  3552. ) {
  3553. // control characters, remove
  3554. } else {
  3555. $out .= $char;
  3556. }
  3557. // reset
  3558. $char = '';
  3559. $mBytes = 1;
  3560. } elseif (0xC0 == (0xE0 & ($in))) {
  3561. // First octet of 2 octet sequence
  3562. $mUcs4 = ($in);
  3563. $mUcs4 = ($mUcs4 & 0x1F) << 6;
  3564. $mState = 1;
  3565. $mBytes = 2;
  3566. } elseif (0xE0 == (0xF0 & ($in))) {
  3567. // First octet of 3 octet sequence
  3568. $mUcs4 = ($in);
  3569. $mUcs4 = ($mUcs4 & 0x0F) << 12;
  3570. $mState = 2;
  3571. $mBytes = 3;
  3572. } elseif (0xF0 == (0xF8 & ($in))) {
  3573. // First octet of 4 octet sequence
  3574. $mUcs4 = ($in);
  3575. $mUcs4 = ($mUcs4 & 0x07) << 18;
  3576. $mState = 3;
  3577. $mBytes = 4;
  3578. } elseif (0xF8 == (0xFC & ($in))) {
  3579. // First octet of 5 octet sequence.
  3580. //
  3581. // This is illegal because the encoded codepoint must be
  3582. // either:
  3583. // (a) not the shortest form or
  3584. // (b) outside the Unicode range of 0-0x10FFFF.
  3585. // Rather than trying to resynchronize, we will carry on
  3586. // until the end of the sequence and let the later error
  3587. // handling code catch it.
  3588. $mUcs4 = ($in);
  3589. $mUcs4 = ($mUcs4 & 0x03) << 24;
  3590. $mState = 4;
  3591. $mBytes = 5;
  3592. } elseif (0xFC == (0xFE & ($in))) {
  3593. // First octet of 6 octet sequence, see comments for 5
  3594. // octet sequence.
  3595. $mUcs4 = ($in);
  3596. $mUcs4 = ($mUcs4 & 1) << 30;
  3597. $mState = 5;
  3598. $mBytes = 6;
  3599. } else {
  3600. // Current octet is neither in the US-ASCII range nor a
  3601. // legal first octet of a multi-octet sequence.
  3602. $mState = 0;
  3603. $mUcs4 = 0;
  3604. $mBytes = 1;
  3605. $char = '';
  3606. }
  3607. } else {
  3608. // When mState is non-zero, we expect a continuation of the
  3609. // multi-octet sequence
  3610. if (0x80 == (0xC0 & ($in))) {
  3611. // Legal continuation.
  3612. $shift = ($mState - 1) * 6;
  3613. $tmp = $in;
  3614. $tmp = ($tmp & 0x0000003F) << $shift;
  3615. $mUcs4 |= $tmp;
  3616. if (0 == --$mState) {
  3617. // End of the multi-octet sequence. mUcs4 now contains
  3618. // the final Unicode codepoint to be output
  3619. // Check for illegal sequences and codepoints.
  3620. // From Unicode 3.1, non-shortest form is illegal
  3621. if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
  3622. ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
  3623. ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
  3624. (4 < $mBytes) ||
  3625. // From Unicode 3.2, surrogate characters = illegal
  3626. (($mUcs4 & 0xFFFFF800) == 0xD800) ||
  3627. // Codepoints outside the Unicode range are illegal
  3628. ($mUcs4 > 0x10FFFF)
  3629. ) {
  3630. } elseif (0xFEFF != $mUcs4 && // omit BOM
  3631. // check for valid Char unicode codepoints
  3632. (
  3633. 0x9 == $mUcs4 ||
  3634. 0xA == $mUcs4 ||
  3635. 0xD == $mUcs4 ||
  3636. (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
  3637. // 7F-9F is not strictly prohibited by XML,
  3638. // but it is non-SGML, and thus we don't allow it
  3639. (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
  3640. (0xE000 <= $mUcs4 && 0xFFFD >= $mUcs4) ||
  3641. (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
  3642. )
  3643. ) {
  3644. $out .= $char;
  3645. }
  3646. // initialize UTF8 cache (reset)
  3647. $mState = 0;
  3648. $mUcs4 = 0;
  3649. $mBytes = 1;
  3650. $char = '';
  3651. }
  3652. } else {
  3653. // ((0xC0 & (*in) != 0x80) && (mState != 0))
  3654. // Incomplete multi-octet sequence.
  3655. // used to result in complete fail, but we'll reset
  3656. $mState = 0;
  3657. $mUcs4 = 0;
  3658. $mBytes = 1;
  3659. $char ='';
  3660. }
  3661. }
  3662. }
  3663. return $out;
  3664. }
  3665. /**
  3666. * Translates a Unicode codepoint into its corresponding UTF-8 character.
  3667. * @note Based on Feyd's function at
  3668. * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
  3669. * which is in public domain.
  3670. * @note While we're going to do code point parsing anyway, a good
  3671. * optimization would be to refuse to translate code points that
  3672. * are non-SGML characters. However, this could lead to duplication.
  3673. * @note This is very similar to the unichr function in
  3674. * maintenance/generate-entity-file.php (although this is superior,
  3675. * due to its sanity checks).
  3676. */
  3677. // +----------+----------+----------+----------+
  3678. // | 33222222 | 22221111 | 111111 | |
  3679. // | 10987654 | 32109876 | 54321098 | 76543210 | bit
  3680. // +----------+----------+----------+----------+
  3681. // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
  3682. // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
  3683. // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
  3684. // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
  3685. // +----------+----------+----------+----------+
  3686. // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
  3687. // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
  3688. // +----------+----------+----------+----------+
  3689. public static function unichr($code)
  3690. {
  3691. if ($code > 1114111 or $code < 0 or
  3692. ($code >= 55296 and $code <= 57343) ) {
  3693. // bits are set outside the "valid" range as defined
  3694. // by UNICODE 4.1.0
  3695. return '';
  3696. }
  3697. $x = $y = $z = $w = 0;
  3698. if ($code < 128) {
  3699. // regular ASCII character
  3700. $x = $code;
  3701. } else {
  3702. // set up bits for UTF-8
  3703. $x = ($code & 63) | 128;
  3704. if ($code < 2048) {
  3705. $y = (($code & 2047) >> 6) | 192;
  3706. } else {
  3707. $y = (($code & 4032) >> 6) | 128;
  3708. if ($code < 65536) {
  3709. $z = (($code >> 12) & 15) | 224;
  3710. } else {
  3711. $z = (($code >> 12) & 63) | 128;
  3712. $w = (($code >> 18) & 7) | 240;
  3713. }
  3714. }
  3715. }
  3716. // set up the actual character
  3717. $ret = '';
  3718. if ($w) {
  3719. $ret .= chr($w);
  3720. }
  3721. if ($z) {
  3722. $ret .= chr($z);
  3723. }
  3724. if ($y) {
  3725. $ret .= chr($y);
  3726. }
  3727. $ret .= chr($x);
  3728. return $ret;
  3729. }
  3730. /**
  3731. * @return bool
  3732. */
  3733. public static function iconvAvailable()
  3734. {
  3735. static $iconv = null;
  3736. if ($iconv === null) {
  3737. $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
  3738. }
  3739. return $iconv;
  3740. }
  3741. /**
  3742. * Convert a string to UTF-8 based on configuration.
  3743. * @param string $str The string to convert
  3744. * @param HTMLPurifier_Config $config
  3745. * @param HTMLPurifier_Context $context
  3746. * @return string
  3747. */
  3748. public static function convertToUTF8($str, $config, $context)
  3749. {
  3750. $encoding = $config->get('Core.Encoding');
  3751. if ($encoding === 'utf-8') {
  3752. return $str;
  3753. }
  3754. static $iconv = null;
  3755. if ($iconv === null) {
  3756. $iconv = self::iconvAvailable();
  3757. }
  3758. if ($iconv && !$config->get('Test.ForceNoIconv')) {
  3759. // unaffected by bugs, since UTF-8 support all characters
  3760. $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
  3761. if ($str === false) {
  3762. // $encoding is not a valid encoding
  3763. trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
  3764. return '';
  3765. }
  3766. // If the string is bjorked by Shift_JIS or a similar encoding
  3767. // that doesn't support all of ASCII, convert the naughty
  3768. // characters to their true byte-wise ASCII/UTF-8 equivalents.
  3769. $str = strtr($str, self::testEncodingSupportsASCII($encoding));
  3770. return $str;
  3771. } elseif ($encoding === 'iso-8859-1') {
  3772. $str = utf8_encode($str);
  3773. return $str;
  3774. }
  3775. $bug = HTMLPurifier_Encoder::testIconvTruncateBug();
  3776. if ($bug == self::ICONV_OK) {
  3777. trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
  3778. } else {
  3779. trigger_error(
  3780. 'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' .
  3781. 'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541',
  3782. E_USER_ERROR
  3783. );
  3784. }
  3785. }
  3786. /**
  3787. * Converts a string from UTF-8 based on configuration.
  3788. * @param string $str The string to convert
  3789. * @param HTMLPurifier_Config $config
  3790. * @param HTMLPurifier_Context $context
  3791. * @return string
  3792. * @note Currently, this is a lossy conversion, with unexpressable
  3793. * characters being omitted.
  3794. */
  3795. public static function convertFromUTF8($str, $config, $context)
  3796. {
  3797. $encoding = $config->get('Core.Encoding');
  3798. if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
  3799. $str = self::convertToASCIIDumbLossless($str);
  3800. }
  3801. if ($encoding === 'utf-8') {
  3802. return $str;
  3803. }
  3804. static $iconv = null;
  3805. if ($iconv === null) {
  3806. $iconv = self::iconvAvailable();
  3807. }
  3808. if ($iconv && !$config->get('Test.ForceNoIconv')) {
  3809. // Undo our previous fix in convertToUTF8, otherwise iconv will barf
  3810. $ascii_fix = self::testEncodingSupportsASCII($encoding);
  3811. if (!$escape && !empty($ascii_fix)) {
  3812. $clear_fix = array();
  3813. foreach ($ascii_fix as $utf8 => $native) {
  3814. $clear_fix[$utf8] = '';
  3815. }
  3816. $str = strtr($str, $clear_fix);
  3817. }
  3818. $str = strtr($str, array_flip($ascii_fix));
  3819. // Normal stuff
  3820. $str = self::iconv('utf-8', $encoding . '//IGNORE', $str);
  3821. return $str;
  3822. } elseif ($encoding === 'iso-8859-1') {
  3823. $str = utf8_decode($str);
  3824. return $str;
  3825. }
  3826. trigger_error('Encoding not supported', E_USER_ERROR);
  3827. // You might be tempted to assume that the ASCII representation
  3828. // might be OK, however, this is *not* universally true over all
  3829. // encodings. So we take the conservative route here, rather
  3830. // than forcibly turn on %Core.EscapeNonASCIICharacters
  3831. }
  3832. /**
  3833. * Lossless (character-wise) conversion of HTML to ASCII
  3834. * @param string $str UTF-8 string to be converted to ASCII
  3835. * @return string ASCII encoded string with non-ASCII character entity-ized
  3836. * @warning Adapted from MediaWiki, claiming fair use: this is a common
  3837. * algorithm. If you disagree with this license fudgery,
  3838. * implement it yourself.
  3839. * @note Uses decimal numeric entities since they are best supported.
  3840. * @note This is a DUMB function: it has no concept of keeping
  3841. * character entities that the projected character encoding
  3842. * can allow. We could possibly implement a smart version
  3843. * but that would require it to also know which Unicode
  3844. * codepoints the charset supported (not an easy task).
  3845. * @note Sort of with cleanUTF8() but it assumes that $str is
  3846. * well-formed UTF-8
  3847. */
  3848. public static function convertToASCIIDumbLossless($str)
  3849. {
  3850. $bytesleft = 0;
  3851. $result = '';
  3852. $working = 0;
  3853. $len = strlen($str);
  3854. for ($i = 0; $i < $len; $i++) {
  3855. $bytevalue = ord($str[$i]);
  3856. if ($bytevalue <= 0x7F) { //0xxx xxxx
  3857. $result .= chr($bytevalue);
  3858. $bytesleft = 0;
  3859. } elseif ($bytevalue <= 0xBF) { //10xx xxxx
  3860. $working = $working << 6;
  3861. $working += ($bytevalue & 0x3F);
  3862. $bytesleft--;
  3863. if ($bytesleft <= 0) {
  3864. $result .= "&#" . $working . ";";
  3865. }
  3866. } elseif ($bytevalue <= 0xDF) { //110x xxxx
  3867. $working = $bytevalue & 0x1F;
  3868. $bytesleft = 1;
  3869. } elseif ($bytevalue <= 0xEF) { //1110 xxxx
  3870. $working = $bytevalue & 0x0F;
  3871. $bytesleft = 2;
  3872. } else { //1111 0xxx
  3873. $working = $bytevalue & 0x07;
  3874. $bytesleft = 3;
  3875. }
  3876. }
  3877. return $result;
  3878. }
  3879. /** No bugs detected in iconv. */
  3880. const ICONV_OK = 0;
  3881. /** Iconv truncates output if converting from UTF-8 to another
  3882. * character set with //IGNORE, and a non-encodable character is found */
  3883. const ICONV_TRUNCATES = 1;
  3884. /** Iconv does not support //IGNORE, making it unusable for
  3885. * transcoding purposes */
  3886. const ICONV_UNUSABLE = 2;
  3887. /**
  3888. * glibc iconv has a known bug where it doesn't handle the magic
  3889. * //IGNORE stanza correctly. In particular, rather than ignore
  3890. * characters, it will return an EILSEQ after consuming some number
  3891. * of characters, and expect you to restart iconv as if it were
  3892. * an E2BIG. Old versions of PHP did not respect the errno, and
  3893. * returned the fragment, so as a result you would see iconv
  3894. * mysteriously truncating output. We can work around this by
  3895. * manually chopping our input into segments of about 8000
  3896. * characters, as long as PHP ignores the error code. If PHP starts
  3897. * paying attention to the error code, iconv becomes unusable.
  3898. *
  3899. * @return int Error code indicating severity of bug.
  3900. */
  3901. public static function testIconvTruncateBug()
  3902. {
  3903. static $code = null;
  3904. if ($code === null) {
  3905. // better not use iconv, otherwise infinite loop!
  3906. $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000));
  3907. if ($r === false) {
  3908. $code = self::ICONV_UNUSABLE;
  3909. } elseif (($c = strlen($r)) < 9000) {
  3910. $code = self::ICONV_TRUNCATES;
  3911. } elseif ($c > 9000) {
  3912. trigger_error(
  3913. 'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' .
  3914. 'include your iconv version as per phpversion()',
  3915. E_USER_ERROR
  3916. );
  3917. } else {
  3918. $code = self::ICONV_OK;
  3919. }
  3920. }
  3921. return $code;
  3922. }
  3923. /**
  3924. * This expensive function tests whether or not a given character
  3925. * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
  3926. * fail this test, and require special processing. Variable width
  3927. * encodings shouldn't ever fail.
  3928. *
  3929. * @param string $encoding Encoding name to test, as per iconv format
  3930. * @param bool $bypass Whether or not to bypass the precompiled arrays.
  3931. * @return Array of UTF-8 characters to their corresponding ASCII,
  3932. * which can be used to "undo" any overzealous iconv action.
  3933. */
  3934. public static function testEncodingSupportsASCII($encoding, $bypass = false)
  3935. {
  3936. // All calls to iconv here are unsafe, proof by case analysis:
  3937. // If ICONV_OK, no difference.
  3938. // If ICONV_TRUNCATE, all calls involve one character inputs,
  3939. // so bug is not triggered.
  3940. // If ICONV_UNUSABLE, this call is irrelevant
  3941. static $encodings = array();
  3942. if (!$bypass) {
  3943. if (isset($encodings[$encoding])) {
  3944. return $encodings[$encoding];
  3945. }
  3946. $lenc = strtolower($encoding);
  3947. switch ($lenc) {
  3948. case 'shift_jis':
  3949. return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
  3950. case 'johab':
  3951. return array("\xE2\x82\xA9" => '\\');
  3952. }
  3953. if (strpos($lenc, 'iso-8859-') === 0) {
  3954. return array();
  3955. }
  3956. }
  3957. $ret = array();
  3958. if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) {
  3959. return false;
  3960. }
  3961. for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
  3962. $c = chr($i); // UTF-8 char
  3963. $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
  3964. if ($r === '' ||
  3965. // This line is needed for iconv implementations that do not
  3966. // omit characters that do not exist in the target character set
  3967. ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
  3968. ) {
  3969. // Reverse engineer: what's the UTF-8 equiv of this byte
  3970. // sequence? This assumes that there's no variable width
  3971. // encoding that doesn't support ASCII.
  3972. $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
  3973. }
  3974. }
  3975. $encodings[$encoding] = $ret;
  3976. return $ret;
  3977. }
  3978. }
  3979. /**
  3980. * Object that provides entity lookup table from entity name to character
  3981. */
  3982. class HTMLPurifier_EntityLookup
  3983. {
  3984. /**
  3985. * Assoc array of entity name to character represented.
  3986. * @type array
  3987. */
  3988. public $table;
  3989. /**
  3990. * Sets up the entity lookup table from the serialized file contents.
  3991. * @param bool $file
  3992. * @note The serialized contents are versioned, but were generated
  3993. * using the maintenance script generate_entity_file.php
  3994. * @warning This is not in constructor to help enforce the Singleton
  3995. */
  3996. public function setup($file = false)
  3997. {
  3998. if (!$file) {
  3999. $file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
  4000. }
  4001. $this->table = unserialize(file_get_contents($file));
  4002. }
  4003. /**
  4004. * Retrieves sole instance of the object.
  4005. * @param bool|HTMLPurifier_EntityLookup $prototype Optional prototype of custom lookup table to overload with.
  4006. * @return HTMLPurifier_EntityLookup
  4007. */
  4008. public static function instance($prototype = false)
  4009. {
  4010. // no references, since PHP doesn't copy unless modified
  4011. static $instance = null;
  4012. if ($prototype) {
  4013. $instance = $prototype;
  4014. } elseif (!$instance) {
  4015. $instance = new HTMLPurifier_EntityLookup();
  4016. $instance->setup();
  4017. }
  4018. return $instance;
  4019. }
  4020. }
  4021. // if want to implement error collecting here, we'll need to use some sort
  4022. // of global data (probably trigger_error) because it's impossible to pass
  4023. // $config or $context to the callback functions.
  4024. /**
  4025. * Handles referencing and derefencing character entities
  4026. */
  4027. class HTMLPurifier_EntityParser
  4028. {
  4029. /**
  4030. * Reference to entity lookup table.
  4031. * @type HTMLPurifier_EntityLookup
  4032. */
  4033. protected $_entity_lookup;
  4034. /**
  4035. * Callback regex string for entities in text.
  4036. * @type string
  4037. */
  4038. protected $_textEntitiesRegex;
  4039. /**
  4040. * Callback regex string for entities in attributes.
  4041. * @type string
  4042. */
  4043. protected $_attrEntitiesRegex;
  4044. /**
  4045. * Tests if the beginning of a string is a semi-optional regex
  4046. */
  4047. protected $_semiOptionalPrefixRegex;
  4048. public function __construct() {
  4049. // From
  4050. // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
  4051. $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
  4052. // NB: three empty captures to put the fourth match in the right
  4053. // place
  4054. $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
  4055. $this->_textEntitiesRegex =
  4056. '/&(?:'.
  4057. // hex
  4058. '[#]x([a-fA-F0-9]+);?|'.
  4059. // dec
  4060. '[#]0*(\d+);?|'.
  4061. // string (mandatory semicolon)
  4062. // NB: order matters: match semicolon preferentially
  4063. '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
  4064. // string (optional semicolon)
  4065. "($semi_optional)".
  4066. ')/';
  4067. $this->_attrEntitiesRegex =
  4068. '/&(?:'.
  4069. // hex
  4070. '[#]x([a-fA-F0-9]+);?|'.
  4071. // dec
  4072. '[#]0*(\d+);?|'.
  4073. // string (mandatory semicolon)
  4074. // NB: order matters: match semicolon preferentially
  4075. '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
  4076. // string (optional semicolon)
  4077. // don't match if trailing is equals or alphanumeric (URL
  4078. // like)
  4079. "($semi_optional)(?![=;A-Za-z0-9])".
  4080. ')/';
  4081. }
  4082. /**
  4083. * Substitute entities with the parsed equivalents. Use this on
  4084. * textual data in an HTML document (as opposed to attributes.)
  4085. *
  4086. * @param string $string String to have entities parsed.
  4087. * @return string Parsed string.
  4088. */
  4089. public function substituteTextEntities($string)
  4090. {
  4091. return preg_replace_callback(
  4092. $this->_textEntitiesRegex,
  4093. array($this, 'entityCallback'),
  4094. $string
  4095. );
  4096. }
  4097. /**
  4098. * Substitute entities with the parsed equivalents. Use this on
  4099. * attribute contents in documents.
  4100. *
  4101. * @param string $string String to have entities parsed.
  4102. * @return string Parsed string.
  4103. */
  4104. public function substituteAttrEntities($string)
  4105. {
  4106. return preg_replace_callback(
  4107. $this->_attrEntitiesRegex,
  4108. array($this, 'entityCallback'),
  4109. $string
  4110. );
  4111. }
  4112. /**
  4113. * Callback function for substituteNonSpecialEntities() that does the work.
  4114. *
  4115. * @param array $matches PCRE matches array, with 0 the entire match, and
  4116. * either index 1, 2 or 3 set with a hex value, dec value,
  4117. * or string (respectively).
  4118. * @return string Replacement string.
  4119. */
  4120. protected function entityCallback($matches)
  4121. {
  4122. $entity = $matches[0];
  4123. $hex_part = @$matches[1];
  4124. $dec_part = @$matches[2];
  4125. $named_part = empty($matches[3]) ? @$matches[4] : $matches[3];
  4126. if ($hex_part !== NULL && $hex_part !== "") {
  4127. return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
  4128. } elseif ($dec_part !== NULL && $dec_part !== "") {
  4129. return HTMLPurifier_Encoder::unichr((int) $dec_part);
  4130. } else {
  4131. if (!$this->_entity_lookup) {
  4132. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  4133. }
  4134. if (isset($this->_entity_lookup->table[$named_part])) {
  4135. return $this->_entity_lookup->table[$named_part];
  4136. } else {
  4137. // exact match didn't match anything, so test if
  4138. // any of the semicolon optional match the prefix.
  4139. // Test that this is an EXACT match is important to
  4140. // prevent infinite loop
  4141. if (!empty($matches[3])) {
  4142. return preg_replace_callback(
  4143. $this->_semiOptionalPrefixRegex,
  4144. array($this, 'entityCallback'),
  4145. $entity
  4146. );
  4147. }
  4148. return $entity;
  4149. }
  4150. }
  4151. }
  4152. // LEGACY CODE BELOW
  4153. /**
  4154. * Callback regex string for parsing entities.
  4155. * @type string
  4156. */
  4157. protected $_substituteEntitiesRegex =
  4158. '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
  4159. // 1. hex 2. dec 3. string (XML style)
  4160. /**
  4161. * Decimal to parsed string conversion table for special entities.
  4162. * @type array
  4163. */
  4164. protected $_special_dec2str =
  4165. array(
  4166. 34 => '"',
  4167. 38 => '&',
  4168. 39 => "'",
  4169. 60 => '<',
  4170. 62 => '>'
  4171. );
  4172. /**
  4173. * Stripped entity names to decimal conversion table for special entities.
  4174. * @type array
  4175. */
  4176. protected $_special_ent2dec =
  4177. array(
  4178. 'quot' => 34,
  4179. 'amp' => 38,
  4180. 'lt' => 60,
  4181. 'gt' => 62
  4182. );
  4183. /**
  4184. * Substitutes non-special entities with their parsed equivalents. Since
  4185. * running this whenever you have parsed character is t3h 5uck, we run
  4186. * it before everything else.
  4187. *
  4188. * @param string $string String to have non-special entities parsed.
  4189. * @return string Parsed string.
  4190. */
  4191. public function substituteNonSpecialEntities($string)
  4192. {
  4193. // it will try to detect missing semicolons, but don't rely on it
  4194. return preg_replace_callback(
  4195. $this->_substituteEntitiesRegex,
  4196. array($this, 'nonSpecialEntityCallback'),
  4197. $string
  4198. );
  4199. }
  4200. /**
  4201. * Callback function for substituteNonSpecialEntities() that does the work.
  4202. *
  4203. * @param array $matches PCRE matches array, with 0 the entire match, and
  4204. * either index 1, 2 or 3 set with a hex value, dec value,
  4205. * or string (respectively).
  4206. * @return string Replacement string.
  4207. */
  4208. protected function nonSpecialEntityCallback($matches)
  4209. {
  4210. // replaces all but big five
  4211. $entity = $matches[0];
  4212. $is_num = (@$matches[0][1] === '#');
  4213. if ($is_num) {
  4214. $is_hex = (@$entity[2] === 'x');
  4215. $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  4216. // abort for special characters
  4217. if (isset($this->_special_dec2str[$code])) {
  4218. return $entity;
  4219. }
  4220. return HTMLPurifier_Encoder::unichr($code);
  4221. } else {
  4222. if (isset($this->_special_ent2dec[$matches[3]])) {
  4223. return $entity;
  4224. }
  4225. if (!$this->_entity_lookup) {
  4226. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  4227. }
  4228. if (isset($this->_entity_lookup->table[$matches[3]])) {
  4229. return $this->_entity_lookup->table[$matches[3]];
  4230. } else {
  4231. return $entity;
  4232. }
  4233. }
  4234. }
  4235. /**
  4236. * Substitutes only special entities with their parsed equivalents.
  4237. *
  4238. * @notice We try to avoid calling this function because otherwise, it
  4239. * would have to be called a lot (for every parsed section).
  4240. *
  4241. * @param string $string String to have non-special entities parsed.
  4242. * @return string Parsed string.
  4243. */
  4244. public function substituteSpecialEntities($string)
  4245. {
  4246. return preg_replace_callback(
  4247. $this->_substituteEntitiesRegex,
  4248. array($this, 'specialEntityCallback'),
  4249. $string
  4250. );
  4251. }
  4252. /**
  4253. * Callback function for substituteSpecialEntities() that does the work.
  4254. *
  4255. * This callback has same syntax as nonSpecialEntityCallback().
  4256. *
  4257. * @param array $matches PCRE-style matches array, with 0 the entire match, and
  4258. * either index 1, 2 or 3 set with a hex value, dec value,
  4259. * or string (respectively).
  4260. * @return string Replacement string.
  4261. */
  4262. protected function specialEntityCallback($matches)
  4263. {
  4264. $entity = $matches[0];
  4265. $is_num = (@$matches[0][1] === '#');
  4266. if ($is_num) {
  4267. $is_hex = (@$entity[2] === 'x');
  4268. $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  4269. return isset($this->_special_dec2str[$int]) ?
  4270. $this->_special_dec2str[$int] :
  4271. $entity;
  4272. } else {
  4273. return isset($this->_special_ent2dec[$matches[3]]) ?
  4274. $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
  4275. $entity;
  4276. }
  4277. }
  4278. }
  4279. /**
  4280. * Error collection class that enables HTML Purifier to report HTML
  4281. * problems back to the user
  4282. */
  4283. class HTMLPurifier_ErrorCollector
  4284. {
  4285. /**
  4286. * Identifiers for the returned error array. These are purposely numeric
  4287. * so list() can be used.
  4288. */
  4289. const LINENO = 0;
  4290. const SEVERITY = 1;
  4291. const MESSAGE = 2;
  4292. const CHILDREN = 3;
  4293. /**
  4294. * @type array
  4295. */
  4296. protected $errors;
  4297. /**
  4298. * @type array
  4299. */
  4300. protected $_current;
  4301. /**
  4302. * @type array
  4303. */
  4304. protected $_stacks = array(array());
  4305. /**
  4306. * @type HTMLPurifier_Language
  4307. */
  4308. protected $locale;
  4309. /**
  4310. * @type HTMLPurifier_Generator
  4311. */
  4312. protected $generator;
  4313. /**
  4314. * @type HTMLPurifier_Context
  4315. */
  4316. protected $context;
  4317. /**
  4318. * @type array
  4319. */
  4320. protected $lines = array();
  4321. /**
  4322. * @param HTMLPurifier_Context $context
  4323. */
  4324. public function __construct($context)
  4325. {
  4326. $this->locale =& $context->get('Locale');
  4327. $this->context = $context;
  4328. $this->_current =& $this->_stacks[0];
  4329. $this->errors =& $this->_stacks[0];
  4330. }
  4331. /**
  4332. * Sends an error message to the collector for later use
  4333. * @param int $severity Error severity, PHP error style (don't use E_USER_)
  4334. * @param string $msg Error message text
  4335. */
  4336. public function send($severity, $msg)
  4337. {
  4338. $args = array();
  4339. if (func_num_args() > 2) {
  4340. $args = func_get_args();
  4341. array_shift($args);
  4342. unset($args[0]);
  4343. }
  4344. $token = $this->context->get('CurrentToken', true);
  4345. $line = $token ? $token->line : $this->context->get('CurrentLine', true);
  4346. $col = $token ? $token->col : $this->context->get('CurrentCol', true);
  4347. $attr = $this->context->get('CurrentAttr', true);
  4348. // perform special substitutions, also add custom parameters
  4349. $subst = array();
  4350. if (!is_null($token)) {
  4351. $args['CurrentToken'] = $token;
  4352. }
  4353. if (!is_null($attr)) {
  4354. $subst['$CurrentAttr.Name'] = $attr;
  4355. if (isset($token->attr[$attr])) {
  4356. $subst['$CurrentAttr.Value'] = $token->attr[$attr];
  4357. }
  4358. }
  4359. if (empty($args)) {
  4360. $msg = $this->locale->getMessage($msg);
  4361. } else {
  4362. $msg = $this->locale->formatMessage($msg, $args);
  4363. }
  4364. if (!empty($subst)) {
  4365. $msg = strtr($msg, $subst);
  4366. }
  4367. // (numerically indexed)
  4368. $error = array(
  4369. self::LINENO => $line,
  4370. self::SEVERITY => $severity,
  4371. self::MESSAGE => $msg,
  4372. self::CHILDREN => array()
  4373. );
  4374. $this->_current[] = $error;
  4375. // NEW CODE BELOW ...
  4376. // Top-level errors are either:
  4377. // TOKEN type, if $value is set appropriately, or
  4378. // "syntax" type, if $value is null
  4379. $new_struct = new HTMLPurifier_ErrorStruct();
  4380. $new_struct->type = HTMLPurifier_ErrorStruct::TOKEN;
  4381. if ($token) {
  4382. $new_struct->value = clone $token;
  4383. }
  4384. if (is_int($line) && is_int($col)) {
  4385. if (isset($this->lines[$line][$col])) {
  4386. $struct = $this->lines[$line][$col];
  4387. } else {
  4388. $struct = $this->lines[$line][$col] = $new_struct;
  4389. }
  4390. // These ksorts may present a performance problem
  4391. ksort($this->lines[$line], SORT_NUMERIC);
  4392. } else {
  4393. if (isset($this->lines[-1])) {
  4394. $struct = $this->lines[-1];
  4395. } else {
  4396. $struct = $this->lines[-1] = $new_struct;
  4397. }
  4398. }
  4399. ksort($this->lines, SORT_NUMERIC);
  4400. // Now, check if we need to operate on a lower structure
  4401. if (!empty($attr)) {
  4402. $struct = $struct->getChild(HTMLPurifier_ErrorStruct::ATTR, $attr);
  4403. if (!$struct->value) {
  4404. $struct->value = array($attr, 'PUT VALUE HERE');
  4405. }
  4406. }
  4407. if (!empty($cssprop)) {
  4408. $struct = $struct->getChild(HTMLPurifier_ErrorStruct::CSSPROP, $cssprop);
  4409. if (!$struct->value) {
  4410. // if we tokenize CSS this might be a little more difficult to do
  4411. $struct->value = array($cssprop, 'PUT VALUE HERE');
  4412. }
  4413. }
  4414. // Ok, structs are all setup, now time to register the error
  4415. $struct->addError($severity, $msg);
  4416. }
  4417. /**
  4418. * Retrieves raw error data for custom formatter to use
  4419. */
  4420. public function getRaw()
  4421. {
  4422. return $this->errors;
  4423. }
  4424. /**
  4425. * Default HTML formatting implementation for error messages
  4426. * @param HTMLPurifier_Config $config Configuration, vital for HTML output nature
  4427. * @param array $errors Errors array to display; used for recursion.
  4428. * @return string
  4429. */
  4430. public function getHTMLFormatted($config, $errors = null)
  4431. {
  4432. $ret = array();
  4433. $this->generator = new HTMLPurifier_Generator($config, $this->context);
  4434. if ($errors === null) {
  4435. $errors = $this->errors;
  4436. }
  4437. // 'At line' message needs to be removed
  4438. // generation code for new structure goes here. It needs to be recursive.
  4439. foreach ($this->lines as $line => $col_array) {
  4440. if ($line == -1) {
  4441. continue;
  4442. }
  4443. foreach ($col_array as $col => $struct) {
  4444. $this->_renderStruct($ret, $struct, $line, $col);
  4445. }
  4446. }
  4447. if (isset($this->lines[-1])) {
  4448. $this->_renderStruct($ret, $this->lines[-1]);
  4449. }
  4450. if (empty($errors)) {
  4451. return '<p>' . $this->locale->getMessage('ErrorCollector: No errors') . '</p>';
  4452. } else {
  4453. return '<ul><li>' . implode('</li><li>', $ret) . '</li></ul>';
  4454. }
  4455. }
  4456. private function _renderStruct(&$ret, $struct, $line = null, $col = null)
  4457. {
  4458. $stack = array($struct);
  4459. $context_stack = array(array());
  4460. while ($current = array_pop($stack)) {
  4461. $context = array_pop($context_stack);
  4462. foreach ($current->errors as $error) {
  4463. list($severity, $msg) = $error;
  4464. $string = '';
  4465. $string .= '<div>';
  4466. // W3C uses an icon to indicate the severity of the error.
  4467. $error = $this->locale->getErrorName($severity);
  4468. $string .= "<span class=\"error e$severity\"><strong>$error</strong></span> ";
  4469. if (!is_null($line) && !is_null($col)) {
  4470. $string .= "<em class=\"location\">Line $line, Column $col: </em> ";
  4471. } else {
  4472. $string .= '<em class="location">End of Document: </em> ';
  4473. }
  4474. $string .= '<strong class="description">' . $this->generator->escape($msg) . '</strong> ';
  4475. $string .= '</div>';
  4476. // Here, have a marker for the character on the column appropriate.
  4477. // Be sure to clip extremely long lines.
  4478. //$string .= '<pre>';
  4479. //$string .= '';
  4480. //$string .= '</pre>';
  4481. $ret[] = $string;
  4482. }
  4483. foreach ($current->children as $array) {
  4484. $context[] = $current;
  4485. $stack = array_merge($stack, array_reverse($array, true));
  4486. for ($i = count($array); $i > 0; $i--) {
  4487. $context_stack[] = $context;
  4488. }
  4489. }
  4490. }
  4491. }
  4492. }
  4493. /**
  4494. * Records errors for particular segments of an HTML document such as tokens,
  4495. * attributes or CSS properties. They can contain error structs (which apply
  4496. * to components of what they represent), but their main purpose is to hold
  4497. * errors applying to whatever struct is being used.
  4498. */
  4499. class HTMLPurifier_ErrorStruct
  4500. {
  4501. /**
  4502. * Possible values for $children first-key. Note that top-level structures
  4503. * are automatically token-level.
  4504. */
  4505. const TOKEN = 0;
  4506. const ATTR = 1;
  4507. const CSSPROP = 2;
  4508. /**
  4509. * Type of this struct.
  4510. * @type string
  4511. */
  4512. public $type;
  4513. /**
  4514. * Value of the struct we are recording errors for. There are various
  4515. * values for this:
  4516. * - TOKEN: Instance of HTMLPurifier_Token
  4517. * - ATTR: array('attr-name', 'value')
  4518. * - CSSPROP: array('prop-name', 'value')
  4519. * @type mixed
  4520. */
  4521. public $value;
  4522. /**
  4523. * Errors registered for this structure.
  4524. * @type array
  4525. */
  4526. public $errors = array();
  4527. /**
  4528. * Child ErrorStructs that are from this structure. For example, a TOKEN
  4529. * ErrorStruct would contain ATTR ErrorStructs. This is a multi-dimensional
  4530. * array in structure: [TYPE]['identifier']
  4531. * @type array
  4532. */
  4533. public $children = array();
  4534. /**
  4535. * @param string $type
  4536. * @param string $id
  4537. * @return mixed
  4538. */
  4539. public function getChild($type, $id)
  4540. {
  4541. if (!isset($this->children[$type][$id])) {
  4542. $this->children[$type][$id] = new HTMLPurifier_ErrorStruct();
  4543. $this->children[$type][$id]->type = $type;
  4544. }
  4545. return $this->children[$type][$id];
  4546. }
  4547. /**
  4548. * @param int $severity
  4549. * @param string $message
  4550. */
  4551. public function addError($severity, $message)
  4552. {
  4553. $this->errors[] = array($severity, $message);
  4554. }
  4555. }
  4556. /**
  4557. * Global exception class for HTML Purifier; any exceptions we throw
  4558. * are from here.
  4559. */
  4560. class HTMLPurifier_Exception extends Exception
  4561. {
  4562. }
  4563. /**
  4564. * Represents a pre or post processing filter on HTML Purifier's output
  4565. *
  4566. * Sometimes, a little ad-hoc fixing of HTML has to be done before
  4567. * it gets sent through HTML Purifier: you can use filters to acheive
  4568. * this effect. For instance, YouTube videos can be preserved using
  4569. * this manner. You could have used a decorator for this task, but
  4570. * PHP's support for them is not terribly robust, so we're going
  4571. * to just loop through the filters.
  4572. *
  4573. * Filters should be exited first in, last out. If there are three filters,
  4574. * named 1, 2 and 3, the order of execution should go 1->preFilter,
  4575. * 2->preFilter, 3->preFilter, purify, 3->postFilter, 2->postFilter,
  4576. * 1->postFilter.
  4577. *
  4578. * @note Methods are not declared abstract as it is perfectly legitimate
  4579. * for an implementation not to want anything to happen on a step
  4580. */
  4581. class HTMLPurifier_Filter
  4582. {
  4583. /**
  4584. * Name of the filter for identification purposes.
  4585. * @type string
  4586. */
  4587. public $name;
  4588. /**
  4589. * Pre-processor function, handles HTML before HTML Purifier
  4590. * @param string $html
  4591. * @param HTMLPurifier_Config $config
  4592. * @param HTMLPurifier_Context $context
  4593. * @return string
  4594. */
  4595. public function preFilter($html, $config, $context)
  4596. {
  4597. return $html;
  4598. }
  4599. /**
  4600. * Post-processor function, handles HTML after HTML Purifier
  4601. * @param string $html
  4602. * @param HTMLPurifier_Config $config
  4603. * @param HTMLPurifier_Context $context
  4604. * @return string
  4605. */
  4606. public function postFilter($html, $config, $context)
  4607. {
  4608. return $html;
  4609. }
  4610. }
  4611. /**
  4612. * Generates HTML from tokens.
  4613. * @todo Refactor interface so that configuration/context is determined
  4614. * upon instantiation, no need for messy generateFromTokens() calls
  4615. * @todo Make some of the more internal functions protected, and have
  4616. * unit tests work around that
  4617. */
  4618. class HTMLPurifier_Generator
  4619. {
  4620. /**
  4621. * Whether or not generator should produce XML output.
  4622. * @type bool
  4623. */
  4624. private $_xhtml = true;
  4625. /**
  4626. * :HACK: Whether or not generator should comment the insides of <script> tags.
  4627. * @type bool
  4628. */
  4629. private $_scriptFix = false;
  4630. /**
  4631. * Cache of HTMLDefinition during HTML output to determine whether or
  4632. * not attributes should be minimized.
  4633. * @type HTMLPurifier_HTMLDefinition
  4634. */
  4635. private $_def;
  4636. /**
  4637. * Cache of %Output.SortAttr.
  4638. * @type bool
  4639. */
  4640. private $_sortAttr;
  4641. /**
  4642. * Cache of %Output.FlashCompat.
  4643. * @type bool
  4644. */
  4645. private $_flashCompat;
  4646. /**
  4647. * Cache of %Output.FixInnerHTML.
  4648. * @type bool
  4649. */
  4650. private $_innerHTMLFix;
  4651. /**
  4652. * Stack for keeping track of object information when outputting IE
  4653. * compatibility code.
  4654. * @type array
  4655. */
  4656. private $_flashStack = array();
  4657. /**
  4658. * Configuration for the generator
  4659. * @type HTMLPurifier_Config
  4660. */
  4661. protected $config;
  4662. /**
  4663. * @param HTMLPurifier_Config $config
  4664. * @param HTMLPurifier_Context $context
  4665. */
  4666. public function __construct($config, $context)
  4667. {
  4668. $this->config = $config;
  4669. $this->_scriptFix = $config->get('Output.CommentScriptContents');
  4670. $this->_innerHTMLFix = $config->get('Output.FixInnerHTML');
  4671. $this->_sortAttr = $config->get('Output.SortAttr');
  4672. $this->_flashCompat = $config->get('Output.FlashCompat');
  4673. $this->_def = $config->getHTMLDefinition();
  4674. $this->_xhtml = $this->_def->doctype->xml;
  4675. }
  4676. /**
  4677. * Generates HTML from an array of tokens.
  4678. * @param HTMLPurifier_Token[] $tokens Array of HTMLPurifier_Token
  4679. * @return string Generated HTML
  4680. */
  4681. public function generateFromTokens($tokens)
  4682. {
  4683. if (!$tokens) {
  4684. return '';
  4685. }
  4686. // Basic algorithm
  4687. $html = '';
  4688. for ($i = 0, $size = count($tokens); $i < $size; $i++) {
  4689. if ($this->_scriptFix && $tokens[$i]->name === 'script'
  4690. && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) {
  4691. // script special case
  4692. // the contents of the script block must be ONE token
  4693. // for this to work.
  4694. $html .= $this->generateFromToken($tokens[$i++]);
  4695. $html .= $this->generateScriptFromToken($tokens[$i++]);
  4696. }
  4697. $html .= $this->generateFromToken($tokens[$i]);
  4698. }
  4699. // Tidy cleanup
  4700. if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) {
  4701. $tidy = new Tidy;
  4702. $tidy->parseString(
  4703. $html,
  4704. array(
  4705. 'indent'=> true,
  4706. 'output-xhtml' => $this->_xhtml,
  4707. 'show-body-only' => true,
  4708. 'indent-spaces' => 2,
  4709. 'wrap' => 68,
  4710. ),
  4711. 'utf8'
  4712. );
  4713. $tidy->cleanRepair();
  4714. $html = (string) $tidy; // explicit cast necessary
  4715. }
  4716. // Normalize newlines to system defined value
  4717. if ($this->config->get('Core.NormalizeNewlines')) {
  4718. $nl = $this->config->get('Output.Newline');
  4719. if ($nl === null) {
  4720. $nl = PHP_EOL;
  4721. }
  4722. if ($nl !== "\n") {
  4723. $html = str_replace("\n", $nl, $html);
  4724. }
  4725. }
  4726. return $html;
  4727. }
  4728. /**
  4729. * Generates HTML from a single token.
  4730. * @param HTMLPurifier_Token $token HTMLPurifier_Token object.
  4731. * @return string Generated HTML
  4732. */
  4733. public function generateFromToken($token)
  4734. {
  4735. if (!$token instanceof HTMLPurifier_Token) {
  4736. trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING);
  4737. return '';
  4738. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  4739. $attr = $this->generateAttributes($token->attr, $token->name);
  4740. if ($this->_flashCompat) {
  4741. if ($token->name == "object") {
  4742. $flash = new stdClass();
  4743. $flash->attr = $token->attr;
  4744. $flash->param = array();
  4745. $this->_flashStack[] = $flash;
  4746. }
  4747. }
  4748. return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
  4749. } elseif ($token instanceof HTMLPurifier_Token_End) {
  4750. $_extra = '';
  4751. if ($this->_flashCompat) {
  4752. if ($token->name == "object" && !empty($this->_flashStack)) {
  4753. // doesn't do anything for now
  4754. }
  4755. }
  4756. return $_extra . '</' . $token->name . '>';
  4757. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  4758. if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) {
  4759. $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value'];
  4760. }
  4761. $attr = $this->generateAttributes($token->attr, $token->name);
  4762. return '<' . $token->name . ($attr ? ' ' : '') . $attr .
  4763. ( $this->_xhtml ? ' /': '' ) // <br /> v. <br>
  4764. . '>';
  4765. } elseif ($token instanceof HTMLPurifier_Token_Text) {
  4766. return $this->escape($token->data, ENT_NOQUOTES);
  4767. } elseif ($token instanceof HTMLPurifier_Token_Comment) {
  4768. return '<!--' . $token->data . '-->';
  4769. } else {
  4770. return '';
  4771. }
  4772. }
  4773. /**
  4774. * Special case processor for the contents of script tags
  4775. * @param HTMLPurifier_Token $token HTMLPurifier_Token object.
  4776. * @return string
  4777. * @warning This runs into problems if there's already a literal
  4778. * --> somewhere inside the script contents.
  4779. */
  4780. public function generateScriptFromToken($token)
  4781. {
  4782. if (!$token instanceof HTMLPurifier_Token_Text) {
  4783. return $this->generateFromToken($token);
  4784. }
  4785. // Thanks <http://lachy.id.au/log/2005/05/script-comments>
  4786. $data = preg_replace('#//\s*$#', '', $token->data);
  4787. return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>';
  4788. }
  4789. /**
  4790. * Generates attribute declarations from attribute array.
  4791. * @note This does not include the leading or trailing space.
  4792. * @param array $assoc_array_of_attributes Attribute array
  4793. * @param string $element Name of element attributes are for, used to check
  4794. * attribute minimization.
  4795. * @return string Generated HTML fragment for insertion.
  4796. */
  4797. public function generateAttributes($assoc_array_of_attributes, $element = '')
  4798. {
  4799. $html = '';
  4800. if ($this->_sortAttr) {
  4801. ksort($assoc_array_of_attributes);
  4802. }
  4803. foreach ($assoc_array_of_attributes as $key => $value) {
  4804. if (!$this->_xhtml) {
  4805. // Remove namespaced attributes
  4806. if (strpos($key, ':') !== false) {
  4807. continue;
  4808. }
  4809. // Check if we should minimize the attribute: val="val" -> val
  4810. if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) {
  4811. $html .= $key . ' ';
  4812. continue;
  4813. }
  4814. }
  4815. // Workaround for Internet Explorer innerHTML bug.
  4816. // Essentially, Internet Explorer, when calculating
  4817. // innerHTML, omits quotes if there are no instances of
  4818. // angled brackets, quotes or spaces. However, when parsing
  4819. // HTML (for example, when you assign to innerHTML), it
  4820. // treats backticks as quotes. Thus,
  4821. // <img alt="``" />
  4822. // becomes
  4823. // <img alt=`` />
  4824. // becomes
  4825. // <img alt='' />
  4826. // Fortunately, all we need to do is trigger an appropriate
  4827. // quoting style, which we do by adding an extra space.
  4828. // This also is consistent with the W3C spec, which states
  4829. // that user agents may ignore leading or trailing
  4830. // whitespace (in fact, most don't, at least for attributes
  4831. // like alt, but an extra space at the end is barely
  4832. // noticeable). Still, we have a configuration knob for
  4833. // this, since this transformation is not necesary if you
  4834. // don't process user input with innerHTML or you don't plan
  4835. // on supporting Internet Explorer.
  4836. if ($this->_innerHTMLFix) {
  4837. if (strpos($value, '`') !== false) {
  4838. // check if correct quoting style would not already be
  4839. // triggered
  4840. if (strcspn($value, '"\' <>') === strlen($value)) {
  4841. // protect!
  4842. $value .= ' ';
  4843. }
  4844. }
  4845. }
  4846. $html .= $key.'="'.$this->escape($value).'" ';
  4847. }
  4848. return rtrim($html);
  4849. }
  4850. /**
  4851. * Escapes raw text data.
  4852. * @todo This really ought to be protected, but until we have a facility
  4853. * for properly generating HTML here w/o using tokens, it stays
  4854. * public.
  4855. * @param string $string String data to escape for HTML.
  4856. * @param int $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is
  4857. * permissible for non-attribute output.
  4858. * @return string escaped data.
  4859. */
  4860. public function escape($string, $quote = null)
  4861. {
  4862. // Workaround for APC bug on Mac Leopard reported by sidepodcast
  4863. // http://htmlpurifier.org/phorum/read.php?3,4823,4846
  4864. if ($quote === null) {
  4865. $quote = ENT_COMPAT;
  4866. }
  4867. return htmlspecialchars($string, $quote, 'UTF-8');
  4868. }
  4869. }
  4870. /**
  4871. * Definition of the purified HTML that describes allowed children,
  4872. * attributes, and many other things.
  4873. *
  4874. * Conventions:
  4875. *
  4876. * All member variables that are prefixed with info
  4877. * (including the main $info array) are used by HTML Purifier internals
  4878. * and should not be directly edited when customizing the HTMLDefinition.
  4879. * They can usually be set via configuration directives or custom
  4880. * modules.
  4881. *
  4882. * On the other hand, member variables without the info prefix are used
  4883. * internally by the HTMLDefinition and MUST NOT be used by other HTML
  4884. * Purifier internals. Many of them, however, are public, and may be
  4885. * edited by userspace code to tweak the behavior of HTMLDefinition.
  4886. *
  4887. * @note This class is inspected by Printer_HTMLDefinition; please
  4888. * update that class if things here change.
  4889. *
  4890. * @warning Directives that change this object's structure must be in
  4891. * the HTML or Attr namespace!
  4892. */
  4893. class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
  4894. {
  4895. // FULLY-PUBLIC VARIABLES ---------------------------------------------
  4896. /**
  4897. * Associative array of element names to HTMLPurifier_ElementDef.
  4898. * @type HTMLPurifier_ElementDef[]
  4899. */
  4900. public $info = array();
  4901. /**
  4902. * Associative array of global attribute name to attribute definition.
  4903. * @type array
  4904. */
  4905. public $info_global_attr = array();
  4906. /**
  4907. * String name of parent element HTML will be going into.
  4908. * @type string
  4909. */
  4910. public $info_parent = 'div';
  4911. /**
  4912. * Definition for parent element, allows parent element to be a
  4913. * tag that's not allowed inside the HTML fragment.
  4914. * @type HTMLPurifier_ElementDef
  4915. */
  4916. public $info_parent_def;
  4917. /**
  4918. * String name of element used to wrap inline elements in block context.
  4919. * @type string
  4920. * @note This is rarely used except for BLOCKQUOTEs in strict mode
  4921. */
  4922. public $info_block_wrapper = 'p';
  4923. /**
  4924. * Associative array of deprecated tag name to HTMLPurifier_TagTransform.
  4925. * @type array
  4926. */
  4927. public $info_tag_transform = array();
  4928. /**
  4929. * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
  4930. * @type HTMLPurifier_AttrTransform[]
  4931. */
  4932. public $info_attr_transform_pre = array();
  4933. /**
  4934. * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
  4935. * @type HTMLPurifier_AttrTransform[]
  4936. */
  4937. public $info_attr_transform_post = array();
  4938. /**
  4939. * Nested lookup array of content set name (Block, Inline) to
  4940. * element name to whether or not it belongs in that content set.
  4941. * @type array
  4942. */
  4943. public $info_content_sets = array();
  4944. /**
  4945. * Indexed list of HTMLPurifier_Injector to be used.
  4946. * @type HTMLPurifier_Injector[]
  4947. */
  4948. public $info_injector = array();
  4949. /**
  4950. * Doctype object
  4951. * @type HTMLPurifier_Doctype
  4952. */
  4953. public $doctype;
  4954. // RAW CUSTOMIZATION STUFF --------------------------------------------
  4955. /**
  4956. * Adds a custom attribute to a pre-existing element
  4957. * @note This is strictly convenience, and does not have a corresponding
  4958. * method in HTMLPurifier_HTMLModule
  4959. * @param string $element_name Element name to add attribute to
  4960. * @param string $attr_name Name of attribute
  4961. * @param mixed $def Attribute definition, can be string or object, see
  4962. * HTMLPurifier_AttrTypes for details
  4963. */
  4964. public function addAttribute($element_name, $attr_name, $def)
  4965. {
  4966. $module = $this->getAnonymousModule();
  4967. if (!isset($module->info[$element_name])) {
  4968. $element = $module->addBlankElement($element_name);
  4969. } else {
  4970. $element = $module->info[$element_name];
  4971. }
  4972. $element->attr[$attr_name] = $def;
  4973. }
  4974. /**
  4975. * Adds a custom element to your HTML definition
  4976. * @see HTMLPurifier_HTMLModule::addElement() for detailed
  4977. * parameter and return value descriptions.
  4978. */
  4979. public function addElement($element_name, $type, $contents, $attr_collections, $attributes = array())
  4980. {
  4981. $module = $this->getAnonymousModule();
  4982. // assume that if the user is calling this, the element
  4983. // is safe. This may not be a good idea
  4984. $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
  4985. return $element;
  4986. }
  4987. /**
  4988. * Adds a blank element to your HTML definition, for overriding
  4989. * existing behavior
  4990. * @param string $element_name
  4991. * @return HTMLPurifier_ElementDef
  4992. * @see HTMLPurifier_HTMLModule::addBlankElement() for detailed
  4993. * parameter and return value descriptions.
  4994. */
  4995. public function addBlankElement($element_name)
  4996. {
  4997. $module = $this->getAnonymousModule();
  4998. $element = $module->addBlankElement($element_name);
  4999. return $element;
  5000. }
  5001. /**
  5002. * Retrieves a reference to the anonymous module, so you can
  5003. * bust out advanced features without having to make your own
  5004. * module.
  5005. * @return HTMLPurifier_HTMLModule
  5006. */
  5007. public function getAnonymousModule()
  5008. {
  5009. if (!$this->_anonModule) {
  5010. $this->_anonModule = new HTMLPurifier_HTMLModule();
  5011. $this->_anonModule->name = 'Anonymous';
  5012. }
  5013. return $this->_anonModule;
  5014. }
  5015. private $_anonModule = null;
  5016. // PUBLIC BUT INTERNAL VARIABLES --------------------------------------
  5017. /**
  5018. * @type string
  5019. */
  5020. public $type = 'HTML';
  5021. /**
  5022. * @type HTMLPurifier_HTMLModuleManager
  5023. */
  5024. public $manager;
  5025. /**
  5026. * Performs low-cost, preliminary initialization.
  5027. */
  5028. public function __construct()
  5029. {
  5030. $this->manager = new HTMLPurifier_HTMLModuleManager();
  5031. }
  5032. /**
  5033. * @param HTMLPurifier_Config $config
  5034. */
  5035. protected function doSetup($config)
  5036. {
  5037. $this->processModules($config);
  5038. $this->setupConfigStuff($config);
  5039. unset($this->manager);
  5040. // cleanup some of the element definitions
  5041. foreach ($this->info as $k => $v) {
  5042. unset($this->info[$k]->content_model);
  5043. unset($this->info[$k]->content_model_type);
  5044. }
  5045. }
  5046. /**
  5047. * Extract out the information from the manager
  5048. * @param HTMLPurifier_Config $config
  5049. */
  5050. protected function processModules($config)
  5051. {
  5052. if ($this->_anonModule) {
  5053. // for user specific changes
  5054. // this is late-loaded so we don't have to deal with PHP4
  5055. // reference wonky-ness
  5056. $this->manager->addModule($this->_anonModule);
  5057. unset($this->_anonModule);
  5058. }
  5059. $this->manager->setup($config);
  5060. $this->doctype = $this->manager->doctype;
  5061. foreach ($this->manager->modules as $module) {
  5062. foreach ($module->info_tag_transform as $k => $v) {
  5063. if ($v === false) {
  5064. unset($this->info_tag_transform[$k]);
  5065. } else {
  5066. $this->info_tag_transform[$k] = $v;
  5067. }
  5068. }
  5069. foreach ($module->info_attr_transform_pre as $k => $v) {
  5070. if ($v === false) {
  5071. unset($this->info_attr_transform_pre[$k]);
  5072. } else {
  5073. $this->info_attr_transform_pre[$k] = $v;
  5074. }
  5075. }
  5076. foreach ($module->info_attr_transform_post as $k => $v) {
  5077. if ($v === false) {
  5078. unset($this->info_attr_transform_post[$k]);
  5079. } else {
  5080. $this->info_attr_transform_post[$k] = $v;
  5081. }
  5082. }
  5083. foreach ($module->info_injector as $k => $v) {
  5084. if ($v === false) {
  5085. unset($this->info_injector[$k]);
  5086. } else {
  5087. $this->info_injector[$k] = $v;
  5088. }
  5089. }
  5090. }
  5091. $this->info = $this->manager->getElements();
  5092. $this->info_content_sets = $this->manager->contentSets->lookup;
  5093. }
  5094. /**
  5095. * Sets up stuff based on config. We need a better way of doing this.
  5096. * @param HTMLPurifier_Config $config
  5097. */
  5098. protected function setupConfigStuff($config)
  5099. {
  5100. $block_wrapper = $config->get('HTML.BlockWrapper');
  5101. if (isset($this->info_content_sets['Block'][$block_wrapper])) {
  5102. $this->info_block_wrapper = $block_wrapper;
  5103. } else {
  5104. trigger_error(
  5105. 'Cannot use non-block element as block wrapper',
  5106. E_USER_ERROR
  5107. );
  5108. }
  5109. $parent = $config->get('HTML.Parent');
  5110. $def = $this->manager->getElement($parent, true);
  5111. if ($def) {
  5112. $this->info_parent = $parent;
  5113. $this->info_parent_def = $def;
  5114. } else {
  5115. trigger_error(
  5116. 'Cannot use unrecognized element as parent',
  5117. E_USER_ERROR
  5118. );
  5119. $this->info_parent_def = $this->manager->getElement($this->info_parent, true);
  5120. }
  5121. // support template text
  5122. $support = "(for information on implementing this, see the support forums) ";
  5123. // setup allowed elements -----------------------------------------
  5124. $allowed_elements = $config->get('HTML.AllowedElements');
  5125. $allowed_attributes = $config->get('HTML.AllowedAttributes'); // retrieve early
  5126. if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
  5127. $allowed = $config->get('HTML.Allowed');
  5128. if (is_string($allowed)) {
  5129. list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
  5130. }
  5131. }
  5132. if (is_array($allowed_elements)) {
  5133. foreach ($this->info as $name => $d) {
  5134. if (!isset($allowed_elements[$name])) {
  5135. unset($this->info[$name]);
  5136. }
  5137. unset($allowed_elements[$name]);
  5138. }
  5139. // emit errors
  5140. foreach ($allowed_elements as $element => $d) {
  5141. $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
  5142. trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
  5143. }
  5144. }
  5145. // setup allowed attributes ---------------------------------------
  5146. $allowed_attributes_mutable = $allowed_attributes; // by copy!
  5147. if (is_array($allowed_attributes)) {
  5148. // This actually doesn't do anything, since we went away from
  5149. // global attributes. It's possible that userland code uses
  5150. // it, but HTMLModuleManager doesn't!
  5151. foreach ($this->info_global_attr as $attr => $x) {
  5152. $keys = array($attr, "*@$attr", "*.$attr");
  5153. $delete = true;
  5154. foreach ($keys as $key) {
  5155. if ($delete && isset($allowed_attributes[$key])) {
  5156. $delete = false;
  5157. }
  5158. if (isset($allowed_attributes_mutable[$key])) {
  5159. unset($allowed_attributes_mutable[$key]);
  5160. }
  5161. }
  5162. if ($delete) {
  5163. unset($this->info_global_attr[$attr]);
  5164. }
  5165. }
  5166. foreach ($this->info as $tag => $info) {
  5167. foreach ($info->attr as $attr => $x) {
  5168. $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
  5169. $delete = true;
  5170. foreach ($keys as $key) {
  5171. if ($delete && isset($allowed_attributes[$key])) {
  5172. $delete = false;
  5173. }
  5174. if (isset($allowed_attributes_mutable[$key])) {
  5175. unset($allowed_attributes_mutable[$key]);
  5176. }
  5177. }
  5178. if ($delete) {
  5179. if ($this->info[$tag]->attr[$attr]->required) {
  5180. trigger_error(
  5181. "Required attribute '$attr' in element '$tag' " .
  5182. "was not allowed, which means '$tag' will not be allowed either",
  5183. E_USER_WARNING
  5184. );
  5185. }
  5186. unset($this->info[$tag]->attr[$attr]);
  5187. }
  5188. }
  5189. }
  5190. // emit errors
  5191. foreach ($allowed_attributes_mutable as $elattr => $d) {
  5192. $bits = preg_split('/[.@]/', $elattr, 2);
  5193. $c = count($bits);
  5194. switch ($c) {
  5195. case 2:
  5196. if ($bits[0] !== '*') {
  5197. $element = htmlspecialchars($bits[0]);
  5198. $attribute = htmlspecialchars($bits[1]);
  5199. if (!isset($this->info[$element])) {
  5200. trigger_error(
  5201. "Cannot allow attribute '$attribute' if element " .
  5202. "'$element' is not allowed/supported $support"
  5203. );
  5204. } else {
  5205. trigger_error(
  5206. "Attribute '$attribute' in element '$element' not supported $support",
  5207. E_USER_WARNING
  5208. );
  5209. }
  5210. break;
  5211. }
  5212. // otherwise fall through
  5213. case 1:
  5214. $attribute = htmlspecialchars($bits[0]);
  5215. trigger_error(
  5216. "Global attribute '$attribute' is not ".
  5217. "supported in any elements $support",
  5218. E_USER_WARNING
  5219. );
  5220. break;
  5221. }
  5222. }
  5223. }
  5224. // setup forbidden elements ---------------------------------------
  5225. $forbidden_elements = $config->get('HTML.ForbiddenElements');
  5226. $forbidden_attributes = $config->get('HTML.ForbiddenAttributes');
  5227. foreach ($this->info as $tag => $info) {
  5228. if (isset($forbidden_elements[$tag])) {
  5229. unset($this->info[$tag]);
  5230. continue;
  5231. }
  5232. foreach ($info->attr as $attr => $x) {
  5233. if (isset($forbidden_attributes["$tag@$attr"]) ||
  5234. isset($forbidden_attributes["*@$attr"]) ||
  5235. isset($forbidden_attributes[$attr])
  5236. ) {
  5237. unset($this->info[$tag]->attr[$attr]);
  5238. continue;
  5239. } elseif (isset($forbidden_attributes["$tag.$attr"])) { // this segment might get removed eventually
  5240. // $tag.$attr are not user supplied, so no worries!
  5241. trigger_error(
  5242. "Error with $tag.$attr: tag.attr syntax not supported for " .
  5243. "HTML.ForbiddenAttributes; use tag@attr instead",
  5244. E_USER_WARNING
  5245. );
  5246. }
  5247. }
  5248. }
  5249. foreach ($forbidden_attributes as $key => $v) {
  5250. if (strlen($key) < 2) {
  5251. continue;
  5252. }
  5253. if ($key[0] != '*') {
  5254. continue;
  5255. }
  5256. if ($key[1] == '.') {
  5257. trigger_error(
  5258. "Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead",
  5259. E_USER_WARNING
  5260. );
  5261. }
  5262. }
  5263. // setup injectors -----------------------------------------------------
  5264. foreach ($this->info_injector as $i => $injector) {
  5265. if ($injector->checkNeeded($config) !== false) {
  5266. // remove injector that does not have it's required
  5267. // elements/attributes present, and is thus not needed.
  5268. unset($this->info_injector[$i]);
  5269. }
  5270. }
  5271. }
  5272. /**
  5273. * Parses a TinyMCE-flavored Allowed Elements and Attributes list into
  5274. * separate lists for processing. Format is element[attr1|attr2],element2...
  5275. * @warning Although it's largely drawn from TinyMCE's implementation,
  5276. * it is different, and you'll probably have to modify your lists
  5277. * @param array $list String list to parse
  5278. * @return array
  5279. * @todo Give this its own class, probably static interface
  5280. */
  5281. public function parseTinyMCEAllowedList($list)
  5282. {
  5283. $list = str_replace(array(' ', "\t"), '', $list);
  5284. $elements = array();
  5285. $attributes = array();
  5286. $chunks = preg_split('/(,|[\n\r]+)/', $list);
  5287. foreach ($chunks as $chunk) {
  5288. if (empty($chunk)) {
  5289. continue;
  5290. }
  5291. // remove TinyMCE element control characters
  5292. if (!strpos($chunk, '[')) {
  5293. $element = $chunk;
  5294. $attr = false;
  5295. } else {
  5296. list($element, $attr) = explode('[', $chunk);
  5297. }
  5298. if ($element !== '*') {
  5299. $elements[$element] = true;
  5300. }
  5301. if (!$attr) {
  5302. continue;
  5303. }
  5304. $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
  5305. $attr = explode('|', $attr);
  5306. foreach ($attr as $key) {
  5307. $attributes["$element.$key"] = true;
  5308. }
  5309. }
  5310. return array($elements, $attributes);
  5311. }
  5312. }
  5313. /**
  5314. * Represents an XHTML 1.1 module, with information on elements, tags
  5315. * and attributes.
  5316. * @note Even though this is technically XHTML 1.1, it is also used for
  5317. * regular HTML parsing. We are using modulization as a convenient
  5318. * way to represent the internals of HTMLDefinition, and our
  5319. * implementation is by no means conforming and does not directly
  5320. * use the normative DTDs or XML schemas.
  5321. * @note The public variables in a module should almost directly
  5322. * correspond to the variables in HTMLPurifier_HTMLDefinition.
  5323. * However, the prefix info carries no special meaning in these
  5324. * objects (include it anyway if that's the correspondence though).
  5325. * @todo Consider making some member functions protected
  5326. */
  5327. class HTMLPurifier_HTMLModule
  5328. {
  5329. // -- Overloadable ----------------------------------------------------
  5330. /**
  5331. * Short unique string identifier of the module.
  5332. * @type string
  5333. */
  5334. public $name;
  5335. /**
  5336. * Informally, a list of elements this module changes.
  5337. * Not used in any significant way.
  5338. * @type array
  5339. */
  5340. public $elements = array();
  5341. /**
  5342. * Associative array of element names to element definitions.
  5343. * Some definitions may be incomplete, to be merged in later
  5344. * with the full definition.
  5345. * @type array
  5346. */
  5347. public $info = array();
  5348. /**
  5349. * Associative array of content set names to content set additions.
  5350. * This is commonly used to, say, add an A element to the Inline
  5351. * content set. This corresponds to an internal variable $content_sets
  5352. * and NOT info_content_sets member variable of HTMLDefinition.
  5353. * @type array
  5354. */
  5355. public $content_sets = array();
  5356. /**
  5357. * Associative array of attribute collection names to attribute
  5358. * collection additions. More rarely used for adding attributes to
  5359. * the global collections. Example is the StyleAttribute module adding
  5360. * the style attribute to the Core. Corresponds to HTMLDefinition's
  5361. * attr_collections->info, since the object's data is only info,
  5362. * with extra behavior associated with it.
  5363. * @type array
  5364. */
  5365. public $attr_collections = array();
  5366. /**
  5367. * Associative array of deprecated tag name to HTMLPurifier_TagTransform.
  5368. * @type array
  5369. */
  5370. public $info_tag_transform = array();
  5371. /**
  5372. * List of HTMLPurifier_AttrTransform to be performed before validation.
  5373. * @type array
  5374. */
  5375. public $info_attr_transform_pre = array();
  5376. /**
  5377. * List of HTMLPurifier_AttrTransform to be performed after validation.
  5378. * @type array
  5379. */
  5380. public $info_attr_transform_post = array();
  5381. /**
  5382. * List of HTMLPurifier_Injector to be performed during well-formedness fixing.
  5383. * An injector will only be invoked if all of it's pre-requisites are met;
  5384. * if an injector fails setup, there will be no error; it will simply be
  5385. * silently disabled.
  5386. * @type array
  5387. */
  5388. public $info_injector = array();
  5389. /**
  5390. * Boolean flag that indicates whether or not getChildDef is implemented.
  5391. * For optimization reasons: may save a call to a function. Be sure
  5392. * to set it if you do implement getChildDef(), otherwise it will have
  5393. * no effect!
  5394. * @type bool
  5395. */
  5396. public $defines_child_def = false;
  5397. /**
  5398. * Boolean flag whether or not this module is safe. If it is not safe, all
  5399. * of its members are unsafe. Modules are safe by default (this might be
  5400. * slightly dangerous, but it doesn't make much sense to force HTML Purifier,
  5401. * which is based off of safe HTML, to explicitly say, "This is safe," even
  5402. * though there are modules which are "unsafe")
  5403. *
  5404. * @type bool
  5405. * @note Previously, safety could be applied at an element level granularity.
  5406. * We've removed this ability, so in order to add "unsafe" elements
  5407. * or attributes, a dedicated module with this property set to false
  5408. * must be used.
  5409. */
  5410. public $safe = true;
  5411. /**
  5412. * Retrieves a proper HTMLPurifier_ChildDef subclass based on
  5413. * content_model and content_model_type member variables of
  5414. * the HTMLPurifier_ElementDef class. There is a similar function
  5415. * in HTMLPurifier_HTMLDefinition.
  5416. * @param HTMLPurifier_ElementDef $def
  5417. * @return HTMLPurifier_ChildDef subclass
  5418. */
  5419. public function getChildDef($def)
  5420. {
  5421. return false;
  5422. }
  5423. // -- Convenience -----------------------------------------------------
  5424. /**
  5425. * Convenience function that sets up a new element
  5426. * @param string $element Name of element to add
  5427. * @param string|bool $type What content set should element be registered to?
  5428. * Set as false to skip this step.
  5429. * @param string $contents Allowed children in form of:
  5430. * "$content_model_type: $content_model"
  5431. * @param array $attr_includes What attribute collections to register to
  5432. * element?
  5433. * @param array $attr What unique attributes does the element define?
  5434. * @see HTMLPurifier_ElementDef:: for in-depth descriptions of these parameters.
  5435. * @return HTMLPurifier_ElementDef Created element definition object, so you
  5436. * can set advanced parameters
  5437. */
  5438. public function addElement($element, $type, $contents, $attr_includes = array(), $attr = array())
  5439. {
  5440. $this->elements[] = $element;
  5441. // parse content_model
  5442. list($content_model_type, $content_model) = $this->parseContents($contents);
  5443. // merge in attribute inclusions
  5444. $this->mergeInAttrIncludes($attr, $attr_includes);
  5445. // add element to content sets
  5446. if ($type) {
  5447. $this->addElementToContentSet($element, $type);
  5448. }
  5449. // create element
  5450. $this->info[$element] = HTMLPurifier_ElementDef::create(
  5451. $content_model,
  5452. $content_model_type,
  5453. $attr
  5454. );
  5455. // literal object $contents means direct child manipulation
  5456. if (!is_string($contents)) {
  5457. $this->info[$element]->child = $contents;
  5458. }
  5459. return $this->info[$element];
  5460. }
  5461. /**
  5462. * Convenience function that creates a totally blank, non-standalone
  5463. * element.
  5464. * @param string $element Name of element to create
  5465. * @return HTMLPurifier_ElementDef Created element
  5466. */
  5467. public function addBlankElement($element)
  5468. {
  5469. if (!isset($this->info[$element])) {
  5470. $this->elements[] = $element;
  5471. $this->info[$element] = new HTMLPurifier_ElementDef();
  5472. $this->info[$element]->standalone = false;
  5473. } else {
  5474. trigger_error("Definition for $element already exists in module, cannot redefine");
  5475. }
  5476. return $this->info[$element];
  5477. }
  5478. /**
  5479. * Convenience function that registers an element to a content set
  5480. * @param string $element Element to register
  5481. * @param string $type Name content set (warning: case sensitive, usually upper-case
  5482. * first letter)
  5483. */
  5484. public function addElementToContentSet($element, $type)
  5485. {
  5486. if (!isset($this->content_sets[$type])) {
  5487. $this->content_sets[$type] = '';
  5488. } else {
  5489. $this->content_sets[$type] .= ' | ';
  5490. }
  5491. $this->content_sets[$type] .= $element;
  5492. }
  5493. /**
  5494. * Convenience function that transforms single-string contents
  5495. * into separate content model and content model type
  5496. * @param string $contents Allowed children in form of:
  5497. * "$content_model_type: $content_model"
  5498. * @return array
  5499. * @note If contents is an object, an array of two nulls will be
  5500. * returned, and the callee needs to take the original $contents
  5501. * and use it directly.
  5502. */
  5503. public function parseContents($contents)
  5504. {
  5505. if (!is_string($contents)) {
  5506. return array(null, null);
  5507. } // defer
  5508. switch ($contents) {
  5509. // check for shorthand content model forms
  5510. case 'Empty':
  5511. return array('empty', '');
  5512. case 'Inline':
  5513. return array('optional', 'Inline | #PCDATA');
  5514. case 'Flow':
  5515. return array('optional', 'Flow | #PCDATA');
  5516. }
  5517. list($content_model_type, $content_model) = explode(':', $contents);
  5518. $content_model_type = strtolower(trim($content_model_type));
  5519. $content_model = trim($content_model);
  5520. return array($content_model_type, $content_model);
  5521. }
  5522. /**
  5523. * Convenience function that merges a list of attribute includes into
  5524. * an attribute array.
  5525. * @param array $attr Reference to attr array to modify
  5526. * @param array $attr_includes Array of includes / string include to merge in
  5527. */
  5528. public function mergeInAttrIncludes(&$attr, $attr_includes)
  5529. {
  5530. if (!is_array($attr_includes)) {
  5531. if (empty($attr_includes)) {
  5532. $attr_includes = array();
  5533. } else {
  5534. $attr_includes = array($attr_includes);
  5535. }
  5536. }
  5537. $attr[0] = $attr_includes;
  5538. }
  5539. /**
  5540. * Convenience function that generates a lookup table with boolean
  5541. * true as value.
  5542. * @param string $list List of values to turn into a lookup
  5543. * @note You can also pass an arbitrary number of arguments in
  5544. * place of the regular argument
  5545. * @return array array equivalent of list
  5546. */
  5547. public function makeLookup($list)
  5548. {
  5549. if (is_string($list)) {
  5550. $list = func_get_args();
  5551. }
  5552. $ret = array();
  5553. foreach ($list as $value) {
  5554. if (is_null($value)) {
  5555. continue;
  5556. }
  5557. $ret[$value] = true;
  5558. }
  5559. return $ret;
  5560. }
  5561. /**
  5562. * Lazy load construction of the module after determining whether
  5563. * or not it's needed, and also when a finalized configuration object
  5564. * is available.
  5565. * @param HTMLPurifier_Config $config
  5566. */
  5567. public function setup($config)
  5568. {
  5569. }
  5570. }
  5571. class HTMLPurifier_HTMLModuleManager
  5572. {
  5573. /**
  5574. * @type HTMLPurifier_DoctypeRegistry
  5575. */
  5576. public $doctypes;
  5577. /**
  5578. * Instance of current doctype.
  5579. * @type string
  5580. */
  5581. public $doctype;
  5582. /**
  5583. * @type HTMLPurifier_AttrTypes
  5584. */
  5585. public $attrTypes;
  5586. /**
  5587. * Active instances of modules for the specified doctype are
  5588. * indexed, by name, in this array.
  5589. * @type HTMLPurifier_HTMLModule[]
  5590. */
  5591. public $modules = array();
  5592. /**
  5593. * Array of recognized HTMLPurifier_HTMLModule instances,
  5594. * indexed by module's class name. This array is usually lazy loaded, but a
  5595. * user can overload a module by pre-emptively registering it.
  5596. * @type HTMLPurifier_HTMLModule[]
  5597. */
  5598. public $registeredModules = array();
  5599. /**
  5600. * List of extra modules that were added by the user
  5601. * using addModule(). These get unconditionally merged into the current doctype, whatever
  5602. * it may be.
  5603. * @type HTMLPurifier_HTMLModule[]
  5604. */
  5605. public $userModules = array();
  5606. /**
  5607. * Associative array of element name to list of modules that have
  5608. * definitions for the element; this array is dynamically filled.
  5609. * @type array
  5610. */
  5611. public $elementLookup = array();
  5612. /**
  5613. * List of prefixes we should use for registering small names.
  5614. * @type array
  5615. */
  5616. public $prefixes = array('HTMLPurifier_HTMLModule_');
  5617. /**
  5618. * @type HTMLPurifier_ContentSets
  5619. */
  5620. public $contentSets;
  5621. /**
  5622. * @type HTMLPurifier_AttrCollections
  5623. */
  5624. public $attrCollections;
  5625. /**
  5626. * If set to true, unsafe elements and attributes will be allowed.
  5627. * @type bool
  5628. */
  5629. public $trusted = false;
  5630. public function __construct()
  5631. {
  5632. // editable internal objects
  5633. $this->attrTypes = new HTMLPurifier_AttrTypes();
  5634. $this->doctypes = new HTMLPurifier_DoctypeRegistry();
  5635. // setup basic modules
  5636. $common = array(
  5637. 'CommonAttributes', 'Text', 'Hypertext', 'List',
  5638. 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
  5639. 'StyleAttribute',
  5640. // Unsafe:
  5641. 'Scripting', 'Object', 'Forms',
  5642. // Sorta legacy, but present in strict:
  5643. 'Name',
  5644. );
  5645. $transitional = array('Legacy', 'Target', 'Iframe');
  5646. $xml = array('XMLCommonAttributes');
  5647. $non_xml = array('NonXMLCommonAttributes');
  5648. // setup basic doctypes
  5649. $this->doctypes->register(
  5650. 'HTML 4.01 Transitional',
  5651. false,
  5652. array_merge($common, $transitional, $non_xml),
  5653. array('Tidy_Transitional', 'Tidy_Proprietary'),
  5654. array(),
  5655. '-//W3C//DTD HTML 4.01 Transitional//EN',
  5656. 'http://www.w3.org/TR/html4/loose.dtd'
  5657. );
  5658. $this->doctypes->register(
  5659. 'HTML 4.01 Strict',
  5660. false,
  5661. array_merge($common, $non_xml),
  5662. array('Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
  5663. array(),
  5664. '-//W3C//DTD HTML 4.01//EN',
  5665. 'http://www.w3.org/TR/html4/strict.dtd'
  5666. );
  5667. $this->doctypes->register(
  5668. 'XHTML 1.0 Transitional',
  5669. true,
  5670. array_merge($common, $transitional, $xml, $non_xml),
  5671. array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Name'),
  5672. array(),
  5673. '-//W3C//DTD XHTML 1.0 Transitional//EN',
  5674. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
  5675. );
  5676. $this->doctypes->register(
  5677. 'XHTML 1.0 Strict',
  5678. true,
  5679. array_merge($common, $xml, $non_xml),
  5680. array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary', 'Tidy_Name'),
  5681. array(),
  5682. '-//W3C//DTD XHTML 1.0 Strict//EN',
  5683. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
  5684. );
  5685. $this->doctypes->register(
  5686. 'XHTML 1.1',
  5687. true,
  5688. // Iframe is a real XHTML 1.1 module, despite being
  5689. // "transitional"!
  5690. array_merge($common, $xml, array('Ruby', 'Iframe')),
  5691. array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict', 'Tidy_Name'), // Tidy_XHTML1_1
  5692. array(),
  5693. '-//W3C//DTD XHTML 1.1//EN',
  5694. 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
  5695. );
  5696. }
  5697. /**
  5698. * Registers a module to the recognized module list, useful for
  5699. * overloading pre-existing modules.
  5700. * @param $module Mixed: string module name, with or without
  5701. * HTMLPurifier_HTMLModule prefix, or instance of
  5702. * subclass of HTMLPurifier_HTMLModule.
  5703. * @param $overload Boolean whether or not to overload previous modules.
  5704. * If this is not set, and you do overload a module,
  5705. * HTML Purifier will complain with a warning.
  5706. * @note This function will not call autoload, you must instantiate
  5707. * (and thus invoke) autoload outside the method.
  5708. * @note If a string is passed as a module name, different variants
  5709. * will be tested in this order:
  5710. * - Check for HTMLPurifier_HTMLModule_$name
  5711. * - Check all prefixes with $name in order they were added
  5712. * - Check for literal object name
  5713. * - Throw fatal error
  5714. * If your object name collides with an internal class, specify
  5715. * your module manually. All modules must have been included
  5716. * externally: registerModule will not perform inclusions for you!
  5717. */
  5718. public function registerModule($module, $overload = false)
  5719. {
  5720. if (is_string($module)) {
  5721. // attempt to load the module
  5722. $original_module = $module;
  5723. $ok = false;
  5724. foreach ($this->prefixes as $prefix) {
  5725. $module = $prefix . $original_module;
  5726. if (class_exists($module)) {
  5727. $ok = true;
  5728. break;
  5729. }
  5730. }
  5731. if (!$ok) {
  5732. $module = $original_module;
  5733. if (!class_exists($module)) {
  5734. trigger_error(
  5735. $original_module . ' module does not exist',
  5736. E_USER_ERROR
  5737. );
  5738. return;
  5739. }
  5740. }
  5741. $module = new $module();
  5742. }
  5743. if (empty($module->name)) {
  5744. trigger_error('Module instance of ' . get_class($module) . ' must have name');
  5745. return;
  5746. }
  5747. if (!$overload && isset($this->registeredModules[$module->name])) {
  5748. trigger_error('Overloading ' . $module->name . ' without explicit overload parameter', E_USER_WARNING);
  5749. }
  5750. $this->registeredModules[$module->name] = $module;
  5751. }
  5752. /**
  5753. * Adds a module to the current doctype by first registering it,
  5754. * and then tacking it on to the active doctype
  5755. */
  5756. public function addModule($module)
  5757. {
  5758. $this->registerModule($module);
  5759. if (is_object($module)) {
  5760. $module = $module->name;
  5761. }
  5762. $this->userModules[] = $module;
  5763. }
  5764. /**
  5765. * Adds a class prefix that registerModule() will use to resolve a
  5766. * string name to a concrete class
  5767. */
  5768. public function addPrefix($prefix)
  5769. {
  5770. $this->prefixes[] = $prefix;
  5771. }
  5772. /**
  5773. * Performs processing on modules, after being called you may
  5774. * use getElement() and getElements()
  5775. * @param HTMLPurifier_Config $config
  5776. */
  5777. public function setup($config)
  5778. {
  5779. $this->trusted = $config->get('HTML.Trusted');
  5780. // generate
  5781. $this->doctype = $this->doctypes->make($config);
  5782. $modules = $this->doctype->modules;
  5783. // take out the default modules that aren't allowed
  5784. $lookup = $config->get('HTML.AllowedModules');
  5785. $special_cases = $config->get('HTML.CoreModules');
  5786. if (is_array($lookup)) {
  5787. foreach ($modules as $k => $m) {
  5788. if (isset($special_cases[$m])) {
  5789. continue;
  5790. }
  5791. if (!isset($lookup[$m])) {
  5792. unset($modules[$k]);
  5793. }
  5794. }
  5795. }
  5796. // custom modules
  5797. if ($config->get('HTML.Proprietary')) {
  5798. $modules[] = 'Proprietary';
  5799. }
  5800. if ($config->get('HTML.SafeObject')) {
  5801. $modules[] = 'SafeObject';
  5802. }
  5803. if ($config->get('HTML.SafeEmbed')) {
  5804. $modules[] = 'SafeEmbed';
  5805. }
  5806. if ($config->get('HTML.SafeScripting') !== array()) {
  5807. $modules[] = 'SafeScripting';
  5808. }
  5809. if ($config->get('HTML.Nofollow')) {
  5810. $modules[] = 'Nofollow';
  5811. }
  5812. if ($config->get('HTML.TargetBlank')) {
  5813. $modules[] = 'TargetBlank';
  5814. }
  5815. // NB: HTML.TargetNoreferrer and HTML.TargetNoopener must be AFTER HTML.TargetBlank
  5816. // so that its post-attr-transform gets run afterwards.
  5817. if ($config->get('HTML.TargetNoreferrer')) {
  5818. $modules[] = 'TargetNoreferrer';
  5819. }
  5820. if ($config->get('HTML.TargetNoopener')) {
  5821. $modules[] = 'TargetNoopener';
  5822. }
  5823. // merge in custom modules
  5824. $modules = array_merge($modules, $this->userModules);
  5825. foreach ($modules as $module) {
  5826. $this->processModule($module);
  5827. $this->modules[$module]->setup($config);
  5828. }
  5829. foreach ($this->doctype->tidyModules as $module) {
  5830. $this->processModule($module);
  5831. $this->modules[$module]->setup($config);
  5832. }
  5833. // prepare any injectors
  5834. foreach ($this->modules as $module) {
  5835. $n = array();
  5836. foreach ($module->info_injector as $injector) {
  5837. if (!is_object($injector)) {
  5838. $class = "HTMLPurifier_Injector_$injector";
  5839. $injector = new $class;
  5840. }
  5841. $n[$injector->name] = $injector;
  5842. }
  5843. $module->info_injector = $n;
  5844. }
  5845. // setup lookup table based on all valid modules
  5846. foreach ($this->modules as $module) {
  5847. foreach ($module->info as $name => $def) {
  5848. if (!isset($this->elementLookup[$name])) {
  5849. $this->elementLookup[$name] = array();
  5850. }
  5851. $this->elementLookup[$name][] = $module->name;
  5852. }
  5853. }
  5854. // note the different choice
  5855. $this->contentSets = new HTMLPurifier_ContentSets(
  5856. // content set assembly deals with all possible modules,
  5857. // not just ones deemed to be "safe"
  5858. $this->modules
  5859. );
  5860. $this->attrCollections = new HTMLPurifier_AttrCollections(
  5861. $this->attrTypes,
  5862. // there is no way to directly disable a global attribute,
  5863. // but using AllowedAttributes or simply not including
  5864. // the module in your custom doctype should be sufficient
  5865. $this->modules
  5866. );
  5867. }
  5868. /**
  5869. * Takes a module and adds it to the active module collection,
  5870. * registering it if necessary.
  5871. */
  5872. public function processModule($module)
  5873. {
  5874. if (!isset($this->registeredModules[$module]) || is_object($module)) {
  5875. $this->registerModule($module);
  5876. }
  5877. $this->modules[$module] = $this->registeredModules[$module];
  5878. }
  5879. /**
  5880. * Retrieves merged element definitions.
  5881. * @return Array of HTMLPurifier_ElementDef
  5882. */
  5883. public function getElements()
  5884. {
  5885. $elements = array();
  5886. foreach ($this->modules as $module) {
  5887. if (!$this->trusted && !$module->safe) {
  5888. continue;
  5889. }
  5890. foreach ($module->info as $name => $v) {
  5891. if (isset($elements[$name])) {
  5892. continue;
  5893. }
  5894. $elements[$name] = $this->getElement($name);
  5895. }
  5896. }
  5897. // remove dud elements, this happens when an element that
  5898. // appeared to be safe actually wasn't
  5899. foreach ($elements as $n => $v) {
  5900. if ($v === false) {
  5901. unset($elements[$n]);
  5902. }
  5903. }
  5904. return $elements;
  5905. }
  5906. /**
  5907. * Retrieves a single merged element definition
  5908. * @param string $name Name of element
  5909. * @param bool $trusted Boolean trusted overriding parameter: set to true
  5910. * if you want the full version of an element
  5911. * @return HTMLPurifier_ElementDef Merged HTMLPurifier_ElementDef
  5912. * @note You may notice that modules are getting iterated over twice (once
  5913. * in getElements() and once here). This
  5914. * is because
  5915. */
  5916. public function getElement($name, $trusted = null)
  5917. {
  5918. if (!isset($this->elementLookup[$name])) {
  5919. return false;
  5920. }
  5921. // setup global state variables
  5922. $def = false;
  5923. if ($trusted === null) {
  5924. $trusted = $this->trusted;
  5925. }
  5926. // iterate through each module that has registered itself to this
  5927. // element
  5928. foreach ($this->elementLookup[$name] as $module_name) {
  5929. $module = $this->modules[$module_name];
  5930. // refuse to create/merge from a module that is deemed unsafe--
  5931. // pretend the module doesn't exist--when trusted mode is not on.
  5932. if (!$trusted && !$module->safe) {
  5933. continue;
  5934. }
  5935. // clone is used because, ideally speaking, the original
  5936. // definition should not be modified. Usually, this will
  5937. // make no difference, but for consistency's sake
  5938. $new_def = clone $module->info[$name];
  5939. if (!$def && $new_def->standalone) {
  5940. $def = $new_def;
  5941. } elseif ($def) {
  5942. // This will occur even if $new_def is standalone. In practice,
  5943. // this will usually result in a full replacement.
  5944. $def->mergeIn($new_def);
  5945. } else {
  5946. // :TODO:
  5947. // non-standalone definitions that don't have a standalone
  5948. // to merge into could be deferred to the end
  5949. // HOWEVER, it is perfectly valid for a non-standalone
  5950. // definition to lack a standalone definition, even
  5951. // after all processing: this allows us to safely
  5952. // specify extra attributes for elements that may not be
  5953. // enabled all in one place. In particular, this might
  5954. // be the case for trusted elements. WARNING: care must
  5955. // be taken that the /extra/ definitions are all safe.
  5956. continue;
  5957. }
  5958. // attribute value expansions
  5959. $this->attrCollections->performInclusions($def->attr);
  5960. $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
  5961. // descendants_are_inline, for ChildDef_Chameleon
  5962. if (is_string($def->content_model) &&
  5963. strpos($def->content_model, 'Inline') !== false) {
  5964. if ($name != 'del' && $name != 'ins') {
  5965. // this is for you, ins/del
  5966. $def->descendants_are_inline = true;
  5967. }
  5968. }
  5969. $this->contentSets->generateChildDef($def, $module);
  5970. }
  5971. // This can occur if there is a blank definition, but no base to
  5972. // mix it in with
  5973. if (!$def) {
  5974. return false;
  5975. }
  5976. // add information on required attributes
  5977. foreach ($def->attr as $attr_name => $attr_def) {
  5978. if ($attr_def->required) {
  5979. $def->required_attr[] = $attr_name;
  5980. }
  5981. }
  5982. return $def;
  5983. }
  5984. }
  5985. /**
  5986. * Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
  5987. * @note In Slashdot-speak, dupe means duplicate.
  5988. * @note The default constructor does not accept $config or $context objects:
  5989. * use must use the static build() factory method to perform initialization.
  5990. */
  5991. class HTMLPurifier_IDAccumulator
  5992. {
  5993. /**
  5994. * Lookup table of IDs we've accumulated.
  5995. * @public
  5996. */
  5997. public $ids = array();
  5998. /**
  5999. * Builds an IDAccumulator, also initializing the default blacklist
  6000. * @param HTMLPurifier_Config $config Instance of HTMLPurifier_Config
  6001. * @param HTMLPurifier_Context $context Instance of HTMLPurifier_Context
  6002. * @return HTMLPurifier_IDAccumulator Fully initialized HTMLPurifier_IDAccumulator
  6003. */
  6004. public static function build($config, $context)
  6005. {
  6006. $id_accumulator = new HTMLPurifier_IDAccumulator();
  6007. $id_accumulator->load($config->get('Attr.IDBlacklist'));
  6008. return $id_accumulator;
  6009. }
  6010. /**
  6011. * Add an ID to the lookup table.
  6012. * @param string $id ID to be added.
  6013. * @return bool status, true if success, false if there's a dupe
  6014. */
  6015. public function add($id)
  6016. {
  6017. if (isset($this->ids[$id])) {
  6018. return false;
  6019. }
  6020. return $this->ids[$id] = true;
  6021. }
  6022. /**
  6023. * Load a list of IDs into the lookup table
  6024. * @param $array_of_ids Array of IDs to load
  6025. * @note This function doesn't care about duplicates
  6026. */
  6027. public function load($array_of_ids)
  6028. {
  6029. foreach ($array_of_ids as $id) {
  6030. $this->ids[$id] = true;
  6031. }
  6032. }
  6033. }
  6034. /**
  6035. * Injects tokens into the document while parsing for well-formedness.
  6036. * This enables "formatter-like" functionality such as auto-paragraphing,
  6037. * smiley-ification and linkification to take place.
  6038. *
  6039. * A note on how handlers create changes; this is done by assigning a new
  6040. * value to the $token reference. These values can take a variety of forms and
  6041. * are best described HTMLPurifier_Strategy_MakeWellFormed->processToken()
  6042. * documentation.
  6043. *
  6044. * @todo Allow injectors to request a re-run on their output. This
  6045. * would help if an operation is recursive.
  6046. */
  6047. abstract class HTMLPurifier_Injector
  6048. {
  6049. /**
  6050. * Advisory name of injector, this is for friendly error messages.
  6051. * @type string
  6052. */
  6053. public $name;
  6054. /**
  6055. * @type HTMLPurifier_HTMLDefinition
  6056. */
  6057. protected $htmlDefinition;
  6058. /**
  6059. * Reference to CurrentNesting variable in Context. This is an array
  6060. * list of tokens that we are currently "inside"
  6061. * @type array
  6062. */
  6063. protected $currentNesting;
  6064. /**
  6065. * Reference to current token.
  6066. * @type HTMLPurifier_Token
  6067. */
  6068. protected $currentToken;
  6069. /**
  6070. * Reference to InputZipper variable in Context.
  6071. * @type HTMLPurifier_Zipper
  6072. */
  6073. protected $inputZipper;
  6074. /**
  6075. * Array of elements and attributes this injector creates and therefore
  6076. * need to be allowed by the definition. Takes form of
  6077. * array('element' => array('attr', 'attr2'), 'element2')
  6078. * @type array
  6079. */
  6080. public $needed = array();
  6081. /**
  6082. * Number of elements to rewind backwards (relative).
  6083. * @type bool|int
  6084. */
  6085. protected $rewindOffset = false;
  6086. /**
  6087. * Rewind to a spot to re-perform processing. This is useful if you
  6088. * deleted a node, and now need to see if this change affected any
  6089. * earlier nodes. Rewinding does not affect other injectors, and can
  6090. * result in infinite loops if not used carefully.
  6091. * @param bool|int $offset
  6092. * @warning HTML Purifier will prevent you from fast-forwarding with this
  6093. * function.
  6094. */
  6095. public function rewindOffset($offset)
  6096. {
  6097. $this->rewindOffset = $offset;
  6098. }
  6099. /**
  6100. * Retrieves rewind offset, and then unsets it.
  6101. * @return bool|int
  6102. */
  6103. public function getRewindOffset()
  6104. {
  6105. $r = $this->rewindOffset;
  6106. $this->rewindOffset = false;
  6107. return $r;
  6108. }
  6109. /**
  6110. * Prepares the injector by giving it the config and context objects:
  6111. * this allows references to important variables to be made within
  6112. * the injector. This function also checks if the HTML environment
  6113. * will work with the Injector (see checkNeeded()).
  6114. * @param HTMLPurifier_Config $config
  6115. * @param HTMLPurifier_Context $context
  6116. * @return bool|string Boolean false if success, string of missing needed element/attribute if failure
  6117. */
  6118. public function prepare($config, $context)
  6119. {
  6120. $this->htmlDefinition = $config->getHTMLDefinition();
  6121. // Even though this might fail, some unit tests ignore this and
  6122. // still test checkNeeded, so be careful. Maybe get rid of that
  6123. // dependency.
  6124. $result = $this->checkNeeded($config);
  6125. if ($result !== false) {
  6126. return $result;
  6127. }
  6128. $this->currentNesting =& $context->get('CurrentNesting');
  6129. $this->currentToken =& $context->get('CurrentToken');
  6130. $this->inputZipper =& $context->get('InputZipper');
  6131. return false;
  6132. }
  6133. /**
  6134. * This function checks if the HTML environment
  6135. * will work with the Injector: if p tags are not allowed, the
  6136. * Auto-Paragraphing injector should not be enabled.
  6137. * @param HTMLPurifier_Config $config
  6138. * @return bool|string Boolean false if success, string of missing needed element/attribute if failure
  6139. */
  6140. public function checkNeeded($config)
  6141. {
  6142. $def = $config->getHTMLDefinition();
  6143. foreach ($this->needed as $element => $attributes) {
  6144. if (is_int($element)) {
  6145. $element = $attributes;
  6146. }
  6147. if (!isset($def->info[$element])) {
  6148. return $element;
  6149. }
  6150. if (!is_array($attributes)) {
  6151. continue;
  6152. }
  6153. foreach ($attributes as $name) {
  6154. if (!isset($def->info[$element]->attr[$name])) {
  6155. return "$element.$name";
  6156. }
  6157. }
  6158. }
  6159. return false;
  6160. }
  6161. /**
  6162. * Tests if the context node allows a certain element
  6163. * @param string $name Name of element to test for
  6164. * @return bool True if element is allowed, false if it is not
  6165. */
  6166. public function allowsElement($name)
  6167. {
  6168. if (!empty($this->currentNesting)) {
  6169. $parent_token = array_pop($this->currentNesting);
  6170. $this->currentNesting[] = $parent_token;
  6171. $parent = $this->htmlDefinition->info[$parent_token->name];
  6172. } else {
  6173. $parent = $this->htmlDefinition->info_parent_def;
  6174. }
  6175. if (!isset($parent->child->elements[$name]) || isset($parent->excludes[$name])) {
  6176. return false;
  6177. }
  6178. // check for exclusion
  6179. for ($i = count($this->currentNesting) - 2; $i >= 0; $i--) {
  6180. $node = $this->currentNesting[$i];
  6181. $def = $this->htmlDefinition->info[$node->name];
  6182. if (isset($def->excludes[$name])) {
  6183. return false;
  6184. }
  6185. }
  6186. return true;
  6187. }
  6188. /**
  6189. * Iterator function, which starts with the next token and continues until
  6190. * you reach the end of the input tokens.
  6191. * @warning Please prevent previous references from interfering with this
  6192. * functions by setting $i = null beforehand!
  6193. * @param int $i Current integer index variable for inputTokens
  6194. * @param HTMLPurifier_Token $current Current token variable.
  6195. * Do NOT use $token, as that variable is also a reference
  6196. * @return bool
  6197. */
  6198. protected function forward(&$i, &$current)
  6199. {
  6200. if ($i === null) {
  6201. $i = count($this->inputZipper->back) - 1;
  6202. } else {
  6203. $i--;
  6204. }
  6205. if ($i < 0) {
  6206. return false;
  6207. }
  6208. $current = $this->inputZipper->back[$i];
  6209. return true;
  6210. }
  6211. /**
  6212. * Similar to _forward, but accepts a third parameter $nesting (which
  6213. * should be initialized at 0) and stops when we hit the end tag
  6214. * for the node $this->inputIndex starts in.
  6215. * @param int $i Current integer index variable for inputTokens
  6216. * @param HTMLPurifier_Token $current Current token variable.
  6217. * Do NOT use $token, as that variable is also a reference
  6218. * @param int $nesting
  6219. * @return bool
  6220. */
  6221. protected function forwardUntilEndToken(&$i, &$current, &$nesting)
  6222. {
  6223. $result = $this->forward($i, $current);
  6224. if (!$result) {
  6225. return false;
  6226. }
  6227. if ($nesting === null) {
  6228. $nesting = 0;
  6229. }
  6230. if ($current instanceof HTMLPurifier_Token_Start) {
  6231. $nesting++;
  6232. } elseif ($current instanceof HTMLPurifier_Token_End) {
  6233. if ($nesting <= 0) {
  6234. return false;
  6235. }
  6236. $nesting--;
  6237. }
  6238. return true;
  6239. }
  6240. /**
  6241. * Iterator function, starts with the previous token and continues until
  6242. * you reach the beginning of input tokens.
  6243. * @warning Please prevent previous references from interfering with this
  6244. * functions by setting $i = null beforehand!
  6245. * @param int $i Current integer index variable for inputTokens
  6246. * @param HTMLPurifier_Token $current Current token variable.
  6247. * Do NOT use $token, as that variable is also a reference
  6248. * @return bool
  6249. */
  6250. protected function backward(&$i, &$current)
  6251. {
  6252. if ($i === null) {
  6253. $i = count($this->inputZipper->front) - 1;
  6254. } else {
  6255. $i--;
  6256. }
  6257. if ($i < 0) {
  6258. return false;
  6259. }
  6260. $current = $this->inputZipper->front[$i];
  6261. return true;
  6262. }
  6263. /**
  6264. * Handler that is called when a text token is processed
  6265. */
  6266. public function handleText(&$token)
  6267. {
  6268. }
  6269. /**
  6270. * Handler that is called when a start or empty token is processed
  6271. */
  6272. public function handleElement(&$token)
  6273. {
  6274. }
  6275. /**
  6276. * Handler that is called when an end token is processed
  6277. */
  6278. public function handleEnd(&$token)
  6279. {
  6280. $this->notifyEnd($token);
  6281. }
  6282. /**
  6283. * Notifier that is called when an end token is processed
  6284. * @param HTMLPurifier_Token $token Current token variable.
  6285. * @note This differs from handlers in that the token is read-only
  6286. * @deprecated
  6287. */
  6288. public function notifyEnd($token)
  6289. {
  6290. }
  6291. }
  6292. /**
  6293. * Represents a language and defines localizable string formatting and
  6294. * other functions, as well as the localized messages for HTML Purifier.
  6295. */
  6296. class HTMLPurifier_Language
  6297. {
  6298. /**
  6299. * ISO 639 language code of language. Prefers shortest possible version.
  6300. * @type string
  6301. */
  6302. public $code = 'en';
  6303. /**
  6304. * Fallback language code.
  6305. * @type bool|string
  6306. */
  6307. public $fallback = false;
  6308. /**
  6309. * Array of localizable messages.
  6310. * @type array
  6311. */
  6312. public $messages = array();
  6313. /**
  6314. * Array of localizable error codes.
  6315. * @type array
  6316. */
  6317. public $errorNames = array();
  6318. /**
  6319. * True if no message file was found for this language, so English
  6320. * is being used instead. Check this if you'd like to notify the
  6321. * user that they've used a non-supported language.
  6322. * @type bool
  6323. */
  6324. public $error = false;
  6325. /**
  6326. * Has the language object been loaded yet?
  6327. * @type bool
  6328. * @todo Make it private, fix usage in HTMLPurifier_LanguageTest
  6329. */
  6330. public $_loaded = false;
  6331. /**
  6332. * @type HTMLPurifier_Config
  6333. */
  6334. protected $config;
  6335. /**
  6336. * @type HTMLPurifier_Context
  6337. */
  6338. protected $context;
  6339. /**
  6340. * @param HTMLPurifier_Config $config
  6341. * @param HTMLPurifier_Context $context
  6342. */
  6343. public function __construct($config, $context)
  6344. {
  6345. $this->config = $config;
  6346. $this->context = $context;
  6347. }
  6348. /**
  6349. * Loads language object with necessary info from factory cache
  6350. * @note This is a lazy loader
  6351. */
  6352. public function load()
  6353. {
  6354. if ($this->_loaded) {
  6355. return;
  6356. }
  6357. $factory = HTMLPurifier_LanguageFactory::instance();
  6358. $factory->loadLanguage($this->code);
  6359. foreach ($factory->keys as $key) {
  6360. $this->$key = $factory->cache[$this->code][$key];
  6361. }
  6362. $this->_loaded = true;
  6363. }
  6364. /**
  6365. * Retrieves a localised message.
  6366. * @param string $key string identifier of message
  6367. * @return string localised message
  6368. */
  6369. public function getMessage($key)
  6370. {
  6371. if (!$this->_loaded) {
  6372. $this->load();
  6373. }
  6374. if (!isset($this->messages[$key])) {
  6375. return "[$key]";
  6376. }
  6377. return $this->messages[$key];
  6378. }
  6379. /**
  6380. * Retrieves a localised error name.
  6381. * @param int $int error number, corresponding to PHP's error reporting
  6382. * @return string localised message
  6383. */
  6384. public function getErrorName($int)
  6385. {
  6386. if (!$this->_loaded) {
  6387. $this->load();
  6388. }
  6389. if (!isset($this->errorNames[$int])) {
  6390. return "[Error: $int]";
  6391. }
  6392. return $this->errorNames[$int];
  6393. }
  6394. /**
  6395. * Converts an array list into a string readable representation
  6396. * @param array $array
  6397. * @return string
  6398. */
  6399. public function listify($array)
  6400. {
  6401. $sep = $this->getMessage('Item separator');
  6402. $sep_last = $this->getMessage('Item separator last');
  6403. $ret = '';
  6404. for ($i = 0, $c = count($array); $i < $c; $i++) {
  6405. if ($i == 0) {
  6406. } elseif ($i + 1 < $c) {
  6407. $ret .= $sep;
  6408. } else {
  6409. $ret .= $sep_last;
  6410. }
  6411. $ret .= $array[$i];
  6412. }
  6413. return $ret;
  6414. }
  6415. /**
  6416. * Formats a localised message with passed parameters
  6417. * @param string $key string identifier of message
  6418. * @param array $args Parameters to substitute in
  6419. * @return string localised message
  6420. * @todo Implement conditionals? Right now, some messages make
  6421. * reference to line numbers, but those aren't always available
  6422. */
  6423. public function formatMessage($key, $args = array())
  6424. {
  6425. if (!$this->_loaded) {
  6426. $this->load();
  6427. }
  6428. if (!isset($this->messages[$key])) {
  6429. return "[$key]";
  6430. }
  6431. $raw = $this->messages[$key];
  6432. $subst = array();
  6433. $generator = false;
  6434. foreach ($args as $i => $value) {
  6435. if (is_object($value)) {
  6436. if ($value instanceof HTMLPurifier_Token) {
  6437. // factor this out some time
  6438. if (!$generator) {
  6439. $generator = $this->context->get('Generator');
  6440. }
  6441. if (isset($value->name)) {
  6442. $subst['$'.$i.'.Name'] = $value->name;
  6443. }
  6444. if (isset($value->data)) {
  6445. $subst['$'.$i.'.Data'] = $value->data;
  6446. }
  6447. $subst['$'.$i.'.Compact'] =
  6448. $subst['$'.$i.'.Serialized'] = $generator->generateFromToken($value);
  6449. // a more complex algorithm for compact representation
  6450. // could be introduced for all types of tokens. This
  6451. // may need to be factored out into a dedicated class
  6452. if (!empty($value->attr)) {
  6453. $stripped_token = clone $value;
  6454. $stripped_token->attr = array();
  6455. $subst['$'.$i.'.Compact'] = $generator->generateFromToken($stripped_token);
  6456. }
  6457. $subst['$'.$i.'.Line'] = $value->line ? $value->line : 'unknown';
  6458. }
  6459. continue;
  6460. } elseif (is_array($value)) {
  6461. $keys = array_keys($value);
  6462. if (array_keys($keys) === $keys) {
  6463. // list
  6464. $subst['$'.$i] = $this->listify($value);
  6465. } else {
  6466. // associative array
  6467. // no $i implementation yet, sorry
  6468. $subst['$'.$i.'.Keys'] = $this->listify($keys);
  6469. $subst['$'.$i.'.Values'] = $this->listify(array_values($value));
  6470. }
  6471. continue;
  6472. }
  6473. $subst['$' . $i] = $value;
  6474. }
  6475. return strtr($raw, $subst);
  6476. }
  6477. }
  6478. /**
  6479. * Class responsible for generating HTMLPurifier_Language objects, managing
  6480. * caching and fallbacks.
  6481. * @note Thanks to MediaWiki for the general logic, although this version
  6482. * has been entirely rewritten
  6483. * @todo Serialized cache for languages
  6484. */
  6485. class HTMLPurifier_LanguageFactory
  6486. {
  6487. /**
  6488. * Cache of language code information used to load HTMLPurifier_Language objects.
  6489. * Structure is: $factory->cache[$language_code][$key] = $value
  6490. * @type array
  6491. */
  6492. public $cache;
  6493. /**
  6494. * Valid keys in the HTMLPurifier_Language object. Designates which
  6495. * variables to slurp out of a message file.
  6496. * @type array
  6497. */
  6498. public $keys = array('fallback', 'messages', 'errorNames');
  6499. /**
  6500. * Instance to validate language codes.
  6501. * @type HTMLPurifier_AttrDef_Lang
  6502. *
  6503. */
  6504. protected $validator;
  6505. /**
  6506. * Cached copy of dirname(__FILE__), directory of current file without
  6507. * trailing slash.
  6508. * @type string
  6509. */
  6510. protected $dir;
  6511. /**
  6512. * Keys whose contents are a hash map and can be merged.
  6513. * @type array
  6514. */
  6515. protected $mergeable_keys_map = array('messages' => true, 'errorNames' => true);
  6516. /**
  6517. * Keys whose contents are a list and can be merged.
  6518. * @value array lookup
  6519. */
  6520. protected $mergeable_keys_list = array();
  6521. /**
  6522. * Retrieve sole instance of the factory.
  6523. * @param HTMLPurifier_LanguageFactory $prototype Optional prototype to overload sole instance with,
  6524. * or bool true to reset to default factory.
  6525. * @return HTMLPurifier_LanguageFactory
  6526. */
  6527. public static function instance($prototype = null)
  6528. {
  6529. static $instance = null;
  6530. if ($prototype !== null) {
  6531. $instance = $prototype;
  6532. } elseif ($instance === null || $prototype == true) {
  6533. $instance = new HTMLPurifier_LanguageFactory();
  6534. $instance->setup();
  6535. }
  6536. return $instance;
  6537. }
  6538. /**
  6539. * Sets up the singleton, much like a constructor
  6540. * @note Prevents people from getting this outside of the singleton
  6541. */
  6542. public function setup()
  6543. {
  6544. $this->validator = new HTMLPurifier_AttrDef_Lang();
  6545. $this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
  6546. }
  6547. /**
  6548. * Creates a language object, handles class fallbacks
  6549. * @param HTMLPurifier_Config $config
  6550. * @param HTMLPurifier_Context $context
  6551. * @param bool|string $code Code to override configuration with. Private parameter.
  6552. * @return HTMLPurifier_Language
  6553. */
  6554. public function create($config, $context, $code = false)
  6555. {
  6556. // validate language code
  6557. if ($code === false) {
  6558. $code = $this->validator->validate(
  6559. $config->get('Core.Language'),
  6560. $config,
  6561. $context
  6562. );
  6563. } else {
  6564. $code = $this->validator->validate($code, $config, $context);
  6565. }
  6566. if ($code === false) {
  6567. $code = 'en'; // malformed code becomes English
  6568. }
  6569. $pcode = str_replace('-', '_', $code); // make valid PHP classname
  6570. static $depth = 0; // recursion protection
  6571. if ($code == 'en') {
  6572. $lang = new HTMLPurifier_Language($config, $context);
  6573. } else {
  6574. $class = 'HTMLPurifier_Language_' . $pcode;
  6575. $file = $this->dir . '/Language/classes/' . $code . '.php';
  6576. if (file_exists($file) || class_exists($class, false)) {
  6577. $lang = new $class($config, $context);
  6578. } else {
  6579. // Go fallback
  6580. $raw_fallback = $this->getFallbackFor($code);
  6581. $fallback = $raw_fallback ? $raw_fallback : 'en';
  6582. $depth++;
  6583. $lang = $this->create($config, $context, $fallback);
  6584. if (!$raw_fallback) {
  6585. $lang->error = true;
  6586. }
  6587. $depth--;
  6588. }
  6589. }
  6590. $lang->code = $code;
  6591. return $lang;
  6592. }
  6593. /**
  6594. * Returns the fallback language for language
  6595. * @note Loads the original language into cache
  6596. * @param string $code language code
  6597. * @return string|bool
  6598. */
  6599. public function getFallbackFor($code)
  6600. {
  6601. $this->loadLanguage($code);
  6602. return $this->cache[$code]['fallback'];
  6603. }
  6604. /**
  6605. * Loads language into the cache, handles message file and fallbacks
  6606. * @param string $code language code
  6607. */
  6608. public function loadLanguage($code)
  6609. {
  6610. static $languages_seen = array(); // recursion guard
  6611. // abort if we've already loaded it
  6612. if (isset($this->cache[$code])) {
  6613. return;
  6614. }
  6615. // generate filename
  6616. $filename = $this->dir . '/Language/messages/' . $code . '.php';
  6617. // default fallback : may be overwritten by the ensuing include
  6618. $fallback = ($code != 'en') ? 'en' : false;
  6619. // load primary localisation
  6620. if (!file_exists($filename)) {
  6621. // skip the include: will rely solely on fallback
  6622. $filename = $this->dir . '/Language/messages/en.php';
  6623. $cache = array();
  6624. } else {
  6625. include $filename;
  6626. $cache = compact($this->keys);
  6627. }
  6628. // load fallback localisation
  6629. if (!empty($fallback)) {
  6630. // infinite recursion guard
  6631. if (isset($languages_seen[$code])) {
  6632. trigger_error(
  6633. 'Circular fallback reference in language ' .
  6634. $code,
  6635. E_USER_ERROR
  6636. );
  6637. $fallback = 'en';
  6638. }
  6639. $language_seen[$code] = true;
  6640. // load the fallback recursively
  6641. $this->loadLanguage($fallback);
  6642. $fallback_cache = $this->cache[$fallback];
  6643. // merge fallback with current language
  6644. foreach ($this->keys as $key) {
  6645. if (isset($cache[$key]) && isset($fallback_cache[$key])) {
  6646. if (isset($this->mergeable_keys_map[$key])) {
  6647. $cache[$key] = $cache[$key] + $fallback_cache[$key];
  6648. } elseif (isset($this->mergeable_keys_list[$key])) {
  6649. $cache[$key] = array_merge($fallback_cache[$key], $cache[$key]);
  6650. }
  6651. } else {
  6652. $cache[$key] = $fallback_cache[$key];
  6653. }
  6654. }
  6655. }
  6656. // save to cache for later retrieval
  6657. $this->cache[$code] = $cache;
  6658. return;
  6659. }
  6660. }
  6661. /**
  6662. * Represents a measurable length, with a string numeric magnitude
  6663. * and a unit. This object is immutable.
  6664. */
  6665. class HTMLPurifier_Length
  6666. {
  6667. /**
  6668. * String numeric magnitude.
  6669. * @type string
  6670. */
  6671. protected $n;
  6672. /**
  6673. * String unit. False is permitted if $n = 0.
  6674. * @type string|bool
  6675. */
  6676. protected $unit;
  6677. /**
  6678. * Whether or not this length is valid. Null if not calculated yet.
  6679. * @type bool
  6680. */
  6681. protected $isValid;
  6682. /**
  6683. * Array Lookup array of units recognized by CSS 2.1
  6684. * @type array
  6685. */
  6686. protected static $allowedUnits = array(
  6687. 'em' => true, 'ex' => true, 'px' => true, 'in' => true,
  6688. 'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
  6689. );
  6690. /**
  6691. * @param string $n Magnitude
  6692. * @param bool|string $u Unit
  6693. */
  6694. public function __construct($n = '0', $u = false)
  6695. {
  6696. $this->n = (string) $n;
  6697. $this->unit = $u !== false ? (string) $u : false;
  6698. }
  6699. /**
  6700. * @param string $s Unit string, like '2em' or '3.4in'
  6701. * @return HTMLPurifier_Length
  6702. * @warning Does not perform validation.
  6703. */
  6704. public static function make($s)
  6705. {
  6706. if ($s instanceof HTMLPurifier_Length) {
  6707. return $s;
  6708. }
  6709. $n_length = strspn($s, '1234567890.+-');
  6710. $n = substr($s, 0, $n_length);
  6711. $unit = substr($s, $n_length);
  6712. if ($unit === '') {
  6713. $unit = false;
  6714. }
  6715. return new HTMLPurifier_Length($n, $unit);
  6716. }
  6717. /**
  6718. * Validates the number and unit.
  6719. * @return bool
  6720. */
  6721. protected function validate()
  6722. {
  6723. // Special case:
  6724. if ($this->n === '+0' || $this->n === '-0') {
  6725. $this->n = '0';
  6726. }
  6727. if ($this->n === '0' && $this->unit === false) {
  6728. return true;
  6729. }
  6730. if (!ctype_lower($this->unit)) {
  6731. $this->unit = strtolower($this->unit);
  6732. }
  6733. if (!isset(HTMLPurifier_Length::$allowedUnits[$this->unit])) {
  6734. return false;
  6735. }
  6736. // Hack:
  6737. $def = new HTMLPurifier_AttrDef_CSS_Number();
  6738. $result = $def->validate($this->n, false, false);
  6739. if ($result === false) {
  6740. return false;
  6741. }
  6742. $this->n = $result;
  6743. return true;
  6744. }
  6745. /**
  6746. * Returns string representation of number.
  6747. * @return string
  6748. */
  6749. public function toString()
  6750. {
  6751. if (!$this->isValid()) {
  6752. return false;
  6753. }
  6754. return $this->n . $this->unit;
  6755. }
  6756. /**
  6757. * Retrieves string numeric magnitude.
  6758. * @return string
  6759. */
  6760. public function getN()
  6761. {
  6762. return $this->n;
  6763. }
  6764. /**
  6765. * Retrieves string unit.
  6766. * @return string
  6767. */
  6768. public function getUnit()
  6769. {
  6770. return $this->unit;
  6771. }
  6772. /**
  6773. * Returns true if this length unit is valid.
  6774. * @return bool
  6775. */
  6776. public function isValid()
  6777. {
  6778. if ($this->isValid === null) {
  6779. $this->isValid = $this->validate();
  6780. }
  6781. return $this->isValid;
  6782. }
  6783. /**
  6784. * Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
  6785. * @param HTMLPurifier_Length $l
  6786. * @return int
  6787. * @warning If both values are too large or small, this calculation will
  6788. * not work properly
  6789. */
  6790. public function compareTo($l)
  6791. {
  6792. if ($l === false) {
  6793. return false;
  6794. }
  6795. if ($l->unit !== $this->unit) {
  6796. $converter = new HTMLPurifier_UnitConverter();
  6797. $l = $converter->convert($l, $this->unit);
  6798. if ($l === false) {
  6799. return false;
  6800. }
  6801. }
  6802. return $this->n - $l->n;
  6803. }
  6804. }
  6805. /**
  6806. * Forgivingly lexes HTML (SGML-style) markup into tokens.
  6807. *
  6808. * A lexer parses a string of SGML-style markup and converts them into
  6809. * corresponding tokens. It doesn't check for well-formedness, although its
  6810. * internal mechanism may make this automatic (such as the case of
  6811. * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
  6812. * from.
  6813. *
  6814. * A lexer is HTML-oriented: it might work with XML, but it's not
  6815. * recommended, as we adhere to a subset of the specification for optimization
  6816. * reasons. This might change in the future. Also, most tokenizers are not
  6817. * expected to handle DTDs or PIs.
  6818. *
  6819. * This class should not be directly instantiated, but you may use create() to
  6820. * retrieve a default copy of the lexer. Being a supertype, this class
  6821. * does not actually define any implementation, but offers commonly used
  6822. * convenience functions for subclasses.
  6823. *
  6824. * @note The unit tests will instantiate this class for testing purposes, as
  6825. * many of the utility functions require a class to be instantiated.
  6826. * This means that, even though this class is not runnable, it will
  6827. * not be declared abstract.
  6828. *
  6829. * @par
  6830. *
  6831. * @note
  6832. * We use tokens rather than create a DOM representation because DOM would:
  6833. *
  6834. * @par
  6835. * -# Require more processing and memory to create,
  6836. * -# Is not streamable, and
  6837. * -# Has the entire document structure (html and body not needed).
  6838. *
  6839. * @par
  6840. * However, DOM is helpful in that it makes it easy to move around nodes
  6841. * without a lot of lookaheads to see when a tag is closed. This is a
  6842. * limitation of the token system and some workarounds would be nice.
  6843. */
  6844. class HTMLPurifier_Lexer
  6845. {
  6846. /**
  6847. * Whether or not this lexer implements line-number/column-number tracking.
  6848. * If it does, set to true.
  6849. */
  6850. public $tracksLineNumbers = false;
  6851. // -- STATIC ----------------------------------------------------------
  6852. /**
  6853. * Retrieves or sets the default Lexer as a Prototype Factory.
  6854. *
  6855. * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
  6856. * a few exceptions involving special features that only DirectLex
  6857. * implements.
  6858. *
  6859. * @note The behavior of this class has changed, rather than accepting
  6860. * a prototype object, it now accepts a configuration object.
  6861. * To specify your own prototype, set %Core.LexerImpl to it.
  6862. * This change in behavior de-singletonizes the lexer object.
  6863. *
  6864. * @param HTMLPurifier_Config $config
  6865. * @return HTMLPurifier_Lexer
  6866. * @throws HTMLPurifier_Exception
  6867. */
  6868. public static function create($config)
  6869. {
  6870. if (!($config instanceof HTMLPurifier_Config)) {
  6871. $lexer = $config;
  6872. trigger_error(
  6873. "Passing a prototype to
  6874. HTMLPurifier_Lexer::create() is deprecated, please instead
  6875. use %Core.LexerImpl",
  6876. E_USER_WARNING
  6877. );
  6878. } else {
  6879. $lexer = $config->get('Core.LexerImpl');
  6880. }
  6881. $needs_tracking =
  6882. $config->get('Core.MaintainLineNumbers') ||
  6883. $config->get('Core.CollectErrors');
  6884. $inst = null;
  6885. if (is_object($lexer)) {
  6886. $inst = $lexer;
  6887. } else {
  6888. if (is_null($lexer)) {
  6889. do {
  6890. // auto-detection algorithm
  6891. if ($needs_tracking) {
  6892. $lexer = 'DirectLex';
  6893. break;
  6894. }
  6895. if (class_exists('DOMDocument', false) &&
  6896. method_exists('DOMDocument', 'loadHTML') &&
  6897. !extension_loaded('domxml')
  6898. ) {
  6899. // check for DOM support, because while it's part of the
  6900. // core, it can be disabled compile time. Also, the PECL
  6901. // domxml extension overrides the default DOM, and is evil
  6902. // and nasty and we shan't bother to support it
  6903. $lexer = 'DOMLex';
  6904. } else {
  6905. $lexer = 'DirectLex';
  6906. }
  6907. } while (0);
  6908. } // do..while so we can break
  6909. // instantiate recognized string names
  6910. switch ($lexer) {
  6911. case 'DOMLex':
  6912. $inst = new HTMLPurifier_Lexer_DOMLex();
  6913. break;
  6914. case 'DirectLex':
  6915. $inst = new HTMLPurifier_Lexer_DirectLex();
  6916. break;
  6917. case 'PH5P':
  6918. $inst = new HTMLPurifier_Lexer_PH5P();
  6919. break;
  6920. default:
  6921. throw new HTMLPurifier_Exception(
  6922. "Cannot instantiate unrecognized Lexer type " .
  6923. htmlspecialchars($lexer)
  6924. );
  6925. }
  6926. }
  6927. if (!$inst) {
  6928. throw new HTMLPurifier_Exception('No lexer was instantiated');
  6929. }
  6930. // once PHP DOM implements native line numbers, or we
  6931. // hack out something using XSLT, remove this stipulation
  6932. if ($needs_tracking && !$inst->tracksLineNumbers) {
  6933. throw new HTMLPurifier_Exception(
  6934. 'Cannot use lexer that does not support line numbers with ' .
  6935. 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
  6936. );
  6937. }
  6938. return $inst;
  6939. }
  6940. // -- CONVENIENCE MEMBERS ---------------------------------------------
  6941. public function __construct()
  6942. {
  6943. $this->_entity_parser = new HTMLPurifier_EntityParser();
  6944. }
  6945. /**
  6946. * Most common entity to raw value conversion table for special entities.
  6947. * @type array
  6948. */
  6949. protected $_special_entity2str =
  6950. array(
  6951. '&quot;' => '"',
  6952. '&amp;' => '&',
  6953. '&lt;' => '<',
  6954. '&gt;' => '>',
  6955. '&#39;' => "'",
  6956. '&#039;' => "'",
  6957. '&#x27;' => "'"
  6958. );
  6959. public function parseText($string, $config) {
  6960. return $this->parseData($string, false, $config);
  6961. }
  6962. public function parseAttr($string, $config) {
  6963. return $this->parseData($string, true, $config);
  6964. }
  6965. /**
  6966. * Parses special entities into the proper characters.
  6967. *
  6968. * This string will translate escaped versions of the special characters
  6969. * into the correct ones.
  6970. *
  6971. * @param string $string String character data to be parsed.
  6972. * @return string Parsed character data.
  6973. */
  6974. public function parseData($string, $is_attr, $config)
  6975. {
  6976. // following functions require at least one character
  6977. if ($string === '') {
  6978. return '';
  6979. }
  6980. // subtracts amps that cannot possibly be escaped
  6981. $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
  6982. ($string[strlen($string) - 1] === '&' ? 1 : 0);
  6983. if (!$num_amp) {
  6984. return $string;
  6985. } // abort if no entities
  6986. $num_esc_amp = substr_count($string, '&amp;');
  6987. $string = strtr($string, $this->_special_entity2str);
  6988. // code duplication for sake of optimization, see above
  6989. $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
  6990. ($string[strlen($string) - 1] === '&' ? 1 : 0);
  6991. if ($num_amp_2 <= $num_esc_amp) {
  6992. return $string;
  6993. }
  6994. // hmm... now we have some uncommon entities. Use the callback.
  6995. if ($config->get('Core.LegacyEntityDecoder')) {
  6996. $string = $this->_entity_parser->substituteSpecialEntities($string);
  6997. } else {
  6998. if ($is_attr) {
  6999. $string = $this->_entity_parser->substituteAttrEntities($string);
  7000. } else {
  7001. $string = $this->_entity_parser->substituteTextEntities($string);
  7002. }
  7003. }
  7004. return $string;
  7005. }
  7006. /**
  7007. * Lexes an HTML string into tokens.
  7008. * @param $string String HTML.
  7009. * @param HTMLPurifier_Config $config
  7010. * @param HTMLPurifier_Context $context
  7011. * @return HTMLPurifier_Token[] array representation of HTML.
  7012. */
  7013. public function tokenizeHTML($string, $config, $context)
  7014. {
  7015. trigger_error('Call to abstract class', E_USER_ERROR);
  7016. }
  7017. /**
  7018. * Translates CDATA sections into regular sections (through escaping).
  7019. * @param string $string HTML string to process.
  7020. * @return string HTML with CDATA sections escaped.
  7021. */
  7022. protected static function escapeCDATA($string)
  7023. {
  7024. return preg_replace_callback(
  7025. '/<!\[CDATA\[(.+?)\]\]>/s',
  7026. array('HTMLPurifier_Lexer', 'CDATACallback'),
  7027. $string
  7028. );
  7029. }
  7030. /**
  7031. * Special CDATA case that is especially convoluted for <script>
  7032. * @param string $string HTML string to process.
  7033. * @return string HTML with CDATA sections escaped.
  7034. */
  7035. protected static function escapeCommentedCDATA($string)
  7036. {
  7037. return preg_replace_callback(
  7038. '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
  7039. array('HTMLPurifier_Lexer', 'CDATACallback'),
  7040. $string
  7041. );
  7042. }
  7043. /**
  7044. * Special Internet Explorer conditional comments should be removed.
  7045. * @param string $string HTML string to process.
  7046. * @return string HTML with conditional comments removed.
  7047. */
  7048. protected static function removeIEConditional($string)
  7049. {
  7050. return preg_replace(
  7051. '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
  7052. '',
  7053. $string
  7054. );
  7055. }
  7056. /**
  7057. * Callback function for escapeCDATA() that does the work.
  7058. *
  7059. * @warning Though this is public in order to let the callback happen,
  7060. * calling it directly is not recommended.
  7061. * @param array $matches PCRE matches array, with index 0 the entire match
  7062. * and 1 the inside of the CDATA section.
  7063. * @return string Escaped internals of the CDATA section.
  7064. */
  7065. protected static function CDATACallback($matches)
  7066. {
  7067. // not exactly sure why the character set is needed, but whatever
  7068. return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
  7069. }
  7070. /**
  7071. * Takes a piece of HTML and normalizes it by converting entities, fixing
  7072. * encoding, extracting bits, and other good stuff.
  7073. * @param string $html HTML.
  7074. * @param HTMLPurifier_Config $config
  7075. * @param HTMLPurifier_Context $context
  7076. * @return string
  7077. * @todo Consider making protected
  7078. */
  7079. public function normalize($html, $config, $context)
  7080. {
  7081. // normalize newlines to \n
  7082. if ($config->get('Core.NormalizeNewlines')) {
  7083. $html = str_replace("\r\n", "\n", $html);
  7084. $html = str_replace("\r", "\n", $html);
  7085. }
  7086. if ($config->get('HTML.Trusted')) {
  7087. // escape convoluted CDATA
  7088. $html = $this->escapeCommentedCDATA($html);
  7089. }
  7090. // escape CDATA
  7091. $html = $this->escapeCDATA($html);
  7092. $html = $this->removeIEConditional($html);
  7093. // extract body from document if applicable
  7094. if ($config->get('Core.ConvertDocumentToFragment')) {
  7095. $e = false;
  7096. if ($config->get('Core.CollectErrors')) {
  7097. $e =& $context->get('ErrorCollector');
  7098. }
  7099. $new_html = $this->extractBody($html);
  7100. if ($e && $new_html != $html) {
  7101. $e->send(E_WARNING, 'Lexer: Extracted body');
  7102. }
  7103. $html = $new_html;
  7104. }
  7105. // expand entities that aren't the big five
  7106. if ($config->get('Core.LegacyEntityDecoder')) {
  7107. $html = $this->_entity_parser->substituteNonSpecialEntities($html);
  7108. }
  7109. // clean into wellformed UTF-8 string for an SGML context: this has
  7110. // to be done after entity expansion because the entities sometimes
  7111. // represent non-SGML characters (horror, horror!)
  7112. $html = HTMLPurifier_Encoder::cleanUTF8($html);
  7113. // if processing instructions are to removed, remove them now
  7114. if ($config->get('Core.RemoveProcessingInstructions')) {
  7115. $html = preg_replace('#<\?.+?\?>#s', '', $html);
  7116. }
  7117. $hidden_elements = $config->get('Core.HiddenElements');
  7118. if ($config->get('Core.AggressivelyRemoveScript') &&
  7119. !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
  7120. || empty($hidden_elements["script"]))) {
  7121. $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
  7122. }
  7123. return $html;
  7124. }
  7125. /**
  7126. * Takes a string of HTML (fragment or document) and returns the content
  7127. * @todo Consider making protected
  7128. */
  7129. public function extractBody($html)
  7130. {
  7131. $matches = array();
  7132. $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
  7133. if ($result) {
  7134. // Make sure it's not in a comment
  7135. $comment_start = strrpos($matches[1], '<!--');
  7136. $comment_end = strrpos($matches[1], '-->');
  7137. if ($comment_start === false ||
  7138. ($comment_end !== false && $comment_end > $comment_start)) {
  7139. return $matches[2];
  7140. }
  7141. }
  7142. return $html;
  7143. }
  7144. }
  7145. /**
  7146. * Abstract base node class that all others inherit from.
  7147. *
  7148. * Why do we not use the DOM extension? (1) It is not always available,
  7149. * (2) it has funny constraints on the data it can represent,
  7150. * whereas we want a maximally flexible representation, and (3) its
  7151. * interface is a bit cumbersome.
  7152. */
  7153. abstract class HTMLPurifier_Node
  7154. {
  7155. /**
  7156. * Line number of the start token in the source document
  7157. * @type int
  7158. */
  7159. public $line;
  7160. /**
  7161. * Column number of the start token in the source document. Null if unknown.
  7162. * @type int
  7163. */
  7164. public $col;
  7165. /**
  7166. * Lookup array of processing that this token is exempt from.
  7167. * Currently, valid values are "ValidateAttributes".
  7168. * @type array
  7169. */
  7170. public $armor = array();
  7171. /**
  7172. * When true, this node should be ignored as non-existent.
  7173. *
  7174. * Who is responsible for ignoring dead nodes? FixNesting is
  7175. * responsible for removing them before passing on to child
  7176. * validators.
  7177. */
  7178. public $dead = false;
  7179. /**
  7180. * Returns a pair of start and end tokens, where the end token
  7181. * is null if it is not necessary. Does not include children.
  7182. * @type array
  7183. */
  7184. abstract public function toTokenPair();
  7185. }
  7186. /**
  7187. * Class that handles operations involving percent-encoding in URIs.
  7188. *
  7189. * @warning
  7190. * Be careful when reusing instances of PercentEncoder. The object
  7191. * you use for normalize() SHOULD NOT be used for encode(), or
  7192. * vice-versa.
  7193. */
  7194. class HTMLPurifier_PercentEncoder
  7195. {
  7196. /**
  7197. * Reserved characters to preserve when using encode().
  7198. * @type array
  7199. */
  7200. protected $preserve = array();
  7201. /**
  7202. * String of characters that should be preserved while using encode().
  7203. * @param bool $preserve
  7204. */
  7205. public function __construct($preserve = false)
  7206. {
  7207. // unreserved letters, ought to const-ify
  7208. for ($i = 48; $i <= 57; $i++) { // digits
  7209. $this->preserve[$i] = true;
  7210. }
  7211. for ($i = 65; $i <= 90; $i++) { // upper-case
  7212. $this->preserve[$i] = true;
  7213. }
  7214. for ($i = 97; $i <= 122; $i++) { // lower-case
  7215. $this->preserve[$i] = true;
  7216. }
  7217. $this->preserve[45] = true; // Dash -
  7218. $this->preserve[46] = true; // Period .
  7219. $this->preserve[95] = true; // Underscore _
  7220. $this->preserve[126]= true; // Tilde ~
  7221. // extra letters not to escape
  7222. if ($preserve !== false) {
  7223. for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
  7224. $this->preserve[ord($preserve[$i])] = true;
  7225. }
  7226. }
  7227. }
  7228. /**
  7229. * Our replacement for urlencode, it encodes all non-reserved characters,
  7230. * as well as any extra characters that were instructed to be preserved.
  7231. * @note
  7232. * Assumes that the string has already been normalized, making any
  7233. * and all percent escape sequences valid. Percents will not be
  7234. * re-escaped, regardless of their status in $preserve
  7235. * @param string $string String to be encoded
  7236. * @return string Encoded string.
  7237. */
  7238. public function encode($string)
  7239. {
  7240. $ret = '';
  7241. for ($i = 0, $c = strlen($string); $i < $c; $i++) {
  7242. if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])])) {
  7243. $ret .= '%' . sprintf('%02X', $int);
  7244. } else {
  7245. $ret .= $string[$i];
  7246. }
  7247. }
  7248. return $ret;
  7249. }
  7250. /**
  7251. * Fix up percent-encoding by decoding unreserved characters and normalizing.
  7252. * @warning This function is affected by $preserve, even though the
  7253. * usual desired behavior is for this not to preserve those
  7254. * characters. Be careful when reusing instances of PercentEncoder!
  7255. * @param string $string String to normalize
  7256. * @return string
  7257. */
  7258. public function normalize($string)
  7259. {
  7260. if ($string == '') {
  7261. return '';
  7262. }
  7263. $parts = explode('%', $string);
  7264. $ret = array_shift($parts);
  7265. foreach ($parts as $part) {
  7266. $length = strlen($part);
  7267. if ($length < 2) {
  7268. $ret .= '%25' . $part;
  7269. continue;
  7270. }
  7271. $encoding = substr($part, 0, 2);
  7272. $text = substr($part, 2);
  7273. if (!ctype_xdigit($encoding)) {
  7274. $ret .= '%25' . $part;
  7275. continue;
  7276. }
  7277. $int = hexdec($encoding);
  7278. if (isset($this->preserve[$int])) {
  7279. $ret .= chr($int) . $text;
  7280. continue;
  7281. }
  7282. $encoding = strtoupper($encoding);
  7283. $ret .= '%' . $encoding . $text;
  7284. }
  7285. return $ret;
  7286. }
  7287. }
  7288. /**
  7289. * Generic property list implementation
  7290. */
  7291. class HTMLPurifier_PropertyList
  7292. {
  7293. /**
  7294. * Internal data-structure for properties.
  7295. * @type array
  7296. */
  7297. protected $data = array();
  7298. /**
  7299. * Parent plist.
  7300. * @type HTMLPurifier_PropertyList
  7301. */
  7302. protected $parent;
  7303. /**
  7304. * Cache.
  7305. * @type array
  7306. */
  7307. protected $cache;
  7308. /**
  7309. * @param HTMLPurifier_PropertyList $parent Parent plist
  7310. */
  7311. public function __construct($parent = null)
  7312. {
  7313. $this->parent = $parent;
  7314. }
  7315. /**
  7316. * Recursively retrieves the value for a key
  7317. * @param string $name
  7318. * @throws HTMLPurifier_Exception
  7319. */
  7320. public function get($name)
  7321. {
  7322. if ($this->has($name)) {
  7323. return $this->data[$name];
  7324. }
  7325. // possible performance bottleneck, convert to iterative if necessary
  7326. if ($this->parent) {
  7327. return $this->parent->get($name);
  7328. }
  7329. throw new HTMLPurifier_Exception("Key '$name' not found");
  7330. }
  7331. /**
  7332. * Sets the value of a key, for this plist
  7333. * @param string $name
  7334. * @param mixed $value
  7335. */
  7336. public function set($name, $value)
  7337. {
  7338. $this->data[$name] = $value;
  7339. }
  7340. /**
  7341. * Returns true if a given key exists
  7342. * @param string $name
  7343. * @return bool
  7344. */
  7345. public function has($name)
  7346. {
  7347. return array_key_exists($name, $this->data);
  7348. }
  7349. /**
  7350. * Resets a value to the value of it's parent, usually the default. If
  7351. * no value is specified, the entire plist is reset.
  7352. * @param string $name
  7353. */
  7354. public function reset($name = null)
  7355. {
  7356. if ($name == null) {
  7357. $this->data = array();
  7358. } else {
  7359. unset($this->data[$name]);
  7360. }
  7361. }
  7362. /**
  7363. * Squashes this property list and all of its property lists into a single
  7364. * array, and returns the array. This value is cached by default.
  7365. * @param bool $force If true, ignores the cache and regenerates the array.
  7366. * @return array
  7367. */
  7368. public function squash($force = false)
  7369. {
  7370. if ($this->cache !== null && !$force) {
  7371. return $this->cache;
  7372. }
  7373. if ($this->parent) {
  7374. return $this->cache = array_merge($this->parent->squash($force), $this->data);
  7375. } else {
  7376. return $this->cache = $this->data;
  7377. }
  7378. }
  7379. /**
  7380. * Returns the parent plist.
  7381. * @return HTMLPurifier_PropertyList
  7382. */
  7383. public function getParent()
  7384. {
  7385. return $this->parent;
  7386. }
  7387. /**
  7388. * Sets the parent plist.
  7389. * @param HTMLPurifier_PropertyList $plist Parent plist
  7390. */
  7391. public function setParent($plist)
  7392. {
  7393. $this->parent = $plist;
  7394. }
  7395. }
  7396. /**
  7397. * Property list iterator. Do not instantiate this class directly.
  7398. */
  7399. class HTMLPurifier_PropertyListIterator extends FilterIterator
  7400. {
  7401. /**
  7402. * @type int
  7403. */
  7404. protected $l;
  7405. /**
  7406. * @type string
  7407. */
  7408. protected $filter;
  7409. /**
  7410. * @param Iterator $iterator Array of data to iterate over
  7411. * @param string $filter Optional prefix to only allow values of
  7412. */
  7413. public function __construct(Iterator $iterator, $filter = null)
  7414. {
  7415. parent::__construct($iterator);
  7416. $this->l = strlen($filter);
  7417. $this->filter = $filter;
  7418. }
  7419. /**
  7420. * @return bool
  7421. */
  7422. public function accept()
  7423. {
  7424. $key = $this->getInnerIterator()->key();
  7425. if (strncmp($key, $this->filter, $this->l) !== 0) {
  7426. return false;
  7427. }
  7428. return true;
  7429. }
  7430. }
  7431. /**
  7432. * A simple array-backed queue, based off of the classic Okasaki
  7433. * persistent amortized queue. The basic idea is to maintain two
  7434. * stacks: an input stack and an output stack. When the output
  7435. * stack runs out, reverse the input stack and use it as the output
  7436. * stack.
  7437. *
  7438. * We don't use the SPL implementation because it's only supported
  7439. * on PHP 5.3 and later.
  7440. *
  7441. * Exercise: Prove that push/pop on this queue take amortized O(1) time.
  7442. *
  7443. * Exercise: Extend this queue to be a deque, while preserving amortized
  7444. * O(1) time. Some care must be taken on rebalancing to avoid quadratic
  7445. * behaviour caused by repeatedly shuffling data from the input stack
  7446. * to the output stack and back.
  7447. */
  7448. class HTMLPurifier_Queue {
  7449. private $input;
  7450. private $output;
  7451. public function __construct($input = array()) {
  7452. $this->input = $input;
  7453. $this->output = array();
  7454. }
  7455. /**
  7456. * Shifts an element off the front of the queue.
  7457. */
  7458. public function shift() {
  7459. if (empty($this->output)) {
  7460. $this->output = array_reverse($this->input);
  7461. $this->input = array();
  7462. }
  7463. if (empty($this->output)) {
  7464. return NULL;
  7465. }
  7466. return array_pop($this->output);
  7467. }
  7468. /**
  7469. * Pushes an element onto the front of the queue.
  7470. */
  7471. public function push($x) {
  7472. array_push($this->input, $x);
  7473. }
  7474. /**
  7475. * Checks if it's empty.
  7476. */
  7477. public function isEmpty() {
  7478. return empty($this->input) && empty($this->output);
  7479. }
  7480. }
  7481. /**
  7482. * Supertype for classes that define a strategy for modifying/purifying tokens.
  7483. *
  7484. * While HTMLPurifier's core purpose is fixing HTML into something proper,
  7485. * strategies provide plug points for extra configuration or even extra
  7486. * features, such as custom tags, custom parsing of text, etc.
  7487. */
  7488. abstract class HTMLPurifier_Strategy
  7489. {
  7490. /**
  7491. * Executes the strategy on the tokens.
  7492. *
  7493. * @param HTMLPurifier_Token[] $tokens Array of HTMLPurifier_Token objects to be operated on.
  7494. * @param HTMLPurifier_Config $config
  7495. * @param HTMLPurifier_Context $context
  7496. * @return HTMLPurifier_Token[] Processed array of token objects.
  7497. */
  7498. abstract public function execute($tokens, $config, $context);
  7499. }
  7500. /**
  7501. * This is in almost every respect equivalent to an array except
  7502. * that it keeps track of which keys were accessed.
  7503. *
  7504. * @warning For the sake of backwards compatibility with early versions
  7505. * of PHP 5, you must not use the $hash[$key] syntax; if you do
  7506. * our version of offsetGet is never called.
  7507. */
  7508. class HTMLPurifier_StringHash extends ArrayObject
  7509. {
  7510. /**
  7511. * @type array
  7512. */
  7513. protected $accessed = array();
  7514. /**
  7515. * Retrieves a value, and logs the access.
  7516. * @param mixed $index
  7517. * @return mixed
  7518. */
  7519. public function offsetGet($index)
  7520. {
  7521. $this->accessed[$index] = true;
  7522. return parent::offsetGet($index);
  7523. }
  7524. /**
  7525. * Returns a lookup array of all array indexes that have been accessed.
  7526. * @return array in form array($index => true).
  7527. */
  7528. public function getAccessed()
  7529. {
  7530. return $this->accessed;
  7531. }
  7532. /**
  7533. * Resets the access array.
  7534. */
  7535. public function resetAccessed()
  7536. {
  7537. $this->accessed = array();
  7538. }
  7539. }
  7540. /**
  7541. * Parses string hash files. File format is as such:
  7542. *
  7543. * DefaultKeyValue
  7544. * KEY: Value
  7545. * KEY2: Value2
  7546. * --MULTILINE-KEY--
  7547. * Multiline
  7548. * value.
  7549. *
  7550. * Which would output something similar to:
  7551. *
  7552. * array(
  7553. * 'ID' => 'DefaultKeyValue',
  7554. * 'KEY' => 'Value',
  7555. * 'KEY2' => 'Value2',
  7556. * 'MULTILINE-KEY' => "Multiline\nvalue.\n",
  7557. * )
  7558. *
  7559. * We use this as an easy to use file-format for configuration schema
  7560. * files, but the class itself is usage agnostic.
  7561. *
  7562. * You can use ---- to forcibly terminate parsing of a single string-hash;
  7563. * this marker is used in multi string-hashes to delimit boundaries.
  7564. */
  7565. class HTMLPurifier_StringHashParser
  7566. {
  7567. /**
  7568. * @type string
  7569. */
  7570. public $default = 'ID';
  7571. /**
  7572. * Parses a file that contains a single string-hash.
  7573. * @param string $file
  7574. * @return array
  7575. */
  7576. public function parseFile($file)
  7577. {
  7578. if (!file_exists($file)) {
  7579. return false;
  7580. }
  7581. $fh = fopen($file, 'r');
  7582. if (!$fh) {
  7583. return false;
  7584. }
  7585. $ret = $this->parseHandle($fh);
  7586. fclose($fh);
  7587. return $ret;
  7588. }
  7589. /**
  7590. * Parses a file that contains multiple string-hashes delimited by '----'
  7591. * @param string $file
  7592. * @return array
  7593. */
  7594. public function parseMultiFile($file)
  7595. {
  7596. if (!file_exists($file)) {
  7597. return false;
  7598. }
  7599. $ret = array();
  7600. $fh = fopen($file, 'r');
  7601. if (!$fh) {
  7602. return false;
  7603. }
  7604. while (!feof($fh)) {
  7605. $ret[] = $this->parseHandle($fh);
  7606. }
  7607. fclose($fh);
  7608. return $ret;
  7609. }
  7610. /**
  7611. * Internal parser that acepts a file handle.
  7612. * @note While it's possible to simulate in-memory parsing by using
  7613. * custom stream wrappers, if such a use-case arises we should
  7614. * factor out the file handle into its own class.
  7615. * @param resource $fh File handle with pointer at start of valid string-hash
  7616. * block.
  7617. * @return array
  7618. */
  7619. protected function parseHandle($fh)
  7620. {
  7621. $state = false;
  7622. $single = false;
  7623. $ret = array();
  7624. do {
  7625. $line = fgets($fh);
  7626. if ($line === false) {
  7627. break;
  7628. }
  7629. $line = rtrim($line, "\n\r");
  7630. if (!$state && $line === '') {
  7631. continue;
  7632. }
  7633. if ($line === '----') {
  7634. break;
  7635. }
  7636. if (strncmp('--#', $line, 3) === 0) {
  7637. // Comment
  7638. continue;
  7639. } elseif (strncmp('--', $line, 2) === 0) {
  7640. // Multiline declaration
  7641. $state = trim($line, '- ');
  7642. if (!isset($ret[$state])) {
  7643. $ret[$state] = '';
  7644. }
  7645. continue;
  7646. } elseif (!$state) {
  7647. $single = true;
  7648. if (strpos($line, ':') !== false) {
  7649. // Single-line declaration
  7650. list($state, $line) = explode(':', $line, 2);
  7651. $line = trim($line);
  7652. } else {
  7653. // Use default declaration
  7654. $state = $this->default;
  7655. }
  7656. }
  7657. if ($single) {
  7658. $ret[$state] = $line;
  7659. $single = false;
  7660. $state = false;
  7661. } else {
  7662. $ret[$state] .= "$line\n";
  7663. }
  7664. } while (!feof($fh));
  7665. return $ret;
  7666. }
  7667. }
  7668. /**
  7669. * Defines a mutation of an obsolete tag into a valid tag.
  7670. */
  7671. abstract class HTMLPurifier_TagTransform
  7672. {
  7673. /**
  7674. * Tag name to transform the tag to.
  7675. * @type string
  7676. */
  7677. public $transform_to;
  7678. /**
  7679. * Transforms the obsolete tag into the valid tag.
  7680. * @param HTMLPurifier_Token_Tag $tag Tag to be transformed.
  7681. * @param HTMLPurifier_Config $config Mandatory HTMLPurifier_Config object
  7682. * @param HTMLPurifier_Context $context Mandatory HTMLPurifier_Context object
  7683. */
  7684. abstract public function transform($tag, $config, $context);
  7685. /**
  7686. * Prepends CSS properties to the style attribute, creating the
  7687. * attribute if it doesn't exist.
  7688. * @warning Copied over from AttrTransform, be sure to keep in sync
  7689. * @param array $attr Attribute array to process (passed by reference)
  7690. * @param string $css CSS to prepend
  7691. */
  7692. protected function prependCSS(&$attr, $css)
  7693. {
  7694. $attr['style'] = isset($attr['style']) ? $attr['style'] : '';
  7695. $attr['style'] = $css . $attr['style'];
  7696. }
  7697. }
  7698. /**
  7699. * Abstract base token class that all others inherit from.
  7700. */
  7701. abstract class HTMLPurifier_Token
  7702. {
  7703. /**
  7704. * Line number node was on in source document. Null if unknown.
  7705. * @type int
  7706. */
  7707. public $line;
  7708. /**
  7709. * Column of line node was on in source document. Null if unknown.
  7710. * @type int
  7711. */
  7712. public $col;
  7713. /**
  7714. * Lookup array of processing that this token is exempt from.
  7715. * Currently, valid values are "ValidateAttributes" and
  7716. * "MakeWellFormed_TagClosedError"
  7717. * @type array
  7718. */
  7719. public $armor = array();
  7720. /**
  7721. * Used during MakeWellFormed. See Note [Injector skips]
  7722. * @type
  7723. */
  7724. public $skip;
  7725. /**
  7726. * @type
  7727. */
  7728. public $rewind;
  7729. /**
  7730. * @type
  7731. */
  7732. public $carryover;
  7733. /**
  7734. * @param string $n
  7735. * @return null|string
  7736. */
  7737. public function __get($n)
  7738. {
  7739. if ($n === 'type') {
  7740. trigger_error('Deprecated type property called; use instanceof', E_USER_NOTICE);
  7741. switch (get_class($this)) {
  7742. case 'HTMLPurifier_Token_Start':
  7743. return 'start';
  7744. case 'HTMLPurifier_Token_Empty':
  7745. return 'empty';
  7746. case 'HTMLPurifier_Token_End':
  7747. return 'end';
  7748. case 'HTMLPurifier_Token_Text':
  7749. return 'text';
  7750. case 'HTMLPurifier_Token_Comment':
  7751. return 'comment';
  7752. default:
  7753. return null;
  7754. }
  7755. }
  7756. }
  7757. /**
  7758. * Sets the position of the token in the source document.
  7759. * @param int $l
  7760. * @param int $c
  7761. */
  7762. public function position($l = null, $c = null)
  7763. {
  7764. $this->line = $l;
  7765. $this->col = $c;
  7766. }
  7767. /**
  7768. * Convenience function for DirectLex settings line/col position.
  7769. * @param int $l
  7770. * @param int $c
  7771. */
  7772. public function rawPosition($l, $c)
  7773. {
  7774. if ($c === -1) {
  7775. $l++;
  7776. }
  7777. $this->line = $l;
  7778. $this->col = $c;
  7779. }
  7780. /**
  7781. * Converts a token into its corresponding node.
  7782. */
  7783. abstract public function toNode();
  7784. }
  7785. /**
  7786. * Factory for token generation.
  7787. *
  7788. * @note Doing some benchmarking indicates that the new operator is much
  7789. * slower than the clone operator (even discounting the cost of the
  7790. * constructor). This class is for that optimization.
  7791. * Other then that, there's not much point as we don't
  7792. * maintain parallel HTMLPurifier_Token hierarchies (the main reason why
  7793. * you'd want to use an abstract factory).
  7794. * @todo Port DirectLex to use this
  7795. */
  7796. class HTMLPurifier_TokenFactory
  7797. {
  7798. // p stands for prototype
  7799. /**
  7800. * @type HTMLPurifier_Token_Start
  7801. */
  7802. private $p_start;
  7803. /**
  7804. * @type HTMLPurifier_Token_End
  7805. */
  7806. private $p_end;
  7807. /**
  7808. * @type HTMLPurifier_Token_Empty
  7809. */
  7810. private $p_empty;
  7811. /**
  7812. * @type HTMLPurifier_Token_Text
  7813. */
  7814. private $p_text;
  7815. /**
  7816. * @type HTMLPurifier_Token_Comment
  7817. */
  7818. private $p_comment;
  7819. /**
  7820. * Generates blank prototypes for cloning.
  7821. */
  7822. public function __construct()
  7823. {
  7824. $this->p_start = new HTMLPurifier_Token_Start('', array());
  7825. $this->p_end = new HTMLPurifier_Token_End('');
  7826. $this->p_empty = new HTMLPurifier_Token_Empty('', array());
  7827. $this->p_text = new HTMLPurifier_Token_Text('');
  7828. $this->p_comment = new HTMLPurifier_Token_Comment('');
  7829. }
  7830. /**
  7831. * Creates a HTMLPurifier_Token_Start.
  7832. * @param string $name Tag name
  7833. * @param array $attr Associative array of attributes
  7834. * @return HTMLPurifier_Token_Start Generated HTMLPurifier_Token_Start
  7835. */
  7836. public function createStart($name, $attr = array())
  7837. {
  7838. $p = clone $this->p_start;
  7839. $p->__construct($name, $attr);
  7840. return $p;
  7841. }
  7842. /**
  7843. * Creates a HTMLPurifier_Token_End.
  7844. * @param string $name Tag name
  7845. * @return HTMLPurifier_Token_End Generated HTMLPurifier_Token_End
  7846. */
  7847. public function createEnd($name)
  7848. {
  7849. $p = clone $this->p_end;
  7850. $p->__construct($name);
  7851. return $p;
  7852. }
  7853. /**
  7854. * Creates a HTMLPurifier_Token_Empty.
  7855. * @param string $name Tag name
  7856. * @param array $attr Associative array of attributes
  7857. * @return HTMLPurifier_Token_Empty Generated HTMLPurifier_Token_Empty
  7858. */
  7859. public function createEmpty($name, $attr = array())
  7860. {
  7861. $p = clone $this->p_empty;
  7862. $p->__construct($name, $attr);
  7863. return $p;
  7864. }
  7865. /**
  7866. * Creates a HTMLPurifier_Token_Text.
  7867. * @param string $data Data of text token
  7868. * @return HTMLPurifier_Token_Text Generated HTMLPurifier_Token_Text
  7869. */
  7870. public function createText($data)
  7871. {
  7872. $p = clone $this->p_text;
  7873. $p->__construct($data);
  7874. return $p;
  7875. }
  7876. /**
  7877. * Creates a HTMLPurifier_Token_Comment.
  7878. * @param string $data Data of comment token
  7879. * @return HTMLPurifier_Token_Comment Generated HTMLPurifier_Token_Comment
  7880. */
  7881. public function createComment($data)
  7882. {
  7883. $p = clone $this->p_comment;
  7884. $p->__construct($data);
  7885. return $p;
  7886. }
  7887. }
  7888. /**
  7889. * HTML Purifier's internal representation of a URI.
  7890. * @note
  7891. * Internal data-structures are completely escaped. If the data needs
  7892. * to be used in a non-URI context (which is very unlikely), be sure
  7893. * to decode it first. The URI may not necessarily be well-formed until
  7894. * validate() is called.
  7895. */
  7896. class HTMLPurifier_URI
  7897. {
  7898. /**
  7899. * @type string
  7900. */
  7901. public $scheme;
  7902. /**
  7903. * @type string
  7904. */
  7905. public $userinfo;
  7906. /**
  7907. * @type string
  7908. */
  7909. public $host;
  7910. /**
  7911. * @type int
  7912. */
  7913. public $port;
  7914. /**
  7915. * @type string
  7916. */
  7917. public $path;
  7918. /**
  7919. * @type string
  7920. */
  7921. public $query;
  7922. /**
  7923. * @type string
  7924. */
  7925. public $fragment;
  7926. /**
  7927. * @param string $scheme
  7928. * @param string $userinfo
  7929. * @param string $host
  7930. * @param int $port
  7931. * @param string $path
  7932. * @param string $query
  7933. * @param string $fragment
  7934. * @note Automatically normalizes scheme and port
  7935. */
  7936. public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment)
  7937. {
  7938. $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
  7939. $this->userinfo = $userinfo;
  7940. $this->host = $host;
  7941. $this->port = is_null($port) ? $port : (int)$port;
  7942. $this->path = $path;
  7943. $this->query = $query;
  7944. $this->fragment = $fragment;
  7945. }
  7946. /**
  7947. * Retrieves a scheme object corresponding to the URI's scheme/default
  7948. * @param HTMLPurifier_Config $config
  7949. * @param HTMLPurifier_Context $context
  7950. * @return HTMLPurifier_URIScheme Scheme object appropriate for validating this URI
  7951. */
  7952. public function getSchemeObj($config, $context)
  7953. {
  7954. $registry = HTMLPurifier_URISchemeRegistry::instance();
  7955. if ($this->scheme !== null) {
  7956. $scheme_obj = $registry->getScheme($this->scheme, $config, $context);
  7957. if (!$scheme_obj) {
  7958. return false;
  7959. } // invalid scheme, clean it out
  7960. } else {
  7961. // no scheme: retrieve the default one
  7962. $def = $config->getDefinition('URI');
  7963. $scheme_obj = $def->getDefaultScheme($config, $context);
  7964. if (!$scheme_obj) {
  7965. if ($def->defaultScheme !== null) {
  7966. // something funky happened to the default scheme object
  7967. trigger_error(
  7968. 'Default scheme object "' . $def->defaultScheme . '" was not readable',
  7969. E_USER_WARNING
  7970. );
  7971. } // suppress error if it's null
  7972. return false;
  7973. }
  7974. }
  7975. return $scheme_obj;
  7976. }
  7977. /**
  7978. * Generic validation method applicable for all schemes. May modify
  7979. * this URI in order to get it into a compliant form.
  7980. * @param HTMLPurifier_Config $config
  7981. * @param HTMLPurifier_Context $context
  7982. * @return bool True if validation/filtering succeeds, false if failure
  7983. */
  7984. public function validate($config, $context)
  7985. {
  7986. // ABNF definitions from RFC 3986
  7987. $chars_sub_delims = '!$&\'()*+,;=';
  7988. $chars_gen_delims = ':/?#[]@';
  7989. $chars_pchar = $chars_sub_delims . ':@';
  7990. // validate host
  7991. if (!is_null($this->host)) {
  7992. $host_def = new HTMLPurifier_AttrDef_URI_Host();
  7993. $this->host = $host_def->validate($this->host, $config, $context);
  7994. if ($this->host === false) {
  7995. $this->host = null;
  7996. }
  7997. }
  7998. // validate scheme
  7999. // NOTE: It's not appropriate to check whether or not this
  8000. // scheme is in our registry, since a URIFilter may convert a
  8001. // URI that we don't allow into one we do. So instead, we just
  8002. // check if the scheme can be dropped because there is no host
  8003. // and it is our default scheme.
  8004. if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
  8005. // support for relative paths is pretty abysmal when the
  8006. // scheme is present, so axe it when possible
  8007. $def = $config->getDefinition('URI');
  8008. if ($def->defaultScheme === $this->scheme) {
  8009. $this->scheme = null;
  8010. }
  8011. }
  8012. // validate username
  8013. if (!is_null($this->userinfo)) {
  8014. $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
  8015. $this->userinfo = $encoder->encode($this->userinfo);
  8016. }
  8017. // validate port
  8018. if (!is_null($this->port)) {
  8019. if ($this->port < 1 || $this->port > 65535) {
  8020. $this->port = null;
  8021. }
  8022. }
  8023. // validate path
  8024. $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
  8025. if (!is_null($this->host)) { // this catches $this->host === ''
  8026. // path-abempty (hier and relative)
  8027. // http://www.example.com/my/path
  8028. // //www.example.com/my/path (looks odd, but works, and
  8029. // recognized by most browsers)
  8030. // (this set is valid or invalid on a scheme by scheme
  8031. // basis, so we'll deal with it later)
  8032. // file:///my/path
  8033. // ///my/path
  8034. $this->path = $segments_encoder->encode($this->path);
  8035. } elseif ($this->path !== '') {
  8036. if ($this->path[0] === '/') {
  8037. // path-absolute (hier and relative)
  8038. // http:/my/path
  8039. // /my/path
  8040. if (strlen($this->path) >= 2 && $this->path[1] === '/') {
  8041. // This could happen if both the host gets stripped
  8042. // out
  8043. // http://my/path
  8044. // //my/path
  8045. $this->path = '';
  8046. } else {
  8047. $this->path = $segments_encoder->encode($this->path);
  8048. }
  8049. } elseif (!is_null($this->scheme)) {
  8050. // path-rootless (hier)
  8051. // http:my/path
  8052. // Short circuit evaluation means we don't need to check nz
  8053. $this->path = $segments_encoder->encode($this->path);
  8054. } else {
  8055. // path-noscheme (relative)
  8056. // my/path
  8057. // (once again, not checking nz)
  8058. $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
  8059. $c = strpos($this->path, '/');
  8060. if ($c !== false) {
  8061. $this->path =
  8062. $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
  8063. $segments_encoder->encode(substr($this->path, $c));
  8064. } else {
  8065. $this->path = $segment_nc_encoder->encode($this->path);
  8066. }
  8067. }
  8068. } else {
  8069. // path-empty (hier and relative)
  8070. $this->path = ''; // just to be safe
  8071. }
  8072. // qf = query and fragment
  8073. $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
  8074. if (!is_null($this->query)) {
  8075. $this->query = $qf_encoder->encode($this->query);
  8076. }
  8077. if (!is_null($this->fragment)) {
  8078. $this->fragment = $qf_encoder->encode($this->fragment);
  8079. }
  8080. return true;
  8081. }
  8082. /**
  8083. * Convert URI back to string
  8084. * @return string URI appropriate for output
  8085. */
  8086. public function toString()
  8087. {
  8088. // reconstruct authority
  8089. $authority = null;
  8090. // there is a rendering difference between a null authority
  8091. // (http:foo-bar) and an empty string authority
  8092. // (http:///foo-bar).
  8093. if (!is_null($this->host)) {
  8094. $authority = '';
  8095. if (!is_null($this->userinfo)) {
  8096. $authority .= $this->userinfo . '@';
  8097. }
  8098. $authority .= $this->host;
  8099. if (!is_null($this->port)) {
  8100. $authority .= ':' . $this->port;
  8101. }
  8102. }
  8103. // Reconstruct the result
  8104. // One might wonder about parsing quirks from browsers after
  8105. // this reconstruction. Unfortunately, parsing behavior depends
  8106. // on what *scheme* was employed (file:///foo is handled *very*
  8107. // differently than http:///foo), so unfortunately we have to
  8108. // defer to the schemes to do the right thing.
  8109. $result = '';
  8110. if (!is_null($this->scheme)) {
  8111. $result .= $this->scheme . ':';
  8112. }
  8113. if (!is_null($authority)) {
  8114. $result .= '//' . $authority;
  8115. }
  8116. $result .= $this->path;
  8117. if (!is_null($this->query)) {
  8118. $result .= '?' . $this->query;
  8119. }
  8120. if (!is_null($this->fragment)) {
  8121. $result .= '#' . $this->fragment;
  8122. }
  8123. return $result;
  8124. }
  8125. /**
  8126. * Returns true if this URL might be considered a 'local' URL given
  8127. * the current context. This is true when the host is null, or
  8128. * when it matches the host supplied to the configuration.
  8129. *
  8130. * Note that this does not do any scheme checking, so it is mostly
  8131. * only appropriate for metadata that doesn't care about protocol
  8132. * security. isBenign is probably what you actually want.
  8133. * @param HTMLPurifier_Config $config
  8134. * @param HTMLPurifier_Context $context
  8135. * @return bool
  8136. */
  8137. public function isLocal($config, $context)
  8138. {
  8139. if ($this->host === null) {
  8140. return true;
  8141. }
  8142. $uri_def = $config->getDefinition('URI');
  8143. if ($uri_def->host === $this->host) {
  8144. return true;
  8145. }
  8146. return false;
  8147. }
  8148. /**
  8149. * Returns true if this URL should be considered a 'benign' URL,
  8150. * that is:
  8151. *
  8152. * - It is a local URL (isLocal), and
  8153. * - It has a equal or better level of security
  8154. * @param HTMLPurifier_Config $config
  8155. * @param HTMLPurifier_Context $context
  8156. * @return bool
  8157. */
  8158. public function isBenign($config, $context)
  8159. {
  8160. if (!$this->isLocal($config, $context)) {
  8161. return false;
  8162. }
  8163. $scheme_obj = $this->getSchemeObj($config, $context);
  8164. if (!$scheme_obj) {
  8165. return false;
  8166. } // conservative approach
  8167. $current_scheme_obj = $config->getDefinition('URI')->getDefaultScheme($config, $context);
  8168. if ($current_scheme_obj->secure) {
  8169. if (!$scheme_obj->secure) {
  8170. return false;
  8171. }
  8172. }
  8173. return true;
  8174. }
  8175. }
  8176. class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
  8177. {
  8178. public $type = 'URI';
  8179. protected $filters = array();
  8180. protected $postFilters = array();
  8181. protected $registeredFilters = array();
  8182. /**
  8183. * HTMLPurifier_URI object of the base specified at %URI.Base
  8184. */
  8185. public $base;
  8186. /**
  8187. * String host to consider "home" base, derived off of $base
  8188. */
  8189. public $host;
  8190. /**
  8191. * Name of default scheme based on %URI.DefaultScheme and %URI.Base
  8192. */
  8193. public $defaultScheme;
  8194. public function __construct()
  8195. {
  8196. $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
  8197. $this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
  8198. $this->registerFilter(new HTMLPurifier_URIFilter_DisableResources());
  8199. $this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
  8200. $this->registerFilter(new HTMLPurifier_URIFilter_SafeIframe());
  8201. $this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
  8202. $this->registerFilter(new HTMLPurifier_URIFilter_Munge());
  8203. }
  8204. public function registerFilter($filter)
  8205. {
  8206. $this->registeredFilters[$filter->name] = $filter;
  8207. }
  8208. public function addFilter($filter, $config)
  8209. {
  8210. $r = $filter->prepare($config);
  8211. if ($r === false) return; // null is ok, for backwards compat
  8212. if ($filter->post) {
  8213. $this->postFilters[$filter->name] = $filter;
  8214. } else {
  8215. $this->filters[$filter->name] = $filter;
  8216. }
  8217. }
  8218. protected function doSetup($config)
  8219. {
  8220. $this->setupMemberVariables($config);
  8221. $this->setupFilters($config);
  8222. }
  8223. protected function setupFilters($config)
  8224. {
  8225. foreach ($this->registeredFilters as $name => $filter) {
  8226. if ($filter->always_load) {
  8227. $this->addFilter($filter, $config);
  8228. } else {
  8229. $conf = $config->get('URI.' . $name);
  8230. if ($conf !== false && $conf !== null) {
  8231. $this->addFilter($filter, $config);
  8232. }
  8233. }
  8234. }
  8235. unset($this->registeredFilters);
  8236. }
  8237. protected function setupMemberVariables($config)
  8238. {
  8239. $this->host = $config->get('URI.Host');
  8240. $base_uri = $config->get('URI.Base');
  8241. if (!is_null($base_uri)) {
  8242. $parser = new HTMLPurifier_URIParser();
  8243. $this->base = $parser->parse($base_uri);
  8244. $this->defaultScheme = $this->base->scheme;
  8245. if (is_null($this->host)) $this->host = $this->base->host;
  8246. }
  8247. if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI.DefaultScheme');
  8248. }
  8249. public function getDefaultScheme($config, $context)
  8250. {
  8251. return HTMLPurifier_URISchemeRegistry::instance()->getScheme($this->defaultScheme, $config, $context);
  8252. }
  8253. public function filter(&$uri, $config, $context)
  8254. {
  8255. foreach ($this->filters as $name => $f) {
  8256. $result = $f->filter($uri, $config, $context);
  8257. if (!$result) return false;
  8258. }
  8259. return true;
  8260. }
  8261. public function postFilter(&$uri, $config, $context)
  8262. {
  8263. foreach ($this->postFilters as $name => $f) {
  8264. $result = $f->filter($uri, $config, $context);
  8265. if (!$result) return false;
  8266. }
  8267. return true;
  8268. }
  8269. }
  8270. /**
  8271. * Chainable filters for custom URI processing.
  8272. *
  8273. * These filters can perform custom actions on a URI filter object,
  8274. * including transformation or blacklisting. A filter named Foo
  8275. * must have a corresponding configuration directive %URI.Foo,
  8276. * unless always_load is specified to be true.
  8277. *
  8278. * The following contexts may be available while URIFilters are being
  8279. * processed:
  8280. *
  8281. * - EmbeddedURI: true if URI is an embedded resource that will
  8282. * be loaded automatically on page load
  8283. * - CurrentToken: a reference to the token that is currently
  8284. * being processed
  8285. * - CurrentAttr: the name of the attribute that is currently being
  8286. * processed
  8287. * - CurrentCSSProperty: the name of the CSS property that is
  8288. * currently being processed (if applicable)
  8289. *
  8290. * @warning This filter is called before scheme object validation occurs.
  8291. * Make sure, if you require a specific scheme object, you
  8292. * you check that it exists. This allows filters to convert
  8293. * proprietary URI schemes into regular ones.
  8294. */
  8295. abstract class HTMLPurifier_URIFilter
  8296. {
  8297. /**
  8298. * Unique identifier of filter.
  8299. * @type string
  8300. */
  8301. public $name;
  8302. /**
  8303. * True if this filter should be run after scheme validation.
  8304. * @type bool
  8305. */
  8306. public $post = false;
  8307. /**
  8308. * True if this filter should always be loaded.
  8309. * This permits a filter to be named Foo without the corresponding
  8310. * %URI.Foo directive existing.
  8311. * @type bool
  8312. */
  8313. public $always_load = false;
  8314. /**
  8315. * Performs initialization for the filter. If the filter returns
  8316. * false, this means that it shouldn't be considered active.
  8317. * @param HTMLPurifier_Config $config
  8318. * @return bool
  8319. */
  8320. public function prepare($config)
  8321. {
  8322. return true;
  8323. }
  8324. /**
  8325. * Filter a URI object
  8326. * @param HTMLPurifier_URI $uri Reference to URI object variable
  8327. * @param HTMLPurifier_Config $config
  8328. * @param HTMLPurifier_Context $context
  8329. * @return bool Whether or not to continue processing: false indicates
  8330. * URL is no good, true indicates continue processing. Note that
  8331. * all changes are committed directly on the URI object
  8332. */
  8333. abstract public function filter(&$uri, $config, $context);
  8334. }
  8335. /**
  8336. * Parses a URI into the components and fragment identifier as specified
  8337. * by RFC 3986.
  8338. */
  8339. class HTMLPurifier_URIParser
  8340. {
  8341. /**
  8342. * Instance of HTMLPurifier_PercentEncoder to do normalization with.
  8343. */
  8344. protected $percentEncoder;
  8345. public function __construct()
  8346. {
  8347. $this->percentEncoder = new HTMLPurifier_PercentEncoder();
  8348. }
  8349. /**
  8350. * Parses a URI.
  8351. * @param $uri string URI to parse
  8352. * @return HTMLPurifier_URI representation of URI. This representation has
  8353. * not been validated yet and may not conform to RFC.
  8354. */
  8355. public function parse($uri)
  8356. {
  8357. $uri = $this->percentEncoder->normalize($uri);
  8358. // Regexp is as per Appendix B.
  8359. // Note that ["<>] are an addition to the RFC's recommended
  8360. // characters, because they represent external delimeters.
  8361. $r_URI = '!'.
  8362. '(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme
  8363. '(//([^/?#"<>]*))?'. // 4. Authority
  8364. '([^?#"<>]*)'. // 5. Path
  8365. '(\?([^#"<>]*))?'. // 7. Query
  8366. '(#([^"<>]*))?'. // 8. Fragment
  8367. '!';
  8368. $matches = array();
  8369. $result = preg_match($r_URI, $uri, $matches);
  8370. if (!$result) return false; // *really* invalid URI
  8371. // seperate out parts
  8372. $scheme = !empty($matches[1]) ? $matches[2] : null;
  8373. $authority = !empty($matches[3]) ? $matches[4] : null;
  8374. $path = $matches[5]; // always present, can be empty
  8375. $query = !empty($matches[6]) ? $matches[7] : null;
  8376. $fragment = !empty($matches[8]) ? $matches[9] : null;
  8377. // further parse authority
  8378. if ($authority !== null) {
  8379. $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
  8380. $matches = array();
  8381. preg_match($r_authority, $authority, $matches);
  8382. $userinfo = !empty($matches[1]) ? $matches[2] : null;
  8383. $host = !empty($matches[3]) ? $matches[3] : '';
  8384. $port = !empty($matches[4]) ? (int) $matches[5] : null;
  8385. } else {
  8386. $port = $host = $userinfo = null;
  8387. }
  8388. return new HTMLPurifier_URI(
  8389. $scheme, $userinfo, $host, $port, $path, $query, $fragment);
  8390. }
  8391. }
  8392. /**
  8393. * Validator for the components of a URI for a specific scheme
  8394. */
  8395. abstract class HTMLPurifier_URIScheme
  8396. {
  8397. /**
  8398. * Scheme's default port (integer). If an explicit port number is
  8399. * specified that coincides with the default port, it will be
  8400. * elided.
  8401. * @type int
  8402. */
  8403. public $default_port = null;
  8404. /**
  8405. * Whether or not URIs of this scheme are locatable by a browser
  8406. * http and ftp are accessible, while mailto and news are not.
  8407. * @type bool
  8408. */
  8409. public $browsable = false;
  8410. /**
  8411. * Whether or not data transmitted over this scheme is encrypted.
  8412. * https is secure, http is not.
  8413. * @type bool
  8414. */
  8415. public $secure = false;
  8416. /**
  8417. * Whether or not the URI always uses <hier_part>, resolves edge cases
  8418. * with making relative URIs absolute
  8419. * @type bool
  8420. */
  8421. public $hierarchical = false;
  8422. /**
  8423. * Whether or not the URI may omit a hostname when the scheme is
  8424. * explicitly specified, ala file:///path/to/file. As of writing,
  8425. * 'file' is the only scheme that browsers support his properly.
  8426. * @type bool
  8427. */
  8428. public $may_omit_host = false;
  8429. /**
  8430. * Validates the components of a URI for a specific scheme.
  8431. * @param HTMLPurifier_URI $uri Reference to a HTMLPurifier_URI object
  8432. * @param HTMLPurifier_Config $config
  8433. * @param HTMLPurifier_Context $context
  8434. * @return bool success or failure
  8435. */
  8436. abstract public function doValidate(&$uri, $config, $context);
  8437. /**
  8438. * Public interface for validating components of a URI. Performs a
  8439. * bunch of default actions. Don't overload this method.
  8440. * @param HTMLPurifier_URI $uri Reference to a HTMLPurifier_URI object
  8441. * @param HTMLPurifier_Config $config
  8442. * @param HTMLPurifier_Context $context
  8443. * @return bool success or failure
  8444. */
  8445. public function validate(&$uri, $config, $context)
  8446. {
  8447. if ($this->default_port == $uri->port) {
  8448. $uri->port = null;
  8449. }
  8450. // kludge: browsers do funny things when the scheme but not the
  8451. // authority is set
  8452. if (!$this->may_omit_host &&
  8453. // if the scheme is present, a missing host is always in error
  8454. (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
  8455. // if the scheme is not present, a *blank* host is in error,
  8456. // since this translates into '///path' which most browsers
  8457. // interpret as being 'http://path'.
  8458. (is_null($uri->scheme) && $uri->host === '')
  8459. ) {
  8460. do {
  8461. if (is_null($uri->scheme)) {
  8462. if (substr($uri->path, 0, 2) != '//') {
  8463. $uri->host = null;
  8464. break;
  8465. }
  8466. // URI is '////path', so we cannot nullify the
  8467. // host to preserve semantics. Try expanding the
  8468. // hostname instead (fall through)
  8469. }
  8470. // first see if we can manually insert a hostname
  8471. $host = $config->get('URI.Host');
  8472. if (!is_null($host)) {
  8473. $uri->host = $host;
  8474. } else {
  8475. // we can't do anything sensible, reject the URL.
  8476. return false;
  8477. }
  8478. } while (false);
  8479. }
  8480. return $this->doValidate($uri, $config, $context);
  8481. }
  8482. }
  8483. /**
  8484. * Registry for retrieving specific URI scheme validator objects.
  8485. */
  8486. class HTMLPurifier_URISchemeRegistry
  8487. {
  8488. /**
  8489. * Retrieve sole instance of the registry.
  8490. * @param HTMLPurifier_URISchemeRegistry $prototype Optional prototype to overload sole instance with,
  8491. * or bool true to reset to default registry.
  8492. * @return HTMLPurifier_URISchemeRegistry
  8493. * @note Pass a registry object $prototype with a compatible interface and
  8494. * the function will copy it and return it all further times.
  8495. */
  8496. public static function instance($prototype = null)
  8497. {
  8498. static $instance = null;
  8499. if ($prototype !== null) {
  8500. $instance = $prototype;
  8501. } elseif ($instance === null || $prototype == true) {
  8502. $instance = new HTMLPurifier_URISchemeRegistry();
  8503. }
  8504. return $instance;
  8505. }
  8506. /**
  8507. * Cache of retrieved schemes.
  8508. * @type HTMLPurifier_URIScheme[]
  8509. */
  8510. protected $schemes = array();
  8511. /**
  8512. * Retrieves a scheme validator object
  8513. * @param string $scheme String scheme name like http or mailto
  8514. * @param HTMLPurifier_Config $config
  8515. * @param HTMLPurifier_Context $context
  8516. * @return HTMLPurifier_URIScheme
  8517. */
  8518. public function getScheme($scheme, $config, $context)
  8519. {
  8520. if (!$config) {
  8521. $config = HTMLPurifier_Config::createDefault();
  8522. }
  8523. // important, otherwise attacker could include arbitrary file
  8524. $allowed_schemes = $config->get('URI.AllowedSchemes');
  8525. if (!$config->get('URI.OverrideAllowedSchemes') &&
  8526. !isset($allowed_schemes[$scheme])
  8527. ) {
  8528. return;
  8529. }
  8530. if (isset($this->schemes[$scheme])) {
  8531. return $this->schemes[$scheme];
  8532. }
  8533. if (!isset($allowed_schemes[$scheme])) {
  8534. return;
  8535. }
  8536. $class = 'HTMLPurifier_URIScheme_' . $scheme;
  8537. if (!class_exists($class)) {
  8538. return;
  8539. }
  8540. $this->schemes[$scheme] = new $class();
  8541. return $this->schemes[$scheme];
  8542. }
  8543. /**
  8544. * Registers a custom scheme to the cache, bypassing reflection.
  8545. * @param string $scheme Scheme name
  8546. * @param HTMLPurifier_URIScheme $scheme_obj
  8547. */
  8548. public function register($scheme, $scheme_obj)
  8549. {
  8550. $this->schemes[$scheme] = $scheme_obj;
  8551. }
  8552. }
  8553. /**
  8554. * Class for converting between different unit-lengths as specified by
  8555. * CSS.
  8556. */
  8557. class HTMLPurifier_UnitConverter
  8558. {
  8559. const ENGLISH = 1;
  8560. const METRIC = 2;
  8561. const DIGITAL = 3;
  8562. /**
  8563. * Units information array. Units are grouped into measuring systems
  8564. * (English, Metric), and are assigned an integer representing
  8565. * the conversion factor between that unit and the smallest unit in
  8566. * the system. Numeric indexes are actually magical constants that
  8567. * encode conversion data from one system to the next, with a O(n^2)
  8568. * constraint on memory (this is generally not a problem, since
  8569. * the number of measuring systems is small.)
  8570. */
  8571. protected static $units = array(
  8572. self::ENGLISH => array(
  8573. 'px' => 3, // This is as per CSS 2.1 and Firefox. Your mileage may vary
  8574. 'pt' => 4,
  8575. 'pc' => 48,
  8576. 'in' => 288,
  8577. self::METRIC => array('pt', '0.352777778', 'mm'),
  8578. ),
  8579. self::METRIC => array(
  8580. 'mm' => 1,
  8581. 'cm' => 10,
  8582. self::ENGLISH => array('mm', '2.83464567', 'pt'),
  8583. ),
  8584. );
  8585. /**
  8586. * Minimum bcmath precision for output.
  8587. * @type int
  8588. */
  8589. protected $outputPrecision;
  8590. /**
  8591. * Bcmath precision for internal calculations.
  8592. * @type int
  8593. */
  8594. protected $internalPrecision;
  8595. /**
  8596. * Whether or not BCMath is available.
  8597. * @type bool
  8598. */
  8599. private $bcmath;
  8600. public function __construct($output_precision = 4, $internal_precision = 10, $force_no_bcmath = false)
  8601. {
  8602. $this->outputPrecision = $output_precision;
  8603. $this->internalPrecision = $internal_precision;
  8604. $this->bcmath = !$force_no_bcmath && function_exists('bcmul');
  8605. }
  8606. /**
  8607. * Converts a length object of one unit into another unit.
  8608. * @param HTMLPurifier_Length $length
  8609. * Instance of HTMLPurifier_Length to convert. You must validate()
  8610. * it before passing it here!
  8611. * @param string $to_unit
  8612. * Unit to convert to.
  8613. * @return HTMLPurifier_Length|bool
  8614. * @note
  8615. * About precision: This conversion function pays very special
  8616. * attention to the incoming precision of values and attempts
  8617. * to maintain a number of significant figure. Results are
  8618. * fairly accurate up to nine digits. Some caveats:
  8619. * - If a number is zero-padded as a result of this significant
  8620. * figure tracking, the zeroes will be eliminated.
  8621. * - If a number contains less than four sigfigs ($outputPrecision)
  8622. * and this causes some decimals to be excluded, those
  8623. * decimals will be added on.
  8624. */
  8625. public function convert($length, $to_unit)
  8626. {
  8627. if (!$length->isValid()) {
  8628. return false;
  8629. }
  8630. $n = $length->getN();
  8631. $unit = $length->getUnit();
  8632. if ($n === '0' || $unit === false) {
  8633. return new HTMLPurifier_Length('0', false);
  8634. }
  8635. $state = $dest_state = false;
  8636. foreach (self::$units as $k => $x) {
  8637. if (isset($x[$unit])) {
  8638. $state = $k;
  8639. }
  8640. if (isset($x[$to_unit])) {
  8641. $dest_state = $k;
  8642. }
  8643. }
  8644. if (!$state || !$dest_state) {
  8645. return false;
  8646. }
  8647. // Some calculations about the initial precision of the number;
  8648. // this will be useful when we need to do final rounding.
  8649. $sigfigs = $this->getSigFigs($n);
  8650. if ($sigfigs < $this->outputPrecision) {
  8651. $sigfigs = $this->outputPrecision;
  8652. }
  8653. // BCMath's internal precision deals only with decimals. Use
  8654. // our default if the initial number has no decimals, or increase
  8655. // it by how ever many decimals, thus, the number of guard digits
  8656. // will always be greater than or equal to internalPrecision.
  8657. $log = (int)floor(log(abs($n), 10));
  8658. $cp = ($log < 0) ? $this->internalPrecision - $log : $this->internalPrecision; // internal precision
  8659. for ($i = 0; $i < 2; $i++) {
  8660. // Determine what unit IN THIS SYSTEM we need to convert to
  8661. if ($dest_state === $state) {
  8662. // Simple conversion
  8663. $dest_unit = $to_unit;
  8664. } else {
  8665. // Convert to the smallest unit, pending a system shift
  8666. $dest_unit = self::$units[$state][$dest_state][0];
  8667. }
  8668. // Do the conversion if necessary
  8669. if ($dest_unit !== $unit) {
  8670. $factor = $this->div(self::$units[$state][$unit], self::$units[$state][$dest_unit], $cp);
  8671. $n = $this->mul($n, $factor, $cp);
  8672. $unit = $dest_unit;
  8673. }
  8674. // Output was zero, so bail out early. Shouldn't ever happen.
  8675. if ($n === '') {
  8676. $n = '0';
  8677. $unit = $to_unit;
  8678. break;
  8679. }
  8680. // It was a simple conversion, so bail out
  8681. if ($dest_state === $state) {
  8682. break;
  8683. }
  8684. if ($i !== 0) {
  8685. // Conversion failed! Apparently, the system we forwarded
  8686. // to didn't have this unit. This should never happen!
  8687. return false;
  8688. }
  8689. // Pre-condition: $i == 0
  8690. // Perform conversion to next system of units
  8691. $n = $this->mul($n, self::$units[$state][$dest_state][1], $cp);
  8692. $unit = self::$units[$state][$dest_state][2];
  8693. $state = $dest_state;
  8694. // One more loop around to convert the unit in the new system.
  8695. }
  8696. // Post-condition: $unit == $to_unit
  8697. if ($unit !== $to_unit) {
  8698. return false;
  8699. }
  8700. // Useful for debugging:
  8701. //echo "<pre>n";
  8702. //echo "$n\nsigfigs = $sigfigs\nnew_log = $new_log\nlog = $log\nrp = $rp\n</pre>\n";
  8703. $n = $this->round($n, $sigfigs);
  8704. if (strpos($n, '.') !== false) {
  8705. $n = rtrim($n, '0');
  8706. }
  8707. $n = rtrim($n, '.');
  8708. return new HTMLPurifier_Length($n, $unit);
  8709. }
  8710. /**
  8711. * Returns the number of significant figures in a string number.
  8712. * @param string $n Decimal number
  8713. * @return int number of sigfigs
  8714. */
  8715. public function getSigFigs($n)
  8716. {
  8717. $n = ltrim($n, '0+-');
  8718. $dp = strpos($n, '.'); // decimal position
  8719. if ($dp === false) {
  8720. $sigfigs = strlen(rtrim($n, '0'));
  8721. } else {
  8722. $sigfigs = strlen(ltrim($n, '0.')); // eliminate extra decimal character
  8723. if ($dp !== 0) {
  8724. $sigfigs--;
  8725. }
  8726. }
  8727. return $sigfigs;
  8728. }
  8729. /**
  8730. * Adds two numbers, using arbitrary precision when available.
  8731. * @param string $s1
  8732. * @param string $s2
  8733. * @param int $scale
  8734. * @return string
  8735. */
  8736. private function add($s1, $s2, $scale)
  8737. {
  8738. if ($this->bcmath) {
  8739. return bcadd($s1, $s2, $scale);
  8740. } else {
  8741. return $this->scale((float)$s1 + (float)$s2, $scale);
  8742. }
  8743. }
  8744. /**
  8745. * Multiples two numbers, using arbitrary precision when available.
  8746. * @param string $s1
  8747. * @param string $s2
  8748. * @param int $scale
  8749. * @return string
  8750. */
  8751. private function mul($s1, $s2, $scale)
  8752. {
  8753. if ($this->bcmath) {
  8754. return bcmul($s1, $s2, $scale);
  8755. } else {
  8756. return $this->scale((float)$s1 * (float)$s2, $scale);
  8757. }
  8758. }
  8759. /**
  8760. * Divides two numbers, using arbitrary precision when available.
  8761. * @param string $s1
  8762. * @param string $s2
  8763. * @param int $scale
  8764. * @return string
  8765. */
  8766. private function div($s1, $s2, $scale)
  8767. {
  8768. if ($this->bcmath) {
  8769. return bcdiv($s1, $s2, $scale);
  8770. } else {
  8771. return $this->scale((float)$s1 / (float)$s2, $scale);
  8772. }
  8773. }
  8774. /**
  8775. * Rounds a number according to the number of sigfigs it should have,
  8776. * using arbitrary precision when available.
  8777. * @param float $n
  8778. * @param int $sigfigs
  8779. * @return string
  8780. */
  8781. private function round($n, $sigfigs)
  8782. {
  8783. $new_log = (int)floor(log(abs($n), 10)); // Number of digits left of decimal - 1
  8784. $rp = $sigfigs - $new_log - 1; // Number of decimal places needed
  8785. $neg = $n < 0 ? '-' : ''; // Negative sign
  8786. if ($this->bcmath) {
  8787. if ($rp >= 0) {
  8788. $n = bcadd($n, $neg . '0.' . str_repeat('0', $rp) . '5', $rp + 1);
  8789. $n = bcdiv($n, '1', $rp);
  8790. } else {
  8791. // This algorithm partially depends on the standardized
  8792. // form of numbers that comes out of bcmath.
  8793. $n = bcadd($n, $neg . '5' . str_repeat('0', $new_log - $sigfigs), 0);
  8794. $n = substr($n, 0, $sigfigs + strlen($neg)) . str_repeat('0', $new_log - $sigfigs + 1);
  8795. }
  8796. return $n;
  8797. } else {
  8798. return $this->scale(round($n, $sigfigs - $new_log - 1), $rp + 1);
  8799. }
  8800. }
  8801. /**
  8802. * Scales a float to $scale digits right of decimal point, like BCMath.
  8803. * @param float $r
  8804. * @param int $scale
  8805. * @return string
  8806. */
  8807. private function scale($r, $scale)
  8808. {
  8809. if ($scale < 0) {
  8810. // The f sprintf type doesn't support negative numbers, so we
  8811. // need to cludge things manually. First get the string.
  8812. $r = sprintf('%.0f', (float)$r);
  8813. // Due to floating point precision loss, $r will more than likely
  8814. // look something like 4652999999999.9234. We grab one more digit
  8815. // than we need to precise from $r and then use that to round
  8816. // appropriately.
  8817. $precise = (string)round(substr($r, 0, strlen($r) + $scale), -1);
  8818. // Now we return it, truncating the zero that was rounded off.
  8819. return substr($precise, 0, -1) . str_repeat('0', -$scale + 1);
  8820. }
  8821. return sprintf('%.' . $scale . 'f', (float)$r);
  8822. }
  8823. }
  8824. /**
  8825. * Parses string representations into their corresponding native PHP
  8826. * variable type. The base implementation does a simple type-check.
  8827. */
  8828. class HTMLPurifier_VarParser
  8829. {
  8830. const STRING = 1;
  8831. const ISTRING = 2;
  8832. const TEXT = 3;
  8833. const ITEXT = 4;
  8834. const INT = 5;
  8835. const FLOAT = 6;
  8836. const BOOL = 7;
  8837. const LOOKUP = 8;
  8838. const ALIST = 9;
  8839. const HASH = 10;
  8840. const MIXED = 11;
  8841. /**
  8842. * Lookup table of allowed types. Mainly for backwards compatibility, but
  8843. * also convenient for transforming string type names to the integer constants.
  8844. */
  8845. public static $types = array(
  8846. 'string' => self::STRING,
  8847. 'istring' => self::ISTRING,
  8848. 'text' => self::TEXT,
  8849. 'itext' => self::ITEXT,
  8850. 'int' => self::INT,
  8851. 'float' => self::FLOAT,
  8852. 'bool' => self::BOOL,
  8853. 'lookup' => self::LOOKUP,
  8854. 'list' => self::ALIST,
  8855. 'hash' => self::HASH,
  8856. 'mixed' => self::MIXED
  8857. );
  8858. /**
  8859. * Lookup table of types that are string, and can have aliases or
  8860. * allowed value lists.
  8861. */
  8862. public static $stringTypes = array(
  8863. self::STRING => true,
  8864. self::ISTRING => true,
  8865. self::TEXT => true,
  8866. self::ITEXT => true,
  8867. );
  8868. /**
  8869. * Validate a variable according to type.
  8870. * It may return NULL as a valid type if $allow_null is true.
  8871. *
  8872. * @param mixed $var Variable to validate
  8873. * @param int $type Type of variable, see HTMLPurifier_VarParser->types
  8874. * @param bool $allow_null Whether or not to permit null as a value
  8875. * @return string Validated and type-coerced variable
  8876. * @throws HTMLPurifier_VarParserException
  8877. */
  8878. final public function parse($var, $type, $allow_null = false)
  8879. {
  8880. if (is_string($type)) {
  8881. if (!isset(HTMLPurifier_VarParser::$types[$type])) {
  8882. throw new HTMLPurifier_VarParserException("Invalid type '$type'");
  8883. } else {
  8884. $type = HTMLPurifier_VarParser::$types[$type];
  8885. }
  8886. }
  8887. $var = $this->parseImplementation($var, $type, $allow_null);
  8888. if ($allow_null && $var === null) {
  8889. return null;
  8890. }
  8891. // These are basic checks, to make sure nothing horribly wrong
  8892. // happened in our implementations.
  8893. switch ($type) {
  8894. case (self::STRING):
  8895. case (self::ISTRING):
  8896. case (self::TEXT):
  8897. case (self::ITEXT):
  8898. if (!is_string($var)) {
  8899. break;
  8900. }
  8901. if ($type == self::ISTRING || $type == self::ITEXT) {
  8902. $var = strtolower($var);
  8903. }
  8904. return $var;
  8905. case (self::INT):
  8906. if (!is_int($var)) {
  8907. break;
  8908. }
  8909. return $var;
  8910. case (self::FLOAT):
  8911. if (!is_float($var)) {
  8912. break;
  8913. }
  8914. return $var;
  8915. case (self::BOOL):
  8916. if (!is_bool($var)) {
  8917. break;
  8918. }
  8919. return $var;
  8920. case (self::LOOKUP):
  8921. case (self::ALIST):
  8922. case (self::HASH):
  8923. if (!is_array($var)) {
  8924. break;
  8925. }
  8926. if ($type === self::LOOKUP) {
  8927. foreach ($var as $k) {
  8928. if ($k !== true) {
  8929. $this->error('Lookup table contains value other than true');
  8930. }
  8931. }
  8932. } elseif ($type === self::ALIST) {
  8933. $keys = array_keys($var);
  8934. if (array_keys($keys) !== $keys) {
  8935. $this->error('Indices for list are not uniform');
  8936. }
  8937. }
  8938. return $var;
  8939. case (self::MIXED):
  8940. return $var;
  8941. default:
  8942. $this->errorInconsistent(get_class($this), $type);
  8943. }
  8944. $this->errorGeneric($var, $type);
  8945. }
  8946. /**
  8947. * Actually implements the parsing. Base implementation does not
  8948. * do anything to $var. Subclasses should overload this!
  8949. * @param mixed $var
  8950. * @param int $type
  8951. * @param bool $allow_null
  8952. * @return string
  8953. */
  8954. protected function parseImplementation($var, $type, $allow_null)
  8955. {
  8956. return $var;
  8957. }
  8958. /**
  8959. * Throws an exception.
  8960. * @throws HTMLPurifier_VarParserException
  8961. */
  8962. protected function error($msg)
  8963. {
  8964. throw new HTMLPurifier_VarParserException($msg);
  8965. }
  8966. /**
  8967. * Throws an inconsistency exception.
  8968. * @note This should not ever be called. It would be called if we
  8969. * extend the allowed values of HTMLPurifier_VarParser without
  8970. * updating subclasses.
  8971. * @param string $class
  8972. * @param int $type
  8973. * @throws HTMLPurifier_Exception
  8974. */
  8975. protected function errorInconsistent($class, $type)
  8976. {
  8977. throw new HTMLPurifier_Exception(
  8978. "Inconsistency in $class: " . HTMLPurifier_VarParser::getTypeName($type) .
  8979. " not implemented"
  8980. );
  8981. }
  8982. /**
  8983. * Generic error for if a type didn't work.
  8984. * @param mixed $var
  8985. * @param int $type
  8986. */
  8987. protected function errorGeneric($var, $type)
  8988. {
  8989. $vtype = gettype($var);
  8990. $this->error("Expected type " . HTMLPurifier_VarParser::getTypeName($type) . ", got $vtype");
  8991. }
  8992. /**
  8993. * @param int $type
  8994. * @return string
  8995. */
  8996. public static function getTypeName($type)
  8997. {
  8998. static $lookup;
  8999. if (!$lookup) {
  9000. // Lazy load the alternative lookup table
  9001. $lookup = array_flip(HTMLPurifier_VarParser::$types);
  9002. }
  9003. if (!isset($lookup[$type])) {
  9004. return 'unknown';
  9005. }
  9006. return $lookup[$type];
  9007. }
  9008. }
  9009. /**
  9010. * Exception type for HTMLPurifier_VarParser
  9011. */
  9012. class HTMLPurifier_VarParserException extends HTMLPurifier_Exception
  9013. {
  9014. }
  9015. /**
  9016. * A zipper is a purely-functional data structure which contains
  9017. * a focus that can be efficiently manipulated. It is known as
  9018. * a "one-hole context". This mutable variant implements a zipper
  9019. * for a list as a pair of two arrays, laid out as follows:
  9020. *
  9021. * Base list: 1 2 3 4 [ ] 6 7 8 9
  9022. * Front list: 1 2 3 4
  9023. * Back list: 9 8 7 6
  9024. *
  9025. * User is expected to keep track of the "current element" and properly
  9026. * fill it back in as necessary. (ToDo: Maybe it's more user friendly
  9027. * to implicitly track the current element?)
  9028. *
  9029. * Nota bene: the current class gets confused if you try to store NULLs
  9030. * in the list.
  9031. */
  9032. class HTMLPurifier_Zipper
  9033. {
  9034. public $front, $back;
  9035. public function __construct($front, $back) {
  9036. $this->front = $front;
  9037. $this->back = $back;
  9038. }
  9039. /**
  9040. * Creates a zipper from an array, with a hole in the
  9041. * 0-index position.
  9042. * @param Array to zipper-ify.
  9043. * @return Tuple of zipper and element of first position.
  9044. */
  9045. static public function fromArray($array) {
  9046. $z = new self(array(), array_reverse($array));
  9047. $t = $z->delete(); // delete the "dummy hole"
  9048. return array($z, $t);
  9049. }
  9050. /**
  9051. * Convert zipper back into a normal array, optionally filling in
  9052. * the hole with a value. (Usually you should supply a $t, unless you
  9053. * are at the end of the array.)
  9054. */
  9055. public function toArray($t = NULL) {
  9056. $a = $this->front;
  9057. if ($t !== NULL) $a[] = $t;
  9058. for ($i = count($this->back)-1; $i >= 0; $i--) {
  9059. $a[] = $this->back[$i];
  9060. }
  9061. return $a;
  9062. }
  9063. /**
  9064. * Move hole to the next element.
  9065. * @param $t Element to fill hole with
  9066. * @return Original contents of new hole.
  9067. */
  9068. public function next($t) {
  9069. if ($t !== NULL) array_push($this->front, $t);
  9070. return empty($this->back) ? NULL : array_pop($this->back);
  9071. }
  9072. /**
  9073. * Iterated hole advancement.
  9074. * @param $t Element to fill hole with
  9075. * @param $i How many forward to advance hole
  9076. * @return Original contents of new hole, i away
  9077. */
  9078. public function advance($t, $n) {
  9079. for ($i = 0; $i < $n; $i++) {
  9080. $t = $this->next($t);
  9081. }
  9082. return $t;
  9083. }
  9084. /**
  9085. * Move hole to the previous element
  9086. * @param $t Element to fill hole with
  9087. * @return Original contents of new hole.
  9088. */
  9089. public function prev($t) {
  9090. if ($t !== NULL) array_push($this->back, $t);
  9091. return empty($this->front) ? NULL : array_pop($this->front);
  9092. }
  9093. /**
  9094. * Delete contents of current hole, shifting hole to
  9095. * next element.
  9096. * @return Original contents of new hole.
  9097. */
  9098. public function delete() {
  9099. return empty($this->back) ? NULL : array_pop($this->back);
  9100. }
  9101. /**
  9102. * Returns true if we are at the end of the list.
  9103. * @return bool
  9104. */
  9105. public function done() {
  9106. return empty($this->back);
  9107. }
  9108. /**
  9109. * Insert element before hole.
  9110. * @param Element to insert
  9111. */
  9112. public function insertBefore($t) {
  9113. if ($t !== NULL) array_push($this->front, $t);
  9114. }
  9115. /**
  9116. * Insert element after hole.
  9117. * @param Element to insert
  9118. */
  9119. public function insertAfter($t) {
  9120. if ($t !== NULL) array_push($this->back, $t);
  9121. }
  9122. /**
  9123. * Splice in multiple elements at hole. Functional specification
  9124. * in terms of array_splice:
  9125. *
  9126. * $arr1 = $arr;
  9127. * $old1 = array_splice($arr1, $i, $delete, $replacement);
  9128. *
  9129. * list($z, $t) = HTMLPurifier_Zipper::fromArray($arr);
  9130. * $t = $z->advance($t, $i);
  9131. * list($old2, $t) = $z->splice($t, $delete, $replacement);
  9132. * $arr2 = $z->toArray($t);
  9133. *
  9134. * assert($old1 === $old2);
  9135. * assert($arr1 === $arr2);
  9136. *
  9137. * NB: the absolute index location after this operation is
  9138. * *unchanged!*
  9139. *
  9140. * @param Current contents of hole.
  9141. */
  9142. public function splice($t, $delete, $replacement) {
  9143. // delete
  9144. $old = array();
  9145. $r = $t;
  9146. for ($i = $delete; $i > 0; $i--) {
  9147. $old[] = $r;
  9148. $r = $this->delete();
  9149. }
  9150. // insert
  9151. for ($i = count($replacement)-1; $i >= 0; $i--) {
  9152. $this->insertAfter($r);
  9153. $r = $replacement[$i];
  9154. }
  9155. return array($old, $r);
  9156. }
  9157. }
  9158. /**
  9159. * Validates the HTML attribute style, otherwise known as CSS.
  9160. * @note We don't implement the whole CSS specification, so it might be
  9161. * difficult to reuse this component in the context of validating
  9162. * actual stylesheet declarations.
  9163. * @note If we were really serious about validating the CSS, we would
  9164. * tokenize the styles and then parse the tokens. Obviously, we
  9165. * are not doing that. Doing that could seriously harm performance,
  9166. * but would make these components a lot more viable for a CSS
  9167. * filtering solution.
  9168. */
  9169. class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
  9170. {
  9171. /**
  9172. * @param string $css
  9173. * @param HTMLPurifier_Config $config
  9174. * @param HTMLPurifier_Context $context
  9175. * @return bool|string
  9176. */
  9177. public function validate($css, $config, $context)
  9178. {
  9179. $css = $this->parseCDATA($css);
  9180. $definition = $config->getCSSDefinition();
  9181. $allow_duplicates = $config->get("CSS.AllowDuplicates");
  9182. // According to the CSS2.1 spec, the places where a
  9183. // non-delimiting semicolon can appear are in strings
  9184. // escape sequences. So here is some dumb hack to
  9185. // handle quotes.
  9186. $len = strlen($css);
  9187. $accum = "";
  9188. $declarations = array();
  9189. $quoted = false;
  9190. for ($i = 0; $i < $len; $i++) {
  9191. $c = strcspn($css, ";'\"", $i);
  9192. $accum .= substr($css, $i, $c);
  9193. $i += $c;
  9194. if ($i == $len) break;
  9195. $d = $css[$i];
  9196. if ($quoted) {
  9197. $accum .= $d;
  9198. if ($d == $quoted) {
  9199. $quoted = false;
  9200. }
  9201. } else {
  9202. if ($d == ";") {
  9203. $declarations[] = $accum;
  9204. $accum = "";
  9205. } else {
  9206. $accum .= $d;
  9207. $quoted = $d;
  9208. }
  9209. }
  9210. }
  9211. if ($accum != "") $declarations[] = $accum;
  9212. $propvalues = array();
  9213. $new_declarations = '';
  9214. /**
  9215. * Name of the current CSS property being validated.
  9216. */
  9217. $property = false;
  9218. $context->register('CurrentCSSProperty', $property);
  9219. foreach ($declarations as $declaration) {
  9220. if (!$declaration) {
  9221. continue;
  9222. }
  9223. if (!strpos($declaration, ':')) {
  9224. continue;
  9225. }
  9226. list($property, $value) = explode(':', $declaration, 2);
  9227. $property = trim($property);
  9228. $value = trim($value);
  9229. $ok = false;
  9230. do {
  9231. if (isset($definition->info[$property])) {
  9232. $ok = true;
  9233. break;
  9234. }
  9235. if (ctype_lower($property)) {
  9236. break;
  9237. }
  9238. $property = strtolower($property);
  9239. if (isset($definition->info[$property])) {
  9240. $ok = true;
  9241. break;
  9242. }
  9243. } while (0);
  9244. if (!$ok) {
  9245. continue;
  9246. }
  9247. // inefficient call, since the validator will do this again
  9248. if (strtolower(trim($value)) !== 'inherit') {
  9249. // inherit works for everything (but only on the base property)
  9250. $result = $definition->info[$property]->validate(
  9251. $value,
  9252. $config,
  9253. $context
  9254. );
  9255. } else {
  9256. $result = 'inherit';
  9257. }
  9258. if ($result === false) {
  9259. continue;
  9260. }
  9261. if ($allow_duplicates) {
  9262. $new_declarations .= "$property:$result;";
  9263. } else {
  9264. $propvalues[$property] = $result;
  9265. }
  9266. }
  9267. $context->destroy('CurrentCSSProperty');
  9268. // procedure does not write the new CSS simultaneously, so it's
  9269. // slightly inefficient, but it's the only way of getting rid of
  9270. // duplicates. Perhaps config to optimize it, but not now.
  9271. foreach ($propvalues as $prop => $value) {
  9272. $new_declarations .= "$prop:$value;";
  9273. }
  9274. return $new_declarations ? $new_declarations : false;
  9275. }
  9276. }
  9277. /**
  9278. * Dummy AttrDef that mimics another AttrDef, BUT it generates clones
  9279. * with make.
  9280. */
  9281. class HTMLPurifier_AttrDef_Clone extends HTMLPurifier_AttrDef
  9282. {
  9283. /**
  9284. * What we're cloning.
  9285. * @type HTMLPurifier_AttrDef
  9286. */
  9287. protected $clone;
  9288. /**
  9289. * @param HTMLPurifier_AttrDef $clone
  9290. */
  9291. public function __construct($clone)
  9292. {
  9293. $this->clone = $clone;
  9294. }
  9295. /**
  9296. * @param string $v
  9297. * @param HTMLPurifier_Config $config
  9298. * @param HTMLPurifier_Context $context
  9299. * @return bool|string
  9300. */
  9301. public function validate($v, $config, $context)
  9302. {
  9303. return $this->clone->validate($v, $config, $context);
  9304. }
  9305. /**
  9306. * @param string $string
  9307. * @return HTMLPurifier_AttrDef
  9308. */
  9309. public function make($string)
  9310. {
  9311. return clone $this->clone;
  9312. }
  9313. }
  9314. // Enum = Enumerated
  9315. /**
  9316. * Validates a keyword against a list of valid values.
  9317. * @warning The case-insensitive compare of this function uses PHP's
  9318. * built-in strtolower and ctype_lower functions, which may
  9319. * cause problems with international comparisons
  9320. */
  9321. class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
  9322. {
  9323. /**
  9324. * Lookup table of valid values.
  9325. * @type array
  9326. * @todo Make protected
  9327. */
  9328. public $valid_values = array();
  9329. /**
  9330. * Bool indicating whether or not enumeration is case sensitive.
  9331. * @note In general this is always case insensitive.
  9332. */
  9333. protected $case_sensitive = false; // values according to W3C spec
  9334. /**
  9335. * @param array $valid_values List of valid values
  9336. * @param bool $case_sensitive Whether or not case sensitive
  9337. */
  9338. public function __construct($valid_values = array(), $case_sensitive = false)
  9339. {
  9340. $this->valid_values = array_flip($valid_values);
  9341. $this->case_sensitive = $case_sensitive;
  9342. }
  9343. /**
  9344. * @param string $string
  9345. * @param HTMLPurifier_Config $config
  9346. * @param HTMLPurifier_Context $context
  9347. * @return bool|string
  9348. */
  9349. public function validate($string, $config, $context)
  9350. {
  9351. $string = trim($string);
  9352. if (!$this->case_sensitive) {
  9353. // we may want to do full case-insensitive libraries
  9354. $string = ctype_lower($string) ? $string : strtolower($string);
  9355. }
  9356. $result = isset($this->valid_values[$string]);
  9357. return $result ? $string : false;
  9358. }
  9359. /**
  9360. * @param string $string In form of comma-delimited list of case-insensitive
  9361. * valid values. Example: "foo,bar,baz". Prepend "s:" to make
  9362. * case sensitive
  9363. * @return HTMLPurifier_AttrDef_Enum
  9364. */
  9365. public function make($string)
  9366. {
  9367. if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
  9368. $string = substr($string, 2);
  9369. $sensitive = true;
  9370. } else {
  9371. $sensitive = false;
  9372. }
  9373. $values = explode(',', $string);
  9374. return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
  9375. }
  9376. }
  9377. /**
  9378. * Validates an integer.
  9379. * @note While this class was modeled off the CSS definition, no currently
  9380. * allowed CSS uses this type. The properties that do are: widows,
  9381. * orphans, z-index, counter-increment, counter-reset. Some of the
  9382. * HTML attributes, however, find use for a non-negative version of this.
  9383. */
  9384. class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
  9385. {
  9386. /**
  9387. * Whether or not negative values are allowed.
  9388. * @type bool
  9389. */
  9390. protected $negative = true;
  9391. /**
  9392. * Whether or not zero is allowed.
  9393. * @type bool
  9394. */
  9395. protected $zero = true;
  9396. /**
  9397. * Whether or not positive values are allowed.
  9398. * @type bool
  9399. */
  9400. protected $positive = true;
  9401. /**
  9402. * @param $negative Bool indicating whether or not negative values are allowed
  9403. * @param $zero Bool indicating whether or not zero is allowed
  9404. * @param $positive Bool indicating whether or not positive values are allowed
  9405. */
  9406. public function __construct($negative = true, $zero = true, $positive = true)
  9407. {
  9408. $this->negative = $negative;
  9409. $this->zero = $zero;
  9410. $this->positive = $positive;
  9411. }
  9412. /**
  9413. * @param string $integer
  9414. * @param HTMLPurifier_Config $config
  9415. * @param HTMLPurifier_Context $context
  9416. * @return bool|string
  9417. */
  9418. public function validate($integer, $config, $context)
  9419. {
  9420. $integer = $this->parseCDATA($integer);
  9421. if ($integer === '') {
  9422. return false;
  9423. }
  9424. // we could possibly simply typecast it to integer, but there are
  9425. // certain fringe cases that must not return an integer.
  9426. // clip leading sign
  9427. if ($this->negative && $integer[0] === '-') {
  9428. $digits = substr($integer, 1);
  9429. if ($digits === '0') {
  9430. $integer = '0';
  9431. } // rm minus sign for zero
  9432. } elseif ($this->positive && $integer[0] === '+') {
  9433. $digits = $integer = substr($integer, 1); // rm unnecessary plus
  9434. } else {
  9435. $digits = $integer;
  9436. }
  9437. // test if it's numeric
  9438. if (!ctype_digit($digits)) {
  9439. return false;
  9440. }
  9441. // perform scope tests
  9442. if (!$this->zero && $integer == 0) {
  9443. return false;
  9444. }
  9445. if (!$this->positive && $integer > 0) {
  9446. return false;
  9447. }
  9448. if (!$this->negative && $integer < 0) {
  9449. return false;
  9450. }
  9451. return $integer;
  9452. }
  9453. }
  9454. /**
  9455. * Validates the HTML attribute lang, effectively a language code.
  9456. * @note Built according to RFC 3066, which obsoleted RFC 1766
  9457. */
  9458. class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
  9459. {
  9460. /**
  9461. * @param string $string
  9462. * @param HTMLPurifier_Config $config
  9463. * @param HTMLPurifier_Context $context
  9464. * @return bool|string
  9465. */
  9466. public function validate($string, $config, $context)
  9467. {
  9468. $string = trim($string);
  9469. if (!$string) {
  9470. return false;
  9471. }
  9472. $subtags = explode('-', $string);
  9473. $num_subtags = count($subtags);
  9474. if ($num_subtags == 0) { // sanity check
  9475. return false;
  9476. }
  9477. // process primary subtag : $subtags[0]
  9478. $length = strlen($subtags[0]);
  9479. switch ($length) {
  9480. case 0:
  9481. return false;
  9482. case 1:
  9483. if (!($subtags[0] == 'x' || $subtags[0] == 'i')) {
  9484. return false;
  9485. }
  9486. break;
  9487. case 2:
  9488. case 3:
  9489. if (!ctype_alpha($subtags[0])) {
  9490. return false;
  9491. } elseif (!ctype_lower($subtags[0])) {
  9492. $subtags[0] = strtolower($subtags[0]);
  9493. }
  9494. break;
  9495. default:
  9496. return false;
  9497. }
  9498. $new_string = $subtags[0];
  9499. if ($num_subtags == 1) {
  9500. return $new_string;
  9501. }
  9502. // process second subtag : $subtags[1]
  9503. $length = strlen($subtags[1]);
  9504. if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
  9505. return $new_string;
  9506. }
  9507. if (!ctype_lower($subtags[1])) {
  9508. $subtags[1] = strtolower($subtags[1]);
  9509. }
  9510. $new_string .= '-' . $subtags[1];
  9511. if ($num_subtags == 2) {
  9512. return $new_string;
  9513. }
  9514. // process all other subtags, index 2 and up
  9515. for ($i = 2; $i < $num_subtags; $i++) {
  9516. $length = strlen($subtags[$i]);
  9517. if ($length == 0 || $length > 8 || !ctype_alnum($subtags[$i])) {
  9518. return $new_string;
  9519. }
  9520. if (!ctype_lower($subtags[$i])) {
  9521. $subtags[$i] = strtolower($subtags[$i]);
  9522. }
  9523. $new_string .= '-' . $subtags[$i];
  9524. }
  9525. return $new_string;
  9526. }
  9527. }
  9528. /**
  9529. * Decorator that, depending on a token, switches between two definitions.
  9530. */
  9531. class HTMLPurifier_AttrDef_Switch
  9532. {
  9533. /**
  9534. * @type string
  9535. */
  9536. protected $tag;
  9537. /**
  9538. * @type HTMLPurifier_AttrDef
  9539. */
  9540. protected $withTag;
  9541. /**
  9542. * @type HTMLPurifier_AttrDef
  9543. */
  9544. protected $withoutTag;
  9545. /**
  9546. * @param string $tag Tag name to switch upon
  9547. * @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
  9548. * @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
  9549. */
  9550. public function __construct($tag, $with_tag, $without_tag)
  9551. {
  9552. $this->tag = $tag;
  9553. $this->withTag = $with_tag;
  9554. $this->withoutTag = $without_tag;
  9555. }
  9556. /**
  9557. * @param string $string
  9558. * @param HTMLPurifier_Config $config
  9559. * @param HTMLPurifier_Context $context
  9560. * @return bool|string
  9561. */
  9562. public function validate($string, $config, $context)
  9563. {
  9564. $token = $context->get('CurrentToken', true);
  9565. if (!$token || $token->name !== $this->tag) {
  9566. return $this->withoutTag->validate($string, $config, $context);
  9567. } else {
  9568. return $this->withTag->validate($string, $config, $context);
  9569. }
  9570. }
  9571. }
  9572. /**
  9573. * Validates arbitrary text according to the HTML spec.
  9574. */
  9575. class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
  9576. {
  9577. /**
  9578. * @param string $string
  9579. * @param HTMLPurifier_Config $config
  9580. * @param HTMLPurifier_Context $context
  9581. * @return bool|string
  9582. */
  9583. public function validate($string, $config, $context)
  9584. {
  9585. return $this->parseCDATA($string);
  9586. }
  9587. }
  9588. /**
  9589. * Validates a URI as defined by RFC 3986.
  9590. * @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
  9591. */
  9592. class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
  9593. {
  9594. /**
  9595. * @type HTMLPurifier_URIParser
  9596. */
  9597. protected $parser;
  9598. /**
  9599. * @type bool
  9600. */
  9601. protected $embedsResource;
  9602. /**
  9603. * @param bool $embeds_resource Does the URI here result in an extra HTTP request?
  9604. */
  9605. public function __construct($embeds_resource = false)
  9606. {
  9607. $this->parser = new HTMLPurifier_URIParser();
  9608. $this->embedsResource = (bool)$embeds_resource;
  9609. }
  9610. /**
  9611. * @param string $string
  9612. * @return HTMLPurifier_AttrDef_URI
  9613. */
  9614. public function make($string)
  9615. {
  9616. $embeds = ($string === 'embedded');
  9617. return new HTMLPurifier_AttrDef_URI($embeds);
  9618. }
  9619. /**
  9620. * @param string $uri
  9621. * @param HTMLPurifier_Config $config
  9622. * @param HTMLPurifier_Context $context
  9623. * @return bool|string
  9624. */
  9625. public function validate($uri, $config, $context)
  9626. {
  9627. if ($config->get('URI.Disable')) {
  9628. return false;
  9629. }
  9630. $uri = $this->parseCDATA($uri);
  9631. // parse the URI
  9632. $uri = $this->parser->parse($uri);
  9633. if ($uri === false) {
  9634. return false;
  9635. }
  9636. // add embedded flag to context for validators
  9637. $context->register('EmbeddedURI', $this->embedsResource);
  9638. $ok = false;
  9639. do {
  9640. // generic validation
  9641. $result = $uri->validate($config, $context);
  9642. if (!$result) {
  9643. break;
  9644. }
  9645. // chained filtering
  9646. $uri_def = $config->getDefinition('URI');
  9647. $result = $uri_def->filter($uri, $config, $context);
  9648. if (!$result) {
  9649. break;
  9650. }
  9651. // scheme-specific validation
  9652. $scheme_obj = $uri->getSchemeObj($config, $context);
  9653. if (!$scheme_obj) {
  9654. break;
  9655. }
  9656. if ($this->embedsResource && !$scheme_obj->browsable) {
  9657. break;
  9658. }
  9659. $result = $scheme_obj->validate($uri, $config, $context);
  9660. if (!$result) {
  9661. break;
  9662. }
  9663. // Post chained filtering
  9664. $result = $uri_def->postFilter($uri, $config, $context);
  9665. if (!$result) {
  9666. break;
  9667. }
  9668. // survived gauntlet
  9669. $ok = true;
  9670. } while (false);
  9671. $context->destroy('EmbeddedURI');
  9672. if (!$ok) {
  9673. return false;
  9674. }
  9675. // back to string
  9676. return $uri->toString();
  9677. }
  9678. }
  9679. /**
  9680. * Validates a number as defined by the CSS spec.
  9681. */
  9682. class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
  9683. {
  9684. /**
  9685. * Indicates whether or not only positive values are allowed.
  9686. * @type bool
  9687. */
  9688. protected $non_negative = false;
  9689. /**
  9690. * @param bool $non_negative indicates whether negatives are forbidden
  9691. */
  9692. public function __construct($non_negative = false)
  9693. {
  9694. $this->non_negative = $non_negative;
  9695. }
  9696. /**
  9697. * @param string $number
  9698. * @param HTMLPurifier_Config $config
  9699. * @param HTMLPurifier_Context $context
  9700. * @return string|bool
  9701. * @warning Some contexts do not pass $config, $context. These
  9702. * variables should not be used without checking HTMLPurifier_Length
  9703. */
  9704. public function validate($number, $config, $context)
  9705. {
  9706. $number = $this->parseCDATA($number);
  9707. if ($number === '') {
  9708. return false;
  9709. }
  9710. if ($number === '0') {
  9711. return '0';
  9712. }
  9713. $sign = '';
  9714. switch ($number[0]) {
  9715. case '-':
  9716. if ($this->non_negative) {
  9717. return false;
  9718. }
  9719. $sign = '-';
  9720. case '+':
  9721. $number = substr($number, 1);
  9722. }
  9723. if (ctype_digit($number)) {
  9724. $number = ltrim($number, '0');
  9725. return $number ? $sign . $number : '0';
  9726. }
  9727. // Period is the only non-numeric character allowed
  9728. if (strpos($number, '.') === false) {
  9729. return false;
  9730. }
  9731. list($left, $right) = explode('.', $number, 2);
  9732. if ($left === '' && $right === '') {
  9733. return false;
  9734. }
  9735. if ($left !== '' && !ctype_digit($left)) {
  9736. return false;
  9737. }
  9738. $left = ltrim($left, '0');
  9739. $right = rtrim($right, '0');
  9740. if ($right === '') {
  9741. return $left ? $sign . $left : '0';
  9742. } elseif (!ctype_digit($right)) {
  9743. return false;
  9744. }
  9745. return $sign . $left . '.' . $right;
  9746. }
  9747. }
  9748. class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
  9749. {
  9750. public function __construct()
  9751. {
  9752. parent::__construct(false); // opacity is non-negative, but we will clamp it
  9753. }
  9754. /**
  9755. * @param string $number
  9756. * @param HTMLPurifier_Config $config
  9757. * @param HTMLPurifier_Context $context
  9758. * @return string
  9759. */
  9760. public function validate($number, $config, $context)
  9761. {
  9762. $result = parent::validate($number, $config, $context);
  9763. if ($result === false) {
  9764. return $result;
  9765. }
  9766. $float = (float)$result;
  9767. if ($float < 0.0) {
  9768. $result = '0';
  9769. }
  9770. if ($float > 1.0) {
  9771. $result = '1';
  9772. }
  9773. return $result;
  9774. }
  9775. }
  9776. /**
  9777. * Validates shorthand CSS property background.
  9778. * @warning Does not support url tokens that have internal spaces.
  9779. */
  9780. class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
  9781. {
  9782. /**
  9783. * Local copy of component validators.
  9784. * @type HTMLPurifier_AttrDef[]
  9785. * @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
  9786. */
  9787. protected $info;
  9788. /**
  9789. * @param HTMLPurifier_Config $config
  9790. */
  9791. public function __construct($config)
  9792. {
  9793. $def = $config->getCSSDefinition();
  9794. $this->info['background-color'] = $def->info['background-color'];
  9795. $this->info['background-image'] = $def->info['background-image'];
  9796. $this->info['background-repeat'] = $def->info['background-repeat'];
  9797. $this->info['background-attachment'] = $def->info['background-attachment'];
  9798. $this->info['background-position'] = $def->info['background-position'];
  9799. }
  9800. /**
  9801. * @param string $string
  9802. * @param HTMLPurifier_Config $config
  9803. * @param HTMLPurifier_Context $context
  9804. * @return bool|string
  9805. */
  9806. public function validate($string, $config, $context)
  9807. {
  9808. // regular pre-processing
  9809. $string = $this->parseCDATA($string);
  9810. if ($string === '') {
  9811. return false;
  9812. }
  9813. // munge rgb() decl if necessary
  9814. $string = $this->mungeRgb($string);
  9815. // assumes URI doesn't have spaces in it
  9816. $bits = explode(' ', $string); // bits to process
  9817. $caught = array();
  9818. $caught['color'] = false;
  9819. $caught['image'] = false;
  9820. $caught['repeat'] = false;
  9821. $caught['attachment'] = false;
  9822. $caught['position'] = false;
  9823. $i = 0; // number of catches
  9824. foreach ($bits as $bit) {
  9825. if ($bit === '') {
  9826. continue;
  9827. }
  9828. foreach ($caught as $key => $status) {
  9829. if ($key != 'position') {
  9830. if ($status !== false) {
  9831. continue;
  9832. }
  9833. $r = $this->info['background-' . $key]->validate($bit, $config, $context);
  9834. } else {
  9835. $r = $bit;
  9836. }
  9837. if ($r === false) {
  9838. continue;
  9839. }
  9840. if ($key == 'position') {
  9841. if ($caught[$key] === false) {
  9842. $caught[$key] = '';
  9843. }
  9844. $caught[$key] .= $r . ' ';
  9845. } else {
  9846. $caught[$key] = $r;
  9847. }
  9848. $i++;
  9849. break;
  9850. }
  9851. }
  9852. if (!$i) {
  9853. return false;
  9854. }
  9855. if ($caught['position'] !== false) {
  9856. $caught['position'] = $this->info['background-position']->
  9857. validate($caught['position'], $config, $context);
  9858. }
  9859. $ret = array();
  9860. foreach ($caught as $value) {
  9861. if ($value === false) {
  9862. continue;
  9863. }
  9864. $ret[] = $value;
  9865. }
  9866. if (empty($ret)) {
  9867. return false;
  9868. }
  9869. return implode(' ', $ret);
  9870. }
  9871. }
  9872. /* W3C says:
  9873. [ // adjective and number must be in correct order, even if
  9874. // you could switch them without introducing ambiguity.
  9875. // some browsers support that syntax
  9876. [
  9877. <percentage> | <length> | left | center | right
  9878. ]
  9879. [
  9880. <percentage> | <length> | top | center | bottom
  9881. ]?
  9882. ] |
  9883. [ // this signifies that the vertical and horizontal adjectives
  9884. // can be arbitrarily ordered, however, there can only be two,
  9885. // one of each, or none at all
  9886. [
  9887. left | center | right
  9888. ] ||
  9889. [
  9890. top | center | bottom
  9891. ]
  9892. ]
  9893. top, left = 0%
  9894. center, (none) = 50%
  9895. bottom, right = 100%
  9896. */
  9897. /* QuirksMode says:
  9898. keyword + length/percentage must be ordered correctly, as per W3C
  9899. Internet Explorer and Opera, however, support arbitrary ordering. We
  9900. should fix it up.
  9901. Minor issue though, not strictly necessary.
  9902. */
  9903. // control freaks may appreciate the ability to convert these to
  9904. // percentages or something, but it's not necessary
  9905. /**
  9906. * Validates the value of background-position.
  9907. */
  9908. class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
  9909. {
  9910. /**
  9911. * @type HTMLPurifier_AttrDef_CSS_Length
  9912. */
  9913. protected $length;
  9914. /**
  9915. * @type HTMLPurifier_AttrDef_CSS_Percentage
  9916. */
  9917. protected $percentage;
  9918. public function __construct()
  9919. {
  9920. $this->length = new HTMLPurifier_AttrDef_CSS_Length();
  9921. $this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
  9922. }
  9923. /**
  9924. * @param string $string
  9925. * @param HTMLPurifier_Config $config
  9926. * @param HTMLPurifier_Context $context
  9927. * @return bool|string
  9928. */
  9929. public function validate($string, $config, $context)
  9930. {
  9931. $string = $this->parseCDATA($string);
  9932. $bits = explode(' ', $string);
  9933. $keywords = array();
  9934. $keywords['h'] = false; // left, right
  9935. $keywords['v'] = false; // top, bottom
  9936. $keywords['ch'] = false; // center (first word)
  9937. $keywords['cv'] = false; // center (second word)
  9938. $measures = array();
  9939. $i = 0;
  9940. $lookup = array(
  9941. 'top' => 'v',
  9942. 'bottom' => 'v',
  9943. 'left' => 'h',
  9944. 'right' => 'h',
  9945. 'center' => 'c'
  9946. );
  9947. foreach ($bits as $bit) {
  9948. if ($bit === '') {
  9949. continue;
  9950. }
  9951. // test for keyword
  9952. $lbit = ctype_lower($bit) ? $bit : strtolower($bit);
  9953. if (isset($lookup[$lbit])) {
  9954. $status = $lookup[$lbit];
  9955. if ($status == 'c') {
  9956. if ($i == 0) {
  9957. $status = 'ch';
  9958. } else {
  9959. $status = 'cv';
  9960. }
  9961. }
  9962. $keywords[$status] = $lbit;
  9963. $i++;
  9964. }
  9965. // test for length
  9966. $r = $this->length->validate($bit, $config, $context);
  9967. if ($r !== false) {
  9968. $measures[] = $r;
  9969. $i++;
  9970. }
  9971. // test for percentage
  9972. $r = $this->percentage->validate($bit, $config, $context);
  9973. if ($r !== false) {
  9974. $measures[] = $r;
  9975. $i++;
  9976. }
  9977. }
  9978. if (!$i) {
  9979. return false;
  9980. } // no valid values were caught
  9981. $ret = array();
  9982. // first keyword
  9983. if ($keywords['h']) {
  9984. $ret[] = $keywords['h'];
  9985. } elseif ($keywords['ch']) {
  9986. $ret[] = $keywords['ch'];
  9987. $keywords['cv'] = false; // prevent re-use: center = center center
  9988. } elseif (count($measures)) {
  9989. $ret[] = array_shift($measures);
  9990. }
  9991. if ($keywords['v']) {
  9992. $ret[] = $keywords['v'];
  9993. } elseif ($keywords['cv']) {
  9994. $ret[] = $keywords['cv'];
  9995. } elseif (count($measures)) {
  9996. $ret[] = array_shift($measures);
  9997. }
  9998. if (empty($ret)) {
  9999. return false;
  10000. }
  10001. return implode(' ', $ret);
  10002. }
  10003. }
  10004. /**
  10005. * Validates the border property as defined by CSS.
  10006. */
  10007. class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
  10008. {
  10009. /**
  10010. * Local copy of properties this property is shorthand for.
  10011. * @type HTMLPurifier_AttrDef[]
  10012. */
  10013. protected $info = array();
  10014. /**
  10015. * @param HTMLPurifier_Config $config
  10016. */
  10017. public function __construct($config)
  10018. {
  10019. $def = $config->getCSSDefinition();
  10020. $this->info['border-width'] = $def->info['border-width'];
  10021. $this->info['border-style'] = $def->info['border-style'];
  10022. $this->info['border-top-color'] = $def->info['border-top-color'];
  10023. }
  10024. /**
  10025. * @param string $string
  10026. * @param HTMLPurifier_Config $config
  10027. * @param HTMLPurifier_Context $context
  10028. * @return bool|string
  10029. */
  10030. public function validate($string, $config, $context)
  10031. {
  10032. $string = $this->parseCDATA($string);
  10033. $string = $this->mungeRgb($string);
  10034. $bits = explode(' ', $string);
  10035. $done = array(); // segments we've finished
  10036. $ret = ''; // return value
  10037. foreach ($bits as $bit) {
  10038. foreach ($this->info as $propname => $validator) {
  10039. if (isset($done[$propname])) {
  10040. continue;
  10041. }
  10042. $r = $validator->validate($bit, $config, $context);
  10043. if ($r !== false) {
  10044. $ret .= $r . ' ';
  10045. $done[$propname] = true;
  10046. break;
  10047. }
  10048. }
  10049. }
  10050. return rtrim($ret);
  10051. }
  10052. }
  10053. /**
  10054. * Validates Color as defined by CSS.
  10055. */
  10056. class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
  10057. {
  10058. /**
  10059. * @type HTMLPurifier_AttrDef_CSS_AlphaValue
  10060. */
  10061. protected $alpha;
  10062. public function __construct()
  10063. {
  10064. $this->alpha = new HTMLPurifier_AttrDef_CSS_AlphaValue();
  10065. }
  10066. /**
  10067. * @param string $color
  10068. * @param HTMLPurifier_Config $config
  10069. * @param HTMLPurifier_Context $context
  10070. * @return bool|string
  10071. */
  10072. public function validate($color, $config, $context)
  10073. {
  10074. static $colors = null;
  10075. if ($colors === null) {
  10076. $colors = $config->get('Core.ColorKeywords');
  10077. }
  10078. $color = trim($color);
  10079. if ($color === '') {
  10080. return false;
  10081. }
  10082. $lower = strtolower($color);
  10083. if (isset($colors[$lower])) {
  10084. return $colors[$lower];
  10085. }
  10086. if (preg_match('#(rgb|rgba|hsl|hsla)\(#', $color, $matches) === 1) {
  10087. $length = strlen($color);
  10088. if (strpos($color, ')') !== $length - 1) {
  10089. return false;
  10090. }
  10091. // get used function : rgb, rgba, hsl or hsla
  10092. $function = $matches[1];
  10093. $parameters_size = 3;
  10094. $alpha_channel = false;
  10095. if (substr($function, -1) === 'a') {
  10096. $parameters_size = 4;
  10097. $alpha_channel = true;
  10098. }
  10099. /*
  10100. * Allowed types for values :
  10101. * parameter_position => [type => max_value]
  10102. */
  10103. $allowed_types = array(
  10104. 1 => array('percentage' => 100, 'integer' => 255),
  10105. 2 => array('percentage' => 100, 'integer' => 255),
  10106. 3 => array('percentage' => 100, 'integer' => 255),
  10107. );
  10108. $allow_different_types = false;
  10109. if (strpos($function, 'hsl') !== false) {
  10110. $allowed_types = array(
  10111. 1 => array('integer' => 360),
  10112. 2 => array('percentage' => 100),
  10113. 3 => array('percentage' => 100),
  10114. );
  10115. $allow_different_types = true;
  10116. }
  10117. $values = trim(str_replace($function, '', $color), ' ()');
  10118. $parts = explode(',', $values);
  10119. if (count($parts) !== $parameters_size) {
  10120. return false;
  10121. }
  10122. $type = false;
  10123. $new_parts = array();
  10124. $i = 0;
  10125. foreach ($parts as $part) {
  10126. $i++;
  10127. $part = trim($part);
  10128. if ($part === '') {
  10129. return false;
  10130. }
  10131. // different check for alpha channel
  10132. if ($alpha_channel === true && $i === count($parts)) {
  10133. $result = $this->alpha->validate($part, $config, $context);
  10134. if ($result === false) {
  10135. return false;
  10136. }
  10137. $new_parts[] = (string)$result;
  10138. continue;
  10139. }
  10140. if (substr($part, -1) === '%') {
  10141. $current_type = 'percentage';
  10142. } else {
  10143. $current_type = 'integer';
  10144. }
  10145. if (!array_key_exists($current_type, $allowed_types[$i])) {
  10146. return false;
  10147. }
  10148. if (!$type) {
  10149. $type = $current_type;
  10150. }
  10151. if ($allow_different_types === false && $type != $current_type) {
  10152. return false;
  10153. }
  10154. $max_value = $allowed_types[$i][$current_type];
  10155. if ($current_type == 'integer') {
  10156. // Return value between range 0 -> $max_value
  10157. $new_parts[] = (int)max(min($part, $max_value), 0);
  10158. } elseif ($current_type == 'percentage') {
  10159. $new_parts[] = (float)max(min(rtrim($part, '%'), $max_value), 0) . '%';
  10160. }
  10161. }
  10162. $new_values = implode(',', $new_parts);
  10163. $color = $function . '(' . $new_values . ')';
  10164. } else {
  10165. // hexadecimal handling
  10166. if ($color[0] === '#') {
  10167. $hex = substr($color, 1);
  10168. } else {
  10169. $hex = $color;
  10170. $color = '#' . $color;
  10171. }
  10172. $length = strlen($hex);
  10173. if ($length !== 3 && $length !== 6) {
  10174. return false;
  10175. }
  10176. if (!ctype_xdigit($hex)) {
  10177. return false;
  10178. }
  10179. }
  10180. return $color;
  10181. }
  10182. }
  10183. /**
  10184. * Allows multiple validators to attempt to validate attribute.
  10185. *
  10186. * Composite is just what it sounds like: a composite of many validators.
  10187. * This means that multiple HTMLPurifier_AttrDef objects will have a whack
  10188. * at the string. If one of them passes, that's what is returned. This is
  10189. * especially useful for CSS values, which often are a choice between
  10190. * an enumerated set of predefined values or a flexible data type.
  10191. */
  10192. class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
  10193. {
  10194. /**
  10195. * List of objects that may process strings.
  10196. * @type HTMLPurifier_AttrDef[]
  10197. * @todo Make protected
  10198. */
  10199. public $defs;
  10200. /**
  10201. * @param HTMLPurifier_AttrDef[] $defs List of HTMLPurifier_AttrDef objects
  10202. */
  10203. public function __construct($defs)
  10204. {
  10205. $this->defs = $defs;
  10206. }
  10207. /**
  10208. * @param string $string
  10209. * @param HTMLPurifier_Config $config
  10210. * @param HTMLPurifier_Context $context
  10211. * @return bool|string
  10212. */
  10213. public function validate($string, $config, $context)
  10214. {
  10215. foreach ($this->defs as $i => $def) {
  10216. $result = $this->defs[$i]->validate($string, $config, $context);
  10217. if ($result !== false) {
  10218. return $result;
  10219. }
  10220. }
  10221. return false;
  10222. }
  10223. }
  10224. /**
  10225. * Decorator which enables CSS properties to be disabled for specific elements.
  10226. */
  10227. class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
  10228. {
  10229. /**
  10230. * @type HTMLPurifier_AttrDef
  10231. */
  10232. public $def;
  10233. /**
  10234. * @type string
  10235. */
  10236. public $element;
  10237. /**
  10238. * @param HTMLPurifier_AttrDef $def Definition to wrap
  10239. * @param string $element Element to deny
  10240. */
  10241. public function __construct($def, $element)
  10242. {
  10243. $this->def = $def;
  10244. $this->element = $element;
  10245. }
  10246. /**
  10247. * Checks if CurrentToken is set and equal to $this->element
  10248. * @param string $string
  10249. * @param HTMLPurifier_Config $config
  10250. * @param HTMLPurifier_Context $context
  10251. * @return bool|string
  10252. */
  10253. public function validate($string, $config, $context)
  10254. {
  10255. $token = $context->get('CurrentToken', true);
  10256. if ($token && $token->name == $this->element) {
  10257. return false;
  10258. }
  10259. return $this->def->validate($string, $config, $context);
  10260. }
  10261. }
  10262. /**
  10263. * Microsoft's proprietary filter: CSS property
  10264. * @note Currently supports the alpha filter. In the future, this will
  10265. * probably need an extensible framework
  10266. */
  10267. class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
  10268. {
  10269. /**
  10270. * @type HTMLPurifier_AttrDef_Integer
  10271. */
  10272. protected $intValidator;
  10273. public function __construct()
  10274. {
  10275. $this->intValidator = new HTMLPurifier_AttrDef_Integer();
  10276. }
  10277. /**
  10278. * @param string $value
  10279. * @param HTMLPurifier_Config $config
  10280. * @param HTMLPurifier_Context $context
  10281. * @return bool|string
  10282. */
  10283. public function validate($value, $config, $context)
  10284. {
  10285. $value = $this->parseCDATA($value);
  10286. if ($value === 'none') {
  10287. return $value;
  10288. }
  10289. // if we looped this we could support multiple filters
  10290. $function_length = strcspn($value, '(');
  10291. $function = trim(substr($value, 0, $function_length));
  10292. if ($function !== 'alpha' &&
  10293. $function !== 'Alpha' &&
  10294. $function !== 'progid:DXImageTransform.Microsoft.Alpha'
  10295. ) {
  10296. return false;
  10297. }
  10298. $cursor = $function_length + 1;
  10299. $parameters_length = strcspn($value, ')', $cursor);
  10300. $parameters = substr($value, $cursor, $parameters_length);
  10301. $params = explode(',', $parameters);
  10302. $ret_params = array();
  10303. $lookup = array();
  10304. foreach ($params as $param) {
  10305. list($key, $value) = explode('=', $param);
  10306. $key = trim($key);
  10307. $value = trim($value);
  10308. if (isset($lookup[$key])) {
  10309. continue;
  10310. }
  10311. if ($key !== 'opacity') {
  10312. continue;
  10313. }
  10314. $value = $this->intValidator->validate($value, $config, $context);
  10315. if ($value === false) {
  10316. continue;
  10317. }
  10318. $int = (int)$value;
  10319. if ($int > 100) {
  10320. $value = '100';
  10321. }
  10322. if ($int < 0) {
  10323. $value = '0';
  10324. }
  10325. $ret_params[] = "$key=$value";
  10326. $lookup[$key] = true;
  10327. }
  10328. $ret_parameters = implode(',', $ret_params);
  10329. $ret_function = "$function($ret_parameters)";
  10330. return $ret_function;
  10331. }
  10332. }
  10333. /**
  10334. * Validates shorthand CSS property font.
  10335. */
  10336. class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
  10337. {
  10338. /**
  10339. * Local copy of validators
  10340. * @type HTMLPurifier_AttrDef[]
  10341. * @note If we moved specific CSS property definitions to their own
  10342. * classes instead of having them be assembled at run time by
  10343. * CSSDefinition, this wouldn't be necessary. We'd instantiate
  10344. * our own copies.
  10345. */
  10346. protected $info = array();
  10347. /**
  10348. * @param HTMLPurifier_Config $config
  10349. */
  10350. public function __construct($config)
  10351. {
  10352. $def = $config->getCSSDefinition();
  10353. $this->info['font-style'] = $def->info['font-style'];
  10354. $this->info['font-variant'] = $def->info['font-variant'];
  10355. $this->info['font-weight'] = $def->info['font-weight'];
  10356. $this->info['font-size'] = $def->info['font-size'];
  10357. $this->info['line-height'] = $def->info['line-height'];
  10358. $this->info['font-family'] = $def->info['font-family'];
  10359. }
  10360. /**
  10361. * @param string $string
  10362. * @param HTMLPurifier_Config $config
  10363. * @param HTMLPurifier_Context $context
  10364. * @return bool|string
  10365. */
  10366. public function validate($string, $config, $context)
  10367. {
  10368. static $system_fonts = array(
  10369. 'caption' => true,
  10370. 'icon' => true,
  10371. 'menu' => true,
  10372. 'message-box' => true,
  10373. 'small-caption' => true,
  10374. 'status-bar' => true
  10375. );
  10376. // regular pre-processing
  10377. $string = $this->parseCDATA($string);
  10378. if ($string === '') {
  10379. return false;
  10380. }
  10381. // check if it's one of the keywords
  10382. $lowercase_string = strtolower($string);
  10383. if (isset($system_fonts[$lowercase_string])) {
  10384. return $lowercase_string;
  10385. }
  10386. $bits = explode(' ', $string); // bits to process
  10387. $stage = 0; // this indicates what we're looking for
  10388. $caught = array(); // which stage 0 properties have we caught?
  10389. $stage_1 = array('font-style', 'font-variant', 'font-weight');
  10390. $final = ''; // output
  10391. for ($i = 0, $size = count($bits); $i < $size; $i++) {
  10392. if ($bits[$i] === '') {
  10393. continue;
  10394. }
  10395. switch ($stage) {
  10396. case 0: // attempting to catch font-style, font-variant or font-weight
  10397. foreach ($stage_1 as $validator_name) {
  10398. if (isset($caught[$validator_name])) {
  10399. continue;
  10400. }
  10401. $r = $this->info[$validator_name]->validate(
  10402. $bits[$i],
  10403. $config,
  10404. $context
  10405. );
  10406. if ($r !== false) {
  10407. $final .= $r . ' ';
  10408. $caught[$validator_name] = true;
  10409. break;
  10410. }
  10411. }
  10412. // all three caught, continue on
  10413. if (count($caught) >= 3) {
  10414. $stage = 1;
  10415. }
  10416. if ($r !== false) {
  10417. break;
  10418. }
  10419. case 1: // attempting to catch font-size and perhaps line-height
  10420. $found_slash = false;
  10421. if (strpos($bits[$i], '/') !== false) {
  10422. list($font_size, $line_height) =
  10423. explode('/', $bits[$i]);
  10424. if ($line_height === '') {
  10425. // ooh, there's a space after the slash!
  10426. $line_height = false;
  10427. $found_slash = true;
  10428. }
  10429. } else {
  10430. $font_size = $bits[$i];
  10431. $line_height = false;
  10432. }
  10433. $r = $this->info['font-size']->validate(
  10434. $font_size,
  10435. $config,
  10436. $context
  10437. );
  10438. if ($r !== false) {
  10439. $final .= $r;
  10440. // attempt to catch line-height
  10441. if ($line_height === false) {
  10442. // we need to scroll forward
  10443. for ($j = $i + 1; $j < $size; $j++) {
  10444. if ($bits[$j] === '') {
  10445. continue;
  10446. }
  10447. if ($bits[$j] === '/') {
  10448. if ($found_slash) {
  10449. return false;
  10450. } else {
  10451. $found_slash = true;
  10452. continue;
  10453. }
  10454. }
  10455. $line_height = $bits[$j];
  10456. break;
  10457. }
  10458. } else {
  10459. // slash already found
  10460. $found_slash = true;
  10461. $j = $i;
  10462. }
  10463. if ($found_slash) {
  10464. $i = $j;
  10465. $r = $this->info['line-height']->validate(
  10466. $line_height,
  10467. $config,
  10468. $context
  10469. );
  10470. if ($r !== false) {
  10471. $final .= '/' . $r;
  10472. }
  10473. }
  10474. $final .= ' ';
  10475. $stage = 2;
  10476. break;
  10477. }
  10478. return false;
  10479. case 2: // attempting to catch font-family
  10480. $font_family =
  10481. implode(' ', array_slice($bits, $i, $size - $i));
  10482. $r = $this->info['font-family']->validate(
  10483. $font_family,
  10484. $config,
  10485. $context
  10486. );
  10487. if ($r !== false) {
  10488. $final .= $r . ' ';
  10489. // processing completed successfully
  10490. return rtrim($final);
  10491. }
  10492. return false;
  10493. }
  10494. }
  10495. return false;
  10496. }
  10497. }
  10498. /**
  10499. * Validates a font family list according to CSS spec
  10500. */
  10501. class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
  10502. {
  10503. protected $mask = null;
  10504. public function __construct()
  10505. {
  10506. $this->mask = '_- ';
  10507. for ($c = 'a'; $c <= 'z'; $c++) {
  10508. $this->mask .= $c;
  10509. }
  10510. for ($c = 'A'; $c <= 'Z'; $c++) {
  10511. $this->mask .= $c;
  10512. }
  10513. for ($c = '0'; $c <= '9'; $c++) {
  10514. $this->mask .= $c;
  10515. } // cast-y, but should be fine
  10516. // special bytes used by UTF-8
  10517. for ($i = 0x80; $i <= 0xFF; $i++) {
  10518. // We don't bother excluding invalid bytes in this range,
  10519. // because the our restriction of well-formed UTF-8 will
  10520. // prevent these from ever occurring.
  10521. $this->mask .= chr($i);
  10522. }
  10523. /*
  10524. PHP's internal strcspn implementation is
  10525. O(length of string * length of mask), making it inefficient
  10526. for large masks. However, it's still faster than
  10527. preg_match 8)
  10528. for (p = s1;;) {
  10529. spanp = s2;
  10530. do {
  10531. if (*spanp == c || p == s1_end) {
  10532. return p - s1;
  10533. }
  10534. } while (spanp++ < (s2_end - 1));
  10535. c = *++p;
  10536. }
  10537. */
  10538. // possible optimization: invert the mask.
  10539. }
  10540. /**
  10541. * @param string $string
  10542. * @param HTMLPurifier_Config $config
  10543. * @param HTMLPurifier_Context $context
  10544. * @return bool|string
  10545. */
  10546. public function validate($string, $config, $context)
  10547. {
  10548. static $generic_names = array(
  10549. 'serif' => true,
  10550. 'sans-serif' => true,
  10551. 'monospace' => true,
  10552. 'fantasy' => true,
  10553. 'cursive' => true
  10554. );
  10555. $allowed_fonts = $config->get('CSS.AllowedFonts');
  10556. // assume that no font names contain commas in them
  10557. $fonts = explode(',', $string);
  10558. $final = '';
  10559. foreach ($fonts as $font) {
  10560. $font = trim($font);
  10561. if ($font === '') {
  10562. continue;
  10563. }
  10564. // match a generic name
  10565. if (isset($generic_names[$font])) {
  10566. if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
  10567. $final .= $font . ', ';
  10568. }
  10569. continue;
  10570. }
  10571. // match a quoted name
  10572. if ($font[0] === '"' || $font[0] === "'") {
  10573. $length = strlen($font);
  10574. if ($length <= 2) {
  10575. continue;
  10576. }
  10577. $quote = $font[0];
  10578. if ($font[$length - 1] !== $quote) {
  10579. continue;
  10580. }
  10581. $font = substr($font, 1, $length - 2);
  10582. }
  10583. $font = $this->expandCSSEscape($font);
  10584. // $font is a pure representation of the font name
  10585. if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
  10586. continue;
  10587. }
  10588. if (ctype_alnum($font) && $font !== '') {
  10589. // very simple font, allow it in unharmed
  10590. $final .= $font . ', ';
  10591. continue;
  10592. }
  10593. // bugger out on whitespace. form feed (0C) really
  10594. // shouldn't show up regardless
  10595. $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
  10596. // Here, there are various classes of characters which need
  10597. // to be treated differently:
  10598. // - Alphanumeric characters are essentially safe. We
  10599. // handled these above.
  10600. // - Spaces require quoting, though most parsers will do
  10601. // the right thing if there aren't any characters that
  10602. // can be misinterpreted
  10603. // - Dashes rarely occur, but they fairly unproblematic
  10604. // for parsing/rendering purposes.
  10605. // The above characters cover the majority of Western font
  10606. // names.
  10607. // - Arbitrary Unicode characters not in ASCII. Because
  10608. // most parsers give little thought to Unicode, treatment
  10609. // of these codepoints is basically uniform, even for
  10610. // punctuation-like codepoints. These characters can
  10611. // show up in non-Western pages and are supported by most
  10612. // major browsers, for example: "MS 明朝" is a
  10613. // legitimate font-name
  10614. // <http://ja.wikipedia.org/wiki/MS_明朝>. See
  10615. // the CSS3 spec for more examples:
  10616. // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
  10617. // You can see live samples of these on the Internet:
  10618. // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック>
  10619. // However, most of these fonts have ASCII equivalents:
  10620. // for example, 'MS Mincho', and it's considered
  10621. // professional to use ASCII font names instead of
  10622. // Unicode font names. Thanks Takeshi Terada for
  10623. // providing this information.
  10624. // The following characters, to my knowledge, have not been
  10625. // used to name font names.
  10626. // - Single quote. While theoretically you might find a
  10627. // font name that has a single quote in its name (serving
  10628. // as an apostrophe, e.g. Dave's Scribble), I haven't
  10629. // been able to find any actual examples of this.
  10630. // Internet Explorer's cssText translation (which I
  10631. // believe is invoked by innerHTML) normalizes any
  10632. // quoting to single quotes, and fails to escape single
  10633. // quotes. (Note that this is not IE's behavior for all
  10634. // CSS properties, just some sort of special casing for
  10635. // font-family). So a single quote *cannot* be used
  10636. // safely in the font-family context if there will be an
  10637. // innerHTML/cssText translation. Note that Firefox 3.x
  10638. // does this too.
  10639. // - Double quote. In IE, these get normalized to
  10640. // single-quotes, no matter what the encoding. (Fun
  10641. // fact, in IE8, the 'content' CSS property gained
  10642. // support, where they special cased to preserve encoded
  10643. // double quotes, but still translate unadorned double
  10644. // quotes into single quotes.) So, because their
  10645. // fixpoint behavior is identical to single quotes, they
  10646. // cannot be allowed either. Firefox 3.x displays
  10647. // single-quote style behavior.
  10648. // - Backslashes are reduced by one (so \\ -> \) every
  10649. // iteration, so they cannot be used safely. This shows
  10650. // up in IE7, IE8 and FF3
  10651. // - Semicolons, commas and backticks are handled properly.
  10652. // - The rest of the ASCII punctuation is handled properly.
  10653. // We haven't checked what browsers do to unadorned
  10654. // versions, but this is not important as long as the
  10655. // browser doesn't /remove/ surrounding quotes (as IE does
  10656. // for HTML).
  10657. //
  10658. // With these results in hand, we conclude that there are
  10659. // various levels of safety:
  10660. // - Paranoid: alphanumeric, spaces and dashes(?)
  10661. // - International: Paranoid + non-ASCII Unicode
  10662. // - Edgy: Everything except quotes, backslashes
  10663. // - NoJS: Standards compliance, e.g. sod IE. Note that
  10664. // with some judicious character escaping (since certain
  10665. // types of escaping doesn't work) this is theoretically
  10666. // OK as long as innerHTML/cssText is not called.
  10667. // We believe that international is a reasonable default
  10668. // (that we will implement now), and once we do more
  10669. // extensive research, we may feel comfortable with dropping
  10670. // it down to edgy.
  10671. // Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of
  10672. // str(c)spn assumes that the string was already well formed
  10673. // Unicode (which of course it is).
  10674. if (strspn($font, $this->mask) !== strlen($font)) {
  10675. continue;
  10676. }
  10677. // Historical:
  10678. // In the absence of innerHTML/cssText, these ugly
  10679. // transforms don't pose a security risk (as \\ and \"
  10680. // might--these escapes are not supported by most browsers).
  10681. // We could try to be clever and use single-quote wrapping
  10682. // when there is a double quote present, but I have choosen
  10683. // not to implement that. (NOTE: you can reduce the amount
  10684. // of escapes by one depending on what quoting style you use)
  10685. // $font = str_replace('\\', '\\5C ', $font);
  10686. // $font = str_replace('"', '\\22 ', $font);
  10687. // $font = str_replace("'", '\\27 ', $font);
  10688. // font possibly with spaces, requires quoting
  10689. $final .= "'$font', ";
  10690. }
  10691. $final = rtrim($final, ', ');
  10692. if ($final === '') {
  10693. return false;
  10694. }
  10695. return $final;
  10696. }
  10697. }
  10698. /**
  10699. * Validates based on {ident} CSS grammar production
  10700. */
  10701. class HTMLPurifier_AttrDef_CSS_Ident extends HTMLPurifier_AttrDef
  10702. {
  10703. /**
  10704. * @param string $string
  10705. * @param HTMLPurifier_Config $config
  10706. * @param HTMLPurifier_Context $context
  10707. * @return bool|string
  10708. */
  10709. public function validate($string, $config, $context)
  10710. {
  10711. $string = trim($string);
  10712. // early abort: '' and '0' (strings that convert to false) are invalid
  10713. if (!$string) {
  10714. return false;
  10715. }
  10716. $pattern = '/^(-?[A-Za-z_][A-Za-z_\-0-9]*)$/';
  10717. if (!preg_match($pattern, $string)) {
  10718. return false;
  10719. }
  10720. return $string;
  10721. }
  10722. }
  10723. /**
  10724. * Decorator which enables !important to be used in CSS values.
  10725. */
  10726. class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
  10727. {
  10728. /**
  10729. * @type HTMLPurifier_AttrDef
  10730. */
  10731. public $def;
  10732. /**
  10733. * @type bool
  10734. */
  10735. public $allow;
  10736. /**
  10737. * @param HTMLPurifier_AttrDef $def Definition to wrap
  10738. * @param bool $allow Whether or not to allow !important
  10739. */
  10740. public function __construct($def, $allow = false)
  10741. {
  10742. $this->def = $def;
  10743. $this->allow = $allow;
  10744. }
  10745. /**
  10746. * Intercepts and removes !important if necessary
  10747. * @param string $string
  10748. * @param HTMLPurifier_Config $config
  10749. * @param HTMLPurifier_Context $context
  10750. * @return bool|string
  10751. */
  10752. public function validate($string, $config, $context)
  10753. {
  10754. // test for ! and important tokens
  10755. $string = trim($string);
  10756. $is_important = false;
  10757. // :TODO: optimization: test directly for !important and ! important
  10758. if (strlen($string) >= 9 && substr($string, -9) === 'important') {
  10759. $temp = rtrim(substr($string, 0, -9));
  10760. // use a temp, because we might want to restore important
  10761. if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
  10762. $string = rtrim(substr($temp, 0, -1));
  10763. $is_important = true;
  10764. }
  10765. }
  10766. $string = $this->def->validate($string, $config, $context);
  10767. if ($this->allow && $is_important) {
  10768. $string .= ' !important';
  10769. }
  10770. return $string;
  10771. }
  10772. }
  10773. /**
  10774. * Represents a Length as defined by CSS.
  10775. */
  10776. class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
  10777. {
  10778. /**
  10779. * @type HTMLPurifier_Length|string
  10780. */
  10781. protected $min;
  10782. /**
  10783. * @type HTMLPurifier_Length|string
  10784. */
  10785. protected $max;
  10786. /**
  10787. * @param HTMLPurifier_Length|string $min Minimum length, or null for no bound. String is also acceptable.
  10788. * @param HTMLPurifier_Length|string $max Maximum length, or null for no bound. String is also acceptable.
  10789. */
  10790. public function __construct($min = null, $max = null)
  10791. {
  10792. $this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
  10793. $this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
  10794. }
  10795. /**
  10796. * @param string $string
  10797. * @param HTMLPurifier_Config $config
  10798. * @param HTMLPurifier_Context $context
  10799. * @return bool|string
  10800. */
  10801. public function validate($string, $config, $context)
  10802. {
  10803. $string = $this->parseCDATA($string);
  10804. // Optimizations
  10805. if ($string === '') {
  10806. return false;
  10807. }
  10808. if ($string === '0') {
  10809. return '0';
  10810. }
  10811. if (strlen($string) === 1) {
  10812. return false;
  10813. }
  10814. $length = HTMLPurifier_Length::make($string);
  10815. if (!$length->isValid()) {
  10816. return false;
  10817. }
  10818. if ($this->min) {
  10819. $c = $length->compareTo($this->min);
  10820. if ($c === false) {
  10821. return false;
  10822. }
  10823. if ($c < 0) {
  10824. return false;
  10825. }
  10826. }
  10827. if ($this->max) {
  10828. $c = $length->compareTo($this->max);
  10829. if ($c === false) {
  10830. return false;
  10831. }
  10832. if ($c > 0) {
  10833. return false;
  10834. }
  10835. }
  10836. return $length->toString();
  10837. }
  10838. }
  10839. /**
  10840. * Validates shorthand CSS property list-style.
  10841. * @warning Does not support url tokens that have internal spaces.
  10842. */
  10843. class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
  10844. {
  10845. /**
  10846. * Local copy of validators.
  10847. * @type HTMLPurifier_AttrDef[]
  10848. * @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
  10849. */
  10850. protected $info;
  10851. /**
  10852. * @param HTMLPurifier_Config $config
  10853. */
  10854. public function __construct($config)
  10855. {
  10856. $def = $config->getCSSDefinition();
  10857. $this->info['list-style-type'] = $def->info['list-style-type'];
  10858. $this->info['list-style-position'] = $def->info['list-style-position'];
  10859. $this->info['list-style-image'] = $def->info['list-style-image'];
  10860. }
  10861. /**
  10862. * @param string $string
  10863. * @param HTMLPurifier_Config $config
  10864. * @param HTMLPurifier_Context $context
  10865. * @return bool|string
  10866. */
  10867. public function validate($string, $config, $context)
  10868. {
  10869. // regular pre-processing
  10870. $string = $this->parseCDATA($string);
  10871. if ($string === '') {
  10872. return false;
  10873. }
  10874. // assumes URI doesn't have spaces in it
  10875. $bits = explode(' ', strtolower($string)); // bits to process
  10876. $caught = array();
  10877. $caught['type'] = false;
  10878. $caught['position'] = false;
  10879. $caught['image'] = false;
  10880. $i = 0; // number of catches
  10881. $none = false;
  10882. foreach ($bits as $bit) {
  10883. if ($i >= 3) {
  10884. return;
  10885. } // optimization bit
  10886. if ($bit === '') {
  10887. continue;
  10888. }
  10889. foreach ($caught as $key => $status) {
  10890. if ($status !== false) {
  10891. continue;
  10892. }
  10893. $r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
  10894. if ($r === false) {
  10895. continue;
  10896. }
  10897. if ($r === 'none') {
  10898. if ($none) {
  10899. continue;
  10900. } else {
  10901. $none = true;
  10902. }
  10903. if ($key == 'image') {
  10904. continue;
  10905. }
  10906. }
  10907. $caught[$key] = $r;
  10908. $i++;
  10909. break;
  10910. }
  10911. }
  10912. if (!$i) {
  10913. return false;
  10914. }
  10915. $ret = array();
  10916. // construct type
  10917. if ($caught['type']) {
  10918. $ret[] = $caught['type'];
  10919. }
  10920. // construct image
  10921. if ($caught['image']) {
  10922. $ret[] = $caught['image'];
  10923. }
  10924. // construct position
  10925. if ($caught['position']) {
  10926. $ret[] = $caught['position'];
  10927. }
  10928. if (empty($ret)) {
  10929. return false;
  10930. }
  10931. return implode(' ', $ret);
  10932. }
  10933. }
  10934. /**
  10935. * Framework class for strings that involve multiple values.
  10936. *
  10937. * Certain CSS properties such as border-width and margin allow multiple
  10938. * lengths to be specified. This class can take a vanilla border-width
  10939. * definition and multiply it, usually into a max of four.
  10940. *
  10941. * @note Even though the CSS specification isn't clear about it, inherit
  10942. * can only be used alone: it will never manifest as part of a multi
  10943. * shorthand declaration. Thus, this class does not allow inherit.
  10944. */
  10945. class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
  10946. {
  10947. /**
  10948. * Instance of component definition to defer validation to.
  10949. * @type HTMLPurifier_AttrDef
  10950. * @todo Make protected
  10951. */
  10952. public $single;
  10953. /**
  10954. * Max number of values allowed.
  10955. * @todo Make protected
  10956. */
  10957. public $max;
  10958. /**
  10959. * @param HTMLPurifier_AttrDef $single HTMLPurifier_AttrDef to multiply
  10960. * @param int $max Max number of values allowed (usually four)
  10961. */
  10962. public function __construct($single, $max = 4)
  10963. {
  10964. $this->single = $single;
  10965. $this->max = $max;
  10966. }
  10967. /**
  10968. * @param string $string
  10969. * @param HTMLPurifier_Config $config
  10970. * @param HTMLPurifier_Context $context
  10971. * @return bool|string
  10972. */
  10973. public function validate($string, $config, $context)
  10974. {
  10975. $string = $this->mungeRgb($this->parseCDATA($string));
  10976. if ($string === '') {
  10977. return false;
  10978. }
  10979. $parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
  10980. $length = count($parts);
  10981. $final = '';
  10982. for ($i = 0, $num = 0; $i < $length && $num < $this->max; $i++) {
  10983. if (ctype_space($parts[$i])) {
  10984. continue;
  10985. }
  10986. $result = $this->single->validate($parts[$i], $config, $context);
  10987. if ($result !== false) {
  10988. $final .= $result . ' ';
  10989. $num++;
  10990. }
  10991. }
  10992. if ($final === '') {
  10993. return false;
  10994. }
  10995. return rtrim($final);
  10996. }
  10997. }
  10998. /**
  10999. * Validates a Percentage as defined by the CSS spec.
  11000. */
  11001. class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
  11002. {
  11003. /**
  11004. * Instance to defer number validation to.
  11005. * @type HTMLPurifier_AttrDef_CSS_Number
  11006. */
  11007. protected $number_def;
  11008. /**
  11009. * @param bool $non_negative Whether to forbid negative values
  11010. */
  11011. public function __construct($non_negative = false)
  11012. {
  11013. $this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
  11014. }
  11015. /**
  11016. * @param string $string
  11017. * @param HTMLPurifier_Config $config
  11018. * @param HTMLPurifier_Context $context
  11019. * @return bool|string
  11020. */
  11021. public function validate($string, $config, $context)
  11022. {
  11023. $string = $this->parseCDATA($string);
  11024. if ($string === '') {
  11025. return false;
  11026. }
  11027. $length = strlen($string);
  11028. if ($length === 1) {
  11029. return false;
  11030. }
  11031. if ($string[$length - 1] !== '%') {
  11032. return false;
  11033. }
  11034. $number = substr($string, 0, $length - 1);
  11035. $number = $this->number_def->validate($number, $config, $context);
  11036. if ($number === false) {
  11037. return false;
  11038. }
  11039. return "$number%";
  11040. }
  11041. }
  11042. /**
  11043. * Validates the value for the CSS property text-decoration
  11044. * @note This class could be generalized into a version that acts sort of
  11045. * like Enum except you can compound the allowed values.
  11046. */
  11047. class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
  11048. {
  11049. /**
  11050. * @param string $string
  11051. * @param HTMLPurifier_Config $config
  11052. * @param HTMLPurifier_Context $context
  11053. * @return bool|string
  11054. */
  11055. public function validate($string, $config, $context)
  11056. {
  11057. static $allowed_values = array(
  11058. 'line-through' => true,
  11059. 'overline' => true,
  11060. 'underline' => true,
  11061. );
  11062. $string = strtolower($this->parseCDATA($string));
  11063. if ($string === 'none') {
  11064. return $string;
  11065. }
  11066. $parts = explode(' ', $string);
  11067. $final = '';
  11068. foreach ($parts as $part) {
  11069. if (isset($allowed_values[$part])) {
  11070. $final .= $part . ' ';
  11071. }
  11072. }
  11073. $final = rtrim($final);
  11074. if ($final === '') {
  11075. return false;
  11076. }
  11077. return $final;
  11078. }
  11079. }
  11080. /**
  11081. * Validates a URI in CSS syntax, which uses url('http://example.com')
  11082. * @note While theoretically speaking a URI in a CSS document could
  11083. * be non-embedded, as of CSS2 there is no such usage so we're
  11084. * generalizing it. This may need to be changed in the future.
  11085. * @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
  11086. * the separator, you cannot put a literal semicolon in
  11087. * in the URI. Try percent encoding it, in that case.
  11088. */
  11089. class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
  11090. {
  11091. public function __construct()
  11092. {
  11093. parent::__construct(true); // always embedded
  11094. }
  11095. /**
  11096. * @param string $uri_string
  11097. * @param HTMLPurifier_Config $config
  11098. * @param HTMLPurifier_Context $context
  11099. * @return bool|string
  11100. */
  11101. public function validate($uri_string, $config, $context)
  11102. {
  11103. // parse the URI out of the string and then pass it onto
  11104. // the parent object
  11105. $uri_string = $this->parseCDATA($uri_string);
  11106. if (strpos($uri_string, 'url(') !== 0) {
  11107. return false;
  11108. }
  11109. $uri_string = substr($uri_string, 4);
  11110. if (strlen($uri_string) == 0) {
  11111. return false;
  11112. }
  11113. $new_length = strlen($uri_string) - 1;
  11114. if ($uri_string[$new_length] != ')') {
  11115. return false;
  11116. }
  11117. $uri = trim(substr($uri_string, 0, $new_length));
  11118. if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
  11119. $quote = $uri[0];
  11120. $new_length = strlen($uri) - 1;
  11121. if ($uri[$new_length] !== $quote) {
  11122. return false;
  11123. }
  11124. $uri = substr($uri, 1, $new_length - 1);
  11125. }
  11126. $uri = $this->expandCSSEscape($uri);
  11127. $result = parent::validate($uri, $config, $context);
  11128. if ($result === false) {
  11129. return false;
  11130. }
  11131. // extra sanity check; should have been done by URI
  11132. $result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
  11133. // suspicious characters are ()'; we're going to percent encode
  11134. // them for safety.
  11135. $result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result);
  11136. // there's an extra bug where ampersands lose their escaping on
  11137. // an innerHTML cycle, so a very unlucky query parameter could
  11138. // then change the meaning of the URL. Unfortunately, there's
  11139. // not much we can do about that...
  11140. return "url(\"$result\")";
  11141. }
  11142. }
  11143. /**
  11144. * Validates a boolean attribute
  11145. */
  11146. class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
  11147. {
  11148. /**
  11149. * @type bool
  11150. */
  11151. protected $name;
  11152. /**
  11153. * @type bool
  11154. */
  11155. public $minimized = true;
  11156. /**
  11157. * @param bool $name
  11158. */
  11159. public function __construct($name = false)
  11160. {
  11161. $this->name = $name;
  11162. }
  11163. /**
  11164. * @param string $string
  11165. * @param HTMLPurifier_Config $config
  11166. * @param HTMLPurifier_Context $context
  11167. * @return bool|string
  11168. */
  11169. public function validate($string, $config, $context)
  11170. {
  11171. return $this->name;
  11172. }
  11173. /**
  11174. * @param string $string Name of attribute
  11175. * @return HTMLPurifier_AttrDef_HTML_Bool
  11176. */
  11177. public function make($string)
  11178. {
  11179. return new HTMLPurifier_AttrDef_HTML_Bool($string);
  11180. }
  11181. }
  11182. /**
  11183. * Validates contents based on NMTOKENS attribute type.
  11184. */
  11185. class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
  11186. {
  11187. /**
  11188. * @param string $string
  11189. * @param HTMLPurifier_Config $config
  11190. * @param HTMLPurifier_Context $context
  11191. * @return bool|string
  11192. */
  11193. public function validate($string, $config, $context)
  11194. {
  11195. $string = trim($string);
  11196. // early abort: '' and '0' (strings that convert to false) are invalid
  11197. if (!$string) {
  11198. return false;
  11199. }
  11200. $tokens = $this->split($string, $config, $context);
  11201. $tokens = $this->filter($tokens, $config, $context);
  11202. if (empty($tokens)) {
  11203. return false;
  11204. }
  11205. return implode(' ', $tokens);
  11206. }
  11207. /**
  11208. * Splits a space separated list of tokens into its constituent parts.
  11209. * @param string $string
  11210. * @param HTMLPurifier_Config $config
  11211. * @param HTMLPurifier_Context $context
  11212. * @return array
  11213. */
  11214. protected function split($string, $config, $context)
  11215. {
  11216. // OPTIMIZABLE!
  11217. // do the preg_match, capture all subpatterns for reformulation
  11218. // we don't support U+00A1 and up codepoints or
  11219. // escaping because I don't know how to do that with regexps
  11220. // and plus it would complicate optimization efforts (you never
  11221. // see that anyway).
  11222. $pattern = '/(?:(?<=\s)|\A)' . // look behind for space or string start
  11223. '((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)' .
  11224. '(?:(?=\s)|\z)/'; // look ahead for space or string end
  11225. preg_match_all($pattern, $string, $matches);
  11226. return $matches[1];
  11227. }
  11228. /**
  11229. * Template method for removing certain tokens based on arbitrary criteria.
  11230. * @note If we wanted to be really functional, we'd do an array_filter
  11231. * with a callback. But... we're not.
  11232. * @param array $tokens
  11233. * @param HTMLPurifier_Config $config
  11234. * @param HTMLPurifier_Context $context
  11235. * @return array
  11236. */
  11237. protected function filter($tokens, $config, $context)
  11238. {
  11239. return $tokens;
  11240. }
  11241. }
  11242. /**
  11243. * Implements special behavior for class attribute (normally NMTOKENS)
  11244. */
  11245. class HTMLPurifier_AttrDef_HTML_Class extends HTMLPurifier_AttrDef_HTML_Nmtokens
  11246. {
  11247. /**
  11248. * @param string $string
  11249. * @param HTMLPurifier_Config $config
  11250. * @param HTMLPurifier_Context $context
  11251. * @return bool|string
  11252. */
  11253. protected function split($string, $config, $context)
  11254. {
  11255. // really, this twiddle should be lazy loaded
  11256. $name = $config->getDefinition('HTML')->doctype->name;
  11257. if ($name == "XHTML 1.1" || $name == "XHTML 2.0") {
  11258. return parent::split($string, $config, $context);
  11259. } else {
  11260. return preg_split('/\s+/', $string);
  11261. }
  11262. }
  11263. /**
  11264. * @param array $tokens
  11265. * @param HTMLPurifier_Config $config
  11266. * @param HTMLPurifier_Context $context
  11267. * @return array
  11268. */
  11269. protected function filter($tokens, $config, $context)
  11270. {
  11271. $allowed = $config->get('Attr.AllowedClasses');
  11272. $forbidden = $config->get('Attr.ForbiddenClasses');
  11273. $ret = array();
  11274. foreach ($tokens as $token) {
  11275. if (($allowed === null || isset($allowed[$token])) &&
  11276. !isset($forbidden[$token]) &&
  11277. // We need this O(n) check because of PHP's array
  11278. // implementation that casts -0 to 0.
  11279. !in_array($token, $ret, true)
  11280. ) {
  11281. $ret[] = $token;
  11282. }
  11283. }
  11284. return $ret;
  11285. }
  11286. }
  11287. /**
  11288. * Validates a color according to the HTML spec.
  11289. */
  11290. class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
  11291. {
  11292. /**
  11293. * @param string $string
  11294. * @param HTMLPurifier_Config $config
  11295. * @param HTMLPurifier_Context $context
  11296. * @return bool|string
  11297. */
  11298. public function validate($string, $config, $context)
  11299. {
  11300. static $colors = null;
  11301. if ($colors === null) {
  11302. $colors = $config->get('Core.ColorKeywords');
  11303. }
  11304. $string = trim($string);
  11305. if (empty($string)) {
  11306. return false;
  11307. }
  11308. $lower = strtolower($string);
  11309. if (isset($colors[$lower])) {
  11310. return $colors[$lower];
  11311. }
  11312. if ($string[0] === '#') {
  11313. $hex = substr($string, 1);
  11314. } else {
  11315. $hex = $string;
  11316. }
  11317. $length = strlen($hex);
  11318. if ($length !== 3 && $length !== 6) {
  11319. return false;
  11320. }
  11321. if (!ctype_xdigit($hex)) {
  11322. return false;
  11323. }
  11324. if ($length === 3) {
  11325. $hex = $hex[0] . $hex[0] . $hex[1] . $hex[1] . $hex[2] . $hex[2];
  11326. }
  11327. return "#$hex";
  11328. }
  11329. }
  11330. /**
  11331. * Special-case enum attribute definition that lazy loads allowed frame targets
  11332. */
  11333. class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
  11334. {
  11335. /**
  11336. * @type array
  11337. */
  11338. public $valid_values = false; // uninitialized value
  11339. /**
  11340. * @type bool
  11341. */
  11342. protected $case_sensitive = false;
  11343. public function __construct()
  11344. {
  11345. }
  11346. /**
  11347. * @param string $string
  11348. * @param HTMLPurifier_Config $config
  11349. * @param HTMLPurifier_Context $context
  11350. * @return bool|string
  11351. */
  11352. public function validate($string, $config, $context)
  11353. {
  11354. if ($this->valid_values === false) {
  11355. $this->valid_values = $config->get('Attr.AllowedFrameTargets');
  11356. }
  11357. return parent::validate($string, $config, $context);
  11358. }
  11359. }
  11360. /**
  11361. * Validates the HTML attribute ID.
  11362. * @warning Even though this is the id processor, it
  11363. * will ignore the directive Attr:IDBlacklist, since it will only
  11364. * go according to the ID accumulator. Since the accumulator is
  11365. * automatically generated, it will have already absorbed the
  11366. * blacklist. If you're hacking around, make sure you use load()!
  11367. */
  11368. class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
  11369. {
  11370. // selector is NOT a valid thing to use for IDREFs, because IDREFs
  11371. // *must* target IDs that exist, whereas selector #ids do not.
  11372. /**
  11373. * Determines whether or not we're validating an ID in a CSS
  11374. * selector context.
  11375. * @type bool
  11376. */
  11377. protected $selector;
  11378. /**
  11379. * @param bool $selector
  11380. */
  11381. public function __construct($selector = false)
  11382. {
  11383. $this->selector = $selector;
  11384. }
  11385. /**
  11386. * @param string $id
  11387. * @param HTMLPurifier_Config $config
  11388. * @param HTMLPurifier_Context $context
  11389. * @return bool|string
  11390. */
  11391. public function validate($id, $config, $context)
  11392. {
  11393. if (!$this->selector && !$config->get('Attr.EnableID')) {
  11394. return false;
  11395. }
  11396. $id = trim($id); // trim it first
  11397. if ($id === '') {
  11398. return false;
  11399. }
  11400. $prefix = $config->get('Attr.IDPrefix');
  11401. if ($prefix !== '') {
  11402. $prefix .= $config->get('Attr.IDPrefixLocal');
  11403. // prevent re-appending the prefix
  11404. if (strpos($id, $prefix) !== 0) {
  11405. $id = $prefix . $id;
  11406. }
  11407. } elseif ($config->get('Attr.IDPrefixLocal') !== '') {
  11408. trigger_error(
  11409. '%Attr.IDPrefixLocal cannot be used unless ' .
  11410. '%Attr.IDPrefix is set',
  11411. E_USER_WARNING
  11412. );
  11413. }
  11414. if (!$this->selector) {
  11415. $id_accumulator =& $context->get('IDAccumulator');
  11416. if (isset($id_accumulator->ids[$id])) {
  11417. return false;
  11418. }
  11419. }
  11420. // we purposely avoid using regex, hopefully this is faster
  11421. if ($config->get('Attr.ID.HTML5') === true) {
  11422. if (preg_match('/[\t\n\x0b\x0c ]/', $id)) {
  11423. return false;
  11424. }
  11425. } else {
  11426. if (ctype_alpha($id)) {
  11427. // OK
  11428. } else {
  11429. if (!ctype_alpha(@$id[0])) {
  11430. return false;
  11431. }
  11432. // primitive style of regexps, I suppose
  11433. $trim = trim(
  11434. $id,
  11435. 'A..Za..z0..9:-._'
  11436. );
  11437. if ($trim !== '') {
  11438. return false;
  11439. }
  11440. }
  11441. }
  11442. $regexp = $config->get('Attr.IDBlacklistRegexp');
  11443. if ($regexp && preg_match($regexp, $id)) {
  11444. return false;
  11445. }
  11446. if (!$this->selector) {
  11447. $id_accumulator->add($id);
  11448. }
  11449. // if no change was made to the ID, return the result
  11450. // else, return the new id if stripping whitespace made it
  11451. // valid, or return false.
  11452. return $id;
  11453. }
  11454. }
  11455. /**
  11456. * Validates an integer representation of pixels according to the HTML spec.
  11457. */
  11458. class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
  11459. {
  11460. /**
  11461. * @type int
  11462. */
  11463. protected $max;
  11464. /**
  11465. * @param int $max
  11466. */
  11467. public function __construct($max = null)
  11468. {
  11469. $this->max = $max;
  11470. }
  11471. /**
  11472. * @param string $string
  11473. * @param HTMLPurifier_Config $config
  11474. * @param HTMLPurifier_Context $context
  11475. * @return bool|string
  11476. */
  11477. public function validate($string, $config, $context)
  11478. {
  11479. $string = trim($string);
  11480. if ($string === '0') {
  11481. return $string;
  11482. }
  11483. if ($string === '') {
  11484. return false;
  11485. }
  11486. $length = strlen($string);
  11487. if (substr($string, $length - 2) == 'px') {
  11488. $string = substr($string, 0, $length - 2);
  11489. }
  11490. if (!is_numeric($string)) {
  11491. return false;
  11492. }
  11493. $int = (int)$string;
  11494. if ($int < 0) {
  11495. return '0';
  11496. }
  11497. // upper-bound value, extremely high values can
  11498. // crash operating systems, see <http://ha.ckers.org/imagecrash.html>
  11499. // WARNING, above link WILL crash you if you're using Windows
  11500. if ($this->max !== null && $int > $this->max) {
  11501. return (string)$this->max;
  11502. }
  11503. return (string)$int;
  11504. }
  11505. /**
  11506. * @param string $string
  11507. * @return HTMLPurifier_AttrDef
  11508. */
  11509. public function make($string)
  11510. {
  11511. if ($string === '') {
  11512. $max = null;
  11513. } else {
  11514. $max = (int)$string;
  11515. }
  11516. $class = get_class($this);
  11517. return new $class($max);
  11518. }
  11519. }
  11520. /**
  11521. * Validates the HTML type length (not to be confused with CSS's length).
  11522. *
  11523. * This accepts integer pixels or percentages as lengths for certain
  11524. * HTML attributes.
  11525. */
  11526. class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
  11527. {
  11528. /**
  11529. * @param string $string
  11530. * @param HTMLPurifier_Config $config
  11531. * @param HTMLPurifier_Context $context
  11532. * @return bool|string
  11533. */
  11534. public function validate($string, $config, $context)
  11535. {
  11536. $string = trim($string);
  11537. if ($string === '') {
  11538. return false;
  11539. }
  11540. $parent_result = parent::validate($string, $config, $context);
  11541. if ($parent_result !== false) {
  11542. return $parent_result;
  11543. }
  11544. $length = strlen($string);
  11545. $last_char = $string[$length - 1];
  11546. if ($last_char !== '%') {
  11547. return false;
  11548. }
  11549. $points = substr($string, 0, $length - 1);
  11550. if (!is_numeric($points)) {
  11551. return false;
  11552. }
  11553. $points = (int)$points;
  11554. if ($points < 0) {
  11555. return '0%';
  11556. }
  11557. if ($points > 100) {
  11558. return '100%';
  11559. }
  11560. return ((string)$points) . '%';
  11561. }
  11562. }
  11563. /**
  11564. * Validates a rel/rev link attribute against a directive of allowed values
  11565. * @note We cannot use Enum because link types allow multiple
  11566. * values.
  11567. * @note Assumes link types are ASCII text
  11568. */
  11569. class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
  11570. {
  11571. /**
  11572. * Name config attribute to pull.
  11573. * @type string
  11574. */
  11575. protected $name;
  11576. /**
  11577. * @param string $name
  11578. */
  11579. public function __construct($name)
  11580. {
  11581. $configLookup = array(
  11582. 'rel' => 'AllowedRel',
  11583. 'rev' => 'AllowedRev'
  11584. );
  11585. if (!isset($configLookup[$name])) {
  11586. trigger_error(
  11587. 'Unrecognized attribute name for link ' .
  11588. 'relationship.',
  11589. E_USER_ERROR
  11590. );
  11591. return;
  11592. }
  11593. $this->name = $configLookup[$name];
  11594. }
  11595. /**
  11596. * @param string $string
  11597. * @param HTMLPurifier_Config $config
  11598. * @param HTMLPurifier_Context $context
  11599. * @return bool|string
  11600. */
  11601. public function validate($string, $config, $context)
  11602. {
  11603. $allowed = $config->get('Attr.' . $this->name);
  11604. if (empty($allowed)) {
  11605. return false;
  11606. }
  11607. $string = $this->parseCDATA($string);
  11608. $parts = explode(' ', $string);
  11609. // lookup to prevent duplicates
  11610. $ret_lookup = array();
  11611. foreach ($parts as $part) {
  11612. $part = strtolower(trim($part));
  11613. if (!isset($allowed[$part])) {
  11614. continue;
  11615. }
  11616. $ret_lookup[$part] = true;
  11617. }
  11618. if (empty($ret_lookup)) {
  11619. return false;
  11620. }
  11621. $string = implode(' ', array_keys($ret_lookup));
  11622. return $string;
  11623. }
  11624. }
  11625. /**
  11626. * Validates a MultiLength as defined by the HTML spec.
  11627. *
  11628. * A multilength is either a integer (pixel count), a percentage, or
  11629. * a relative number.
  11630. */
  11631. class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
  11632. {
  11633. /**
  11634. * @param string $string
  11635. * @param HTMLPurifier_Config $config
  11636. * @param HTMLPurifier_Context $context
  11637. * @return bool|string
  11638. */
  11639. public function validate($string, $config, $context)
  11640. {
  11641. $string = trim($string);
  11642. if ($string === '') {
  11643. return false;
  11644. }
  11645. $parent_result = parent::validate($string, $config, $context);
  11646. if ($parent_result !== false) {
  11647. return $parent_result;
  11648. }
  11649. $length = strlen($string);
  11650. $last_char = $string[$length - 1];
  11651. if ($last_char !== '*') {
  11652. return false;
  11653. }
  11654. $int = substr($string, 0, $length - 1);
  11655. if ($int == '') {
  11656. return '*';
  11657. }
  11658. if (!is_numeric($int)) {
  11659. return false;
  11660. }
  11661. $int = (int)$int;
  11662. if ($int < 0) {
  11663. return false;
  11664. }
  11665. if ($int == 0) {
  11666. return '0';
  11667. }
  11668. if ($int == 1) {
  11669. return '*';
  11670. }
  11671. return ((string)$int) . '*';
  11672. }
  11673. }
  11674. abstract class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
  11675. {
  11676. /**
  11677. * Unpacks a mailbox into its display-name and address
  11678. * @param string $string
  11679. * @return mixed
  11680. */
  11681. public function unpack($string)
  11682. {
  11683. // needs to be implemented
  11684. }
  11685. }
  11686. // sub-implementations
  11687. /**
  11688. * Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
  11689. */
  11690. class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
  11691. {
  11692. /**
  11693. * IPv4 sub-validator.
  11694. * @type HTMLPurifier_AttrDef_URI_IPv4
  11695. */
  11696. protected $ipv4;
  11697. /**
  11698. * IPv6 sub-validator.
  11699. * @type HTMLPurifier_AttrDef_URI_IPv6
  11700. */
  11701. protected $ipv6;
  11702. public function __construct()
  11703. {
  11704. $this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
  11705. $this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
  11706. }
  11707. /**
  11708. * @param string $string
  11709. * @param HTMLPurifier_Config $config
  11710. * @param HTMLPurifier_Context $context
  11711. * @return bool|string
  11712. */
  11713. public function validate($string, $config, $context)
  11714. {
  11715. $length = strlen($string);
  11716. // empty hostname is OK; it's usually semantically equivalent:
  11717. // the default host as defined by a URI scheme is used:
  11718. //
  11719. // If the URI scheme defines a default for host, then that
  11720. // default applies when the host subcomponent is undefined
  11721. // or when the registered name is empty (zero length).
  11722. if ($string === '') {
  11723. return '';
  11724. }
  11725. if ($length > 1 && $string[0] === '[' && $string[$length - 1] === ']') {
  11726. //IPv6
  11727. $ip = substr($string, 1, $length - 2);
  11728. $valid = $this->ipv6->validate($ip, $config, $context);
  11729. if ($valid === false) {
  11730. return false;
  11731. }
  11732. return '[' . $valid . ']';
  11733. }
  11734. // need to do checks on unusual encodings too
  11735. $ipv4 = $this->ipv4->validate($string, $config, $context);
  11736. if ($ipv4 !== false) {
  11737. return $ipv4;
  11738. }
  11739. // A regular domain name.
  11740. // This doesn't match I18N domain names, but we don't have proper IRI support,
  11741. // so force users to insert Punycode.
  11742. // There is not a good sense in which underscores should be
  11743. // allowed, since it's technically not! (And if you go as
  11744. // far to allow everything as specified by the DNS spec...
  11745. // well, that's literally everything, modulo some space limits
  11746. // for the components and the overall name (which, by the way,
  11747. // we are NOT checking!). So we (arbitrarily) decide this:
  11748. // let's allow underscores wherever we would have allowed
  11749. // hyphens, if they are enabled. This is a pretty good match
  11750. // for browser behavior, for example, a large number of browsers
  11751. // cannot handle foo_.example.com, but foo_bar.example.com is
  11752. // fairly well supported.
  11753. $underscore = $config->get('Core.AllowHostnameUnderscore') ? '_' : '';
  11754. // Based off of RFC 1738, but amended so that
  11755. // as per RFC 3696, the top label need only not be all numeric.
  11756. // The productions describing this are:
  11757. $a = '[a-z]'; // alpha
  11758. $an = '[a-z0-9]'; // alphanum
  11759. $and = "[a-z0-9-$underscore]"; // alphanum | "-"
  11760. // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  11761. $domainlabel = "$an(?:$and*$an)?";
  11762. // AMENDED as per RFC 3696
  11763. // toplabel = alphanum | alphanum *( alphanum | "-" ) alphanum
  11764. // side condition: not all numeric
  11765. $toplabel = "$an(?:$and*$an)?";
  11766. // hostname = *( domainlabel "." ) toplabel [ "." ]
  11767. if (preg_match("/^(?:$domainlabel\.)*($toplabel)\.?$/i", $string, $matches)) {
  11768. if (!ctype_digit($matches[1])) {
  11769. return $string;
  11770. }
  11771. }
  11772. // PHP 5.3 and later support this functionality natively
  11773. if (function_exists('idn_to_ascii')) {
  11774. $string = idn_to_ascii($string);
  11775. // If we have Net_IDNA2 support, we can support IRIs by
  11776. // punycoding them. (This is the most portable thing to do,
  11777. // since otherwise we have to assume browsers support
  11778. } elseif ($config->get('Core.EnableIDNA')) {
  11779. $idna = new Net_IDNA2(array('encoding' => 'utf8', 'overlong' => false, 'strict' => true));
  11780. // we need to encode each period separately
  11781. $parts = explode('.', $string);
  11782. try {
  11783. $new_parts = array();
  11784. foreach ($parts as $part) {
  11785. $encodable = false;
  11786. for ($i = 0, $c = strlen($part); $i < $c; $i++) {
  11787. if (ord($part[$i]) > 0x7a) {
  11788. $encodable = true;
  11789. break;
  11790. }
  11791. }
  11792. if (!$encodable) {
  11793. $new_parts[] = $part;
  11794. } else {
  11795. $new_parts[] = $idna->encode($part);
  11796. }
  11797. }
  11798. $string = implode('.', $new_parts);
  11799. } catch (Exception $e) {
  11800. // XXX error reporting
  11801. }
  11802. }
  11803. // Try again
  11804. if (preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string)) {
  11805. return $string;
  11806. }
  11807. return false;
  11808. }
  11809. }
  11810. /**
  11811. * Validates an IPv4 address
  11812. * @author Feyd @ forums.devnetwork.net (public domain)
  11813. */
  11814. class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
  11815. {
  11816. /**
  11817. * IPv4 regex, protected so that IPv6 can reuse it.
  11818. * @type string
  11819. */
  11820. protected $ip4;
  11821. /**
  11822. * @param string $aIP
  11823. * @param HTMLPurifier_Config $config
  11824. * @param HTMLPurifier_Context $context
  11825. * @return bool|string
  11826. */
  11827. public function validate($aIP, $config, $context)
  11828. {
  11829. if (!$this->ip4) {
  11830. $this->_loadRegex();
  11831. }
  11832. if (preg_match('#^' . $this->ip4 . '$#s', $aIP)) {
  11833. return $aIP;
  11834. }
  11835. return false;
  11836. }
  11837. /**
  11838. * Lazy load function to prevent regex from being stuffed in
  11839. * cache.
  11840. */
  11841. protected function _loadRegex()
  11842. {
  11843. $oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
  11844. $this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
  11845. }
  11846. }
  11847. /**
  11848. * Validates an IPv6 address.
  11849. * @author Feyd @ forums.devnetwork.net (public domain)
  11850. * @note This function requires brackets to have been removed from address
  11851. * in URI.
  11852. */
  11853. class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
  11854. {
  11855. /**
  11856. * @param string $aIP
  11857. * @param HTMLPurifier_Config $config
  11858. * @param HTMLPurifier_Context $context
  11859. * @return bool|string
  11860. */
  11861. public function validate($aIP, $config, $context)
  11862. {
  11863. if (!$this->ip4) {
  11864. $this->_loadRegex();
  11865. }
  11866. $original = $aIP;
  11867. $hex = '[0-9a-fA-F]';
  11868. $blk = '(?:' . $hex . '{1,4})';
  11869. $pre = '(?:/(?:12[0-8]|1[0-1][0-9]|[1-9][0-9]|[0-9]))'; // /0 - /128
  11870. // prefix check
  11871. if (strpos($aIP, '/') !== false) {
  11872. if (preg_match('#' . $pre . '$#s', $aIP, $find)) {
  11873. $aIP = substr($aIP, 0, 0 - strlen($find[0]));
  11874. unset($find);
  11875. } else {
  11876. return false;
  11877. }
  11878. }
  11879. // IPv4-compatiblity check
  11880. if (preg_match('#(?<=:' . ')' . $this->ip4 . '$#s', $aIP, $find)) {
  11881. $aIP = substr($aIP, 0, 0 - strlen($find[0]));
  11882. $ip = explode('.', $find[0]);
  11883. $ip = array_map('dechex', $ip);
  11884. $aIP .= $ip[0] . $ip[1] . ':' . $ip[2] . $ip[3];
  11885. unset($find, $ip);
  11886. }
  11887. // compression check
  11888. $aIP = explode('::', $aIP);
  11889. $c = count($aIP);
  11890. if ($c > 2) {
  11891. return false;
  11892. } elseif ($c == 2) {
  11893. list($first, $second) = $aIP;
  11894. $first = explode(':', $first);
  11895. $second = explode(':', $second);
  11896. if (count($first) + count($second) > 8) {
  11897. return false;
  11898. }
  11899. while (count($first) < 8) {
  11900. array_push($first, '0');
  11901. }
  11902. array_splice($first, 8 - count($second), 8, $second);
  11903. $aIP = $first;
  11904. unset($first, $second);
  11905. } else {
  11906. $aIP = explode(':', $aIP[0]);
  11907. }
  11908. $c = count($aIP);
  11909. if ($c != 8) {
  11910. return false;
  11911. }
  11912. // All the pieces should be 16-bit hex strings. Are they?
  11913. foreach ($aIP as $piece) {
  11914. if (!preg_match('#^[0-9a-fA-F]{4}$#s', sprintf('%04s', $piece))) {
  11915. return false;
  11916. }
  11917. }
  11918. return $original;
  11919. }
  11920. }
  11921. /**
  11922. * Primitive email validation class based on the regexp found at
  11923. * http://www.regular-expressions.info/email.html
  11924. */
  11925. class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
  11926. {
  11927. /**
  11928. * @param string $string
  11929. * @param HTMLPurifier_Config $config
  11930. * @param HTMLPurifier_Context $context
  11931. * @return bool|string
  11932. */
  11933. public function validate($string, $config, $context)
  11934. {
  11935. // no support for named mailboxes i.e. "Bob <bob@example.com>"
  11936. // that needs more percent encoding to be done
  11937. if ($string == '') {
  11938. return false;
  11939. }
  11940. $string = trim($string);
  11941. $result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
  11942. return $result ? $string : false;
  11943. }
  11944. }
  11945. /**
  11946. * Pre-transform that changes proprietary background attribute to CSS.
  11947. */
  11948. class HTMLPurifier_AttrTransform_Background extends HTMLPurifier_AttrTransform
  11949. {
  11950. /**
  11951. * @param array $attr
  11952. * @param HTMLPurifier_Config $config
  11953. * @param HTMLPurifier_Context $context
  11954. * @return array
  11955. */
  11956. public function transform($attr, $config, $context)
  11957. {
  11958. if (!isset($attr['background'])) {
  11959. return $attr;
  11960. }
  11961. $background = $this->confiscateAttr($attr, 'background');
  11962. // some validation should happen here
  11963. $this->prependCSS($attr, "background-image:url($background);");
  11964. return $attr;
  11965. }
  11966. }
  11967. // this MUST be placed in post, as it assumes that any value in dir is valid
  11968. /**
  11969. * Post-trasnform that ensures that bdo tags have the dir attribute set.
  11970. */
  11971. class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
  11972. {
  11973. /**
  11974. * @param array $attr
  11975. * @param HTMLPurifier_Config $config
  11976. * @param HTMLPurifier_Context $context
  11977. * @return array
  11978. */
  11979. public function transform($attr, $config, $context)
  11980. {
  11981. if (isset($attr['dir'])) {
  11982. return $attr;
  11983. }
  11984. $attr['dir'] = $config->get('Attr.DefaultTextDir');
  11985. return $attr;
  11986. }
  11987. }
  11988. /**
  11989. * Pre-transform that changes deprecated bgcolor attribute to CSS.
  11990. */
  11991. class HTMLPurifier_AttrTransform_BgColor extends HTMLPurifier_AttrTransform
  11992. {
  11993. /**
  11994. * @param array $attr
  11995. * @param HTMLPurifier_Config $config
  11996. * @param HTMLPurifier_Context $context
  11997. * @return array
  11998. */
  11999. public function transform($attr, $config, $context)
  12000. {
  12001. if (!isset($attr['bgcolor'])) {
  12002. return $attr;
  12003. }
  12004. $bgcolor = $this->confiscateAttr($attr, 'bgcolor');
  12005. // some validation should happen here
  12006. $this->prependCSS($attr, "background-color:$bgcolor;");
  12007. return $attr;
  12008. }
  12009. }
  12010. /**
  12011. * Pre-transform that changes converts a boolean attribute to fixed CSS
  12012. */
  12013. class HTMLPurifier_AttrTransform_BoolToCSS extends HTMLPurifier_AttrTransform
  12014. {
  12015. /**
  12016. * Name of boolean attribute that is trigger.
  12017. * @type string
  12018. */
  12019. protected $attr;
  12020. /**
  12021. * CSS declarations to add to style, needs trailing semicolon.
  12022. * @type string
  12023. */
  12024. protected $css;
  12025. /**
  12026. * @param string $attr attribute name to convert from
  12027. * @param string $css CSS declarations to add to style (needs semicolon)
  12028. */
  12029. public function __construct($attr, $css)
  12030. {
  12031. $this->attr = $attr;
  12032. $this->css = $css;
  12033. }
  12034. /**
  12035. * @param array $attr
  12036. * @param HTMLPurifier_Config $config
  12037. * @param HTMLPurifier_Context $context
  12038. * @return array
  12039. */
  12040. public function transform($attr, $config, $context)
  12041. {
  12042. if (!isset($attr[$this->attr])) {
  12043. return $attr;
  12044. }
  12045. unset($attr[$this->attr]);
  12046. $this->prependCSS($attr, $this->css);
  12047. return $attr;
  12048. }
  12049. }
  12050. /**
  12051. * Pre-transform that changes deprecated border attribute to CSS.
  12052. */
  12053. class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform
  12054. {
  12055. /**
  12056. * @param array $attr
  12057. * @param HTMLPurifier_Config $config
  12058. * @param HTMLPurifier_Context $context
  12059. * @return array
  12060. */
  12061. public function transform($attr, $config, $context)
  12062. {
  12063. if (!isset($attr['border'])) {
  12064. return $attr;
  12065. }
  12066. $border_width = $this->confiscateAttr($attr, 'border');
  12067. // some validation should happen here
  12068. $this->prependCSS($attr, "border:{$border_width}px solid;");
  12069. return $attr;
  12070. }
  12071. }
  12072. /**
  12073. * Generic pre-transform that converts an attribute with a fixed number of
  12074. * values (enumerated) to CSS.
  12075. */
  12076. class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform
  12077. {
  12078. /**
  12079. * Name of attribute to transform from.
  12080. * @type string
  12081. */
  12082. protected $attr;
  12083. /**
  12084. * Lookup array of attribute values to CSS.
  12085. * @type array
  12086. */
  12087. protected $enumToCSS = array();
  12088. /**
  12089. * Case sensitivity of the matching.
  12090. * @type bool
  12091. * @warning Currently can only be guaranteed to work with ASCII
  12092. * values.
  12093. */
  12094. protected $caseSensitive = false;
  12095. /**
  12096. * @param string $attr Attribute name to transform from
  12097. * @param array $enum_to_css Lookup array of attribute values to CSS
  12098. * @param bool $case_sensitive Case sensitivity indicator, default false
  12099. */
  12100. public function __construct($attr, $enum_to_css, $case_sensitive = false)
  12101. {
  12102. $this->attr = $attr;
  12103. $this->enumToCSS = $enum_to_css;
  12104. $this->caseSensitive = (bool)$case_sensitive;
  12105. }
  12106. /**
  12107. * @param array $attr
  12108. * @param HTMLPurifier_Config $config
  12109. * @param HTMLPurifier_Context $context
  12110. * @return array
  12111. */
  12112. public function transform($attr, $config, $context)
  12113. {
  12114. if (!isset($attr[$this->attr])) {
  12115. return $attr;
  12116. }
  12117. $value = trim($attr[$this->attr]);
  12118. unset($attr[$this->attr]);
  12119. if (!$this->caseSensitive) {
  12120. $value = strtolower($value);
  12121. }
  12122. if (!isset($this->enumToCSS[$value])) {
  12123. return $attr;
  12124. }
  12125. $this->prependCSS($attr, $this->enumToCSS[$value]);
  12126. return $attr;
  12127. }
  12128. }
  12129. // must be called POST validation
  12130. /**
  12131. * Transform that supplies default values for the src and alt attributes
  12132. * in img tags, as well as prevents the img tag from being removed
  12133. * because of a missing alt tag. This needs to be registered as both
  12134. * a pre and post attribute transform.
  12135. */
  12136. class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
  12137. {
  12138. /**
  12139. * @param array $attr
  12140. * @param HTMLPurifier_Config $config
  12141. * @param HTMLPurifier_Context $context
  12142. * @return array
  12143. */
  12144. public function transform($attr, $config, $context)
  12145. {
  12146. $src = true;
  12147. if (!isset($attr['src'])) {
  12148. if ($config->get('Core.RemoveInvalidImg')) {
  12149. return $attr;
  12150. }
  12151. $attr['src'] = $config->get('Attr.DefaultInvalidImage');
  12152. $src = false;
  12153. }
  12154. if (!isset($attr['alt'])) {
  12155. if ($src) {
  12156. $alt = $config->get('Attr.DefaultImageAlt');
  12157. if ($alt === null) {
  12158. $attr['alt'] = basename($attr['src']);
  12159. } else {
  12160. $attr['alt'] = $alt;
  12161. }
  12162. } else {
  12163. $attr['alt'] = $config->get('Attr.DefaultInvalidImageAlt');
  12164. }
  12165. }
  12166. return $attr;
  12167. }
  12168. }
  12169. /**
  12170. * Pre-transform that changes deprecated hspace and vspace attributes to CSS
  12171. */
  12172. class HTMLPurifier_AttrTransform_ImgSpace extends HTMLPurifier_AttrTransform
  12173. {
  12174. /**
  12175. * @type string
  12176. */
  12177. protected $attr;
  12178. /**
  12179. * @type array
  12180. */
  12181. protected $css = array(
  12182. 'hspace' => array('left', 'right'),
  12183. 'vspace' => array('top', 'bottom')
  12184. );
  12185. /**
  12186. * @param string $attr
  12187. */
  12188. public function __construct($attr)
  12189. {
  12190. $this->attr = $attr;
  12191. if (!isset($this->css[$attr])) {
  12192. trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
  12193. }
  12194. }
  12195. /**
  12196. * @param array $attr
  12197. * @param HTMLPurifier_Config $config
  12198. * @param HTMLPurifier_Context $context
  12199. * @return array
  12200. */
  12201. public function transform($attr, $config, $context)
  12202. {
  12203. if (!isset($attr[$this->attr])) {
  12204. return $attr;
  12205. }
  12206. $width = $this->confiscateAttr($attr, $this->attr);
  12207. // some validation could happen here
  12208. if (!isset($this->css[$this->attr])) {
  12209. return $attr;
  12210. }
  12211. $style = '';
  12212. foreach ($this->css[$this->attr] as $suffix) {
  12213. $property = "margin-$suffix";
  12214. $style .= "$property:{$width}px;";
  12215. }
  12216. $this->prependCSS($attr, $style);
  12217. return $attr;
  12218. }
  12219. }
  12220. /**
  12221. * Performs miscellaneous cross attribute validation and filtering for
  12222. * input elements. This is meant to be a post-transform.
  12223. */
  12224. class HTMLPurifier_AttrTransform_Input extends HTMLPurifier_AttrTransform
  12225. {
  12226. /**
  12227. * @type HTMLPurifier_AttrDef_HTML_Pixels
  12228. */
  12229. protected $pixels;
  12230. public function __construct()
  12231. {
  12232. $this->pixels = new HTMLPurifier_AttrDef_HTML_Pixels();
  12233. }
  12234. /**
  12235. * @param array $attr
  12236. * @param HTMLPurifier_Config $config
  12237. * @param HTMLPurifier_Context $context
  12238. * @return array
  12239. */
  12240. public function transform($attr, $config, $context)
  12241. {
  12242. if (!isset($attr['type'])) {
  12243. $t = 'text';
  12244. } else {
  12245. $t = strtolower($attr['type']);
  12246. }
  12247. if (isset($attr['checked']) && $t !== 'radio' && $t !== 'checkbox') {
  12248. unset($attr['checked']);
  12249. }
  12250. if (isset($attr['maxlength']) && $t !== 'text' && $t !== 'password') {
  12251. unset($attr['maxlength']);
  12252. }
  12253. if (isset($attr['size']) && $t !== 'text' && $t !== 'password') {
  12254. $result = $this->pixels->validate($attr['size'], $config, $context);
  12255. if ($result === false) {
  12256. unset($attr['size']);
  12257. } else {
  12258. $attr['size'] = $result;
  12259. }
  12260. }
  12261. if (isset($attr['src']) && $t !== 'image') {
  12262. unset($attr['src']);
  12263. }
  12264. if (!isset($attr['value']) && ($t === 'radio' || $t === 'checkbox')) {
  12265. $attr['value'] = '';
  12266. }
  12267. return $attr;
  12268. }
  12269. }
  12270. /**
  12271. * Post-transform that copies lang's value to xml:lang (and vice-versa)
  12272. * @note Theoretically speaking, this could be a pre-transform, but putting
  12273. * post is more efficient.
  12274. */
  12275. class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
  12276. {
  12277. /**
  12278. * @param array $attr
  12279. * @param HTMLPurifier_Config $config
  12280. * @param HTMLPurifier_Context $context
  12281. * @return array
  12282. */
  12283. public function transform($attr, $config, $context)
  12284. {
  12285. $lang = isset($attr['lang']) ? $attr['lang'] : false;
  12286. $xml_lang = isset($attr['xml:lang']) ? $attr['xml:lang'] : false;
  12287. if ($lang !== false && $xml_lang === false) {
  12288. $attr['xml:lang'] = $lang;
  12289. } elseif ($xml_lang !== false) {
  12290. $attr['lang'] = $xml_lang;
  12291. }
  12292. return $attr;
  12293. }
  12294. }
  12295. /**
  12296. * Class for handling width/height length attribute transformations to CSS
  12297. */
  12298. class HTMLPurifier_AttrTransform_Length extends HTMLPurifier_AttrTransform
  12299. {
  12300. /**
  12301. * @type string
  12302. */
  12303. protected $name;
  12304. /**
  12305. * @type string
  12306. */
  12307. protected $cssName;
  12308. public function __construct($name, $css_name = null)
  12309. {
  12310. $this->name = $name;
  12311. $this->cssName = $css_name ? $css_name : $name;
  12312. }
  12313. /**
  12314. * @param array $attr
  12315. * @param HTMLPurifier_Config $config
  12316. * @param HTMLPurifier_Context $context
  12317. * @return array
  12318. */
  12319. public function transform($attr, $config, $context)
  12320. {
  12321. if (!isset($attr[$this->name])) {
  12322. return $attr;
  12323. }
  12324. $length = $this->confiscateAttr($attr, $this->name);
  12325. if (ctype_digit($length)) {
  12326. $length .= 'px';
  12327. }
  12328. $this->prependCSS($attr, $this->cssName . ":$length;");
  12329. return $attr;
  12330. }
  12331. }
  12332. /**
  12333. * Pre-transform that changes deprecated name attribute to ID if necessary
  12334. */
  12335. class HTMLPurifier_AttrTransform_Name extends HTMLPurifier_AttrTransform
  12336. {
  12337. /**
  12338. * @param array $attr
  12339. * @param HTMLPurifier_Config $config
  12340. * @param HTMLPurifier_Context $context
  12341. * @return array
  12342. */
  12343. public function transform($attr, $config, $context)
  12344. {
  12345. // Abort early if we're using relaxed definition of name
  12346. if ($config->get('HTML.Attr.Name.UseCDATA')) {
  12347. return $attr;
  12348. }
  12349. if (!isset($attr['name'])) {
  12350. return $attr;
  12351. }
  12352. $id = $this->confiscateAttr($attr, 'name');
  12353. if (isset($attr['id'])) {
  12354. return $attr;
  12355. }
  12356. $attr['id'] = $id;
  12357. return $attr;
  12358. }
  12359. }
  12360. /**
  12361. * Post-transform that performs validation to the name attribute; if
  12362. * it is present with an equivalent id attribute, it is passed through;
  12363. * otherwise validation is performed.
  12364. */
  12365. class HTMLPurifier_AttrTransform_NameSync extends HTMLPurifier_AttrTransform
  12366. {
  12367. public function __construct()
  12368. {
  12369. $this->idDef = new HTMLPurifier_AttrDef_HTML_ID();
  12370. }
  12371. /**
  12372. * @param array $attr
  12373. * @param HTMLPurifier_Config $config
  12374. * @param HTMLPurifier_Context $context
  12375. * @return array
  12376. */
  12377. public function transform($attr, $config, $context)
  12378. {
  12379. if (!isset($attr['name'])) {
  12380. return $attr;
  12381. }
  12382. $name = $attr['name'];
  12383. if (isset($attr['id']) && $attr['id'] === $name) {
  12384. return $attr;
  12385. }
  12386. $result = $this->idDef->validate($name, $config, $context);
  12387. if ($result === false) {
  12388. unset($attr['name']);
  12389. } else {
  12390. $attr['name'] = $result;
  12391. }
  12392. return $attr;
  12393. }
  12394. }
  12395. // must be called POST validation
  12396. /**
  12397. * Adds rel="nofollow" to all outbound links. This transform is
  12398. * only attached if Attr.Nofollow is TRUE.
  12399. */
  12400. class HTMLPurifier_AttrTransform_Nofollow extends HTMLPurifier_AttrTransform
  12401. {
  12402. /**
  12403. * @type HTMLPurifier_URIParser
  12404. */
  12405. private $parser;
  12406. public function __construct()
  12407. {
  12408. $this->parser = new HTMLPurifier_URIParser();
  12409. }
  12410. /**
  12411. * @param array $attr
  12412. * @param HTMLPurifier_Config $config
  12413. * @param HTMLPurifier_Context $context
  12414. * @return array
  12415. */
  12416. public function transform($attr, $config, $context)
  12417. {
  12418. if (!isset($attr['href'])) {
  12419. return $attr;
  12420. }
  12421. // XXX Kind of inefficient
  12422. $url = $this->parser->parse($attr['href']);
  12423. $scheme = $url->getSchemeObj($config, $context);
  12424. if ($scheme->browsable && !$url->isLocal($config, $context)) {
  12425. if (isset($attr['rel'])) {
  12426. $rels = explode(' ', $attr['rel']);
  12427. if (!in_array('nofollow', $rels)) {
  12428. $rels[] = 'nofollow';
  12429. }
  12430. $attr['rel'] = implode(' ', $rels);
  12431. } else {
  12432. $attr['rel'] = 'nofollow';
  12433. }
  12434. }
  12435. return $attr;
  12436. }
  12437. }
  12438. class HTMLPurifier_AttrTransform_SafeEmbed extends HTMLPurifier_AttrTransform
  12439. {
  12440. /**
  12441. * @type string
  12442. */
  12443. public $name = "SafeEmbed";
  12444. /**
  12445. * @param array $attr
  12446. * @param HTMLPurifier_Config $config
  12447. * @param HTMLPurifier_Context $context
  12448. * @return array
  12449. */
  12450. public function transform($attr, $config, $context)
  12451. {
  12452. $attr['allowscriptaccess'] = 'never';
  12453. $attr['allownetworking'] = 'internal';
  12454. $attr['type'] = 'application/x-shockwave-flash';
  12455. return $attr;
  12456. }
  12457. }
  12458. /**
  12459. * Writes default type for all objects. Currently only supports flash.
  12460. */
  12461. class HTMLPurifier_AttrTransform_SafeObject extends HTMLPurifier_AttrTransform
  12462. {
  12463. /**
  12464. * @type string
  12465. */
  12466. public $name = "SafeObject";
  12467. /**
  12468. * @param array $attr
  12469. * @param HTMLPurifier_Config $config
  12470. * @param HTMLPurifier_Context $context
  12471. * @return array
  12472. */
  12473. public function transform($attr, $config, $context)
  12474. {
  12475. if (!isset($attr['type'])) {
  12476. $attr['type'] = 'application/x-shockwave-flash';
  12477. }
  12478. return $attr;
  12479. }
  12480. }
  12481. /**
  12482. * Validates name/value pairs in param tags to be used in safe objects. This
  12483. * will only allow name values it recognizes, and pre-fill certain attributes
  12484. * with required values.
  12485. *
  12486. * @note
  12487. * This class only supports Flash. In the future, Quicktime support
  12488. * may be added.
  12489. *
  12490. * @warning
  12491. * This class expects an injector to add the necessary parameters tags.
  12492. */
  12493. class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
  12494. {
  12495. /**
  12496. * @type string
  12497. */
  12498. public $name = "SafeParam";
  12499. /**
  12500. * @type HTMLPurifier_AttrDef_URI
  12501. */
  12502. private $uri;
  12503. public function __construct()
  12504. {
  12505. $this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
  12506. $this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent'));
  12507. }
  12508. /**
  12509. * @param array $attr
  12510. * @param HTMLPurifier_Config $config
  12511. * @param HTMLPurifier_Context $context
  12512. * @return array
  12513. */
  12514. public function transform($attr, $config, $context)
  12515. {
  12516. // If we add support for other objects, we'll need to alter the
  12517. // transforms.
  12518. switch ($attr['name']) {
  12519. // application/x-shockwave-flash
  12520. // Keep this synchronized with Injector/SafeObject.php
  12521. case 'allowScriptAccess':
  12522. $attr['value'] = 'never';
  12523. break;
  12524. case 'allowNetworking':
  12525. $attr['value'] = 'internal';
  12526. break;
  12527. case 'allowFullScreen':
  12528. if ($config->get('HTML.FlashAllowFullScreen')) {
  12529. $attr['value'] = ($attr['value'] == 'true') ? 'true' : 'false';
  12530. } else {
  12531. $attr['value'] = 'false';
  12532. }
  12533. break;
  12534. case 'wmode':
  12535. $attr['value'] = $this->wmode->validate($attr['value'], $config, $context);
  12536. break;
  12537. case 'movie':
  12538. case 'src':
  12539. $attr['name'] = "movie";
  12540. $attr['value'] = $this->uri->validate($attr['value'], $config, $context);
  12541. break;
  12542. case 'flashvars':
  12543. // we're going to allow arbitrary inputs to the SWF, on
  12544. // the reasoning that it could only hack the SWF, not us.
  12545. break;
  12546. // add other cases to support other param name/value pairs
  12547. default:
  12548. $attr['name'] = $attr['value'] = null;
  12549. }
  12550. return $attr;
  12551. }
  12552. }
  12553. /**
  12554. * Implements required attribute stipulation for <script>
  12555. */
  12556. class HTMLPurifier_AttrTransform_ScriptRequired extends HTMLPurifier_AttrTransform
  12557. {
  12558. /**
  12559. * @param array $attr
  12560. * @param HTMLPurifier_Config $config
  12561. * @param HTMLPurifier_Context $context
  12562. * @return array
  12563. */
  12564. public function transform($attr, $config, $context)
  12565. {
  12566. if (!isset($attr['type'])) {
  12567. $attr['type'] = 'text/javascript';
  12568. }
  12569. return $attr;
  12570. }
  12571. }
  12572. // must be called POST validation
  12573. /**
  12574. * Adds target="blank" to all outbound links. This transform is
  12575. * only attached if Attr.TargetBlank is TRUE. This works regardless
  12576. * of whether or not Attr.AllowedFrameTargets
  12577. */
  12578. class HTMLPurifier_AttrTransform_TargetBlank extends HTMLPurifier_AttrTransform
  12579. {
  12580. /**
  12581. * @type HTMLPurifier_URIParser
  12582. */
  12583. private $parser;
  12584. public function __construct()
  12585. {
  12586. $this->parser = new HTMLPurifier_URIParser();
  12587. }
  12588. /**
  12589. * @param array $attr
  12590. * @param HTMLPurifier_Config $config
  12591. * @param HTMLPurifier_Context $context
  12592. * @return array
  12593. */
  12594. public function transform($attr, $config, $context)
  12595. {
  12596. if (!isset($attr['href'])) {
  12597. return $attr;
  12598. }
  12599. // XXX Kind of inefficient
  12600. $url = $this->parser->parse($attr['href']);
  12601. $scheme = $url->getSchemeObj($config, $context);
  12602. if ($scheme->browsable && !$url->isBenign($config, $context)) {
  12603. $attr['target'] = '_blank';
  12604. }
  12605. return $attr;
  12606. }
  12607. }
  12608. // must be called POST validation
  12609. /**
  12610. * Adds rel="noopener" to any links which target a different window
  12611. * than the current one. This is used to prevent malicious websites
  12612. * from silently replacing the original window, which could be used
  12613. * to do phishing.
  12614. * This transform is controlled by %HTML.TargetNoopener.
  12615. */
  12616. class HTMLPurifier_AttrTransform_TargetNoopener extends HTMLPurifier_AttrTransform
  12617. {
  12618. /**
  12619. * @param array $attr
  12620. * @param HTMLPurifier_Config $config
  12621. * @param HTMLPurifier_Context $context
  12622. * @return array
  12623. */
  12624. public function transform($attr, $config, $context)
  12625. {
  12626. if (isset($attr['rel'])) {
  12627. $rels = explode(' ', $attr['rel']);
  12628. } else {
  12629. $rels = array();
  12630. }
  12631. if (isset($attr['target']) && !in_array('noopener', $rels)) {
  12632. $rels[] = 'noopener';
  12633. }
  12634. if (!empty($rels) || isset($attr['rel'])) {
  12635. $attr['rel'] = implode(' ', $rels);
  12636. }
  12637. return $attr;
  12638. }
  12639. }
  12640. // must be called POST validation
  12641. /**
  12642. * Adds rel="noreferrer" to any links which target a different window
  12643. * than the current one. This is used to prevent malicious websites
  12644. * from silently replacing the original window, which could be used
  12645. * to do phishing.
  12646. * This transform is controlled by %HTML.TargetNoreferrer.
  12647. */
  12648. class HTMLPurifier_AttrTransform_TargetNoreferrer extends HTMLPurifier_AttrTransform
  12649. {
  12650. /**
  12651. * @param array $attr
  12652. * @param HTMLPurifier_Config $config
  12653. * @param HTMLPurifier_Context $context
  12654. * @return array
  12655. */
  12656. public function transform($attr, $config, $context)
  12657. {
  12658. if (isset($attr['rel'])) {
  12659. $rels = explode(' ', $attr['rel']);
  12660. } else {
  12661. $rels = array();
  12662. }
  12663. if (isset($attr['target']) && !in_array('noreferrer', $rels)) {
  12664. $rels[] = 'noreferrer';
  12665. }
  12666. if (!empty($rels) || isset($attr['rel'])) {
  12667. $attr['rel'] = implode(' ', $rels);
  12668. }
  12669. return $attr;
  12670. }
  12671. }
  12672. /**
  12673. * Sets height/width defaults for <textarea>
  12674. */
  12675. class HTMLPurifier_AttrTransform_Textarea extends HTMLPurifier_AttrTransform
  12676. {
  12677. /**
  12678. * @param array $attr
  12679. * @param HTMLPurifier_Config $config
  12680. * @param HTMLPurifier_Context $context
  12681. * @return array
  12682. */
  12683. public function transform($attr, $config, $context)
  12684. {
  12685. // Calculated from Firefox
  12686. if (!isset($attr['cols'])) {
  12687. $attr['cols'] = '22';
  12688. }
  12689. if (!isset($attr['rows'])) {
  12690. $attr['rows'] = '3';
  12691. }
  12692. return $attr;
  12693. }
  12694. }
  12695. /**
  12696. * Definition that uses different definitions depending on context.
  12697. *
  12698. * The del and ins tags are notable because they allow different types of
  12699. * elements depending on whether or not they're in a block or inline context.
  12700. * Chameleon allows this behavior to happen by using two different
  12701. * definitions depending on context. While this somewhat generalized,
  12702. * it is specifically intended for those two tags.
  12703. */
  12704. class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
  12705. {
  12706. /**
  12707. * Instance of the definition object to use when inline. Usually stricter.
  12708. * @type HTMLPurifier_ChildDef_Optional
  12709. */
  12710. public $inline;
  12711. /**
  12712. * Instance of the definition object to use when block.
  12713. * @type HTMLPurifier_ChildDef_Optional
  12714. */
  12715. public $block;
  12716. /**
  12717. * @type string
  12718. */
  12719. public $type = 'chameleon';
  12720. /**
  12721. * @param array $inline List of elements to allow when inline.
  12722. * @param array $block List of elements to allow when block.
  12723. */
  12724. public function __construct($inline, $block)
  12725. {
  12726. $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
  12727. $this->block = new HTMLPurifier_ChildDef_Optional($block);
  12728. $this->elements = $this->block->elements;
  12729. }
  12730. /**
  12731. * @param HTMLPurifier_Node[] $children
  12732. * @param HTMLPurifier_Config $config
  12733. * @param HTMLPurifier_Context $context
  12734. * @return bool
  12735. */
  12736. public function validateChildren($children, $config, $context)
  12737. {
  12738. if ($context->get('IsInline') === false) {
  12739. return $this->block->validateChildren(
  12740. $children,
  12741. $config,
  12742. $context
  12743. );
  12744. } else {
  12745. return $this->inline->validateChildren(
  12746. $children,
  12747. $config,
  12748. $context
  12749. );
  12750. }
  12751. }
  12752. }
  12753. /**
  12754. * Custom validation class, accepts DTD child definitions
  12755. *
  12756. * @warning Currently this class is an all or nothing proposition, that is,
  12757. * it will only give a bool return value.
  12758. */
  12759. class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
  12760. {
  12761. /**
  12762. * @type string
  12763. */
  12764. public $type = 'custom';
  12765. /**
  12766. * @type bool
  12767. */
  12768. public $allow_empty = false;
  12769. /**
  12770. * Allowed child pattern as defined by the DTD.
  12771. * @type string
  12772. */
  12773. public $dtd_regex;
  12774. /**
  12775. * PCRE regex derived from $dtd_regex.
  12776. * @type string
  12777. */
  12778. private $_pcre_regex;
  12779. /**
  12780. * @param $dtd_regex Allowed child pattern from the DTD
  12781. */
  12782. public function __construct($dtd_regex)
  12783. {
  12784. $this->dtd_regex = $dtd_regex;
  12785. $this->_compileRegex();
  12786. }
  12787. /**
  12788. * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
  12789. */
  12790. protected function _compileRegex()
  12791. {
  12792. $raw = str_replace(' ', '', $this->dtd_regex);
  12793. if ($raw{0} != '(') {
  12794. $raw = "($raw)";
  12795. }
  12796. $el = '[#a-zA-Z0-9_.-]+';
  12797. $reg = $raw;
  12798. // COMPLICATED! AND MIGHT BE BUGGY! I HAVE NO CLUE WHAT I'M
  12799. // DOING! Seriously: if there's problems, please report them.
  12800. // collect all elements into the $elements array
  12801. preg_match_all("/$el/", $reg, $matches);
  12802. foreach ($matches[0] as $match) {
  12803. $this->elements[$match] = true;
  12804. }
  12805. // setup all elements as parentheticals with leading commas
  12806. $reg = preg_replace("/$el/", '(,\\0)', $reg);
  12807. // remove commas when they were not solicited
  12808. $reg = preg_replace("/([^,(|]\(+),/", '\\1', $reg);
  12809. // remove all non-paranthetical commas: they are handled by first regex
  12810. $reg = preg_replace("/,\(/", '(', $reg);
  12811. $this->_pcre_regex = $reg;
  12812. }
  12813. /**
  12814. * @param HTMLPurifier_Node[] $children
  12815. * @param HTMLPurifier_Config $config
  12816. * @param HTMLPurifier_Context $context
  12817. * @return bool
  12818. */
  12819. public function validateChildren($children, $config, $context)
  12820. {
  12821. $list_of_children = '';
  12822. $nesting = 0; // depth into the nest
  12823. foreach ($children as $node) {
  12824. if (!empty($node->is_whitespace)) {
  12825. continue;
  12826. }
  12827. $list_of_children .= $node->name . ',';
  12828. }
  12829. // add leading comma to deal with stray comma declarations
  12830. $list_of_children = ',' . rtrim($list_of_children, ',');
  12831. $okay =
  12832. preg_match(
  12833. '/^,?' . $this->_pcre_regex . '$/',
  12834. $list_of_children
  12835. );
  12836. return (bool)$okay;
  12837. }
  12838. }
  12839. /**
  12840. * Definition that disallows all elements.
  12841. * @warning validateChildren() in this class is actually never called, because
  12842. * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
  12843. * before child definitions are parsed in earnest by
  12844. * HTMLPurifier_Strategy_FixNesting.
  12845. */
  12846. class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
  12847. {
  12848. /**
  12849. * @type bool
  12850. */
  12851. public $allow_empty = true;
  12852. /**
  12853. * @type string
  12854. */
  12855. public $type = 'empty';
  12856. public function __construct()
  12857. {
  12858. }
  12859. /**
  12860. * @param HTMLPurifier_Node[] $children
  12861. * @param HTMLPurifier_Config $config
  12862. * @param HTMLPurifier_Context $context
  12863. * @return array
  12864. */
  12865. public function validateChildren($children, $config, $context)
  12866. {
  12867. return array();
  12868. }
  12869. }
  12870. /**
  12871. * Definition for list containers ul and ol.
  12872. *
  12873. * What does this do? The big thing is to handle ol/ul at the top
  12874. * level of list nodes, which should be handled specially by /folding/
  12875. * them into the previous list node. We generally shouldn't ever
  12876. * see other disallowed elements, because the autoclose behavior
  12877. * in MakeWellFormed handles it.
  12878. */
  12879. class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
  12880. {
  12881. /**
  12882. * @type string
  12883. */
  12884. public $type = 'list';
  12885. /**
  12886. * @type array
  12887. */
  12888. // lying a little bit, so that we can handle ul and ol ourselves
  12889. // XXX: This whole business with 'wrap' is all a bit unsatisfactory
  12890. public $elements = array('li' => true, 'ul' => true, 'ol' => true);
  12891. /**
  12892. * @param array $children
  12893. * @param HTMLPurifier_Config $config
  12894. * @param HTMLPurifier_Context $context
  12895. * @return array
  12896. */
  12897. public function validateChildren($children, $config, $context)
  12898. {
  12899. // Flag for subclasses
  12900. $this->whitespace = false;
  12901. // if there are no tokens, delete parent node
  12902. if (empty($children)) {
  12903. return false;
  12904. }
  12905. // if li is not allowed, delete parent node
  12906. if (!isset($config->getHTMLDefinition()->info['li'])) {
  12907. trigger_error("Cannot allow ul/ol without allowing li", E_USER_WARNING);
  12908. return false;
  12909. }
  12910. // the new set of children
  12911. $result = array();
  12912. // a little sanity check to make sure it's not ALL whitespace
  12913. $all_whitespace = true;
  12914. $current_li = null;
  12915. foreach ($children as $node) {
  12916. if (!empty($node->is_whitespace)) {
  12917. $result[] = $node;
  12918. continue;
  12919. }
  12920. $all_whitespace = false; // phew, we're not talking about whitespace
  12921. if ($node->name === 'li') {
  12922. // good
  12923. $current_li = $node;
  12924. $result[] = $node;
  12925. } else {
  12926. // we want to tuck this into the previous li
  12927. // Invariant: we expect the node to be ol/ul
  12928. // ToDo: Make this more robust in the case of not ol/ul
  12929. // by distinguishing between existing li and li created
  12930. // to handle non-list elements; non-list elements should
  12931. // not be appended to an existing li; only li created
  12932. // for non-list. This distinction is not currently made.
  12933. if ($current_li === null) {
  12934. $current_li = new HTMLPurifier_Node_Element('li');
  12935. $result[] = $current_li;
  12936. }
  12937. $current_li->children[] = $node;
  12938. $current_li->empty = false; // XXX fascinating! Check for this error elsewhere ToDo
  12939. }
  12940. }
  12941. if (empty($result)) {
  12942. return false;
  12943. }
  12944. if ($all_whitespace) {
  12945. return false;
  12946. }
  12947. return $result;
  12948. }
  12949. }
  12950. /**
  12951. * Definition that allows a set of elements, but disallows empty children.
  12952. */
  12953. class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
  12954. {
  12955. /**
  12956. * Lookup table of allowed elements.
  12957. * @type array
  12958. */
  12959. public $elements = array();
  12960. /**
  12961. * Whether or not the last passed node was all whitespace.
  12962. * @type bool
  12963. */
  12964. protected $whitespace = false;
  12965. /**
  12966. * @param array|string $elements List of allowed element names (lowercase).
  12967. */
  12968. public function __construct($elements)
  12969. {
  12970. if (is_string($elements)) {
  12971. $elements = str_replace(' ', '', $elements);
  12972. $elements = explode('|', $elements);
  12973. }
  12974. $keys = array_keys($elements);
  12975. if ($keys == array_keys($keys)) {
  12976. $elements = array_flip($elements);
  12977. foreach ($elements as $i => $x) {
  12978. $elements[$i] = true;
  12979. if (empty($i)) {
  12980. unset($elements[$i]);
  12981. } // remove blank
  12982. }
  12983. }
  12984. $this->elements = $elements;
  12985. }
  12986. /**
  12987. * @type bool
  12988. */
  12989. public $allow_empty = false;
  12990. /**
  12991. * @type string
  12992. */
  12993. public $type = 'required';
  12994. /**
  12995. * @param array $children
  12996. * @param HTMLPurifier_Config $config
  12997. * @param HTMLPurifier_Context $context
  12998. * @return array
  12999. */
  13000. public function validateChildren($children, $config, $context)
  13001. {
  13002. // Flag for subclasses
  13003. $this->whitespace = false;
  13004. // if there are no tokens, delete parent node
  13005. if (empty($children)) {
  13006. return false;
  13007. }
  13008. // the new set of children
  13009. $result = array();
  13010. // whether or not parsed character data is allowed
  13011. // this controls whether or not we silently drop a tag
  13012. // or generate escaped HTML from it
  13013. $pcdata_allowed = isset($this->elements['#PCDATA']);
  13014. // a little sanity check to make sure it's not ALL whitespace
  13015. $all_whitespace = true;
  13016. $stack = array_reverse($children);
  13017. while (!empty($stack)) {
  13018. $node = array_pop($stack);
  13019. if (!empty($node->is_whitespace)) {
  13020. $result[] = $node;
  13021. continue;
  13022. }
  13023. $all_whitespace = false; // phew, we're not talking about whitespace
  13024. if (!isset($this->elements[$node->name])) {
  13025. // special case text
  13026. // XXX One of these ought to be redundant or something
  13027. if ($pcdata_allowed && $node instanceof HTMLPurifier_Node_Text) {
  13028. $result[] = $node;
  13029. continue;
  13030. }
  13031. // spill the child contents in
  13032. // ToDo: Make configurable
  13033. if ($node instanceof HTMLPurifier_Node_Element) {
  13034. for ($i = count($node->children) - 1; $i >= 0; $i--) {
  13035. $stack[] = $node->children[$i];
  13036. }
  13037. continue;
  13038. }
  13039. continue;
  13040. }
  13041. $result[] = $node;
  13042. }
  13043. if (empty($result)) {
  13044. return false;
  13045. }
  13046. if ($all_whitespace) {
  13047. $this->whitespace = true;
  13048. return false;
  13049. }
  13050. return $result;
  13051. }
  13052. }
  13053. /**
  13054. * Definition that allows a set of elements, and allows no children.
  13055. * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
  13056. * really, one shouldn't inherit from the other. Only altered behavior
  13057. * is to overload a returned false with an array. Thus, it will never
  13058. * return false.
  13059. */
  13060. class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
  13061. {
  13062. /**
  13063. * @type bool
  13064. */
  13065. public $allow_empty = true;
  13066. /**
  13067. * @type string
  13068. */
  13069. public $type = 'optional';
  13070. /**
  13071. * @param array $children
  13072. * @param HTMLPurifier_Config $config
  13073. * @param HTMLPurifier_Context $context
  13074. * @return array
  13075. */
  13076. public function validateChildren($children, $config, $context)
  13077. {
  13078. $result = parent::validateChildren($children, $config, $context);
  13079. // we assume that $children is not modified
  13080. if ($result === false) {
  13081. if (empty($children)) {
  13082. return true;
  13083. } elseif ($this->whitespace) {
  13084. return $children;
  13085. } else {
  13086. return array();
  13087. }
  13088. }
  13089. return $result;
  13090. }
  13091. }
  13092. /**
  13093. * Takes the contents of blockquote when in strict and reformats for validation.
  13094. */
  13095. class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Required
  13096. {
  13097. /**
  13098. * @type array
  13099. */
  13100. protected $real_elements;
  13101. /**
  13102. * @type array
  13103. */
  13104. protected $fake_elements;
  13105. /**
  13106. * @type bool
  13107. */
  13108. public $allow_empty = true;
  13109. /**
  13110. * @type string
  13111. */
  13112. public $type = 'strictblockquote';
  13113. /**
  13114. * @type bool
  13115. */
  13116. protected $init = false;
  13117. /**
  13118. * @param HTMLPurifier_Config $config
  13119. * @return array
  13120. * @note We don't want MakeWellFormed to auto-close inline elements since
  13121. * they might be allowed.
  13122. */
  13123. public function getAllowedElements($config)
  13124. {
  13125. $this->init($config);
  13126. return $this->fake_elements;
  13127. }
  13128. /**
  13129. * @param array $children
  13130. * @param HTMLPurifier_Config $config
  13131. * @param HTMLPurifier_Context $context
  13132. * @return array
  13133. */
  13134. public function validateChildren($children, $config, $context)
  13135. {
  13136. $this->init($config);
  13137. // trick the parent class into thinking it allows more
  13138. $this->elements = $this->fake_elements;
  13139. $result = parent::validateChildren($children, $config, $context);
  13140. $this->elements = $this->real_elements;
  13141. if ($result === false) {
  13142. return array();
  13143. }
  13144. if ($result === true) {
  13145. $result = $children;
  13146. }
  13147. $def = $config->getHTMLDefinition();
  13148. $block_wrap_name = $def->info_block_wrapper;
  13149. $block_wrap = false;
  13150. $ret = array();
  13151. foreach ($result as $node) {
  13152. if ($block_wrap === false) {
  13153. if (($node instanceof HTMLPurifier_Node_Text && !$node->is_whitespace) ||
  13154. ($node instanceof HTMLPurifier_Node_Element && !isset($this->elements[$node->name]))) {
  13155. $block_wrap = new HTMLPurifier_Node_Element($def->info_block_wrapper);
  13156. $ret[] = $block_wrap;
  13157. }
  13158. } else {
  13159. if ($node instanceof HTMLPurifier_Node_Element && isset($this->elements[$node->name])) {
  13160. $block_wrap = false;
  13161. }
  13162. }
  13163. if ($block_wrap) {
  13164. $block_wrap->children[] = $node;
  13165. } else {
  13166. $ret[] = $node;
  13167. }
  13168. }
  13169. return $ret;
  13170. }
  13171. /**
  13172. * @param HTMLPurifier_Config $config
  13173. */
  13174. private function init($config)
  13175. {
  13176. if (!$this->init) {
  13177. $def = $config->getHTMLDefinition();
  13178. // allow all inline elements
  13179. $this->real_elements = $this->elements;
  13180. $this->fake_elements = $def->info_content_sets['Flow'];
  13181. $this->fake_elements['#PCDATA'] = true;
  13182. $this->init = true;
  13183. }
  13184. }
  13185. }
  13186. /**
  13187. * Definition for tables. The general idea is to extract out all of the
  13188. * essential bits, and then reconstruct it later.
  13189. *
  13190. * This is a bit confusing, because the DTDs and the W3C
  13191. * validators seem to disagree on the appropriate definition. The
  13192. * DTD claims:
  13193. *
  13194. * (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
  13195. *
  13196. * But actually, the HTML4 spec then has this to say:
  13197. *
  13198. * The TBODY start tag is always required except when the table
  13199. * contains only one table body and no table head or foot sections.
  13200. * The TBODY end tag may always be safely omitted.
  13201. *
  13202. * So the DTD is kind of wrong. The validator is, unfortunately, kind
  13203. * of on crack.
  13204. *
  13205. * The definition changed again in XHTML1.1; and in my opinion, this
  13206. * formulation makes the most sense.
  13207. *
  13208. * caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))
  13209. *
  13210. * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode.
  13211. * If we encounter a thead, tfoot or tbody, we are placed in the former
  13212. * mode, and we *must* wrap any stray tr segments with a tbody. But if
  13213. * we don't run into any of them, just have tr tags is OK.
  13214. */
  13215. class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
  13216. {
  13217. /**
  13218. * @type bool
  13219. */
  13220. public $allow_empty = false;
  13221. /**
  13222. * @type string
  13223. */
  13224. public $type = 'table';
  13225. /**
  13226. * @type array
  13227. */
  13228. public $elements = array(
  13229. 'tr' => true,
  13230. 'tbody' => true,
  13231. 'thead' => true,
  13232. 'tfoot' => true,
  13233. 'caption' => true,
  13234. 'colgroup' => true,
  13235. 'col' => true
  13236. );
  13237. public function __construct()
  13238. {
  13239. }
  13240. /**
  13241. * @param array $children
  13242. * @param HTMLPurifier_Config $config
  13243. * @param HTMLPurifier_Context $context
  13244. * @return array
  13245. */
  13246. public function validateChildren($children, $config, $context)
  13247. {
  13248. if (empty($children)) {
  13249. return false;
  13250. }
  13251. // only one of these elements is allowed in a table
  13252. $caption = false;
  13253. $thead = false;
  13254. $tfoot = false;
  13255. // whitespace
  13256. $initial_ws = array();
  13257. $after_caption_ws = array();
  13258. $after_thead_ws = array();
  13259. $after_tfoot_ws = array();
  13260. // as many of these as you want
  13261. $cols = array();
  13262. $content = array();
  13263. $tbody_mode = false; // if true, then we need to wrap any stray
  13264. // <tr>s with a <tbody>.
  13265. $ws_accum =& $initial_ws;
  13266. foreach ($children as $node) {
  13267. if ($node instanceof HTMLPurifier_Node_Comment) {
  13268. $ws_accum[] = $node;
  13269. continue;
  13270. }
  13271. switch ($node->name) {
  13272. case 'tbody':
  13273. $tbody_mode = true;
  13274. // fall through
  13275. case 'tr':
  13276. $content[] = $node;
  13277. $ws_accum =& $content;
  13278. break;
  13279. case 'caption':
  13280. // there can only be one caption!
  13281. if ($caption !== false) break;
  13282. $caption = $node;
  13283. $ws_accum =& $after_caption_ws;
  13284. break;
  13285. case 'thead':
  13286. $tbody_mode = true;
  13287. // XXX This breaks rendering properties with
  13288. // Firefox, which never floats a <thead> to
  13289. // the top. Ever. (Our scheme will float the
  13290. // first <thead> to the top.) So maybe
  13291. // <thead>s that are not first should be
  13292. // turned into <tbody>? Very tricky, indeed.
  13293. if ($thead === false) {
  13294. $thead = $node;
  13295. $ws_accum =& $after_thead_ws;
  13296. } else {
  13297. // Oops, there's a second one! What
  13298. // should we do? Current behavior is to
  13299. // transmutate the first and last entries into
  13300. // tbody tags, and then put into content.
  13301. // Maybe a better idea is to *attach
  13302. // it* to the existing thead or tfoot?
  13303. // We don't do this, because Firefox
  13304. // doesn't float an extra tfoot to the
  13305. // bottom like it does for the first one.
  13306. $node->name = 'tbody';
  13307. $content[] = $node;
  13308. $ws_accum =& $content;
  13309. }
  13310. break;
  13311. case 'tfoot':
  13312. // see above for some aveats
  13313. $tbody_mode = true;
  13314. if ($tfoot === false) {
  13315. $tfoot = $node;
  13316. $ws_accum =& $after_tfoot_ws;
  13317. } else {
  13318. $node->name = 'tbody';
  13319. $content[] = $node;
  13320. $ws_accum =& $content;
  13321. }
  13322. break;
  13323. case 'colgroup':
  13324. case 'col':
  13325. $cols[] = $node;
  13326. $ws_accum =& $cols;
  13327. break;
  13328. case '#PCDATA':
  13329. // How is whitespace handled? We treat is as sticky to
  13330. // the *end* of the previous element. So all of the
  13331. // nonsense we have worked on is to keep things
  13332. // together.
  13333. if (!empty($node->is_whitespace)) {
  13334. $ws_accum[] = $node;
  13335. }
  13336. break;
  13337. }
  13338. }
  13339. if (empty($content)) {
  13340. return false;
  13341. }
  13342. $ret = $initial_ws;
  13343. if ($caption !== false) {
  13344. $ret[] = $caption;
  13345. $ret = array_merge($ret, $after_caption_ws);
  13346. }
  13347. if ($cols !== false) {
  13348. $ret = array_merge($ret, $cols);
  13349. }
  13350. if ($thead !== false) {
  13351. $ret[] = $thead;
  13352. $ret = array_merge($ret, $after_thead_ws);
  13353. }
  13354. if ($tfoot !== false) {
  13355. $ret[] = $tfoot;
  13356. $ret = array_merge($ret, $after_tfoot_ws);
  13357. }
  13358. if ($tbody_mode) {
  13359. // we have to shuffle tr into tbody
  13360. $current_tr_tbody = null;
  13361. foreach($content as $node) {
  13362. switch ($node->name) {
  13363. case 'tbody':
  13364. $current_tr_tbody = null;
  13365. $ret[] = $node;
  13366. break;
  13367. case 'tr':
  13368. if ($current_tr_tbody === null) {
  13369. $current_tr_tbody = new HTMLPurifier_Node_Element('tbody');
  13370. $ret[] = $current_tr_tbody;
  13371. }
  13372. $current_tr_tbody->children[] = $node;
  13373. break;
  13374. case '#PCDATA':
  13375. //assert($node->is_whitespace);
  13376. if ($current_tr_tbody === null) {
  13377. $ret[] = $node;
  13378. } else {
  13379. $current_tr_tbody->children[] = $node;
  13380. }
  13381. break;
  13382. }
  13383. }
  13384. } else {
  13385. $ret = array_merge($ret, $content);
  13386. }
  13387. return $ret;
  13388. }
  13389. }
  13390. class HTMLPurifier_DefinitionCache_Decorator extends HTMLPurifier_DefinitionCache
  13391. {
  13392. /**
  13393. * Cache object we are decorating
  13394. * @type HTMLPurifier_DefinitionCache
  13395. */
  13396. public $cache;
  13397. /**
  13398. * The name of the decorator
  13399. * @var string
  13400. */
  13401. public $name;
  13402. public function __construct()
  13403. {
  13404. }
  13405. /**
  13406. * Lazy decorator function
  13407. * @param HTMLPurifier_DefinitionCache $cache Reference to cache object to decorate
  13408. * @return HTMLPurifier_DefinitionCache_Decorator
  13409. */
  13410. public function decorate(&$cache)
  13411. {
  13412. $decorator = $this->copy();
  13413. // reference is necessary for mocks in PHP 4
  13414. $decorator->cache =& $cache;
  13415. $decorator->type = $cache->type;
  13416. return $decorator;
  13417. }
  13418. /**
  13419. * Cross-compatible clone substitute
  13420. * @return HTMLPurifier_DefinitionCache_Decorator
  13421. */
  13422. public function copy()
  13423. {
  13424. return new HTMLPurifier_DefinitionCache_Decorator();
  13425. }
  13426. /**
  13427. * @param HTMLPurifier_Definition $def
  13428. * @param HTMLPurifier_Config $config
  13429. * @return mixed
  13430. */
  13431. public function add($def, $config)
  13432. {
  13433. return $this->cache->add($def, $config);
  13434. }
  13435. /**
  13436. * @param HTMLPurifier_Definition $def
  13437. * @param HTMLPurifier_Config $config
  13438. * @return mixed
  13439. */
  13440. public function set($def, $config)
  13441. {
  13442. return $this->cache->set($def, $config);
  13443. }
  13444. /**
  13445. * @param HTMLPurifier_Definition $def
  13446. * @param HTMLPurifier_Config $config
  13447. * @return mixed
  13448. */
  13449. public function replace($def, $config)
  13450. {
  13451. return $this->cache->replace($def, $config);
  13452. }
  13453. /**
  13454. * @param HTMLPurifier_Config $config
  13455. * @return mixed
  13456. */
  13457. public function get($config)
  13458. {
  13459. return $this->cache->get($config);
  13460. }
  13461. /**
  13462. * @param HTMLPurifier_Config $config
  13463. * @return mixed
  13464. */
  13465. public function remove($config)
  13466. {
  13467. return $this->cache->remove($config);
  13468. }
  13469. /**
  13470. * @param HTMLPurifier_Config $config
  13471. * @return mixed
  13472. */
  13473. public function flush($config)
  13474. {
  13475. return $this->cache->flush($config);
  13476. }
  13477. /**
  13478. * @param HTMLPurifier_Config $config
  13479. * @return mixed
  13480. */
  13481. public function cleanup($config)
  13482. {
  13483. return $this->cache->cleanup($config);
  13484. }
  13485. }
  13486. /**
  13487. * Null cache object to use when no caching is on.
  13488. */
  13489. class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
  13490. {
  13491. /**
  13492. * @param HTMLPurifier_Definition $def
  13493. * @param HTMLPurifier_Config $config
  13494. * @return bool
  13495. */
  13496. public function add($def, $config)
  13497. {
  13498. return false;
  13499. }
  13500. /**
  13501. * @param HTMLPurifier_Definition $def
  13502. * @param HTMLPurifier_Config $config
  13503. * @return bool
  13504. */
  13505. public function set($def, $config)
  13506. {
  13507. return false;
  13508. }
  13509. /**
  13510. * @param HTMLPurifier_Definition $def
  13511. * @param HTMLPurifier_Config $config
  13512. * @return bool
  13513. */
  13514. public function replace($def, $config)
  13515. {
  13516. return false;
  13517. }
  13518. /**
  13519. * @param HTMLPurifier_Config $config
  13520. * @return bool
  13521. */
  13522. public function remove($config)
  13523. {
  13524. return false;
  13525. }
  13526. /**
  13527. * @param HTMLPurifier_Config $config
  13528. * @return bool
  13529. */
  13530. public function get($config)
  13531. {
  13532. return false;
  13533. }
  13534. /**
  13535. * @param HTMLPurifier_Config $config
  13536. * @return bool
  13537. */
  13538. public function flush($config)
  13539. {
  13540. return false;
  13541. }
  13542. /**
  13543. * @param HTMLPurifier_Config $config
  13544. * @return bool
  13545. */
  13546. public function cleanup($config)
  13547. {
  13548. return false;
  13549. }
  13550. }
  13551. class HTMLPurifier_DefinitionCache_Serializer extends HTMLPurifier_DefinitionCache
  13552. {
  13553. /**
  13554. * @param HTMLPurifier_Definition $def
  13555. * @param HTMLPurifier_Config $config
  13556. * @return int|bool
  13557. */
  13558. public function add($def, $config)
  13559. {
  13560. if (!$this->checkDefType($def)) {
  13561. return;
  13562. }
  13563. $file = $this->generateFilePath($config);
  13564. if (file_exists($file)) {
  13565. return false;
  13566. }
  13567. if (!$this->_prepareDir($config)) {
  13568. return false;
  13569. }
  13570. return $this->_write($file, serialize($def), $config);
  13571. }
  13572. /**
  13573. * @param HTMLPurifier_Definition $def
  13574. * @param HTMLPurifier_Config $config
  13575. * @return int|bool
  13576. */
  13577. public function set($def, $config)
  13578. {
  13579. if (!$this->checkDefType($def)) {
  13580. return;
  13581. }
  13582. $file = $this->generateFilePath($config);
  13583. if (!$this->_prepareDir($config)) {
  13584. return false;
  13585. }
  13586. return $this->_write($file, serialize($def), $config);
  13587. }
  13588. /**
  13589. * @param HTMLPurifier_Definition $def
  13590. * @param HTMLPurifier_Config $config
  13591. * @return int|bool
  13592. */
  13593. public function replace($def, $config)
  13594. {
  13595. if (!$this->checkDefType($def)) {
  13596. return;
  13597. }
  13598. $file = $this->generateFilePath($config);
  13599. if (!file_exists($file)) {
  13600. return false;
  13601. }
  13602. if (!$this->_prepareDir($config)) {
  13603. return false;
  13604. }
  13605. return $this->_write($file, serialize($def), $config);
  13606. }
  13607. /**
  13608. * @param HTMLPurifier_Config $config
  13609. * @return bool|HTMLPurifier_Config
  13610. */
  13611. public function get($config)
  13612. {
  13613. $file = $this->generateFilePath($config);
  13614. if (!file_exists($file)) {
  13615. return false;
  13616. }
  13617. return unserialize(file_get_contents($file));
  13618. }
  13619. /**
  13620. * @param HTMLPurifier_Config $config
  13621. * @return bool
  13622. */
  13623. public function remove($config)
  13624. {
  13625. $file = $this->generateFilePath($config);
  13626. if (!file_exists($file)) {
  13627. return false;
  13628. }
  13629. return unlink($file);
  13630. }
  13631. /**
  13632. * @param HTMLPurifier_Config $config
  13633. * @return bool
  13634. */
  13635. public function flush($config)
  13636. {
  13637. if (!$this->_prepareDir($config)) {
  13638. return false;
  13639. }
  13640. $dir = $this->generateDirectoryPath($config);
  13641. $dh = opendir($dir);
  13642. // Apparently, on some versions of PHP, readdir will return
  13643. // an empty string if you pass an invalid argument to readdir.
  13644. // So you need this test. See #49.
  13645. if (false === $dh) {
  13646. return false;
  13647. }
  13648. while (false !== ($filename = readdir($dh))) {
  13649. if (empty($filename)) {
  13650. continue;
  13651. }
  13652. if ($filename[0] === '.') {
  13653. continue;
  13654. }
  13655. unlink($dir . '/' . $filename);
  13656. }
  13657. closedir($dh);
  13658. return true;
  13659. }
  13660. /**
  13661. * @param HTMLPurifier_Config $config
  13662. * @return bool
  13663. */
  13664. public function cleanup($config)
  13665. {
  13666. if (!$this->_prepareDir($config)) {
  13667. return false;
  13668. }
  13669. $dir = $this->generateDirectoryPath($config);
  13670. $dh = opendir($dir);
  13671. // See #49 (and above).
  13672. if (false === $dh) {
  13673. return false;
  13674. }
  13675. while (false !== ($filename = readdir($dh))) {
  13676. if (empty($filename)) {
  13677. continue;
  13678. }
  13679. if ($filename[0] === '.') {
  13680. continue;
  13681. }
  13682. $key = substr($filename, 0, strlen($filename) - 4);
  13683. if ($this->isOld($key, $config)) {
  13684. unlink($dir . '/' . $filename);
  13685. }
  13686. }
  13687. closedir($dh);
  13688. return true;
  13689. }
  13690. /**
  13691. * Generates the file path to the serial file corresponding to
  13692. * the configuration and definition name
  13693. * @param HTMLPurifier_Config $config
  13694. * @return string
  13695. * @todo Make protected
  13696. */
  13697. public function generateFilePath($config)
  13698. {
  13699. $key = $this->generateKey($config);
  13700. return $this->generateDirectoryPath($config) . '/' . $key . '.ser';
  13701. }
  13702. /**
  13703. * Generates the path to the directory contain this cache's serial files
  13704. * @param HTMLPurifier_Config $config
  13705. * @return string
  13706. * @note No trailing slash
  13707. * @todo Make protected
  13708. */
  13709. public function generateDirectoryPath($config)
  13710. {
  13711. $base = $this->generateBaseDirectoryPath($config);
  13712. return $base . '/' . $this->type;
  13713. }
  13714. /**
  13715. * Generates path to base directory that contains all definition type
  13716. * serials
  13717. * @param HTMLPurifier_Config $config
  13718. * @return mixed|string
  13719. * @todo Make protected
  13720. */
  13721. public function generateBaseDirectoryPath($config)
  13722. {
  13723. $base = $config->get('Cache.SerializerPath');
  13724. $base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
  13725. return $base;
  13726. }
  13727. /**
  13728. * Convenience wrapper function for file_put_contents
  13729. * @param string $file File name to write to
  13730. * @param string $data Data to write into file
  13731. * @param HTMLPurifier_Config $config
  13732. * @return int|bool Number of bytes written if success, or false if failure.
  13733. */
  13734. private function _write($file, $data, $config)
  13735. {
  13736. $result = file_put_contents($file, $data);
  13737. if ($result !== false) {
  13738. // set permissions of the new file (no execute)
  13739. $chmod = $config->get('Cache.SerializerPermissions');
  13740. if ($chmod !== null) {
  13741. chmod($file, $chmod & 0666);
  13742. }
  13743. }
  13744. return $result;
  13745. }
  13746. /**
  13747. * Prepares the directory that this type stores the serials in
  13748. * @param HTMLPurifier_Config $config
  13749. * @return bool True if successful
  13750. */
  13751. private function _prepareDir($config)
  13752. {
  13753. $directory = $this->generateDirectoryPath($config);
  13754. $chmod = $config->get('Cache.SerializerPermissions');
  13755. if ($chmod === null) {
  13756. // TODO: This races
  13757. if (is_dir($directory)) return true;
  13758. return mkdir($directory);
  13759. }
  13760. if (!is_dir($directory)) {
  13761. $base = $this->generateBaseDirectoryPath($config);
  13762. if (!is_dir($base)) {
  13763. trigger_error(
  13764. 'Base directory ' . $base . ' does not exist,
  13765. please create or change using %Cache.SerializerPath',
  13766. E_USER_WARNING
  13767. );
  13768. return false;
  13769. } elseif (!$this->_testPermissions($base, $chmod)) {
  13770. return false;
  13771. }
  13772. if (!mkdir($directory, $chmod)) {
  13773. trigger_error(
  13774. 'Could not create directory ' . $directory . '',
  13775. E_USER_WARNING
  13776. );
  13777. return false;
  13778. }
  13779. if (!$this->_testPermissions($directory, $chmod)) {
  13780. return false;
  13781. }
  13782. } elseif (!$this->_testPermissions($directory, $chmod)) {
  13783. return false;
  13784. }
  13785. return true;
  13786. }
  13787. /**
  13788. * Tests permissions on a directory and throws out friendly
  13789. * error messages and attempts to chmod it itself if possible
  13790. * @param string $dir Directory path
  13791. * @param int $chmod Permissions
  13792. * @return bool True if directory is writable
  13793. */
  13794. private function _testPermissions($dir, $chmod)
  13795. {
  13796. // early abort, if it is writable, everything is hunky-dory
  13797. if (is_writable($dir)) {
  13798. return true;
  13799. }
  13800. if (!is_dir($dir)) {
  13801. // generally, you'll want to handle this beforehand
  13802. // so a more specific error message can be given
  13803. trigger_error(
  13804. 'Directory ' . $dir . ' does not exist',
  13805. E_USER_WARNING
  13806. );
  13807. return false;
  13808. }
  13809. if (function_exists('posix_getuid') && $chmod !== null) {
  13810. // POSIX system, we can give more specific advice
  13811. if (fileowner($dir) === posix_getuid()) {
  13812. // we can chmod it ourselves
  13813. $chmod = $chmod | 0700;
  13814. if (chmod($dir, $chmod)) {
  13815. return true;
  13816. }
  13817. } elseif (filegroup($dir) === posix_getgid()) {
  13818. $chmod = $chmod | 0070;
  13819. } else {
  13820. // PHP's probably running as nobody, so we'll
  13821. // need to give global permissions
  13822. $chmod = $chmod | 0777;
  13823. }
  13824. trigger_error(
  13825. 'Directory ' . $dir . ' not writable, ' .
  13826. 'please chmod to ' . decoct($chmod),
  13827. E_USER_WARNING
  13828. );
  13829. } else {
  13830. // generic error message
  13831. trigger_error(
  13832. 'Directory ' . $dir . ' not writable, ' .
  13833. 'please alter file permissions',
  13834. E_USER_WARNING
  13835. );
  13836. }
  13837. return false;
  13838. }
  13839. }
  13840. /**
  13841. * Definition cache decorator class that cleans up the cache
  13842. * whenever there is a cache miss.
  13843. */
  13844. class HTMLPurifier_DefinitionCache_Decorator_Cleanup extends HTMLPurifier_DefinitionCache_Decorator
  13845. {
  13846. /**
  13847. * @type string
  13848. */
  13849. public $name = 'Cleanup';
  13850. /**
  13851. * @return HTMLPurifier_DefinitionCache_Decorator_Cleanup
  13852. */
  13853. public function copy()
  13854. {
  13855. return new HTMLPurifier_DefinitionCache_Decorator_Cleanup();
  13856. }
  13857. /**
  13858. * @param HTMLPurifier_Definition $def
  13859. * @param HTMLPurifier_Config $config
  13860. * @return mixed
  13861. */
  13862. public function add($def, $config)
  13863. {
  13864. $status = parent::add($def, $config);
  13865. if (!$status) {
  13866. parent::cleanup($config);
  13867. }
  13868. return $status;
  13869. }
  13870. /**
  13871. * @param HTMLPurifier_Definition $def
  13872. * @param HTMLPurifier_Config $config
  13873. * @return mixed
  13874. */
  13875. public function set($def, $config)
  13876. {
  13877. $status = parent::set($def, $config);
  13878. if (!$status) {
  13879. parent::cleanup($config);
  13880. }
  13881. return $status;
  13882. }
  13883. /**
  13884. * @param HTMLPurifier_Definition $def
  13885. * @param HTMLPurifier_Config $config
  13886. * @return mixed
  13887. */
  13888. public function replace($def, $config)
  13889. {
  13890. $status = parent::replace($def, $config);
  13891. if (!$status) {
  13892. parent::cleanup($config);
  13893. }
  13894. return $status;
  13895. }
  13896. /**
  13897. * @param HTMLPurifier_Config $config
  13898. * @return mixed
  13899. */
  13900. public function get($config)
  13901. {
  13902. $ret = parent::get($config);
  13903. if (!$ret) {
  13904. parent::cleanup($config);
  13905. }
  13906. return $ret;
  13907. }
  13908. }
  13909. /**
  13910. * Definition cache decorator class that saves all cache retrievals
  13911. * to PHP's memory; good for unit tests or circumstances where
  13912. * there are lots of configuration objects floating around.
  13913. */
  13914. class HTMLPurifier_DefinitionCache_Decorator_Memory extends HTMLPurifier_DefinitionCache_Decorator
  13915. {
  13916. /**
  13917. * @type array
  13918. */
  13919. protected $definitions;
  13920. /**
  13921. * @type string
  13922. */
  13923. public $name = 'Memory';
  13924. /**
  13925. * @return HTMLPurifier_DefinitionCache_Decorator_Memory
  13926. */
  13927. public function copy()
  13928. {
  13929. return new HTMLPurifier_DefinitionCache_Decorator_Memory();
  13930. }
  13931. /**
  13932. * @param HTMLPurifier_Definition $def
  13933. * @param HTMLPurifier_Config $config
  13934. * @return mixed
  13935. */
  13936. public function add($def, $config)
  13937. {
  13938. $status = parent::add($def, $config);
  13939. if ($status) {
  13940. $this->definitions[$this->generateKey($config)] = $def;
  13941. }
  13942. return $status;
  13943. }
  13944. /**
  13945. * @param HTMLPurifier_Definition $def
  13946. * @param HTMLPurifier_Config $config
  13947. * @return mixed
  13948. */
  13949. public function set($def, $config)
  13950. {
  13951. $status = parent::set($def, $config);
  13952. if ($status) {
  13953. $this->definitions[$this->generateKey($config)] = $def;
  13954. }
  13955. return $status;
  13956. }
  13957. /**
  13958. * @param HTMLPurifier_Definition $def
  13959. * @param HTMLPurifier_Config $config
  13960. * @return mixed
  13961. */
  13962. public function replace($def, $config)
  13963. {
  13964. $status = parent::replace($def, $config);
  13965. if ($status) {
  13966. $this->definitions[$this->generateKey($config)] = $def;
  13967. }
  13968. return $status;
  13969. }
  13970. /**
  13971. * @param HTMLPurifier_Config $config
  13972. * @return mixed
  13973. */
  13974. public function get($config)
  13975. {
  13976. $key = $this->generateKey($config);
  13977. if (isset($this->definitions[$key])) {
  13978. return $this->definitions[$key];
  13979. }
  13980. $this->definitions[$key] = parent::get($config);
  13981. return $this->definitions[$key];
  13982. }
  13983. }
  13984. /**
  13985. * XHTML 1.1 Bi-directional Text Module, defines elements that
  13986. * declare directionality of content. Text Extension Module.
  13987. */
  13988. class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
  13989. {
  13990. /**
  13991. * @type string
  13992. */
  13993. public $name = 'Bdo';
  13994. /**
  13995. * @type array
  13996. */
  13997. public $attr_collections = array(
  13998. 'I18N' => array('dir' => false)
  13999. );
  14000. /**
  14001. * @param HTMLPurifier_Config $config
  14002. */
  14003. public function setup($config)
  14004. {
  14005. $bdo = $this->addElement(
  14006. 'bdo',
  14007. 'Inline',
  14008. 'Inline',
  14009. array('Core', 'Lang'),
  14010. array(
  14011. 'dir' => 'Enum#ltr,rtl', // required
  14012. // The Abstract Module specification has the attribute
  14013. // inclusions wrong for bdo: bdo allows Lang
  14014. )
  14015. );
  14016. $bdo->attr_transform_post[] = new HTMLPurifier_AttrTransform_BdoDir();
  14017. $this->attr_collections['I18N']['dir'] = 'Enum#ltr,rtl';
  14018. }
  14019. }
  14020. class HTMLPurifier_HTMLModule_CommonAttributes extends HTMLPurifier_HTMLModule
  14021. {
  14022. /**
  14023. * @type string
  14024. */
  14025. public $name = 'CommonAttributes';
  14026. /**
  14027. * @type array
  14028. */
  14029. public $attr_collections = array(
  14030. 'Core' => array(
  14031. 0 => array('Style'),
  14032. // 'xml:space' => false,
  14033. 'class' => 'Class',
  14034. 'id' => 'ID',
  14035. 'title' => 'CDATA',
  14036. ),
  14037. 'Lang' => array(),
  14038. 'I18N' => array(
  14039. 0 => array('Lang'), // proprietary, for xml:lang/lang
  14040. ),
  14041. 'Common' => array(
  14042. 0 => array('Core', 'I18N')
  14043. )
  14044. );
  14045. }
  14046. /**
  14047. * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
  14048. * Module.
  14049. */
  14050. class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
  14051. {
  14052. /**
  14053. * @type string
  14054. */
  14055. public $name = 'Edit';
  14056. /**
  14057. * @param HTMLPurifier_Config $config
  14058. */
  14059. public function setup($config)
  14060. {
  14061. $contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
  14062. $attr = array(
  14063. 'cite' => 'URI',
  14064. // 'datetime' => 'Datetime', // not implemented
  14065. );
  14066. $this->addElement('del', 'Inline', $contents, 'Common', $attr);
  14067. $this->addElement('ins', 'Inline', $contents, 'Common', $attr);
  14068. }
  14069. // HTML 4.01 specifies that ins/del must not contain block
  14070. // elements when used in an inline context, chameleon is
  14071. // a complicated workaround to acheive this effect
  14072. // Inline context ! Block context (exclamation mark is
  14073. // separator, see getChildDef for parsing)
  14074. /**
  14075. * @type bool
  14076. */
  14077. public $defines_child_def = true;
  14078. /**
  14079. * @param HTMLPurifier_ElementDef $def
  14080. * @return HTMLPurifier_ChildDef_Chameleon
  14081. */
  14082. public function getChildDef($def)
  14083. {
  14084. if ($def->content_model_type != 'chameleon') {
  14085. return false;
  14086. }
  14087. $value = explode('!', $def->content_model);
  14088. return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
  14089. }
  14090. }
  14091. /**
  14092. * XHTML 1.1 Forms module, defines all form-related elements found in HTML 4.
  14093. */
  14094. class HTMLPurifier_HTMLModule_Forms extends HTMLPurifier_HTMLModule
  14095. {
  14096. /**
  14097. * @type string
  14098. */
  14099. public $name = 'Forms';
  14100. /**
  14101. * @type bool
  14102. */
  14103. public $safe = false;
  14104. /**
  14105. * @type array
  14106. */
  14107. public $content_sets = array(
  14108. 'Block' => 'Form',
  14109. 'Inline' => 'Formctrl',
  14110. );
  14111. /**
  14112. * @param HTMLPurifier_Config $config
  14113. */
  14114. public function setup($config)
  14115. {
  14116. $form = $this->addElement(
  14117. 'form',
  14118. 'Form',
  14119. 'Required: Heading | List | Block | fieldset',
  14120. 'Common',
  14121. array(
  14122. 'accept' => 'ContentTypes',
  14123. 'accept-charset' => 'Charsets',
  14124. 'action*' => 'URI',
  14125. 'method' => 'Enum#get,post',
  14126. // really ContentType, but these two are the only ones used today
  14127. 'enctype' => 'Enum#application/x-www-form-urlencoded,multipart/form-data',
  14128. )
  14129. );
  14130. $form->excludes = array('form' => true);
  14131. $input = $this->addElement(
  14132. 'input',
  14133. 'Formctrl',
  14134. 'Empty',
  14135. 'Common',
  14136. array(
  14137. 'accept' => 'ContentTypes',
  14138. 'accesskey' => 'Character',
  14139. 'alt' => 'Text',
  14140. 'checked' => 'Bool#checked',
  14141. 'disabled' => 'Bool#disabled',
  14142. 'maxlength' => 'Number',
  14143. 'name' => 'CDATA',
  14144. 'readonly' => 'Bool#readonly',
  14145. 'size' => 'Number',
  14146. 'src' => 'URI#embedded',
  14147. 'tabindex' => 'Number',
  14148. 'type' => 'Enum#text,password,checkbox,button,radio,submit,reset,file,hidden,image',
  14149. 'value' => 'CDATA',
  14150. )
  14151. );
  14152. $input->attr_transform_post[] = new HTMLPurifier_AttrTransform_Input();
  14153. $this->addElement(
  14154. 'select',
  14155. 'Formctrl',
  14156. 'Required: optgroup | option',
  14157. 'Common',
  14158. array(
  14159. 'disabled' => 'Bool#disabled',
  14160. 'multiple' => 'Bool#multiple',
  14161. 'name' => 'CDATA',
  14162. 'size' => 'Number',
  14163. 'tabindex' => 'Number',
  14164. )
  14165. );
  14166. $this->addElement(
  14167. 'option',
  14168. false,
  14169. 'Optional: #PCDATA',
  14170. 'Common',
  14171. array(
  14172. 'disabled' => 'Bool#disabled',
  14173. 'label' => 'Text',
  14174. 'selected' => 'Bool#selected',
  14175. 'value' => 'CDATA',
  14176. )
  14177. );
  14178. // It's illegal for there to be more than one selected, but not
  14179. // be multiple. Also, no selected means undefined behavior. This might
  14180. // be difficult to implement; perhaps an injector, or a context variable.
  14181. $textarea = $this->addElement(
  14182. 'textarea',
  14183. 'Formctrl',
  14184. 'Optional: #PCDATA',
  14185. 'Common',
  14186. array(
  14187. 'accesskey' => 'Character',
  14188. 'cols*' => 'Number',
  14189. 'disabled' => 'Bool#disabled',
  14190. 'name' => 'CDATA',
  14191. 'readonly' => 'Bool#readonly',
  14192. 'rows*' => 'Number',
  14193. 'tabindex' => 'Number',
  14194. )
  14195. );
  14196. $textarea->attr_transform_pre[] = new HTMLPurifier_AttrTransform_Textarea();
  14197. $button = $this->addElement(
  14198. 'button',
  14199. 'Formctrl',
  14200. 'Optional: #PCDATA | Heading | List | Block | Inline',
  14201. 'Common',
  14202. array(
  14203. 'accesskey' => 'Character',
  14204. 'disabled' => 'Bool#disabled',
  14205. 'name' => 'CDATA',
  14206. 'tabindex' => 'Number',
  14207. 'type' => 'Enum#button,submit,reset',
  14208. 'value' => 'CDATA',
  14209. )
  14210. );
  14211. // For exclusions, ideally we'd specify content sets, not literal elements
  14212. $button->excludes = $this->makeLookup(
  14213. 'form',
  14214. 'fieldset', // Form
  14215. 'input',
  14216. 'select',
  14217. 'textarea',
  14218. 'label',
  14219. 'button', // Formctrl
  14220. 'a', // as per HTML 4.01 spec, this is omitted by modularization
  14221. 'isindex',
  14222. 'iframe' // legacy items
  14223. );
  14224. // Extra exclusion: img usemap="" is not permitted within this element.
  14225. // We'll omit this for now, since we don't have any good way of
  14226. // indicating it yet.
  14227. // This is HIGHLY user-unfriendly; we need a custom child-def for this
  14228. $this->addElement('fieldset', 'Form', 'Custom: (#WS?,legend,(Flow|#PCDATA)*)', 'Common');
  14229. $label = $this->addElement(
  14230. 'label',
  14231. 'Formctrl',
  14232. 'Optional: #PCDATA | Inline',
  14233. 'Common',
  14234. array(
  14235. 'accesskey' => 'Character',
  14236. // 'for' => 'IDREF', // IDREF not implemented, cannot allow
  14237. )
  14238. );
  14239. $label->excludes = array('label' => true);
  14240. $this->addElement(
  14241. 'legend',
  14242. false,
  14243. 'Optional: #PCDATA | Inline',
  14244. 'Common',
  14245. array(
  14246. 'accesskey' => 'Character',
  14247. )
  14248. );
  14249. $this->addElement(
  14250. 'optgroup',
  14251. false,
  14252. 'Required: option',
  14253. 'Common',
  14254. array(
  14255. 'disabled' => 'Bool#disabled',
  14256. 'label*' => 'Text',
  14257. )
  14258. );
  14259. // Don't forget an injector for <isindex>. This one's a little complex
  14260. // because it maps to multiple elements.
  14261. }
  14262. }
  14263. /**
  14264. * XHTML 1.1 Hypertext Module, defines hypertext links. Core Module.
  14265. */
  14266. class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
  14267. {
  14268. /**
  14269. * @type string
  14270. */
  14271. public $name = 'Hypertext';
  14272. /**
  14273. * @param HTMLPurifier_Config $config
  14274. */
  14275. public function setup($config)
  14276. {
  14277. $a = $this->addElement(
  14278. 'a',
  14279. 'Inline',
  14280. 'Inline',
  14281. 'Common',
  14282. array(
  14283. // 'accesskey' => 'Character',
  14284. // 'charset' => 'Charset',
  14285. 'href' => 'URI',
  14286. // 'hreflang' => 'LanguageCode',
  14287. 'rel' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rel'),
  14288. 'rev' => new HTMLPurifier_AttrDef_HTML_LinkTypes('rev'),
  14289. // 'tabindex' => 'Number',
  14290. // 'type' => 'ContentType',
  14291. )
  14292. );
  14293. $a->formatting = true;
  14294. $a->excludes = array('a' => true);
  14295. }
  14296. }
  14297. /**
  14298. * XHTML 1.1 Iframe Module provides inline frames.
  14299. *
  14300. * @note This module is not considered safe unless an Iframe
  14301. * whitelisting mechanism is specified. Currently, the only
  14302. * such mechanism is %URL.SafeIframeRegexp
  14303. */
  14304. class HTMLPurifier_HTMLModule_Iframe extends HTMLPurifier_HTMLModule
  14305. {
  14306. /**
  14307. * @type string
  14308. */
  14309. public $name = 'Iframe';
  14310. /**
  14311. * @type bool
  14312. */
  14313. public $safe = false;
  14314. /**
  14315. * @param HTMLPurifier_Config $config
  14316. */
  14317. public function setup($config)
  14318. {
  14319. if ($config->get('HTML.SafeIframe')) {
  14320. $this->safe = true;
  14321. }
  14322. $this->addElement(
  14323. 'iframe',
  14324. 'Inline',
  14325. 'Flow',
  14326. 'Common',
  14327. array(
  14328. 'src' => 'URI#embedded',
  14329. 'width' => 'Length',
  14330. 'height' => 'Length',
  14331. 'name' => 'ID',
  14332. 'scrolling' => 'Enum#yes,no,auto',
  14333. 'frameborder' => 'Enum#0,1',
  14334. 'longdesc' => 'URI',
  14335. 'marginheight' => 'Pixels',
  14336. 'marginwidth' => 'Pixels',
  14337. )
  14338. );
  14339. }
  14340. }
  14341. /**
  14342. * XHTML 1.1 Image Module provides basic image embedding.
  14343. * @note There is specialized code for removing empty images in
  14344. * HTMLPurifier_Strategy_RemoveForeignElements
  14345. */
  14346. class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
  14347. {
  14348. /**
  14349. * @type string
  14350. */
  14351. public $name = 'Image';
  14352. /**
  14353. * @param HTMLPurifier_Config $config
  14354. */
  14355. public function setup($config)
  14356. {
  14357. $max = $config->get('HTML.MaxImgLength');
  14358. $img = $this->addElement(
  14359. 'img',
  14360. 'Inline',
  14361. 'Empty',
  14362. 'Common',
  14363. array(
  14364. 'alt*' => 'Text',
  14365. // According to the spec, it's Length, but percents can
  14366. // be abused, so we allow only Pixels.
  14367. 'height' => 'Pixels#' . $max,
  14368. 'width' => 'Pixels#' . $max,
  14369. 'longdesc' => 'URI',
  14370. 'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
  14371. )
  14372. );
  14373. if ($max === null || $config->get('HTML.Trusted')) {
  14374. $img->attr['height'] =
  14375. $img->attr['width'] = 'Length';
  14376. }
  14377. // kind of strange, but splitting things up would be inefficient
  14378. $img->attr_transform_pre[] =
  14379. $img->attr_transform_post[] =
  14380. new HTMLPurifier_AttrTransform_ImgRequired();
  14381. }
  14382. }
  14383. /**
  14384. * XHTML 1.1 Legacy module defines elements that were previously
  14385. * deprecated.
  14386. *
  14387. * @note Not all legacy elements have been implemented yet, which
  14388. * is a bit of a reverse problem as compared to browsers! In
  14389. * addition, this legacy module may implement a bit more than
  14390. * mandated by XHTML 1.1.
  14391. *
  14392. * This module can be used in combination with TransformToStrict in order
  14393. * to transform as many deprecated elements as possible, but retain
  14394. * questionably deprecated elements that do not have good alternatives
  14395. * as well as transform elements that don't have an implementation.
  14396. * See docs/ref-strictness.txt for more details.
  14397. */
  14398. class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
  14399. {
  14400. /**
  14401. * @type string
  14402. */
  14403. public $name = 'Legacy';
  14404. /**
  14405. * @param HTMLPurifier_Config $config
  14406. */
  14407. public function setup($config)
  14408. {
  14409. $this->addElement(
  14410. 'basefont',
  14411. 'Inline',
  14412. 'Empty',
  14413. null,
  14414. array(
  14415. 'color' => 'Color',
  14416. 'face' => 'Text', // extremely broad, we should
  14417. 'size' => 'Text', // tighten it
  14418. 'id' => 'ID'
  14419. )
  14420. );
  14421. $this->addElement('center', 'Block', 'Flow', 'Common');
  14422. $this->addElement(
  14423. 'dir',
  14424. 'Block',
  14425. 'Required: li',
  14426. 'Common',
  14427. array(
  14428. 'compact' => 'Bool#compact'
  14429. )
  14430. );
  14431. $this->addElement(
  14432. 'font',
  14433. 'Inline',
  14434. 'Inline',
  14435. array('Core', 'I18N'),
  14436. array(
  14437. 'color' => 'Color',
  14438. 'face' => 'Text', // extremely broad, we should
  14439. 'size' => 'Text', // tighten it
  14440. )
  14441. );
  14442. $this->addElement(
  14443. 'menu',
  14444. 'Block',
  14445. 'Required: li',
  14446. 'Common',
  14447. array(
  14448. 'compact' => 'Bool#compact'
  14449. )
  14450. );
  14451. $s = $this->addElement('s', 'Inline', 'Inline', 'Common');
  14452. $s->formatting = true;
  14453. $strike = $this->addElement('strike', 'Inline', 'Inline', 'Common');
  14454. $strike->formatting = true;
  14455. $u = $this->addElement('u', 'Inline', 'Inline', 'Common');
  14456. $u->formatting = true;
  14457. // setup modifications to old elements
  14458. $align = 'Enum#left,right,center,justify';
  14459. $address = $this->addBlankElement('address');
  14460. $address->content_model = 'Inline | #PCDATA | p';
  14461. $address->content_model_type = 'optional';
  14462. $address->child = false;
  14463. $blockquote = $this->addBlankElement('blockquote');
  14464. $blockquote->content_model = 'Flow | #PCDATA';
  14465. $blockquote->content_model_type = 'optional';
  14466. $blockquote->child = false;
  14467. $br = $this->addBlankElement('br');
  14468. $br->attr['clear'] = 'Enum#left,all,right,none';
  14469. $caption = $this->addBlankElement('caption');
  14470. $caption->attr['align'] = 'Enum#top,bottom,left,right';
  14471. $div = $this->addBlankElement('div');
  14472. $div->attr['align'] = $align;
  14473. $dl = $this->addBlankElement('dl');
  14474. $dl->attr['compact'] = 'Bool#compact';
  14475. for ($i = 1; $i <= 6; $i++) {
  14476. $h = $this->addBlankElement("h$i");
  14477. $h->attr['align'] = $align;
  14478. }
  14479. $hr = $this->addBlankElement('hr');
  14480. $hr->attr['align'] = $align;
  14481. $hr->attr['noshade'] = 'Bool#noshade';
  14482. $hr->attr['size'] = 'Pixels';
  14483. $hr->attr['width'] = 'Length';
  14484. $img = $this->addBlankElement('img');
  14485. $img->attr['align'] = 'IAlign';
  14486. $img->attr['border'] = 'Pixels';
  14487. $img->attr['hspace'] = 'Pixels';
  14488. $img->attr['vspace'] = 'Pixels';
  14489. // figure out this integer business
  14490. $li = $this->addBlankElement('li');
  14491. $li->attr['value'] = new HTMLPurifier_AttrDef_Integer();
  14492. $li->attr['type'] = 'Enum#s:1,i,I,a,A,disc,square,circle';
  14493. $ol = $this->addBlankElement('ol');
  14494. $ol->attr['compact'] = 'Bool#compact';
  14495. $ol->attr['start'] = new HTMLPurifier_AttrDef_Integer();
  14496. $ol->attr['type'] = 'Enum#s:1,i,I,a,A';
  14497. $p = $this->addBlankElement('p');
  14498. $p->attr['align'] = $align;
  14499. $pre = $this->addBlankElement('pre');
  14500. $pre->attr['width'] = 'Number';
  14501. // script omitted
  14502. $table = $this->addBlankElement('table');
  14503. $table->attr['align'] = 'Enum#left,center,right';
  14504. $table->attr['bgcolor'] = 'Color';
  14505. $tr = $this->addBlankElement('tr');
  14506. $tr->attr['bgcolor'] = 'Color';
  14507. $th = $this->addBlankElement('th');
  14508. $th->attr['bgcolor'] = 'Color';
  14509. $th->attr['height'] = 'Length';
  14510. $th->attr['nowrap'] = 'Bool#nowrap';
  14511. $th->attr['width'] = 'Length';
  14512. $td = $this->addBlankElement('td');
  14513. $td->attr['bgcolor'] = 'Color';
  14514. $td->attr['height'] = 'Length';
  14515. $td->attr['nowrap'] = 'Bool#nowrap';
  14516. $td->attr['width'] = 'Length';
  14517. $ul = $this->addBlankElement('ul');
  14518. $ul->attr['compact'] = 'Bool#compact';
  14519. $ul->attr['type'] = 'Enum#square,disc,circle';
  14520. // "safe" modifications to "unsafe" elements
  14521. // WARNING: If you want to add support for an unsafe, legacy
  14522. // attribute, make a new TrustedLegacy module with the trusted
  14523. // bit set appropriately
  14524. $form = $this->addBlankElement('form');
  14525. $form->content_model = 'Flow | #PCDATA';
  14526. $form->content_model_type = 'optional';
  14527. $form->attr['target'] = 'FrameTarget';
  14528. $input = $this->addBlankElement('input');
  14529. $input->attr['align'] = 'IAlign';
  14530. $legend = $this->addBlankElement('legend');
  14531. $legend->attr['align'] = 'LAlign';
  14532. }
  14533. }
  14534. /**
  14535. * XHTML 1.1 List Module, defines list-oriented elements. Core Module.
  14536. */
  14537. class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
  14538. {
  14539. /**
  14540. * @type string
  14541. */
  14542. public $name = 'List';
  14543. // According to the abstract schema, the List content set is a fully formed
  14544. // one or more expr, but it invariably occurs in an optional declaration
  14545. // so we're not going to do that subtlety. It might cause trouble
  14546. // if a user defines "List" and expects that multiple lists are
  14547. // allowed to be specified, but then again, that's not very intuitive.
  14548. // Furthermore, the actual XML Schema may disagree. Regardless,
  14549. // we don't have support for such nested expressions without using
  14550. // the incredibly inefficient and draconic Custom ChildDef.
  14551. /**
  14552. * @type array
  14553. */
  14554. public $content_sets = array('Flow' => 'List');
  14555. /**
  14556. * @param HTMLPurifier_Config $config
  14557. */
  14558. public function setup($config)
  14559. {
  14560. $ol = $this->addElement('ol', 'List', new HTMLPurifier_ChildDef_List(), 'Common');
  14561. $ul = $this->addElement('ul', 'List', new HTMLPurifier_ChildDef_List(), 'Common');
  14562. // XXX The wrap attribute is handled by MakeWellFormed. This is all
  14563. // quite unsatisfactory, because we generated this
  14564. // *specifically* for lists, and now a big chunk of the handling
  14565. // is done properly by the List ChildDef. So actually, we just
  14566. // want enough information to make autoclosing work properly,
  14567. // and then hand off the tricky stuff to the ChildDef.
  14568. $ol->wrap = 'li';
  14569. $ul->wrap = 'li';
  14570. $this->addElement('dl', 'List', 'Required: dt | dd', 'Common');
  14571. $this->addElement('li', false, 'Flow', 'Common');
  14572. $this->addElement('dd', false, 'Flow', 'Common');
  14573. $this->addElement('dt', false, 'Inline', 'Common');
  14574. }
  14575. }
  14576. class HTMLPurifier_HTMLModule_Name extends HTMLPurifier_HTMLModule
  14577. {
  14578. /**
  14579. * @type string
  14580. */
  14581. public $name = 'Name';
  14582. /**
  14583. * @param HTMLPurifier_Config $config
  14584. */
  14585. public function setup($config)
  14586. {
  14587. $elements = array('a', 'applet', 'form', 'frame', 'iframe', 'img', 'map');
  14588. foreach ($elements as $name) {
  14589. $element = $this->addBlankElement($name);
  14590. $element->attr['name'] = 'CDATA';
  14591. if (!$config->get('HTML.Attr.Name.UseCDATA')) {
  14592. $element->attr_transform_post[] = new HTMLPurifier_AttrTransform_NameSync();
  14593. }
  14594. }
  14595. }
  14596. }
  14597. /**
  14598. * Module adds the nofollow attribute transformation to a tags. It
  14599. * is enabled by HTML.Nofollow
  14600. */
  14601. class HTMLPurifier_HTMLModule_Nofollow extends HTMLPurifier_HTMLModule
  14602. {
  14603. /**
  14604. * @type string
  14605. */
  14606. public $name = 'Nofollow';
  14607. /**
  14608. * @param HTMLPurifier_Config $config
  14609. */
  14610. public function setup($config)
  14611. {
  14612. $a = $this->addBlankElement('a');
  14613. $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow();
  14614. }
  14615. }
  14616. class HTMLPurifier_HTMLModule_NonXMLCommonAttributes extends HTMLPurifier_HTMLModule
  14617. {
  14618. /**
  14619. * @type string
  14620. */
  14621. public $name = 'NonXMLCommonAttributes';
  14622. /**
  14623. * @type array
  14624. */
  14625. public $attr_collections = array(
  14626. 'Lang' => array(
  14627. 'lang' => 'LanguageCode',
  14628. )
  14629. );
  14630. }
  14631. /**
  14632. * XHTML 1.1 Object Module, defines elements for generic object inclusion
  14633. * @warning Users will commonly use <embed> to cater to legacy browsers: this
  14634. * module does not allow this sort of behavior
  14635. */
  14636. class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
  14637. {
  14638. /**
  14639. * @type string
  14640. */
  14641. public $name = 'Object';
  14642. /**
  14643. * @type bool
  14644. */
  14645. public $safe = false;
  14646. /**
  14647. * @param HTMLPurifier_Config $config
  14648. */
  14649. public function setup($config)
  14650. {
  14651. $this->addElement(
  14652. 'object',
  14653. 'Inline',
  14654. 'Optional: #PCDATA | Flow | param',
  14655. 'Common',
  14656. array(
  14657. 'archive' => 'URI',
  14658. 'classid' => 'URI',
  14659. 'codebase' => 'URI',
  14660. 'codetype' => 'Text',
  14661. 'data' => 'URI',
  14662. 'declare' => 'Bool#declare',
  14663. 'height' => 'Length',
  14664. 'name' => 'CDATA',
  14665. 'standby' => 'Text',
  14666. 'tabindex' => 'Number',
  14667. 'type' => 'ContentType',
  14668. 'width' => 'Length'
  14669. )
  14670. );
  14671. $this->addElement(
  14672. 'param',
  14673. false,
  14674. 'Empty',
  14675. null,
  14676. array(
  14677. 'id' => 'ID',
  14678. 'name*' => 'Text',
  14679. 'type' => 'Text',
  14680. 'value' => 'Text',
  14681. 'valuetype' => 'Enum#data,ref,object'
  14682. )
  14683. );
  14684. }
  14685. }
  14686. /**
  14687. * XHTML 1.1 Presentation Module, defines simple presentation-related
  14688. * markup. Text Extension Module.
  14689. * @note The official XML Schema and DTD specs further divide this into
  14690. * two modules:
  14691. * - Block Presentation (hr)
  14692. * - Inline Presentation (b, big, i, small, sub, sup, tt)
  14693. * We have chosen not to heed this distinction, as content_sets
  14694. * provides satisfactory disambiguation.
  14695. */
  14696. class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
  14697. {
  14698. /**
  14699. * @type string
  14700. */
  14701. public $name = 'Presentation';
  14702. /**
  14703. * @param HTMLPurifier_Config $config
  14704. */
  14705. public function setup($config)
  14706. {
  14707. $this->addElement('hr', 'Block', 'Empty', 'Common');
  14708. $this->addElement('sub', 'Inline', 'Inline', 'Common');
  14709. $this->addElement('sup', 'Inline', 'Inline', 'Common');
  14710. $b = $this->addElement('b', 'Inline', 'Inline', 'Common');
  14711. $b->formatting = true;
  14712. $big = $this->addElement('big', 'Inline', 'Inline', 'Common');
  14713. $big->formatting = true;
  14714. $i = $this->addElement('i', 'Inline', 'Inline', 'Common');
  14715. $i->formatting = true;
  14716. $small = $this->addElement('small', 'Inline', 'Inline', 'Common');
  14717. $small->formatting = true;
  14718. $tt = $this->addElement('tt', 'Inline', 'Inline', 'Common');
  14719. $tt->formatting = true;
  14720. }
  14721. }
  14722. /**
  14723. * Module defines proprietary tags and attributes in HTML.
  14724. * @warning If this module is enabled, standards-compliance is off!
  14725. */
  14726. class HTMLPurifier_HTMLModule_Proprietary extends HTMLPurifier_HTMLModule
  14727. {
  14728. /**
  14729. * @type string
  14730. */
  14731. public $name = 'Proprietary';
  14732. /**
  14733. * @param HTMLPurifier_Config $config
  14734. */
  14735. public function setup($config)
  14736. {
  14737. $this->addElement(
  14738. 'marquee',
  14739. 'Inline',
  14740. 'Flow',
  14741. 'Common',
  14742. array(
  14743. 'direction' => 'Enum#left,right,up,down',
  14744. 'behavior' => 'Enum#alternate',
  14745. 'width' => 'Length',
  14746. 'height' => 'Length',
  14747. 'scrolldelay' => 'Number',
  14748. 'scrollamount' => 'Number',
  14749. 'loop' => 'Number',
  14750. 'bgcolor' => 'Color',
  14751. 'hspace' => 'Pixels',
  14752. 'vspace' => 'Pixels',
  14753. )
  14754. );
  14755. }
  14756. }
  14757. /**
  14758. * XHTML 1.1 Ruby Annotation Module, defines elements that indicate
  14759. * short runs of text alongside base text for annotation or pronounciation.
  14760. */
  14761. class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
  14762. {
  14763. /**
  14764. * @type string
  14765. */
  14766. public $name = 'Ruby';
  14767. /**
  14768. * @param HTMLPurifier_Config $config
  14769. */
  14770. public function setup($config)
  14771. {
  14772. $this->addElement(
  14773. 'ruby',
  14774. 'Inline',
  14775. 'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
  14776. 'Common'
  14777. );
  14778. $this->addElement('rbc', false, 'Required: rb', 'Common');
  14779. $this->addElement('rtc', false, 'Required: rt', 'Common');
  14780. $rb = $this->addElement('rb', false, 'Inline', 'Common');
  14781. $rb->excludes = array('ruby' => true);
  14782. $rt = $this->addElement('rt', false, 'Inline', 'Common', array('rbspan' => 'Number'));
  14783. $rt->excludes = array('ruby' => true);
  14784. $this->addElement('rp', false, 'Optional: #PCDATA', 'Common');
  14785. }
  14786. }
  14787. /**
  14788. * A "safe" embed module. See SafeObject. This is a proprietary element.
  14789. */
  14790. class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
  14791. {
  14792. /**
  14793. * @type string
  14794. */
  14795. public $name = 'SafeEmbed';
  14796. /**
  14797. * @param HTMLPurifier_Config $config
  14798. */
  14799. public function setup($config)
  14800. {
  14801. $max = $config->get('HTML.MaxImgLength');
  14802. $embed = $this->addElement(
  14803. 'embed',
  14804. 'Inline',
  14805. 'Empty',
  14806. 'Common',
  14807. array(
  14808. 'src*' => 'URI#embedded',
  14809. 'type' => 'Enum#application/x-shockwave-flash',
  14810. 'width' => 'Pixels#' . $max,
  14811. 'height' => 'Pixels#' . $max,
  14812. 'allowscriptaccess' => 'Enum#never',
  14813. 'allownetworking' => 'Enum#internal',
  14814. 'flashvars' => 'Text',
  14815. 'wmode' => 'Enum#window,transparent,opaque',
  14816. 'name' => 'ID',
  14817. )
  14818. );
  14819. $embed->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeEmbed();
  14820. }
  14821. }
  14822. /**
  14823. * A "safe" object module. In theory, objects permitted by this module will
  14824. * be safe, and untrusted users can be allowed to embed arbitrary flash objects
  14825. * (maybe other types too, but only Flash is supported as of right now).
  14826. * Highly experimental.
  14827. */
  14828. class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
  14829. {
  14830. /**
  14831. * @type string
  14832. */
  14833. public $name = 'SafeObject';
  14834. /**
  14835. * @param HTMLPurifier_Config $config
  14836. */
  14837. public function setup($config)
  14838. {
  14839. // These definitions are not intrinsically safe: the attribute transforms
  14840. // are a vital part of ensuring safety.
  14841. $max = $config->get('HTML.MaxImgLength');
  14842. $object = $this->addElement(
  14843. 'object',
  14844. 'Inline',
  14845. 'Optional: param | Flow | #PCDATA',
  14846. 'Common',
  14847. array(
  14848. // While technically not required by the spec, we're forcing
  14849. // it to this value.
  14850. 'type' => 'Enum#application/x-shockwave-flash',
  14851. 'width' => 'Pixels#' . $max,
  14852. 'height' => 'Pixels#' . $max,
  14853. 'data' => 'URI#embedded',
  14854. 'codebase' => new HTMLPurifier_AttrDef_Enum(
  14855. array(
  14856. 'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0'
  14857. )
  14858. ),
  14859. )
  14860. );
  14861. $object->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeObject();
  14862. $param = $this->addElement(
  14863. 'param',
  14864. false,
  14865. 'Empty',
  14866. false,
  14867. array(
  14868. 'id' => 'ID',
  14869. 'name*' => 'Text',
  14870. 'value' => 'Text'
  14871. )
  14872. );
  14873. $param->attr_transform_post[] = new HTMLPurifier_AttrTransform_SafeParam();
  14874. $this->info_injector[] = 'SafeObject';
  14875. }
  14876. }
  14877. /**
  14878. * A "safe" script module. No inline JS is allowed, and pointed to JS
  14879. * files must match whitelist.
  14880. */
  14881. class HTMLPurifier_HTMLModule_SafeScripting extends HTMLPurifier_HTMLModule
  14882. {
  14883. /**
  14884. * @type string
  14885. */
  14886. public $name = 'SafeScripting';
  14887. /**
  14888. * @param HTMLPurifier_Config $config
  14889. */
  14890. public function setup($config)
  14891. {
  14892. // These definitions are not intrinsically safe: the attribute transforms
  14893. // are a vital part of ensuring safety.
  14894. $allowed = $config->get('HTML.SafeScripting');
  14895. $script = $this->addElement(
  14896. 'script',
  14897. 'Inline',
  14898. 'Empty',
  14899. null,
  14900. array(
  14901. // While technically not required by the spec, we're forcing
  14902. // it to this value.
  14903. 'type' => 'Enum#text/javascript',
  14904. 'src*' => new HTMLPurifier_AttrDef_Enum(array_keys($allowed))
  14905. )
  14906. );
  14907. $script->attr_transform_pre[] =
  14908. $script->attr_transform_post[] = new HTMLPurifier_AttrTransform_ScriptRequired();
  14909. }
  14910. }
  14911. /*
  14912. WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
  14913. INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
  14914. */
  14915. /**
  14916. * XHTML 1.1 Scripting module, defines elements that are used to contain
  14917. * information pertaining to executable scripts or the lack of support
  14918. * for executable scripts.
  14919. * @note This module does not contain inline scripting elements
  14920. */
  14921. class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
  14922. {
  14923. /**
  14924. * @type string
  14925. */
  14926. public $name = 'Scripting';
  14927. /**
  14928. * @type array
  14929. */
  14930. public $elements = array('script', 'noscript');
  14931. /**
  14932. * @type array
  14933. */
  14934. public $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
  14935. /**
  14936. * @type bool
  14937. */
  14938. public $safe = false;
  14939. /**
  14940. * @param HTMLPurifier_Config $config
  14941. */
  14942. public function setup($config)
  14943. {
  14944. // TODO: create custom child-definition for noscript that
  14945. // auto-wraps stray #PCDATA in a similar manner to
  14946. // blockquote's custom definition (we would use it but
  14947. // blockquote's contents are optional while noscript's contents
  14948. // are required)
  14949. // TODO: convert this to new syntax, main problem is getting
  14950. // both content sets working
  14951. // In theory, this could be safe, but I don't see any reason to
  14952. // allow it.
  14953. $this->info['noscript'] = new HTMLPurifier_ElementDef();
  14954. $this->info['noscript']->attr = array(0 => array('Common'));
  14955. $this->info['noscript']->content_model = 'Heading | List | Block';
  14956. $this->info['noscript']->content_model_type = 'required';
  14957. $this->info['script'] = new HTMLPurifier_ElementDef();
  14958. $this->info['script']->attr = array(
  14959. 'defer' => new HTMLPurifier_AttrDef_Enum(array('defer')),
  14960. 'src' => new HTMLPurifier_AttrDef_URI(true),
  14961. 'type' => new HTMLPurifier_AttrDef_Enum(array('text/javascript'))
  14962. );
  14963. $this->info['script']->content_model = '#PCDATA';
  14964. $this->info['script']->content_model_type = 'optional';
  14965. $this->info['script']->attr_transform_pre[] =
  14966. $this->info['script']->attr_transform_post[] =
  14967. new HTMLPurifier_AttrTransform_ScriptRequired();
  14968. }
  14969. }
  14970. /**
  14971. * XHTML 1.1 Edit Module, defines editing-related elements. Text Extension
  14972. * Module.
  14973. */
  14974. class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
  14975. {
  14976. /**
  14977. * @type string
  14978. */
  14979. public $name = 'StyleAttribute';
  14980. /**
  14981. * @type array
  14982. */
  14983. public $attr_collections = array(
  14984. // The inclusion routine differs from the Abstract Modules but
  14985. // is in line with the DTD and XML Schemas.
  14986. 'Style' => array('style' => false), // see constructor
  14987. 'Core' => array(0 => array('Style'))
  14988. );
  14989. /**
  14990. * @param HTMLPurifier_Config $config
  14991. */
  14992. public function setup($config)
  14993. {
  14994. $this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
  14995. }
  14996. }
  14997. /**
  14998. * XHTML 1.1 Tables Module, fully defines accessible table elements.
  14999. */
  15000. class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
  15001. {
  15002. /**
  15003. * @type string
  15004. */
  15005. public $name = 'Tables';
  15006. /**
  15007. * @param HTMLPurifier_Config $config
  15008. */
  15009. public function setup($config)
  15010. {
  15011. $this->addElement('caption', false, 'Inline', 'Common');
  15012. $this->addElement(
  15013. 'table',
  15014. 'Block',
  15015. new HTMLPurifier_ChildDef_Table(),
  15016. 'Common',
  15017. array(
  15018. 'border' => 'Pixels',
  15019. 'cellpadding' => 'Length',
  15020. 'cellspacing' => 'Length',
  15021. 'frame' => 'Enum#void,above,below,hsides,lhs,rhs,vsides,box,border',
  15022. 'rules' => 'Enum#none,groups,rows,cols,all',
  15023. 'summary' => 'Text',
  15024. 'width' => 'Length'
  15025. )
  15026. );
  15027. // common attributes
  15028. $cell_align = array(
  15029. 'align' => 'Enum#left,center,right,justify,char',
  15030. 'charoff' => 'Length',
  15031. 'valign' => 'Enum#top,middle,bottom,baseline',
  15032. );
  15033. $cell_t = array_merge(
  15034. array(
  15035. 'abbr' => 'Text',
  15036. 'colspan' => 'Number',
  15037. 'rowspan' => 'Number',
  15038. // Apparently, as of HTML5 this attribute only applies
  15039. // to 'th' elements.
  15040. 'scope' => 'Enum#row,col,rowgroup,colgroup',
  15041. ),
  15042. $cell_align
  15043. );
  15044. $this->addElement('td', false, 'Flow', 'Common', $cell_t);
  15045. $this->addElement('th', false, 'Flow', 'Common', $cell_t);
  15046. $this->addElement('tr', false, 'Required: td | th', 'Common', $cell_align);
  15047. $cell_col = array_merge(
  15048. array(
  15049. 'span' => 'Number',
  15050. 'width' => 'MultiLength',
  15051. ),
  15052. $cell_align
  15053. );
  15054. $this->addElement('col', false, 'Empty', 'Common', $cell_col);
  15055. $this->addElement('colgroup', false, 'Optional: col', 'Common', $cell_col);
  15056. $this->addElement('tbody', false, 'Required: tr', 'Common', $cell_align);
  15057. $this->addElement('thead', false, 'Required: tr', 'Common', $cell_align);
  15058. $this->addElement('tfoot', false, 'Required: tr', 'Common', $cell_align);
  15059. }
  15060. }
  15061. /**
  15062. * XHTML 1.1 Target Module, defines target attribute in link elements.
  15063. */
  15064. class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
  15065. {
  15066. /**
  15067. * @type string
  15068. */
  15069. public $name = 'Target';
  15070. /**
  15071. * @param HTMLPurifier_Config $config
  15072. */
  15073. public function setup($config)
  15074. {
  15075. $elements = array('a');
  15076. foreach ($elements as $name) {
  15077. $e = $this->addBlankElement($name);
  15078. $e->attr = array(
  15079. 'target' => new HTMLPurifier_AttrDef_HTML_FrameTarget()
  15080. );
  15081. }
  15082. }
  15083. }
  15084. /**
  15085. * Module adds the target=blank attribute transformation to a tags. It
  15086. * is enabled by HTML.TargetBlank
  15087. */
  15088. class HTMLPurifier_HTMLModule_TargetBlank extends HTMLPurifier_HTMLModule
  15089. {
  15090. /**
  15091. * @type string
  15092. */
  15093. public $name = 'TargetBlank';
  15094. /**
  15095. * @param HTMLPurifier_Config $config
  15096. */
  15097. public function setup($config)
  15098. {
  15099. $a = $this->addBlankElement('a');
  15100. $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetBlank();
  15101. }
  15102. }
  15103. /**
  15104. * Module adds the target-based noopener attribute transformation to a tags. It
  15105. * is enabled by HTML.TargetNoopener
  15106. */
  15107. class HTMLPurifier_HTMLModule_TargetNoopener extends HTMLPurifier_HTMLModule
  15108. {
  15109. /**
  15110. * @type string
  15111. */
  15112. public $name = 'TargetNoopener';
  15113. /**
  15114. * @param HTMLPurifier_Config $config
  15115. */
  15116. public function setup($config) {
  15117. $a = $this->addBlankElement('a');
  15118. $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetNoopener();
  15119. }
  15120. }
  15121. /**
  15122. * Module adds the target-based noreferrer attribute transformation to a tags. It
  15123. * is enabled by HTML.TargetNoreferrer
  15124. */
  15125. class HTMLPurifier_HTMLModule_TargetNoreferrer extends HTMLPurifier_HTMLModule
  15126. {
  15127. /**
  15128. * @type string
  15129. */
  15130. public $name = 'TargetNoreferrer';
  15131. /**
  15132. * @param HTMLPurifier_Config $config
  15133. */
  15134. public function setup($config) {
  15135. $a = $this->addBlankElement('a');
  15136. $a->attr_transform_post[] = new HTMLPurifier_AttrTransform_TargetNoreferrer();
  15137. }
  15138. }
  15139. /**
  15140. * XHTML 1.1 Text Module, defines basic text containers. Core Module.
  15141. * @note In the normative XML Schema specification, this module
  15142. * is further abstracted into the following modules:
  15143. * - Block Phrasal (address, blockquote, pre, h1, h2, h3, h4, h5, h6)
  15144. * - Block Structural (div, p)
  15145. * - Inline Phrasal (abbr, acronym, cite, code, dfn, em, kbd, q, samp, strong, var)
  15146. * - Inline Structural (br, span)
  15147. * This module, functionally, does not distinguish between these
  15148. * sub-modules, but the code is internally structured to reflect
  15149. * these distinctions.
  15150. */
  15151. class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
  15152. {
  15153. /**
  15154. * @type string
  15155. */
  15156. public $name = 'Text';
  15157. /**
  15158. * @type array
  15159. */
  15160. public $content_sets = array(
  15161. 'Flow' => 'Heading | Block | Inline'
  15162. );
  15163. /**
  15164. * @param HTMLPurifier_Config $config
  15165. */
  15166. public function setup($config)
  15167. {
  15168. // Inline Phrasal -------------------------------------------------
  15169. $this->addElement('abbr', 'Inline', 'Inline', 'Common');
  15170. $this->addElement('acronym', 'Inline', 'Inline', 'Common');
  15171. $this->addElement('cite', 'Inline', 'Inline', 'Common');
  15172. $this->addElement('dfn', 'Inline', 'Inline', 'Common');
  15173. $this->addElement('kbd', 'Inline', 'Inline', 'Common');
  15174. $this->addElement('q', 'Inline', 'Inline', 'Common', array('cite' => 'URI'));
  15175. $this->addElement('samp', 'Inline', 'Inline', 'Common');
  15176. $this->addElement('var', 'Inline', 'Inline', 'Common');
  15177. $em = $this->addElement('em', 'Inline', 'Inline', 'Common');
  15178. $em->formatting = true;
  15179. $strong = $this->addElement('strong', 'Inline', 'Inline', 'Common');
  15180. $strong->formatting = true;
  15181. $code = $this->addElement('code', 'Inline', 'Inline', 'Common');
  15182. $code->formatting = true;
  15183. // Inline Structural ----------------------------------------------
  15184. $this->addElement('span', 'Inline', 'Inline', 'Common');
  15185. $this->addElement('br', 'Inline', 'Empty', 'Core');
  15186. // Block Phrasal --------------------------------------------------
  15187. $this->addElement('address', 'Block', 'Inline', 'Common');
  15188. $this->addElement('blockquote', 'Block', 'Optional: Heading | Block | List', 'Common', array('cite' => 'URI'));
  15189. $pre = $this->addElement('pre', 'Block', 'Inline', 'Common');
  15190. $pre->excludes = $this->makeLookup(
  15191. 'img',
  15192. 'big',
  15193. 'small',
  15194. 'object',
  15195. 'applet',
  15196. 'font',
  15197. 'basefont'
  15198. );
  15199. $this->addElement('h1', 'Heading', 'Inline', 'Common');
  15200. $this->addElement('h2', 'Heading', 'Inline', 'Common');
  15201. $this->addElement('h3', 'Heading', 'Inline', 'Common');
  15202. $this->addElement('h4', 'Heading', 'Inline', 'Common');
  15203. $this->addElement('h5', 'Heading', 'Inline', 'Common');
  15204. $this->addElement('h6', 'Heading', 'Inline', 'Common');
  15205. // Block Structural -----------------------------------------------
  15206. $p = $this->addElement('p', 'Block', 'Inline', 'Common');
  15207. $p->autoclose = array_flip(
  15208. array("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "ol", "p", "ul")
  15209. );
  15210. $this->addElement('div', 'Block', 'Flow', 'Common');
  15211. }
  15212. }
  15213. /**
  15214. * Abstract class for a set of proprietary modules that clean up (tidy)
  15215. * poorly written HTML.
  15216. * @todo Figure out how to protect some of these methods/properties
  15217. */
  15218. class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
  15219. {
  15220. /**
  15221. * List of supported levels.
  15222. * Index zero is a special case "no fixes" level.
  15223. * @type array
  15224. */
  15225. public $levels = array(0 => 'none', 'light', 'medium', 'heavy');
  15226. /**
  15227. * Default level to place all fixes in.
  15228. * Disabled by default.
  15229. * @type string
  15230. */
  15231. public $defaultLevel = null;
  15232. /**
  15233. * Lists of fixes used by getFixesForLevel().
  15234. * Format is:
  15235. * HTMLModule_Tidy->fixesForLevel[$level] = array('fix-1', 'fix-2');
  15236. * @type array
  15237. */
  15238. public $fixesForLevel = array(
  15239. 'light' => array(),
  15240. 'medium' => array(),
  15241. 'heavy' => array()
  15242. );
  15243. /**
  15244. * Lazy load constructs the module by determining the necessary
  15245. * fixes to create and then delegating to the populate() function.
  15246. * @param HTMLPurifier_Config $config
  15247. * @todo Wildcard matching and error reporting when an added or
  15248. * subtracted fix has no effect.
  15249. */
  15250. public function setup($config)
  15251. {
  15252. // create fixes, initialize fixesForLevel
  15253. $fixes = $this->makeFixes();
  15254. $this->makeFixesForLevel($fixes);
  15255. // figure out which fixes to use
  15256. $level = $config->get('HTML.TidyLevel');
  15257. $fixes_lookup = $this->getFixesForLevel($level);
  15258. // get custom fix declarations: these need namespace processing
  15259. $add_fixes = $config->get('HTML.TidyAdd');
  15260. $remove_fixes = $config->get('HTML.TidyRemove');
  15261. foreach ($fixes as $name => $fix) {
  15262. // needs to be refactored a little to implement globbing
  15263. if (isset($remove_fixes[$name]) ||
  15264. (!isset($add_fixes[$name]) && !isset($fixes_lookup[$name]))) {
  15265. unset($fixes[$name]);
  15266. }
  15267. }
  15268. // populate this module with necessary fixes
  15269. $this->populate($fixes);
  15270. }
  15271. /**
  15272. * Retrieves all fixes per a level, returning fixes for that specific
  15273. * level as well as all levels below it.
  15274. * @param string $level level identifier, see $levels for valid values
  15275. * @return array Lookup up table of fixes
  15276. */
  15277. public function getFixesForLevel($level)
  15278. {
  15279. if ($level == $this->levels[0]) {
  15280. return array();
  15281. }
  15282. $activated_levels = array();
  15283. for ($i = 1, $c = count($this->levels); $i < $c; $i++) {
  15284. $activated_levels[] = $this->levels[$i];
  15285. if ($this->levels[$i] == $level) {
  15286. break;
  15287. }
  15288. }
  15289. if ($i == $c) {
  15290. trigger_error(
  15291. 'Tidy level ' . htmlspecialchars($level) . ' not recognized',
  15292. E_USER_WARNING
  15293. );
  15294. return array();
  15295. }
  15296. $ret = array();
  15297. foreach ($activated_levels as $level) {
  15298. foreach ($this->fixesForLevel[$level] as $fix) {
  15299. $ret[$fix] = true;
  15300. }
  15301. }
  15302. return $ret;
  15303. }
  15304. /**
  15305. * Dynamically populates the $fixesForLevel member variable using
  15306. * the fixes array. It may be custom overloaded, used in conjunction
  15307. * with $defaultLevel, or not used at all.
  15308. * @param array $fixes
  15309. */
  15310. public function makeFixesForLevel($fixes)
  15311. {
  15312. if (!isset($this->defaultLevel)) {
  15313. return;
  15314. }
  15315. if (!isset($this->fixesForLevel[$this->defaultLevel])) {
  15316. trigger_error(
  15317. 'Default level ' . $this->defaultLevel . ' does not exist',
  15318. E_USER_ERROR
  15319. );
  15320. return;
  15321. }
  15322. $this->fixesForLevel[$this->defaultLevel] = array_keys($fixes);
  15323. }
  15324. /**
  15325. * Populates the module with transforms and other special-case code
  15326. * based on a list of fixes passed to it
  15327. * @param array $fixes Lookup table of fixes to activate
  15328. */
  15329. public function populate($fixes)
  15330. {
  15331. foreach ($fixes as $name => $fix) {
  15332. // determine what the fix is for
  15333. list($type, $params) = $this->getFixType($name);
  15334. switch ($type) {
  15335. case 'attr_transform_pre':
  15336. case 'attr_transform_post':
  15337. $attr = $params['attr'];
  15338. if (isset($params['element'])) {
  15339. $element = $params['element'];
  15340. if (empty($this->info[$element])) {
  15341. $e = $this->addBlankElement($element);
  15342. } else {
  15343. $e = $this->info[$element];
  15344. }
  15345. } else {
  15346. $type = "info_$type";
  15347. $e = $this;
  15348. }
  15349. // PHP does some weird parsing when I do
  15350. // $e->$type[$attr], so I have to assign a ref.
  15351. $f =& $e->$type;
  15352. $f[$attr] = $fix;
  15353. break;
  15354. case 'tag_transform':
  15355. $this->info_tag_transform[$params['element']] = $fix;
  15356. break;
  15357. case 'child':
  15358. case 'content_model_type':
  15359. $element = $params['element'];
  15360. if (empty($this->info[$element])) {
  15361. $e = $this->addBlankElement($element);
  15362. } else {
  15363. $e = $this->info[$element];
  15364. }
  15365. $e->$type = $fix;
  15366. break;
  15367. default:
  15368. trigger_error("Fix type $type not supported", E_USER_ERROR);
  15369. break;
  15370. }
  15371. }
  15372. }
  15373. /**
  15374. * Parses a fix name and determines what kind of fix it is, as well
  15375. * as other information defined by the fix
  15376. * @param $name String name of fix
  15377. * @return array(string $fix_type, array $fix_parameters)
  15378. * @note $fix_parameters is type dependant, see populate() for usage
  15379. * of these parameters
  15380. */
  15381. public function getFixType($name)
  15382. {
  15383. // parse it
  15384. $property = $attr = null;
  15385. if (strpos($name, '#') !== false) {
  15386. list($name, $property) = explode('#', $name);
  15387. }
  15388. if (strpos($name, '@') !== false) {
  15389. list($name, $attr) = explode('@', $name);
  15390. }
  15391. // figure out the parameters
  15392. $params = array();
  15393. if ($name !== '') {
  15394. $params['element'] = $name;
  15395. }
  15396. if (!is_null($attr)) {
  15397. $params['attr'] = $attr;
  15398. }
  15399. // special case: attribute transform
  15400. if (!is_null($attr)) {
  15401. if (is_null($property)) {
  15402. $property = 'pre';
  15403. }
  15404. $type = 'attr_transform_' . $property;
  15405. return array($type, $params);
  15406. }
  15407. // special case: tag transform
  15408. if (is_null($property)) {
  15409. return array('tag_transform', $params);
  15410. }
  15411. return array($property, $params);
  15412. }
  15413. /**
  15414. * Defines all fixes the module will perform in a compact
  15415. * associative array of fix name to fix implementation.
  15416. * @return array
  15417. */
  15418. public function makeFixes()
  15419. {
  15420. }
  15421. }
  15422. class HTMLPurifier_HTMLModule_XMLCommonAttributes extends HTMLPurifier_HTMLModule
  15423. {
  15424. /**
  15425. * @type string
  15426. */
  15427. public $name = 'XMLCommonAttributes';
  15428. /**
  15429. * @type array
  15430. */
  15431. public $attr_collections = array(
  15432. 'Lang' => array(
  15433. 'xml:lang' => 'LanguageCode',
  15434. )
  15435. );
  15436. }
  15437. /**
  15438. * Name is deprecated, but allowed in strict doctypes, so onl
  15439. */
  15440. class HTMLPurifier_HTMLModule_Tidy_Name extends HTMLPurifier_HTMLModule_Tidy
  15441. {
  15442. /**
  15443. * @type string
  15444. */
  15445. public $name = 'Tidy_Name';
  15446. /**
  15447. * @type string
  15448. */
  15449. public $defaultLevel = 'heavy';
  15450. /**
  15451. * @return array
  15452. */
  15453. public function makeFixes()
  15454. {
  15455. $r = array();
  15456. // @name for img, a -----------------------------------------------
  15457. // Technically, it's allowed even on strict, so we allow authors to use
  15458. // it. However, it's deprecated in future versions of XHTML.
  15459. $r['img@name'] =
  15460. $r['a@name'] = new HTMLPurifier_AttrTransform_Name();
  15461. return $r;
  15462. }
  15463. }
  15464. class HTMLPurifier_HTMLModule_Tidy_Proprietary extends HTMLPurifier_HTMLModule_Tidy
  15465. {
  15466. /**
  15467. * @type string
  15468. */
  15469. public $name = 'Tidy_Proprietary';
  15470. /**
  15471. * @type string
  15472. */
  15473. public $defaultLevel = 'light';
  15474. /**
  15475. * @return array
  15476. */
  15477. public function makeFixes()
  15478. {
  15479. $r = array();
  15480. $r['table@background'] = new HTMLPurifier_AttrTransform_Background();
  15481. $r['td@background'] = new HTMLPurifier_AttrTransform_Background();
  15482. $r['th@background'] = new HTMLPurifier_AttrTransform_Background();
  15483. $r['tr@background'] = new HTMLPurifier_AttrTransform_Background();
  15484. $r['thead@background'] = new HTMLPurifier_AttrTransform_Background();
  15485. $r['tfoot@background'] = new HTMLPurifier_AttrTransform_Background();
  15486. $r['tbody@background'] = new HTMLPurifier_AttrTransform_Background();
  15487. $r['table@height'] = new HTMLPurifier_AttrTransform_Length('height');
  15488. return $r;
  15489. }
  15490. }
  15491. class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends HTMLPurifier_HTMLModule_Tidy
  15492. {
  15493. /**
  15494. * @return array
  15495. */
  15496. public function makeFixes()
  15497. {
  15498. $r = array();
  15499. // == deprecated tag transforms ===================================
  15500. $r['font'] = new HTMLPurifier_TagTransform_Font();
  15501. $r['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
  15502. $r['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
  15503. $r['center'] = new HTMLPurifier_TagTransform_Simple('div', 'text-align:center;');
  15504. $r['u'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:underline;');
  15505. $r['s'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
  15506. $r['strike'] = new HTMLPurifier_TagTransform_Simple('span', 'text-decoration:line-through;');
  15507. // == deprecated attribute transforms =============================
  15508. $r['caption@align'] =
  15509. new HTMLPurifier_AttrTransform_EnumToCSS(
  15510. 'align',
  15511. array(
  15512. // we're following IE's behavior, not Firefox's, due
  15513. // to the fact that no one supports caption-side:right,
  15514. // W3C included (with CSS 2.1). This is a slightly
  15515. // unreasonable attribute!
  15516. 'left' => 'text-align:left;',
  15517. 'right' => 'text-align:right;',
  15518. 'top' => 'caption-side:top;',
  15519. 'bottom' => 'caption-side:bottom;' // not supported by IE
  15520. )
  15521. );
  15522. // @align for img -------------------------------------------------
  15523. $r['img@align'] =
  15524. new HTMLPurifier_AttrTransform_EnumToCSS(
  15525. 'align',
  15526. array(
  15527. 'left' => 'float:left;',
  15528. 'right' => 'float:right;',
  15529. 'top' => 'vertical-align:top;',
  15530. 'middle' => 'vertical-align:middle;',
  15531. 'bottom' => 'vertical-align:baseline;',
  15532. )
  15533. );
  15534. // @align for table -----------------------------------------------
  15535. $r['table@align'] =
  15536. new HTMLPurifier_AttrTransform_EnumToCSS(
  15537. 'align',
  15538. array(
  15539. 'left' => 'float:left;',
  15540. 'center' => 'margin-left:auto;margin-right:auto;',
  15541. 'right' => 'float:right;'
  15542. )
  15543. );
  15544. // @align for hr -----------------------------------------------
  15545. $r['hr@align'] =
  15546. new HTMLPurifier_AttrTransform_EnumToCSS(
  15547. 'align',
  15548. array(
  15549. // we use both text-align and margin because these work
  15550. // for different browsers (IE and Firefox, respectively)
  15551. // and the melange makes for a pretty cross-compatible
  15552. // solution
  15553. 'left' => 'margin-left:0;margin-right:auto;text-align:left;',
  15554. 'center' => 'margin-left:auto;margin-right:auto;text-align:center;',
  15555. 'right' => 'margin-left:auto;margin-right:0;text-align:right;'
  15556. )
  15557. );
  15558. // @align for h1, h2, h3, h4, h5, h6, p, div ----------------------
  15559. // {{{
  15560. $align_lookup = array();
  15561. $align_values = array('left', 'right', 'center', 'justify');
  15562. foreach ($align_values as $v) {
  15563. $align_lookup[$v] = "text-align:$v;";
  15564. }
  15565. // }}}
  15566. $r['h1@align'] =
  15567. $r['h2@align'] =
  15568. $r['h3@align'] =
  15569. $r['h4@align'] =
  15570. $r['h5@align'] =
  15571. $r['h6@align'] =
  15572. $r['p@align'] =
  15573. $r['div@align'] =
  15574. new HTMLPurifier_AttrTransform_EnumToCSS('align', $align_lookup);
  15575. // @bgcolor for table, tr, td, th ---------------------------------
  15576. $r['table@bgcolor'] =
  15577. $r['td@bgcolor'] =
  15578. $r['th@bgcolor'] =
  15579. new HTMLPurifier_AttrTransform_BgColor();
  15580. // @border for img ------------------------------------------------
  15581. $r['img@border'] = new HTMLPurifier_AttrTransform_Border();
  15582. // @clear for br --------------------------------------------------
  15583. $r['br@clear'] =
  15584. new HTMLPurifier_AttrTransform_EnumToCSS(
  15585. 'clear',
  15586. array(
  15587. 'left' => 'clear:left;',
  15588. 'right' => 'clear:right;',
  15589. 'all' => 'clear:both;',
  15590. 'none' => 'clear:none;',
  15591. )
  15592. );
  15593. // @height for td, th ---------------------------------------------
  15594. $r['td@height'] =
  15595. $r['th@height'] =
  15596. new HTMLPurifier_AttrTransform_Length('height');
  15597. // @hspace for img ------------------------------------------------
  15598. $r['img@hspace'] = new HTMLPurifier_AttrTransform_ImgSpace('hspace');
  15599. // @noshade for hr ------------------------------------------------
  15600. // this transformation is not precise but often good enough.
  15601. // different browsers use different styles to designate noshade
  15602. $r['hr@noshade'] =
  15603. new HTMLPurifier_AttrTransform_BoolToCSS(
  15604. 'noshade',
  15605. 'color:#808080;background-color:#808080;border:0;'
  15606. );
  15607. // @nowrap for td, th ---------------------------------------------
  15608. $r['td@nowrap'] =
  15609. $r['th@nowrap'] =
  15610. new HTMLPurifier_AttrTransform_BoolToCSS(
  15611. 'nowrap',
  15612. 'white-space:nowrap;'
  15613. );
  15614. // @size for hr --------------------------------------------------
  15615. $r['hr@size'] = new HTMLPurifier_AttrTransform_Length('size', 'height');
  15616. // @type for li, ol, ul -------------------------------------------
  15617. // {{{
  15618. $ul_types = array(
  15619. 'disc' => 'list-style-type:disc;',
  15620. 'square' => 'list-style-type:square;',
  15621. 'circle' => 'list-style-type:circle;'
  15622. );
  15623. $ol_types = array(
  15624. '1' => 'list-style-type:decimal;',
  15625. 'i' => 'list-style-type:lower-roman;',
  15626. 'I' => 'list-style-type:upper-roman;',
  15627. 'a' => 'list-style-type:lower-alpha;',
  15628. 'A' => 'list-style-type:upper-alpha;'
  15629. );
  15630. $li_types = $ul_types + $ol_types;
  15631. // }}}
  15632. $r['ul@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ul_types);
  15633. $r['ol@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $ol_types, true);
  15634. $r['li@type'] = new HTMLPurifier_AttrTransform_EnumToCSS('type', $li_types, true);
  15635. // @vspace for img ------------------------------------------------
  15636. $r['img@vspace'] = new HTMLPurifier_AttrTransform_ImgSpace('vspace');
  15637. // @width for hr, td, th ------------------------------------------
  15638. $r['td@width'] =
  15639. $r['th@width'] =
  15640. $r['hr@width'] = new HTMLPurifier_AttrTransform_Length('width');
  15641. return $r;
  15642. }
  15643. }
  15644. class HTMLPurifier_HTMLModule_Tidy_Strict extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
  15645. {
  15646. /**
  15647. * @type string
  15648. */
  15649. public $name = 'Tidy_Strict';
  15650. /**
  15651. * @type string
  15652. */
  15653. public $defaultLevel = 'light';
  15654. /**
  15655. * @return array
  15656. */
  15657. public function makeFixes()
  15658. {
  15659. $r = parent::makeFixes();
  15660. $r['blockquote#content_model_type'] = 'strictblockquote';
  15661. return $r;
  15662. }
  15663. /**
  15664. * @type bool
  15665. */
  15666. public $defines_child_def = true;
  15667. /**
  15668. * @param HTMLPurifier_ElementDef $def
  15669. * @return HTMLPurifier_ChildDef_StrictBlockquote
  15670. */
  15671. public function getChildDef($def)
  15672. {
  15673. if ($def->content_model_type != 'strictblockquote') {
  15674. return parent::getChildDef($def);
  15675. }
  15676. return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
  15677. }
  15678. }
  15679. class HTMLPurifier_HTMLModule_Tidy_Transitional extends HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4
  15680. {
  15681. /**
  15682. * @type string
  15683. */
  15684. public $name = 'Tidy_Transitional';
  15685. /**
  15686. * @type string
  15687. */
  15688. public $defaultLevel = 'heavy';
  15689. }
  15690. class HTMLPurifier_HTMLModule_Tidy_XHTML extends HTMLPurifier_HTMLModule_Tidy
  15691. {
  15692. /**
  15693. * @type string
  15694. */
  15695. public $name = 'Tidy_XHTML';
  15696. /**
  15697. * @type string
  15698. */
  15699. public $defaultLevel = 'medium';
  15700. /**
  15701. * @return array
  15702. */
  15703. public function makeFixes()
  15704. {
  15705. $r = array();
  15706. $r['@lang'] = new HTMLPurifier_AttrTransform_Lang();
  15707. return $r;
  15708. }
  15709. }
  15710. /**
  15711. * Injector that auto paragraphs text in the root node based on
  15712. * double-spacing.
  15713. * @todo Ensure all states are unit tested, including variations as well.
  15714. * @todo Make a graph of the flow control for this Injector.
  15715. */
  15716. class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
  15717. {
  15718. /**
  15719. * @type string
  15720. */
  15721. public $name = 'AutoParagraph';
  15722. /**
  15723. * @type array
  15724. */
  15725. public $needed = array('p');
  15726. /**
  15727. * @return HTMLPurifier_Token_Start
  15728. */
  15729. private function _pStart()
  15730. {
  15731. $par = new HTMLPurifier_Token_Start('p');
  15732. $par->armor['MakeWellFormed_TagClosedError'] = true;
  15733. return $par;
  15734. }
  15735. /**
  15736. * @param HTMLPurifier_Token_Text $token
  15737. */
  15738. public function handleText(&$token)
  15739. {
  15740. $text = $token->data;
  15741. // Does the current parent allow <p> tags?
  15742. if ($this->allowsElement('p')) {
  15743. if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
  15744. // Note that we have differing behavior when dealing with text
  15745. // in the anonymous root node, or a node inside the document.
  15746. // If the text as a double-newline, the treatment is the same;
  15747. // if it doesn't, see the next if-block if you're in the document.
  15748. $i = $nesting = null;
  15749. if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
  15750. // State 1.1: ... ^ (whitespace, then document end)
  15751. // ----
  15752. // This is a degenerate case
  15753. } else {
  15754. if (!$token->is_whitespace || $this->_isInline($current)) {
  15755. // State 1.2: PAR1
  15756. // ----
  15757. // State 1.3: PAR1\n\nPAR2
  15758. // ------------
  15759. // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
  15760. // ------------
  15761. $token = array($this->_pStart());
  15762. $this->_splitText($text, $token);
  15763. } else {
  15764. // State 1.5: \n<hr />
  15765. // --
  15766. }
  15767. }
  15768. } else {
  15769. // State 2: <div>PAR1... (similar to 1.4)
  15770. // ----
  15771. // We're in an element that allows paragraph tags, but we're not
  15772. // sure if we're going to need them.
  15773. if ($this->_pLookAhead()) {
  15774. // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
  15775. // ----
  15776. // Note: This will always be the first child, since any
  15777. // previous inline element would have triggered this very
  15778. // same routine, and found the double newline. One possible
  15779. // exception would be a comment.
  15780. $token = array($this->_pStart(), $token);
  15781. } else {
  15782. // State 2.2.1: <div>PAR1<div>
  15783. // ----
  15784. // State 2.2.2: <div>PAR1<b>PAR1</b></div>
  15785. // ----
  15786. }
  15787. }
  15788. // Is the current parent a <p> tag?
  15789. } elseif (!empty($this->currentNesting) &&
  15790. $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') {
  15791. // State 3.1: ...<p>PAR1
  15792. // ----
  15793. // State 3.2: ...<p>PAR1\n\nPAR2
  15794. // ------------
  15795. $token = array();
  15796. $this->_splitText($text, $token);
  15797. // Abort!
  15798. } else {
  15799. // State 4.1: ...<b>PAR1
  15800. // ----
  15801. // State 4.2: ...<b>PAR1\n\nPAR2
  15802. // ------------
  15803. }
  15804. }
  15805. /**
  15806. * @param HTMLPurifier_Token $token
  15807. */
  15808. public function handleElement(&$token)
  15809. {
  15810. // We don't have to check if we're already in a <p> tag for block
  15811. // tokens, because the tag would have been autoclosed by MakeWellFormed.
  15812. if ($this->allowsElement('p')) {
  15813. if (!empty($this->currentNesting)) {
  15814. if ($this->_isInline($token)) {
  15815. // State 1: <div>...<b>
  15816. // ---
  15817. // Check if this token is adjacent to the parent token
  15818. // (seek backwards until token isn't whitespace)
  15819. $i = null;
  15820. $this->backward($i, $prev);
  15821. if (!$prev instanceof HTMLPurifier_Token_Start) {
  15822. // Token wasn't adjacent
  15823. if ($prev instanceof HTMLPurifier_Token_Text &&
  15824. substr($prev->data, -2) === "\n\n"
  15825. ) {
  15826. // State 1.1.4: <div><p>PAR1</p>\n\n<b>
  15827. // ---
  15828. // Quite frankly, this should be handled by splitText
  15829. $token = array($this->_pStart(), $token);
  15830. } else {
  15831. // State 1.1.1: <div><p>PAR1</p><b>
  15832. // ---
  15833. // State 1.1.2: <div><br /><b>
  15834. // ---
  15835. // State 1.1.3: <div>PAR<b>
  15836. // ---
  15837. }
  15838. } else {
  15839. // State 1.2.1: <div><b>
  15840. // ---
  15841. // Lookahead to see if <p> is needed.
  15842. if ($this->_pLookAhead()) {
  15843. // State 1.3.1: <div><b>PAR1\n\nPAR2
  15844. // ---
  15845. $token = array($this->_pStart(), $token);
  15846. } else {
  15847. // State 1.3.2: <div><b>PAR1</b></div>
  15848. // ---
  15849. // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
  15850. // ---
  15851. }
  15852. }
  15853. } else {
  15854. // State 2.3: ...<div>
  15855. // -----
  15856. }
  15857. } else {
  15858. if ($this->_isInline($token)) {
  15859. // State 3.1: <b>
  15860. // ---
  15861. // This is where the {p} tag is inserted, not reflected in
  15862. // inputTokens yet, however.
  15863. $token = array($this->_pStart(), $token);
  15864. } else {
  15865. // State 3.2: <div>
  15866. // -----
  15867. }
  15868. $i = null;
  15869. if ($this->backward($i, $prev)) {
  15870. if (!$prev instanceof HTMLPurifier_Token_Text) {
  15871. // State 3.1.1: ...</p>{p}<b>
  15872. // ---
  15873. // State 3.2.1: ...</p><div>
  15874. // -----
  15875. if (!is_array($token)) {
  15876. $token = array($token);
  15877. }
  15878. array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
  15879. } else {
  15880. // State 3.1.2: ...</p>\n\n{p}<b>
  15881. // ---
  15882. // State 3.2.2: ...</p>\n\n<div>
  15883. // -----
  15884. // Note: PAR<ELEM> cannot occur because PAR would have been
  15885. // wrapped in <p> tags.
  15886. }
  15887. }
  15888. }
  15889. } else {
  15890. // State 2.2: <ul><li>
  15891. // ----
  15892. // State 2.4: <p><b>
  15893. // ---
  15894. }
  15895. }
  15896. /**
  15897. * Splits up a text in paragraph tokens and appends them
  15898. * to the result stream that will replace the original
  15899. * @param string $data String text data that will be processed
  15900. * into paragraphs
  15901. * @param HTMLPurifier_Token[] $result Reference to array of tokens that the
  15902. * tags will be appended onto
  15903. */
  15904. private function _splitText($data, &$result)
  15905. {
  15906. $raw_paragraphs = explode("\n\n", $data);
  15907. $paragraphs = array(); // without empty paragraphs
  15908. $needs_start = false;
  15909. $needs_end = false;
  15910. $c = count($raw_paragraphs);
  15911. if ($c == 1) {
  15912. // There were no double-newlines, abort quickly. In theory this
  15913. // should never happen.
  15914. $result[] = new HTMLPurifier_Token_Text($data);
  15915. return;
  15916. }
  15917. for ($i = 0; $i < $c; $i++) {
  15918. $par = $raw_paragraphs[$i];
  15919. if (trim($par) !== '') {
  15920. $paragraphs[] = $par;
  15921. } else {
  15922. if ($i == 0) {
  15923. // Double newline at the front
  15924. if (empty($result)) {
  15925. // The empty result indicates that the AutoParagraph
  15926. // injector did not add any start paragraph tokens.
  15927. // This means that we have been in a paragraph for
  15928. // a while, and the newline means we should start a new one.
  15929. $result[] = new HTMLPurifier_Token_End('p');
  15930. $result[] = new HTMLPurifier_Token_Text("\n\n");
  15931. // However, the start token should only be added if
  15932. // there is more processing to be done (i.e. there are
  15933. // real paragraphs in here). If there are none, the
  15934. // next start paragraph tag will be handled by the
  15935. // next call to the injector
  15936. $needs_start = true;
  15937. } else {
  15938. // We just started a new paragraph!
  15939. // Reinstate a double-newline for presentation's sake, since
  15940. // it was in the source code.
  15941. array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
  15942. }
  15943. } elseif ($i + 1 == $c) {
  15944. // Double newline at the end
  15945. // There should be a trailing </p> when we're finally done.
  15946. $needs_end = true;
  15947. }
  15948. }
  15949. }
  15950. // Check if this was just a giant blob of whitespace. Move this earlier,
  15951. // perhaps?
  15952. if (empty($paragraphs)) {
  15953. return;
  15954. }
  15955. // Add the start tag indicated by \n\n at the beginning of $data
  15956. if ($needs_start) {
  15957. $result[] = $this->_pStart();
  15958. }
  15959. // Append the paragraphs onto the result
  15960. foreach ($paragraphs as $par) {
  15961. $result[] = new HTMLPurifier_Token_Text($par);
  15962. $result[] = new HTMLPurifier_Token_End('p');
  15963. $result[] = new HTMLPurifier_Token_Text("\n\n");
  15964. $result[] = $this->_pStart();
  15965. }
  15966. // Remove trailing start token; Injector will handle this later if
  15967. // it was indeed needed. This prevents from needing to do a lookahead,
  15968. // at the cost of a lookbehind later.
  15969. array_pop($result);
  15970. // If there is no need for an end tag, remove all of it and let
  15971. // MakeWellFormed close it later.
  15972. if (!$needs_end) {
  15973. array_pop($result); // removes \n\n
  15974. array_pop($result); // removes </p>
  15975. }
  15976. }
  15977. /**
  15978. * Returns true if passed token is inline (and, ergo, allowed in
  15979. * paragraph tags)
  15980. * @param HTMLPurifier_Token $token
  15981. * @return bool
  15982. */
  15983. private function _isInline($token)
  15984. {
  15985. return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
  15986. }
  15987. /**
  15988. * Looks ahead in the token list and determines whether or not we need
  15989. * to insert a <p> tag.
  15990. * @return bool
  15991. */
  15992. private function _pLookAhead()
  15993. {
  15994. if ($this->currentToken instanceof HTMLPurifier_Token_Start) {
  15995. $nesting = 1;
  15996. } else {
  15997. $nesting = 0;
  15998. }
  15999. $ok = false;
  16000. $i = null;
  16001. while ($this->forwardUntilEndToken($i, $current, $nesting)) {
  16002. $result = $this->_checkNeedsP($current);
  16003. if ($result !== null) {
  16004. $ok = $result;
  16005. break;
  16006. }
  16007. }
  16008. return $ok;
  16009. }
  16010. /**
  16011. * Determines if a particular token requires an earlier inline token
  16012. * to get a paragraph. This should be used with _forwardUntilEndToken
  16013. * @param HTMLPurifier_Token $current
  16014. * @return bool
  16015. */
  16016. private function _checkNeedsP($current)
  16017. {
  16018. if ($current instanceof HTMLPurifier_Token_Start) {
  16019. if (!$this->_isInline($current)) {
  16020. // <div>PAR1<div>
  16021. // ----
  16022. // Terminate early, since we hit a block element
  16023. return false;
  16024. }
  16025. } elseif ($current instanceof HTMLPurifier_Token_Text) {
  16026. if (strpos($current->data, "\n\n") !== false) {
  16027. // <div>PAR1<b>PAR1\n\nPAR2
  16028. // ----
  16029. return true;
  16030. } else {
  16031. // <div>PAR1<b>PAR1...
  16032. // ----
  16033. }
  16034. }
  16035. return null;
  16036. }
  16037. }
  16038. /**
  16039. * Injector that displays the URL of an anchor instead of linking to it, in addition to showing the text of the link.
  16040. */
  16041. class HTMLPurifier_Injector_DisplayLinkURI extends HTMLPurifier_Injector
  16042. {
  16043. /**
  16044. * @type string
  16045. */
  16046. public $name = 'DisplayLinkURI';
  16047. /**
  16048. * @type array
  16049. */
  16050. public $needed = array('a');
  16051. /**
  16052. * @param $token
  16053. */
  16054. public function handleElement(&$token)
  16055. {
  16056. }
  16057. /**
  16058. * @param HTMLPurifier_Token $token
  16059. */
  16060. public function handleEnd(&$token)
  16061. {
  16062. if (isset($token->start->attr['href'])) {
  16063. $url = $token->start->attr['href'];
  16064. unset($token->start->attr['href']);
  16065. $token = array($token, new HTMLPurifier_Token_Text(" ($url)"));
  16066. } else {
  16067. // nothing to display
  16068. }
  16069. }
  16070. }
  16071. /**
  16072. * Injector that converts http, https and ftp text URLs to actual links.
  16073. */
  16074. class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
  16075. {
  16076. /**
  16077. * @type string
  16078. */
  16079. public $name = 'Linkify';
  16080. /**
  16081. * @type array
  16082. */
  16083. public $needed = array('a' => array('href'));
  16084. /**
  16085. * @param HTMLPurifier_Token $token
  16086. */
  16087. public function handleText(&$token)
  16088. {
  16089. if (!$this->allowsElement('a')) {
  16090. return;
  16091. }
  16092. if (strpos($token->data, '://') === false) {
  16093. // our really quick heuristic failed, abort
  16094. // this may not work so well if we want to match things like
  16095. // "google.com", but then again, most people don't
  16096. return;
  16097. }
  16098. // there is/are URL(s). Let's split the string.
  16099. // We use this regex:
  16100. // https://gist.github.com/gruber/249502
  16101. // but with @cscott's backtracking fix and also
  16102. // the Unicode characters un-Unicodified.
  16103. $bits = preg_split(
  16104. '/\\b((?:[a-z][\\w\\-]+:(?:\\/{1,3}|[a-z0-9%])|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}\\/)(?:[^\\s()<>]|\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\))+(?:\\((?:[^\\s()<>]|(?:\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'".,<>?\x{00ab}\x{00bb}\x{201c}\x{201d}\x{2018}\x{2019}]))/iu',
  16105. $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
  16106. $token = array();
  16107. // $i = index
  16108. // $c = count
  16109. // $l = is link
  16110. for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
  16111. if (!$l) {
  16112. if ($bits[$i] === '') {
  16113. continue;
  16114. }
  16115. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  16116. } else {
  16117. $token[] = new HTMLPurifier_Token_Start('a', array('href' => $bits[$i]));
  16118. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  16119. $token[] = new HTMLPurifier_Token_End('a');
  16120. }
  16121. }
  16122. }
  16123. }
  16124. /**
  16125. * Injector that converts configuration directive syntax %Namespace.Directive
  16126. * to links
  16127. */
  16128. class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
  16129. {
  16130. /**
  16131. * @type string
  16132. */
  16133. public $name = 'PurifierLinkify';
  16134. /**
  16135. * @type string
  16136. */
  16137. public $docURL;
  16138. /**
  16139. * @type array
  16140. */
  16141. public $needed = array('a' => array('href'));
  16142. /**
  16143. * @param HTMLPurifier_Config $config
  16144. * @param HTMLPurifier_Context $context
  16145. * @return string
  16146. */
  16147. public function prepare($config, $context)
  16148. {
  16149. $this->docURL = $config->get('AutoFormat.PurifierLinkify.DocURL');
  16150. return parent::prepare($config, $context);
  16151. }
  16152. /**
  16153. * @param HTMLPurifier_Token $token
  16154. */
  16155. public function handleText(&$token)
  16156. {
  16157. if (!$this->allowsElement('a')) {
  16158. return;
  16159. }
  16160. if (strpos($token->data, '%') === false) {
  16161. return;
  16162. }
  16163. $bits = preg_split('#%([a-z0-9]+\.[a-z0-9]+)#Si', $token->data, -1, PREG_SPLIT_DELIM_CAPTURE);
  16164. $token = array();
  16165. // $i = index
  16166. // $c = count
  16167. // $l = is link
  16168. for ($i = 0, $c = count($bits), $l = false; $i < $c; $i++, $l = !$l) {
  16169. if (!$l) {
  16170. if ($bits[$i] === '') {
  16171. continue;
  16172. }
  16173. $token[] = new HTMLPurifier_Token_Text($bits[$i]);
  16174. } else {
  16175. $token[] = new HTMLPurifier_Token_Start(
  16176. 'a',
  16177. array('href' => str_replace('%s', $bits[$i], $this->docURL))
  16178. );
  16179. $token[] = new HTMLPurifier_Token_Text('%' . $bits[$i]);
  16180. $token[] = new HTMLPurifier_Token_End('a');
  16181. }
  16182. }
  16183. }
  16184. }
  16185. class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
  16186. {
  16187. /**
  16188. * @type HTMLPurifier_Context
  16189. */
  16190. private $context;
  16191. /**
  16192. * @type HTMLPurifier_Config
  16193. */
  16194. private $config;
  16195. /**
  16196. * @type HTMLPurifier_AttrValidator
  16197. */
  16198. private $attrValidator;
  16199. /**
  16200. * @type bool
  16201. */
  16202. private $removeNbsp;
  16203. /**
  16204. * @type bool
  16205. */
  16206. private $removeNbspExceptions;
  16207. /**
  16208. * Cached contents of %AutoFormat.RemoveEmpty.Predicate
  16209. * @type array
  16210. */
  16211. private $exclude;
  16212. /**
  16213. * @param HTMLPurifier_Config $config
  16214. * @param HTMLPurifier_Context $context
  16215. * @return void
  16216. */
  16217. public function prepare($config, $context)
  16218. {
  16219. parent::prepare($config, $context);
  16220. $this->config = $config;
  16221. $this->context = $context;
  16222. $this->removeNbsp = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp');
  16223. $this->removeNbspExceptions = $config->get('AutoFormat.RemoveEmpty.RemoveNbsp.Exceptions');
  16224. $this->exclude = $config->get('AutoFormat.RemoveEmpty.Predicate');
  16225. foreach ($this->exclude as $key => $attrs) {
  16226. if (!is_array($attrs)) {
  16227. // HACK, see HTMLPurifier/Printer/ConfigForm.php
  16228. $this->exclude[$key] = explode(';', $attrs);
  16229. }
  16230. }
  16231. $this->attrValidator = new HTMLPurifier_AttrValidator();
  16232. }
  16233. /**
  16234. * @param HTMLPurifier_Token $token
  16235. */
  16236. public function handleElement(&$token)
  16237. {
  16238. if (!$token instanceof HTMLPurifier_Token_Start) {
  16239. return;
  16240. }
  16241. $next = false;
  16242. $deleted = 1; // the current tag
  16243. for ($i = count($this->inputZipper->back) - 1; $i >= 0; $i--, $deleted++) {
  16244. $next = $this->inputZipper->back[$i];
  16245. if ($next instanceof HTMLPurifier_Token_Text) {
  16246. if ($next->is_whitespace) {
  16247. continue;
  16248. }
  16249. if ($this->removeNbsp && !isset($this->removeNbspExceptions[$token->name])) {
  16250. $plain = str_replace("\xC2\xA0", "", $next->data);
  16251. $isWsOrNbsp = $plain === '' || ctype_space($plain);
  16252. if ($isWsOrNbsp) {
  16253. continue;
  16254. }
  16255. }
  16256. }
  16257. break;
  16258. }
  16259. if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
  16260. $this->attrValidator->validateToken($token, $this->config, $this->context);
  16261. $token->armor['ValidateAttributes'] = true;
  16262. if (isset($this->exclude[$token->name])) {
  16263. $r = true;
  16264. foreach ($this->exclude[$token->name] as $elem) {
  16265. if (!isset($token->attr[$elem])) $r = false;
  16266. }
  16267. if ($r) return;
  16268. }
  16269. if (isset($token->attr['id']) || isset($token->attr['name'])) {
  16270. return;
  16271. }
  16272. $token = $deleted + 1;
  16273. for ($b = 0, $c = count($this->inputZipper->front); $b < $c; $b++) {
  16274. $prev = $this->inputZipper->front[$b];
  16275. if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) {
  16276. continue;
  16277. }
  16278. break;
  16279. }
  16280. // This is safe because we removed the token that triggered this.
  16281. $this->rewindOffset($b+$deleted);
  16282. return;
  16283. }
  16284. }
  16285. }
  16286. /**
  16287. * Injector that removes spans with no attributes
  16288. */
  16289. class HTMLPurifier_Injector_RemoveSpansWithoutAttributes extends HTMLPurifier_Injector
  16290. {
  16291. /**
  16292. * @type string
  16293. */
  16294. public $name = 'RemoveSpansWithoutAttributes';
  16295. /**
  16296. * @type array
  16297. */
  16298. public $needed = array('span');
  16299. /**
  16300. * @type HTMLPurifier_AttrValidator
  16301. */
  16302. private $attrValidator;
  16303. /**
  16304. * Used by AttrValidator.
  16305. * @type HTMLPurifier_Config
  16306. */
  16307. private $config;
  16308. /**
  16309. * @type HTMLPurifier_Context
  16310. */
  16311. private $context;
  16312. public function prepare($config, $context)
  16313. {
  16314. $this->attrValidator = new HTMLPurifier_AttrValidator();
  16315. $this->config = $config;
  16316. $this->context = $context;
  16317. return parent::prepare($config, $context);
  16318. }
  16319. /**
  16320. * @param HTMLPurifier_Token $token
  16321. */
  16322. public function handleElement(&$token)
  16323. {
  16324. if ($token->name !== 'span' || !$token instanceof HTMLPurifier_Token_Start) {
  16325. return;
  16326. }
  16327. // We need to validate the attributes now since this doesn't normally
  16328. // happen until after MakeWellFormed. If all the attributes are removed
  16329. // the span needs to be removed too.
  16330. $this->attrValidator->validateToken($token, $this->config, $this->context);
  16331. $token->armor['ValidateAttributes'] = true;
  16332. if (!empty($token->attr)) {
  16333. return;
  16334. }
  16335. $nesting = 0;
  16336. while ($this->forwardUntilEndToken($i, $current, $nesting)) {
  16337. }
  16338. if ($current instanceof HTMLPurifier_Token_End && $current->name === 'span') {
  16339. // Mark closing span tag for deletion
  16340. $current->markForDeletion = true;
  16341. // Delete open span tag
  16342. $token = false;
  16343. }
  16344. }
  16345. /**
  16346. * @param HTMLPurifier_Token $token
  16347. */
  16348. public function handleEnd(&$token)
  16349. {
  16350. if ($token->markForDeletion) {
  16351. $token = false;
  16352. }
  16353. }
  16354. }
  16355. /**
  16356. * Adds important param elements to inside of object in order to make
  16357. * things safe.
  16358. */
  16359. class HTMLPurifier_Injector_SafeObject extends HTMLPurifier_Injector
  16360. {
  16361. /**
  16362. * @type string
  16363. */
  16364. public $name = 'SafeObject';
  16365. /**
  16366. * @type array
  16367. */
  16368. public $needed = array('object', 'param');
  16369. /**
  16370. * @type array
  16371. */
  16372. protected $objectStack = array();
  16373. /**
  16374. * @type array
  16375. */
  16376. protected $paramStack = array();
  16377. /**
  16378. * Keep this synchronized with AttrTransform/SafeParam.php.
  16379. * @type array
  16380. */
  16381. protected $addParam = array(
  16382. 'allowScriptAccess' => 'never',
  16383. 'allowNetworking' => 'internal',
  16384. );
  16385. /**
  16386. * These are all lower-case keys.
  16387. * @type array
  16388. */
  16389. protected $allowedParam = array(
  16390. 'wmode' => true,
  16391. 'movie' => true,
  16392. 'flashvars' => true,
  16393. 'src' => true,
  16394. 'allowfullscreen' => true, // if omitted, assume to be 'false'
  16395. );
  16396. /**
  16397. * @param HTMLPurifier_Config $config
  16398. * @param HTMLPurifier_Context $context
  16399. * @return void
  16400. */
  16401. public function prepare($config, $context)
  16402. {
  16403. parent::prepare($config, $context);
  16404. }
  16405. /**
  16406. * @param HTMLPurifier_Token $token
  16407. */
  16408. public function handleElement(&$token)
  16409. {
  16410. if ($token->name == 'object') {
  16411. $this->objectStack[] = $token;
  16412. $this->paramStack[] = array();
  16413. $new = array($token);
  16414. foreach ($this->addParam as $name => $value) {
  16415. $new[] = new HTMLPurifier_Token_Empty('param', array('name' => $name, 'value' => $value));
  16416. }
  16417. $token = $new;
  16418. } elseif ($token->name == 'param') {
  16419. $nest = count($this->currentNesting) - 1;
  16420. if ($nest >= 0 && $this->currentNesting[$nest]->name === 'object') {
  16421. $i = count($this->objectStack) - 1;
  16422. if (!isset($token->attr['name'])) {
  16423. $token = false;
  16424. return;
  16425. }
  16426. $n = $token->attr['name'];
  16427. // We need this fix because YouTube doesn't supply a data
  16428. // attribute, which we need if a type is specified. This is
  16429. // *very* Flash specific.
  16430. if (!isset($this->objectStack[$i]->attr['data']) &&
  16431. ($token->attr['name'] == 'movie' || $token->attr['name'] == 'src')
  16432. ) {
  16433. $this->objectStack[$i]->attr['data'] = $token->attr['value'];
  16434. }
  16435. // Check if the parameter is the correct value but has not
  16436. // already been added
  16437. if (!isset($this->paramStack[$i][$n]) &&
  16438. isset($this->addParam[$n]) &&
  16439. $token->attr['name'] === $this->addParam[$n]) {
  16440. // keep token, and add to param stack
  16441. $this->paramStack[$i][$n] = true;
  16442. } elseif (isset($this->allowedParam[strtolower($n)])) {
  16443. // keep token, don't do anything to it
  16444. // (could possibly check for duplicates here)
  16445. // Note: In principle, parameters should be case sensitive.
  16446. // But it seems they are not really; so accept any case.
  16447. } else {
  16448. $token = false;
  16449. }
  16450. } else {
  16451. // not directly inside an object, DENY!
  16452. $token = false;
  16453. }
  16454. }
  16455. }
  16456. public function handleEnd(&$token)
  16457. {
  16458. // This is the WRONG way of handling the object and param stacks;
  16459. // we should be inserting them directly on the relevant object tokens
  16460. // so that the global stack handling handles it.
  16461. if ($token->name == 'object') {
  16462. array_pop($this->objectStack);
  16463. array_pop($this->paramStack);
  16464. }
  16465. }
  16466. }
  16467. /**
  16468. * Parser that uses PHP 5's DOM extension (part of the core).
  16469. *
  16470. * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
  16471. * It gives us a forgiving HTML parser, which we use to transform the HTML
  16472. * into a DOM, and then into the tokens. It is blazingly fast (for large
  16473. * documents, it performs twenty times faster than
  16474. * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
  16475. *
  16476. * @note Any empty elements will have empty tokens associated with them, even if
  16477. * this is prohibited by the spec. This is cannot be fixed until the spec
  16478. * comes into play.
  16479. *
  16480. * @note PHP's DOM extension does not actually parse any entities, we use
  16481. * our own function to do that.
  16482. *
  16483. * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
  16484. * If this is a huge problem, due to the fact that HTML is hand
  16485. * edited and you are unable to get a parser cache that caches the
  16486. * the output of HTML Purifier while keeping the original HTML lying
  16487. * around, you may want to run Tidy on the resulting output or use
  16488. * HTMLPurifier_DirectLex
  16489. */
  16490. class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
  16491. {
  16492. /**
  16493. * @type HTMLPurifier_TokenFactory
  16494. */
  16495. private $factory;
  16496. public function __construct()
  16497. {
  16498. // setup the factory
  16499. parent::__construct();
  16500. $this->factory = new HTMLPurifier_TokenFactory();
  16501. }
  16502. /**
  16503. * @param string $html
  16504. * @param HTMLPurifier_Config $config
  16505. * @param HTMLPurifier_Context $context
  16506. * @return HTMLPurifier_Token[]
  16507. */
  16508. public function tokenizeHTML($html, $config, $context)
  16509. {
  16510. $html = $this->normalize($html, $config, $context);
  16511. // attempt to armor stray angled brackets that cannot possibly
  16512. // form tags and thus are probably being used as emoticons
  16513. if ($config->get('Core.AggressivelyFixLt')) {
  16514. $char = '[^a-z!\/]';
  16515. $comment = "/<!--(.*?)(-->|\z)/is";
  16516. $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
  16517. do {
  16518. $old = $html;
  16519. $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
  16520. } while ($html !== $old);
  16521. $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
  16522. }
  16523. // preprocess html, essential for UTF-8
  16524. $html = $this->wrapHTML($html, $config, $context);
  16525. $doc = new DOMDocument();
  16526. $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
  16527. set_error_handler(array($this, 'muteErrorHandler'));
  16528. $doc->loadHTML($html);
  16529. restore_error_handler();
  16530. $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
  16531. getElementsByTagName('body')->item(0); // <body>
  16532. $div = $body->getElementsByTagName('div')->item(0); // <div>
  16533. $tokens = array();
  16534. $this->tokenizeDOM($div, $tokens, $config);
  16535. // If the div has a sibling, that means we tripped across
  16536. // a premature </div> tag. So remove the div we parsed,
  16537. // and then tokenize the rest of body. We can't tokenize
  16538. // the sibling directly as we'll lose the tags in that case.
  16539. if ($div->nextSibling) {
  16540. $body->removeChild($div);
  16541. $this->tokenizeDOM($body, $tokens, $config);
  16542. }
  16543. return $tokens;
  16544. }
  16545. /**
  16546. * Iterative function that tokenizes a node, putting it into an accumulator.
  16547. * To iterate is human, to recurse divine - L. Peter Deutsch
  16548. * @param DOMNode $node DOMNode to be tokenized.
  16549. * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
  16550. * @return HTMLPurifier_Token of node appended to previously passed tokens.
  16551. */
  16552. protected function tokenizeDOM($node, &$tokens, $config)
  16553. {
  16554. $level = 0;
  16555. $nodes = array($level => new HTMLPurifier_Queue(array($node)));
  16556. $closingNodes = array();
  16557. do {
  16558. while (!$nodes[$level]->isEmpty()) {
  16559. $node = $nodes[$level]->shift(); // FIFO
  16560. $collect = $level > 0 ? true : false;
  16561. $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
  16562. if ($needEndingTag) {
  16563. $closingNodes[$level][] = $node;
  16564. }
  16565. if ($node->childNodes && $node->childNodes->length) {
  16566. $level++;
  16567. $nodes[$level] = new HTMLPurifier_Queue();
  16568. foreach ($node->childNodes as $childNode) {
  16569. $nodes[$level]->push($childNode);
  16570. }
  16571. }
  16572. }
  16573. $level--;
  16574. if ($level && isset($closingNodes[$level])) {
  16575. while ($node = array_pop($closingNodes[$level])) {
  16576. $this->createEndNode($node, $tokens);
  16577. }
  16578. }
  16579. } while ($level > 0);
  16580. }
  16581. /**
  16582. * @param DOMNode $node DOMNode to be tokenized.
  16583. * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
  16584. * @param bool $collect Says whether or start and close are collected, set to
  16585. * false at first recursion because it's the implicit DIV
  16586. * tag you're dealing with.
  16587. * @return bool if the token needs an endtoken
  16588. * @todo data and tagName properties don't seem to exist in DOMNode?
  16589. */
  16590. protected function createStartNode($node, &$tokens, $collect, $config)
  16591. {
  16592. // intercept non element nodes. WE MUST catch all of them,
  16593. // but we're not getting the character reference nodes because
  16594. // those should have been preprocessed
  16595. if ($node->nodeType === XML_TEXT_NODE) {
  16596. $tokens[] = $this->factory->createText($node->data);
  16597. return false;
  16598. } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
  16599. // undo libxml's special treatment of <script> and <style> tags
  16600. $last = end($tokens);
  16601. $data = $node->data;
  16602. // (note $node->tagname is already normalized)
  16603. if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
  16604. $new_data = trim($data);
  16605. if (substr($new_data, 0, 4) === '<!--') {
  16606. $data = substr($new_data, 4);
  16607. if (substr($data, -3) === '-->') {
  16608. $data = substr($data, 0, -3);
  16609. } else {
  16610. // Highly suspicious! Not sure what to do...
  16611. }
  16612. }
  16613. }
  16614. $tokens[] = $this->factory->createText($this->parseText($data, $config));
  16615. return false;
  16616. } elseif ($node->nodeType === XML_COMMENT_NODE) {
  16617. // this is code is only invoked for comments in script/style in versions
  16618. // of libxml pre-2.6.28 (regular comments, of course, are still
  16619. // handled regularly)
  16620. $tokens[] = $this->factory->createComment($node->data);
  16621. return false;
  16622. } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
  16623. // not-well tested: there may be other nodes we have to grab
  16624. return false;
  16625. }
  16626. $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
  16627. // We still have to make sure that the element actually IS empty
  16628. if (!$node->childNodes->length) {
  16629. if ($collect) {
  16630. $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
  16631. }
  16632. return false;
  16633. } else {
  16634. if ($collect) {
  16635. $tokens[] = $this->factory->createStart(
  16636. $tag_name = $node->tagName, // somehow, it get's dropped
  16637. $attr
  16638. );
  16639. }
  16640. return true;
  16641. }
  16642. }
  16643. /**
  16644. * @param DOMNode $node
  16645. * @param HTMLPurifier_Token[] $tokens
  16646. */
  16647. protected function createEndNode($node, &$tokens)
  16648. {
  16649. $tokens[] = $this->factory->createEnd($node->tagName);
  16650. }
  16651. /**
  16652. * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
  16653. *
  16654. * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.
  16655. * @return array Associative array of attributes.
  16656. */
  16657. protected function transformAttrToAssoc($node_map)
  16658. {
  16659. // NamedNodeMap is documented very well, so we're using undocumented
  16660. // features, namely, the fact that it implements Iterator and
  16661. // has a ->length attribute
  16662. if ($node_map->length === 0) {
  16663. return array();
  16664. }
  16665. $array = array();
  16666. foreach ($node_map as $attr) {
  16667. $array[$attr->name] = $attr->value;
  16668. }
  16669. return $array;
  16670. }
  16671. /**
  16672. * An error handler that mutes all errors
  16673. * @param int $errno
  16674. * @param string $errstr
  16675. */
  16676. public function muteErrorHandler($errno, $errstr)
  16677. {
  16678. }
  16679. /**
  16680. * Callback function for undoing escaping of stray angled brackets
  16681. * in comments
  16682. * @param array $matches
  16683. * @return string
  16684. */
  16685. public function callbackUndoCommentSubst($matches)
  16686. {
  16687. return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
  16688. }
  16689. /**
  16690. * Callback function that entity-izes ampersands in comments so that
  16691. * callbackUndoCommentSubst doesn't clobber them
  16692. * @param array $matches
  16693. * @return string
  16694. */
  16695. public function callbackArmorCommentEntities($matches)
  16696. {
  16697. return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
  16698. }
  16699. /**
  16700. * Wraps an HTML fragment in the necessary HTML
  16701. * @param string $html
  16702. * @param HTMLPurifier_Config $config
  16703. * @param HTMLPurifier_Context $context
  16704. * @return string
  16705. */
  16706. protected function wrapHTML($html, $config, $context, $use_div = true)
  16707. {
  16708. $def = $config->getDefinition('HTML');
  16709. $ret = '';
  16710. if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
  16711. $ret .= '<!DOCTYPE html ';
  16712. if (!empty($def->doctype->dtdPublic)) {
  16713. $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
  16714. }
  16715. if (!empty($def->doctype->dtdSystem)) {
  16716. $ret .= '"' . $def->doctype->dtdSystem . '" ';
  16717. }
  16718. $ret .= '>';
  16719. }
  16720. $ret .= '<html><head>';
  16721. $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
  16722. // No protection if $html contains a stray </div>!
  16723. $ret .= '</head><body>';
  16724. if ($use_div) $ret .= '<div>';
  16725. $ret .= $html;
  16726. if ($use_div) $ret .= '</div>';
  16727. $ret .= '</body></html>';
  16728. return $ret;
  16729. }
  16730. }
  16731. /**
  16732. * Our in-house implementation of a parser.
  16733. *
  16734. * A pure PHP parser, DirectLex has absolutely no dependencies, making
  16735. * it a reasonably good default for PHP4. Written with efficiency in mind,
  16736. * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
  16737. * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  16738. *
  16739. * @todo Reread XML spec and document differences.
  16740. */
  16741. class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  16742. {
  16743. /**
  16744. * @type bool
  16745. */
  16746. public $tracksLineNumbers = true;
  16747. /**
  16748. * Whitespace characters for str(c)spn.
  16749. * @type string
  16750. */
  16751. protected $_whitespace = "\x20\x09\x0D\x0A";
  16752. /**
  16753. * Callback function for script CDATA fudge
  16754. * @param array $matches, in form of array(opening tag, contents, closing tag)
  16755. * @return string
  16756. */
  16757. protected function scriptCallback($matches)
  16758. {
  16759. return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  16760. }
  16761. /**
  16762. * @param String $html
  16763. * @param HTMLPurifier_Config $config
  16764. * @param HTMLPurifier_Context $context
  16765. * @return array|HTMLPurifier_Token[]
  16766. */
  16767. public function tokenizeHTML($html, $config, $context)
  16768. {
  16769. // special normalization for script tags without any armor
  16770. // our "armor" heurstic is a < sign any number of whitespaces after
  16771. // the first script tag
  16772. if ($config->get('HTML.Trusted')) {
  16773. $html = preg_replace_callback(
  16774. '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  16775. array($this, 'scriptCallback'),
  16776. $html
  16777. );
  16778. }
  16779. $html = $this->normalize($html, $config, $context);
  16780. $cursor = 0; // our location in the text
  16781. $inside_tag = false; // whether or not we're parsing the inside of a tag
  16782. $array = array(); // result array
  16783. // This is also treated to mean maintain *column* numbers too
  16784. $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
  16785. if ($maintain_line_numbers === null) {
  16786. // automatically determine line numbering by checking
  16787. // if error collection is on
  16788. $maintain_line_numbers = $config->get('Core.CollectErrors');
  16789. }
  16790. if ($maintain_line_numbers) {
  16791. $current_line = 1;
  16792. $current_col = 0;
  16793. $length = strlen($html);
  16794. } else {
  16795. $current_line = false;
  16796. $current_col = false;
  16797. $length = false;
  16798. }
  16799. $context->register('CurrentLine', $current_line);
  16800. $context->register('CurrentCol', $current_col);
  16801. $nl = "\n";
  16802. // how often to manually recalculate. This will ALWAYS be right,
  16803. // but it's pretty wasteful. Set to 0 to turn off
  16804. $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
  16805. $e = false;
  16806. if ($config->get('Core.CollectErrors')) {
  16807. $e =& $context->get('ErrorCollector');
  16808. }
  16809. // for testing synchronization
  16810. $loops = 0;
  16811. while (++$loops) {
  16812. // $cursor is either at the start of a token, or inside of
  16813. // a tag (i.e. there was a < immediately before it), as indicated
  16814. // by $inside_tag
  16815. if ($maintain_line_numbers) {
  16816. // $rcursor, however, is always at the start of a token.
  16817. $rcursor = $cursor - (int)$inside_tag;
  16818. // Column number is cheap, so we calculate it every round.
  16819. // We're interested at the *end* of the newline string, so
  16820. // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
  16821. // from our "rcursor" position.
  16822. $nl_pos = strrpos($html, $nl, $rcursor - $length);
  16823. $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
  16824. // recalculate lines
  16825. if ($synchronize_interval && // synchronization is on
  16826. $cursor > 0 && // cursor is further than zero
  16827. $loops % $synchronize_interval === 0) { // time to synchronize!
  16828. $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
  16829. }
  16830. }
  16831. $position_next_lt = strpos($html, '<', $cursor);
  16832. $position_next_gt = strpos($html, '>', $cursor);
  16833. // triggers on "<b>asdf</b>" but not "asdf <b></b>"
  16834. // special case to set up context
  16835. if ($position_next_lt === $cursor) {
  16836. $inside_tag = true;
  16837. $cursor++;
  16838. }
  16839. if (!$inside_tag && $position_next_lt !== false) {
  16840. // We are not inside tag and there still is another tag to parse
  16841. $token = new
  16842. HTMLPurifier_Token_Text(
  16843. $this->parseText(
  16844. substr(
  16845. $html,
  16846. $cursor,
  16847. $position_next_lt - $cursor
  16848. ), $config
  16849. )
  16850. );
  16851. if ($maintain_line_numbers) {
  16852. $token->rawPosition($current_line, $current_col);
  16853. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
  16854. }
  16855. $array[] = $token;
  16856. $cursor = $position_next_lt + 1;
  16857. $inside_tag = true;
  16858. continue;
  16859. } elseif (!$inside_tag) {
  16860. // We are not inside tag but there are no more tags
  16861. // If we're already at the end, break
  16862. if ($cursor === strlen($html)) {
  16863. break;
  16864. }
  16865. // Create Text of rest of string
  16866. $token = new
  16867. HTMLPurifier_Token_Text(
  16868. $this->parseText(
  16869. substr(
  16870. $html,
  16871. $cursor
  16872. ), $config
  16873. )
  16874. );
  16875. if ($maintain_line_numbers) {
  16876. $token->rawPosition($current_line, $current_col);
  16877. }
  16878. $array[] = $token;
  16879. break;
  16880. } elseif ($inside_tag && $position_next_gt !== false) {
  16881. // We are in tag and it is well formed
  16882. // Grab the internals of the tag
  16883. $strlen_segment = $position_next_gt - $cursor;
  16884. if ($strlen_segment < 1) {
  16885. // there's nothing to process!
  16886. $token = new HTMLPurifier_Token_Text('<');
  16887. $cursor++;
  16888. continue;
  16889. }
  16890. $segment = substr($html, $cursor, $strlen_segment);
  16891. if ($segment === false) {
  16892. // somehow, we attempted to access beyond the end of
  16893. // the string, defense-in-depth, reported by Nate Abele
  16894. break;
  16895. }
  16896. // Check if it's a comment
  16897. if (substr($segment, 0, 3) === '!--') {
  16898. // re-determine segment length, looking for -->
  16899. $position_comment_end = strpos($html, '-->', $cursor);
  16900. if ($position_comment_end === false) {
  16901. // uh oh, we have a comment that extends to
  16902. // infinity. Can't be helped: set comment
  16903. // end position to end of string
  16904. if ($e) {
  16905. $e->send(E_WARNING, 'Lexer: Unclosed comment');
  16906. }
  16907. $position_comment_end = strlen($html);
  16908. $end = true;
  16909. } else {
  16910. $end = false;
  16911. }
  16912. $strlen_segment = $position_comment_end - $cursor;
  16913. $segment = substr($html, $cursor, $strlen_segment);
  16914. $token = new
  16915. HTMLPurifier_Token_Comment(
  16916. substr(
  16917. $segment,
  16918. 3,
  16919. $strlen_segment - 3
  16920. )
  16921. );
  16922. if ($maintain_line_numbers) {
  16923. $token->rawPosition($current_line, $current_col);
  16924. $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
  16925. }
  16926. $array[] = $token;
  16927. $cursor = $end ? $position_comment_end : $position_comment_end + 3;
  16928. $inside_tag = false;
  16929. continue;
  16930. }
  16931. // Check if it's an end tag
  16932. $is_end_tag = (strpos($segment, '/') === 0);
  16933. if ($is_end_tag) {
  16934. $type = substr($segment, 1);
  16935. $token = new HTMLPurifier_Token_End($type);
  16936. if ($maintain_line_numbers) {
  16937. $token->rawPosition($current_line, $current_col);
  16938. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  16939. }
  16940. $array[] = $token;
  16941. $inside_tag = false;
  16942. $cursor = $position_next_gt + 1;
  16943. continue;
  16944. }
  16945. // Check leading character is alnum, if not, we may
  16946. // have accidently grabbed an emoticon. Translate into
  16947. // text and go our merry way
  16948. if (!ctype_alpha($segment[0])) {
  16949. // XML: $segment[0] !== '_' && $segment[0] !== ':'
  16950. if ($e) {
  16951. $e->send(E_NOTICE, 'Lexer: Unescaped lt');
  16952. }
  16953. $token = new HTMLPurifier_Token_Text('<');
  16954. if ($maintain_line_numbers) {
  16955. $token->rawPosition($current_line, $current_col);
  16956. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  16957. }
  16958. $array[] = $token;
  16959. $inside_tag = false;
  16960. continue;
  16961. }
  16962. // Check if it is explicitly self closing, if so, remove
  16963. // trailing slash. Remember, we could have a tag like <br>, so
  16964. // any later token processing scripts must convert improperly
  16965. // classified EmptyTags from StartTags.
  16966. $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
  16967. if ($is_self_closing) {
  16968. $strlen_segment--;
  16969. $segment = substr($segment, 0, $strlen_segment);
  16970. }
  16971. // Check if there are any attributes
  16972. $position_first_space = strcspn($segment, $this->_whitespace);
  16973. if ($position_first_space >= $strlen_segment) {
  16974. if ($is_self_closing) {
  16975. $token = new HTMLPurifier_Token_Empty($segment);
  16976. } else {
  16977. $token = new HTMLPurifier_Token_Start($segment);
  16978. }
  16979. if ($maintain_line_numbers) {
  16980. $token->rawPosition($current_line, $current_col);
  16981. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  16982. }
  16983. $array[] = $token;
  16984. $inside_tag = false;
  16985. $cursor = $position_next_gt + 1;
  16986. continue;
  16987. }
  16988. // Grab out all the data
  16989. $type = substr($segment, 0, $position_first_space);
  16990. $attribute_string =
  16991. trim(
  16992. substr(
  16993. $segment,
  16994. $position_first_space
  16995. )
  16996. );
  16997. if ($attribute_string) {
  16998. $attr = $this->parseAttributeString(
  16999. $attribute_string,
  17000. $config,
  17001. $context
  17002. );
  17003. } else {
  17004. $attr = array();
  17005. }
  17006. if ($is_self_closing) {
  17007. $token = new HTMLPurifier_Token_Empty($type, $attr);
  17008. } else {
  17009. $token = new HTMLPurifier_Token_Start($type, $attr);
  17010. }
  17011. if ($maintain_line_numbers) {
  17012. $token->rawPosition($current_line, $current_col);
  17013. $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
  17014. }
  17015. $array[] = $token;
  17016. $cursor = $position_next_gt + 1;
  17017. $inside_tag = false;
  17018. continue;
  17019. } else {
  17020. // inside tag, but there's no ending > sign
  17021. if ($e) {
  17022. $e->send(E_WARNING, 'Lexer: Missing gt');
  17023. }
  17024. $token = new
  17025. HTMLPurifier_Token_Text(
  17026. '<' .
  17027. $this->parseText(
  17028. substr($html, $cursor), $config
  17029. )
  17030. );
  17031. if ($maintain_line_numbers) {
  17032. $token->rawPosition($current_line, $current_col);
  17033. }
  17034. // no cursor scroll? Hmm...
  17035. $array[] = $token;
  17036. break;
  17037. }
  17038. break;
  17039. }
  17040. $context->destroy('CurrentLine');
  17041. $context->destroy('CurrentCol');
  17042. return $array;
  17043. }
  17044. /**
  17045. * PHP 5.0.x compatible substr_count that implements offset and length
  17046. * @param string $haystack
  17047. * @param string $needle
  17048. * @param int $offset
  17049. * @param int $length
  17050. * @return int
  17051. */
  17052. protected function substrCount($haystack, $needle, $offset, $length)
  17053. {
  17054. static $oldVersion;
  17055. if ($oldVersion === null) {
  17056. $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
  17057. }
  17058. if ($oldVersion) {
  17059. $haystack = substr($haystack, $offset, $length);
  17060. return substr_count($haystack, $needle);
  17061. } else {
  17062. return substr_count($haystack, $needle, $offset, $length);
  17063. }
  17064. }
  17065. /**
  17066. * Takes the inside of an HTML tag and makes an assoc array of attributes.
  17067. *
  17068. * @param string $string Inside of tag excluding name.
  17069. * @param HTMLPurifier_Config $config
  17070. * @param HTMLPurifier_Context $context
  17071. * @return array Assoc array of attributes.
  17072. */
  17073. public function parseAttributeString($string, $config, $context)
  17074. {
  17075. $string = (string)$string; // quick typecast
  17076. if ($string == '') {
  17077. return array();
  17078. } // no attributes
  17079. $e = false;
  17080. if ($config->get('Core.CollectErrors')) {
  17081. $e =& $context->get('ErrorCollector');
  17082. }
  17083. // let's see if we can abort as quickly as possible
  17084. // one equal sign, no spaces => one attribute
  17085. $num_equal = substr_count($string, '=');
  17086. $has_space = strpos($string, ' ');
  17087. if ($num_equal === 0 && !$has_space) {
  17088. // bool attribute
  17089. return array($string => $string);
  17090. } elseif ($num_equal === 1 && !$has_space) {
  17091. // only one attribute
  17092. list($key, $quoted_value) = explode('=', $string);
  17093. $quoted_value = trim($quoted_value);
  17094. if (!$key) {
  17095. if ($e) {
  17096. $e->send(E_ERROR, 'Lexer: Missing attribute key');
  17097. }
  17098. return array();
  17099. }
  17100. if (!$quoted_value) {
  17101. return array($key => '');
  17102. }
  17103. $first_char = @$quoted_value[0];
  17104. $last_char = @$quoted_value[strlen($quoted_value) - 1];
  17105. $same_quote = ($first_char == $last_char);
  17106. $open_quote = ($first_char == '"' || $first_char == "'");
  17107. if ($same_quote && $open_quote) {
  17108. // well behaved
  17109. $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
  17110. } else {
  17111. // not well behaved
  17112. if ($open_quote) {
  17113. if ($e) {
  17114. $e->send(E_ERROR, 'Lexer: Missing end quote');
  17115. }
  17116. $value = substr($quoted_value, 1);
  17117. } else {
  17118. $value = $quoted_value;
  17119. }
  17120. }
  17121. if ($value === false) {
  17122. $value = '';
  17123. }
  17124. return array($key => $this->parseAttr($value, $config));
  17125. }
  17126. // setup loop environment
  17127. $array = array(); // return assoc array of attributes
  17128. $cursor = 0; // current position in string (moves forward)
  17129. $size = strlen($string); // size of the string (stays the same)
  17130. // if we have unquoted attributes, the parser expects a terminating
  17131. // space, so let's guarantee that there's always a terminating space.
  17132. $string .= ' ';
  17133. $old_cursor = -1;
  17134. while ($cursor < $size) {
  17135. if ($old_cursor >= $cursor) {
  17136. throw new Exception("Infinite loop detected");
  17137. }
  17138. $old_cursor = $cursor;
  17139. $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
  17140. // grab the key
  17141. $key_begin = $cursor; //we're currently at the start of the key
  17142. // scroll past all characters that are the key (not whitespace or =)
  17143. $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
  17144. $key_end = $cursor; // now at the end of the key
  17145. $key = substr($string, $key_begin, $key_end - $key_begin);
  17146. if (!$key) {
  17147. if ($e) {
  17148. $e->send(E_ERROR, 'Lexer: Missing attribute key');
  17149. }
  17150. $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
  17151. continue; // empty key
  17152. }
  17153. // scroll past all whitespace
  17154. $cursor += strspn($string, $this->_whitespace, $cursor);
  17155. if ($cursor >= $size) {
  17156. $array[$key] = $key;
  17157. break;
  17158. }
  17159. // if the next character is an equal sign, we've got a regular
  17160. // pair, otherwise, it's a bool attribute
  17161. $first_char = @$string[$cursor];
  17162. if ($first_char == '=') {
  17163. // key="value"
  17164. $cursor++;
  17165. $cursor += strspn($string, $this->_whitespace, $cursor);
  17166. if ($cursor === false) {
  17167. $array[$key] = '';
  17168. break;
  17169. }
  17170. // we might be in front of a quote right now
  17171. $char = @$string[$cursor];
  17172. if ($char == '"' || $char == "'") {
  17173. // it's quoted, end bound is $char
  17174. $cursor++;
  17175. $value_begin = $cursor;
  17176. $cursor = strpos($string, $char, $cursor);
  17177. $value_end = $cursor;
  17178. } else {
  17179. // it's not quoted, end bound is whitespace
  17180. $value_begin = $cursor;
  17181. $cursor += strcspn($string, $this->_whitespace, $cursor);
  17182. $value_end = $cursor;
  17183. }
  17184. // we reached a premature end
  17185. if ($cursor === false) {
  17186. $cursor = $size;
  17187. $value_end = $cursor;
  17188. }
  17189. $value = substr($string, $value_begin, $value_end - $value_begin);
  17190. if ($value === false) {
  17191. $value = '';
  17192. }
  17193. $array[$key] = $this->parseAttr($value, $config);
  17194. $cursor++;
  17195. } else {
  17196. // boolattr
  17197. if ($key !== '') {
  17198. $array[$key] = $key;
  17199. } else {
  17200. // purely theoretical
  17201. if ($e) {
  17202. $e->send(E_ERROR, 'Lexer: Missing attribute key');
  17203. }
  17204. }
  17205. }
  17206. }
  17207. return $array;
  17208. }
  17209. }
  17210. /**
  17211. * Concrete comment node class.
  17212. */
  17213. class HTMLPurifier_Node_Comment extends HTMLPurifier_Node
  17214. {
  17215. /**
  17216. * Character data within comment.
  17217. * @type string
  17218. */
  17219. public $data;
  17220. /**
  17221. * @type bool
  17222. */
  17223. public $is_whitespace = true;
  17224. /**
  17225. * Transparent constructor.
  17226. *
  17227. * @param string $data String comment data.
  17228. * @param int $line
  17229. * @param int $col
  17230. */
  17231. public function __construct($data, $line = null, $col = null)
  17232. {
  17233. $this->data = $data;
  17234. $this->line = $line;
  17235. $this->col = $col;
  17236. }
  17237. public function toTokenPair() {
  17238. return array(new HTMLPurifier_Token_Comment($this->data, $this->line, $this->col), null);
  17239. }
  17240. }
  17241. /**
  17242. * Concrete element node class.
  17243. */
  17244. class HTMLPurifier_Node_Element extends HTMLPurifier_Node
  17245. {
  17246. /**
  17247. * The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
  17248. *
  17249. * @note Strictly speaking, XML tags are case sensitive, so we shouldn't
  17250. * be lower-casing them, but these tokens cater to HTML tags, which are
  17251. * insensitive.
  17252. * @type string
  17253. */
  17254. public $name;
  17255. /**
  17256. * Associative array of the node's attributes.
  17257. * @type array
  17258. */
  17259. public $attr = array();
  17260. /**
  17261. * List of child elements.
  17262. * @type array
  17263. */
  17264. public $children = array();
  17265. /**
  17266. * Does this use the <a></a> form or the </a> form, i.e.
  17267. * is it a pair of start/end tokens or an empty token.
  17268. * @bool
  17269. */
  17270. public $empty = false;
  17271. public $endCol = null, $endLine = null, $endArmor = array();
  17272. public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) {
  17273. $this->name = $name;
  17274. $this->attr = $attr;
  17275. $this->line = $line;
  17276. $this->col = $col;
  17277. $this->armor = $armor;
  17278. }
  17279. public function toTokenPair() {
  17280. // XXX inefficiency here, normalization is not necessary
  17281. if ($this->empty) {
  17282. return array(new HTMLPurifier_Token_Empty($this->name, $this->attr, $this->line, $this->col, $this->armor), null);
  17283. } else {
  17284. $start = new HTMLPurifier_Token_Start($this->name, $this->attr, $this->line, $this->col, $this->armor);
  17285. $end = new HTMLPurifier_Token_End($this->name, array(), $this->endLine, $this->endCol, $this->endArmor);
  17286. //$end->start = $start;
  17287. return array($start, $end);
  17288. }
  17289. }
  17290. }
  17291. /**
  17292. * Concrete text token class.
  17293. *
  17294. * Text tokens comprise of regular parsed character data (PCDATA) and raw
  17295. * character data (from the CDATA sections). Internally, their
  17296. * data is parsed with all entities expanded. Surprisingly, the text token
  17297. * does have a "tag name" called #PCDATA, which is how the DTD represents it
  17298. * in permissible child nodes.
  17299. */
  17300. class HTMLPurifier_Node_Text extends HTMLPurifier_Node
  17301. {
  17302. /**
  17303. * PCDATA tag name compatible with DTD, see
  17304. * HTMLPurifier_ChildDef_Custom for details.
  17305. * @type string
  17306. */
  17307. public $name = '#PCDATA';
  17308. /**
  17309. * @type string
  17310. */
  17311. public $data;
  17312. /**< Parsed character data of text. */
  17313. /**
  17314. * @type bool
  17315. */
  17316. public $is_whitespace;
  17317. /**< Bool indicating if node is whitespace. */
  17318. /**
  17319. * Constructor, accepts data and determines if it is whitespace.
  17320. * @param string $data String parsed character data.
  17321. * @param int $line
  17322. * @param int $col
  17323. */
  17324. public function __construct($data, $is_whitespace, $line = null, $col = null)
  17325. {
  17326. $this->data = $data;
  17327. $this->is_whitespace = $is_whitespace;
  17328. $this->line = $line;
  17329. $this->col = $col;
  17330. }
  17331. public function toTokenPair() {
  17332. return array(new HTMLPurifier_Token_Text($this->data, $this->line, $this->col), null);
  17333. }
  17334. }
  17335. /**
  17336. * Composite strategy that runs multiple strategies on tokens.
  17337. */
  17338. abstract class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
  17339. {
  17340. /**
  17341. * List of strategies to run tokens through.
  17342. * @type HTMLPurifier_Strategy[]
  17343. */
  17344. protected $strategies = array();
  17345. /**
  17346. * @param HTMLPurifier_Token[] $tokens
  17347. * @param HTMLPurifier_Config $config
  17348. * @param HTMLPurifier_Context $context
  17349. * @return HTMLPurifier_Token[]
  17350. */
  17351. public function execute($tokens, $config, $context)
  17352. {
  17353. foreach ($this->strategies as $strategy) {
  17354. $tokens = $strategy->execute($tokens, $config, $context);
  17355. }
  17356. return $tokens;
  17357. }
  17358. }
  17359. /**
  17360. * Core strategy composed of the big four strategies.
  17361. */
  17362. class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
  17363. {
  17364. public function __construct()
  17365. {
  17366. $this->strategies[] = new HTMLPurifier_Strategy_RemoveForeignElements();
  17367. $this->strategies[] = new HTMLPurifier_Strategy_MakeWellFormed();
  17368. $this->strategies[] = new HTMLPurifier_Strategy_FixNesting();
  17369. $this->strategies[] = new HTMLPurifier_Strategy_ValidateAttributes();
  17370. }
  17371. }
  17372. /**
  17373. * Takes a well formed list of tokens and fixes their nesting.
  17374. *
  17375. * HTML elements dictate which elements are allowed to be their children,
  17376. * for example, you can't have a p tag in a span tag. Other elements have
  17377. * much more rigorous definitions: tables, for instance, require a specific
  17378. * order for their elements. There are also constraints not expressible by
  17379. * document type definitions, such as the chameleon nature of ins/del
  17380. * tags and global child exclusions.
  17381. *
  17382. * The first major objective of this strategy is to iterate through all
  17383. * the nodes and determine whether or not their children conform to the
  17384. * element's definition. If they do not, the child definition may
  17385. * optionally supply an amended list of elements that is valid or
  17386. * require that the entire node be deleted (and the previous node
  17387. * rescanned).
  17388. *
  17389. * The second objective is to ensure that explicitly excluded elements of
  17390. * an element do not appear in its children. Code that accomplishes this
  17391. * task is pervasive through the strategy, though the two are distinct tasks
  17392. * and could, theoretically, be seperated (although it's not recommended).
  17393. *
  17394. * @note Whether or not unrecognized children are silently dropped or
  17395. * translated into text depends on the child definitions.
  17396. *
  17397. * @todo Enable nodes to be bubbled out of the structure. This is
  17398. * easier with our new algorithm.
  17399. */
  17400. class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
  17401. {
  17402. /**
  17403. * @param HTMLPurifier_Token[] $tokens
  17404. * @param HTMLPurifier_Config $config
  17405. * @param HTMLPurifier_Context $context
  17406. * @return array|HTMLPurifier_Token[]
  17407. */
  17408. public function execute($tokens, $config, $context)
  17409. {
  17410. //####################################################################//
  17411. // Pre-processing
  17412. // O(n) pass to convert to a tree, so that we can efficiently
  17413. // refer to substrings
  17414. $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
  17415. // get a copy of the HTML definition
  17416. $definition = $config->getHTMLDefinition();
  17417. $excludes_enabled = !$config->get('Core.DisableExcludes');
  17418. // setup the context variable 'IsInline', for chameleon processing
  17419. // is 'false' when we are not inline, 'true' when it must always
  17420. // be inline, and an integer when it is inline for a certain
  17421. // branch of the document tree
  17422. $is_inline = $definition->info_parent_def->descendants_are_inline;
  17423. $context->register('IsInline', $is_inline);
  17424. // setup error collector
  17425. $e =& $context->get('ErrorCollector', true);
  17426. //####################################################################//
  17427. // Loop initialization
  17428. // stack that contains all elements that are excluded
  17429. // it is organized by parent elements, similar to $stack,
  17430. // but it is only populated when an element with exclusions is
  17431. // processed, i.e. there won't be empty exclusions.
  17432. $exclude_stack = array($definition->info_parent_def->excludes);
  17433. // variable that contains the start token while we are processing
  17434. // nodes. This enables error reporting to do its job
  17435. $node = $top_node;
  17436. // dummy token
  17437. list($token, $d) = $node->toTokenPair();
  17438. $context->register('CurrentNode', $node);
  17439. $context->register('CurrentToken', $token);
  17440. //####################################################################//
  17441. // Loop
  17442. // We need to implement a post-order traversal iteratively, to
  17443. // avoid running into stack space limits. This is pretty tricky
  17444. // to reason about, so we just manually stack-ify the recursive
  17445. // variant:
  17446. //
  17447. // function f($node) {
  17448. // foreach ($node->children as $child) {
  17449. // f($child);
  17450. // }
  17451. // validate($node);
  17452. // }
  17453. //
  17454. // Thus, we will represent a stack frame as array($node,
  17455. // $is_inline, stack of children)
  17456. // e.g. array_reverse($node->children) - already processed
  17457. // children.
  17458. $parent_def = $definition->info_parent_def;
  17459. $stack = array(
  17460. array($top_node,
  17461. $parent_def->descendants_are_inline,
  17462. $parent_def->excludes, // exclusions
  17463. 0)
  17464. );
  17465. while (!empty($stack)) {
  17466. list($node, $is_inline, $excludes, $ix) = array_pop($stack);
  17467. // recursive call
  17468. $go = false;
  17469. $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
  17470. while (isset($node->children[$ix])) {
  17471. $child = $node->children[$ix++];
  17472. if ($child instanceof HTMLPurifier_Node_Element) {
  17473. $go = true;
  17474. $stack[] = array($node, $is_inline, $excludes, $ix);
  17475. $stack[] = array($child,
  17476. // ToDo: I don't think it matters if it's def or
  17477. // child_def, but double check this...
  17478. $is_inline || $def->descendants_are_inline,
  17479. empty($def->excludes) ? $excludes
  17480. : array_merge($excludes, $def->excludes),
  17481. 0);
  17482. break;
  17483. }
  17484. };
  17485. if ($go) continue;
  17486. list($token, $d) = $node->toTokenPair();
  17487. // base case
  17488. if ($excludes_enabled && isset($excludes[$node->name])) {
  17489. $node->dead = true;
  17490. if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
  17491. } else {
  17492. // XXX I suppose it would be slightly more efficient to
  17493. // avoid the allocation here and have children
  17494. // strategies handle it
  17495. $children = array();
  17496. foreach ($node->children as $child) {
  17497. if (!$child->dead) $children[] = $child;
  17498. }
  17499. $result = $def->child->validateChildren($children, $config, $context);
  17500. if ($result === true) {
  17501. // nop
  17502. $node->children = $children;
  17503. } elseif ($result === false) {
  17504. $node->dead = true;
  17505. if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
  17506. } else {
  17507. $node->children = $result;
  17508. if ($e) {
  17509. // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
  17510. if (empty($result) && !empty($children)) {
  17511. $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
  17512. } else if ($result != $children) {
  17513. $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
  17514. }
  17515. }
  17516. }
  17517. }
  17518. }
  17519. //####################################################################//
  17520. // Post-processing
  17521. // remove context variables
  17522. $context->destroy('IsInline');
  17523. $context->destroy('CurrentNode');
  17524. $context->destroy('CurrentToken');
  17525. //####################################################################//
  17526. // Return
  17527. return HTMLPurifier_Arborize::flatten($node, $config, $context);
  17528. }
  17529. }
  17530. /**
  17531. * Takes tokens makes them well-formed (balance end tags, etc.)
  17532. *
  17533. * Specification of the armor attributes this strategy uses:
  17534. *
  17535. * - MakeWellFormed_TagClosedError: This armor field is used to
  17536. * suppress tag closed errors for certain tokens [TagClosedSuppress],
  17537. * in particular, if a tag was generated automatically by HTML
  17538. * Purifier, we may rely on our infrastructure to close it for us
  17539. * and shouldn't report an error to the user [TagClosedAuto].
  17540. */
  17541. class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
  17542. {
  17543. /**
  17544. * Array stream of tokens being processed.
  17545. * @type HTMLPurifier_Token[]
  17546. */
  17547. protected $tokens;
  17548. /**
  17549. * Current token.
  17550. * @type HTMLPurifier_Token
  17551. */
  17552. protected $token;
  17553. /**
  17554. * Zipper managing the true state.
  17555. * @type HTMLPurifier_Zipper
  17556. */
  17557. protected $zipper;
  17558. /**
  17559. * Current nesting of elements.
  17560. * @type array
  17561. */
  17562. protected $stack;
  17563. /**
  17564. * Injectors active in this stream processing.
  17565. * @type HTMLPurifier_Injector[]
  17566. */
  17567. protected $injectors;
  17568. /**
  17569. * Current instance of HTMLPurifier_Config.
  17570. * @type HTMLPurifier_Config
  17571. */
  17572. protected $config;
  17573. /**
  17574. * Current instance of HTMLPurifier_Context.
  17575. * @type HTMLPurifier_Context
  17576. */
  17577. protected $context;
  17578. /**
  17579. * @param HTMLPurifier_Token[] $tokens
  17580. * @param HTMLPurifier_Config $config
  17581. * @param HTMLPurifier_Context $context
  17582. * @return HTMLPurifier_Token[]
  17583. * @throws HTMLPurifier_Exception
  17584. */
  17585. public function execute($tokens, $config, $context)
  17586. {
  17587. $definition = $config->getHTMLDefinition();
  17588. // local variables
  17589. $generator = new HTMLPurifier_Generator($config, $context);
  17590. $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
  17591. // used for autoclose early abortion
  17592. $global_parent_allowed_elements = $definition->info_parent_def->child->getAllowedElements($config);
  17593. $e = $context->get('ErrorCollector', true);
  17594. $i = false; // injector index
  17595. list($zipper, $token) = HTMLPurifier_Zipper::fromArray($tokens);
  17596. if ($token === NULL) {
  17597. return array();
  17598. }
  17599. $reprocess = false; // whether or not to reprocess the same token
  17600. $stack = array();
  17601. // member variables
  17602. $this->stack =& $stack;
  17603. $this->tokens =& $tokens;
  17604. $this->token =& $token;
  17605. $this->zipper =& $zipper;
  17606. $this->config = $config;
  17607. $this->context = $context;
  17608. // context variables
  17609. $context->register('CurrentNesting', $stack);
  17610. $context->register('InputZipper', $zipper);
  17611. $context->register('CurrentToken', $token);
  17612. // -- begin INJECTOR --
  17613. $this->injectors = array();
  17614. $injectors = $config->getBatch('AutoFormat');
  17615. $def_injectors = $definition->info_injector;
  17616. $custom_injectors = $injectors['Custom'];
  17617. unset($injectors['Custom']); // special case
  17618. foreach ($injectors as $injector => $b) {
  17619. // XXX: Fix with a legitimate lookup table of enabled filters
  17620. if (strpos($injector, '.') !== false) {
  17621. continue;
  17622. }
  17623. $injector = "HTMLPurifier_Injector_$injector";
  17624. if (!$b) {
  17625. continue;
  17626. }
  17627. $this->injectors[] = new $injector;
  17628. }
  17629. foreach ($def_injectors as $injector) {
  17630. // assumed to be objects
  17631. $this->injectors[] = $injector;
  17632. }
  17633. foreach ($custom_injectors as $injector) {
  17634. if (!$injector) {
  17635. continue;
  17636. }
  17637. if (is_string($injector)) {
  17638. $injector = "HTMLPurifier_Injector_$injector";
  17639. $injector = new $injector;
  17640. }
  17641. $this->injectors[] = $injector;
  17642. }
  17643. // give the injectors references to the definition and context
  17644. // variables for performance reasons
  17645. foreach ($this->injectors as $ix => $injector) {
  17646. $error = $injector->prepare($config, $context);
  17647. if (!$error) {
  17648. continue;
  17649. }
  17650. array_splice($this->injectors, $ix, 1); // rm the injector
  17651. trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
  17652. }
  17653. // -- end INJECTOR --
  17654. // a note on reprocessing:
  17655. // In order to reduce code duplication, whenever some code needs
  17656. // to make HTML changes in order to make things "correct", the
  17657. // new HTML gets sent through the purifier, regardless of its
  17658. // status. This means that if we add a start token, because it
  17659. // was totally necessary, we don't have to update nesting; we just
  17660. // punt ($reprocess = true; continue;) and it does that for us.
  17661. // isset is in loop because $tokens size changes during loop exec
  17662. for (;;
  17663. // only increment if we don't need to reprocess
  17664. $reprocess ? $reprocess = false : $token = $zipper->next($token)) {
  17665. // check for a rewind
  17666. if (is_int($i)) {
  17667. // possibility: disable rewinding if the current token has a
  17668. // rewind set on it already. This would offer protection from
  17669. // infinite loop, but might hinder some advanced rewinding.
  17670. $rewind_offset = $this->injectors[$i]->getRewindOffset();
  17671. if (is_int($rewind_offset)) {
  17672. for ($j = 0; $j < $rewind_offset; $j++) {
  17673. if (empty($zipper->front)) break;
  17674. $token = $zipper->prev($token);
  17675. // indicate that other injectors should not process this token,
  17676. // but we need to reprocess it. See Note [Injector skips]
  17677. unset($token->skip[$i]);
  17678. $token->rewind = $i;
  17679. if ($token instanceof HTMLPurifier_Token_Start) {
  17680. array_pop($this->stack);
  17681. } elseif ($token instanceof HTMLPurifier_Token_End) {
  17682. $this->stack[] = $token->start;
  17683. }
  17684. }
  17685. }
  17686. $i = false;
  17687. }
  17688. // handle case of document end
  17689. if ($token === NULL) {
  17690. // kill processing if stack is empty
  17691. if (empty($this->stack)) {
  17692. break;
  17693. }
  17694. // peek
  17695. $top_nesting = array_pop($this->stack);
  17696. $this->stack[] = $top_nesting;
  17697. // send error [TagClosedSuppress]
  17698. if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
  17699. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
  17700. }
  17701. // append, don't splice, since this is the end
  17702. $token = new HTMLPurifier_Token_End($top_nesting->name);
  17703. // punt!
  17704. $reprocess = true;
  17705. continue;
  17706. }
  17707. //echo '<br>'; printZipper($zipper, $token);//printTokens($this->stack);
  17708. //flush();
  17709. // quick-check: if it's not a tag, no need to process
  17710. if (empty($token->is_tag)) {
  17711. if ($token instanceof HTMLPurifier_Token_Text) {
  17712. foreach ($this->injectors as $i => $injector) {
  17713. if (isset($token->skip[$i])) {
  17714. // See Note [Injector skips]
  17715. continue;
  17716. }
  17717. if ($token->rewind !== null && $token->rewind !== $i) {
  17718. continue;
  17719. }
  17720. // XXX fuckup
  17721. $r = $token;
  17722. $injector->handleText($r);
  17723. $token = $this->processToken($r, $i);
  17724. $reprocess = true;
  17725. break;
  17726. }
  17727. }
  17728. // another possibility is a comment
  17729. continue;
  17730. }
  17731. if (isset($definition->info[$token->name])) {
  17732. $type = $definition->info[$token->name]->child->type;
  17733. } else {
  17734. $type = false; // Type is unknown, treat accordingly
  17735. }
  17736. // quick tag checks: anything that's *not* an end tag
  17737. $ok = false;
  17738. if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
  17739. // claims to be a start tag but is empty
  17740. $token = new HTMLPurifier_Token_Empty(
  17741. $token->name,
  17742. $token->attr,
  17743. $token->line,
  17744. $token->col,
  17745. $token->armor
  17746. );
  17747. $ok = true;
  17748. } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
  17749. // claims to be empty but really is a start tag
  17750. // NB: this assignment is required
  17751. $old_token = $token;
  17752. $token = new HTMLPurifier_Token_End($token->name);
  17753. $token = $this->insertBefore(
  17754. new HTMLPurifier_Token_Start($old_token->name, $old_token->attr, $old_token->line, $old_token->col, $old_token->armor)
  17755. );
  17756. // punt (since we had to modify the input stream in a non-trivial way)
  17757. $reprocess = true;
  17758. continue;
  17759. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  17760. // real empty token
  17761. $ok = true;
  17762. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  17763. // start tag
  17764. // ...unless they also have to close their parent
  17765. if (!empty($this->stack)) {
  17766. // Performance note: you might think that it's rather
  17767. // inefficient, recalculating the autoclose information
  17768. // for every tag that a token closes (since when we
  17769. // do an autoclose, we push a new token into the
  17770. // stream and then /process/ that, before
  17771. // re-processing this token.) But this is
  17772. // necessary, because an injector can make an
  17773. // arbitrary transformations to the autoclosing
  17774. // tokens we introduce, so things may have changed
  17775. // in the meantime. Also, doing the inefficient thing is
  17776. // "easy" to reason about (for certain perverse definitions
  17777. // of "easy")
  17778. $parent = array_pop($this->stack);
  17779. $this->stack[] = $parent;
  17780. $parent_def = null;
  17781. $parent_elements = null;
  17782. $autoclose = false;
  17783. if (isset($definition->info[$parent->name])) {
  17784. $parent_def = $definition->info[$parent->name];
  17785. $parent_elements = $parent_def->child->getAllowedElements($config);
  17786. $autoclose = !isset($parent_elements[$token->name]);
  17787. }
  17788. if ($autoclose && $definition->info[$token->name]->wrap) {
  17789. // Check if an element can be wrapped by another
  17790. // element to make it valid in a context (for
  17791. // example, <ul><ul> needs a <li> in between)
  17792. $wrapname = $definition->info[$token->name]->wrap;
  17793. $wrapdef = $definition->info[$wrapname];
  17794. $elements = $wrapdef->child->getAllowedElements($config);
  17795. if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) {
  17796. $newtoken = new HTMLPurifier_Token_Start($wrapname);
  17797. $token = $this->insertBefore($newtoken);
  17798. $reprocess = true;
  17799. continue;
  17800. }
  17801. }
  17802. $carryover = false;
  17803. if ($autoclose && $parent_def->formatting) {
  17804. $carryover = true;
  17805. }
  17806. if ($autoclose) {
  17807. // check if this autoclose is doomed to fail
  17808. // (this rechecks $parent, which his harmless)
  17809. $autoclose_ok = isset($global_parent_allowed_elements[$token->name]);
  17810. if (!$autoclose_ok) {
  17811. foreach ($this->stack as $ancestor) {
  17812. $elements = $definition->info[$ancestor->name]->child->getAllowedElements($config);
  17813. if (isset($elements[$token->name])) {
  17814. $autoclose_ok = true;
  17815. break;
  17816. }
  17817. if ($definition->info[$token->name]->wrap) {
  17818. $wrapname = $definition->info[$token->name]->wrap;
  17819. $wrapdef = $definition->info[$wrapname];
  17820. $wrap_elements = $wrapdef->child->getAllowedElements($config);
  17821. if (isset($wrap_elements[$token->name]) && isset($elements[$wrapname])) {
  17822. $autoclose_ok = true;
  17823. break;
  17824. }
  17825. }
  17826. }
  17827. }
  17828. if ($autoclose_ok) {
  17829. // errors need to be updated
  17830. $new_token = new HTMLPurifier_Token_End($parent->name);
  17831. $new_token->start = $parent;
  17832. // [TagClosedSuppress]
  17833. if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
  17834. if (!$carryover) {
  17835. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
  17836. } else {
  17837. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
  17838. }
  17839. }
  17840. if ($carryover) {
  17841. $element = clone $parent;
  17842. // [TagClosedAuto]
  17843. $element->armor['MakeWellFormed_TagClosedError'] = true;
  17844. $element->carryover = true;
  17845. $token = $this->processToken(array($new_token, $token, $element));
  17846. } else {
  17847. $token = $this->insertBefore($new_token);
  17848. }
  17849. } else {
  17850. $token = $this->remove();
  17851. }
  17852. $reprocess = true;
  17853. continue;
  17854. }
  17855. }
  17856. $ok = true;
  17857. }
  17858. if ($ok) {
  17859. foreach ($this->injectors as $i => $injector) {
  17860. if (isset($token->skip[$i])) {
  17861. // See Note [Injector skips]
  17862. continue;
  17863. }
  17864. if ($token->rewind !== null && $token->rewind !== $i) {
  17865. continue;
  17866. }
  17867. $r = $token;
  17868. $injector->handleElement($r);
  17869. $token = $this->processToken($r, $i);
  17870. $reprocess = true;
  17871. break;
  17872. }
  17873. if (!$reprocess) {
  17874. // ah, nothing interesting happened; do normal processing
  17875. if ($token instanceof HTMLPurifier_Token_Start) {
  17876. $this->stack[] = $token;
  17877. } elseif ($token instanceof HTMLPurifier_Token_End) {
  17878. throw new HTMLPurifier_Exception(
  17879. 'Improper handling of end tag in start code; possible error in MakeWellFormed'
  17880. );
  17881. }
  17882. }
  17883. continue;
  17884. }
  17885. // sanity check: we should be dealing with a closing tag
  17886. if (!$token instanceof HTMLPurifier_Token_End) {
  17887. throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
  17888. }
  17889. // make sure that we have something open
  17890. if (empty($this->stack)) {
  17891. if ($escape_invalid_tags) {
  17892. if ($e) {
  17893. $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
  17894. }
  17895. $token = new HTMLPurifier_Token_Text($generator->generateFromToken($token));
  17896. } else {
  17897. if ($e) {
  17898. $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
  17899. }
  17900. $token = $this->remove();
  17901. }
  17902. $reprocess = true;
  17903. continue;
  17904. }
  17905. // first, check for the simplest case: everything closes neatly.
  17906. // Eventually, everything passes through here; if there are problems
  17907. // we modify the input stream accordingly and then punt, so that
  17908. // the tokens get processed again.
  17909. $current_parent = array_pop($this->stack);
  17910. if ($current_parent->name == $token->name) {
  17911. $token->start = $current_parent;
  17912. foreach ($this->injectors as $i => $injector) {
  17913. if (isset($token->skip[$i])) {
  17914. // See Note [Injector skips]
  17915. continue;
  17916. }
  17917. if ($token->rewind !== null && $token->rewind !== $i) {
  17918. continue;
  17919. }
  17920. $r = $token;
  17921. $injector->handleEnd($r);
  17922. $token = $this->processToken($r, $i);
  17923. $this->stack[] = $current_parent;
  17924. $reprocess = true;
  17925. break;
  17926. }
  17927. continue;
  17928. }
  17929. // okay, so we're trying to close the wrong tag
  17930. // undo the pop previous pop
  17931. $this->stack[] = $current_parent;
  17932. // scroll back the entire nest, trying to find our tag.
  17933. // (feature could be to specify how far you'd like to go)
  17934. $size = count($this->stack);
  17935. // -2 because -1 is the last element, but we already checked that
  17936. $skipped_tags = false;
  17937. for ($j = $size - 2; $j >= 0; $j--) {
  17938. if ($this->stack[$j]->name == $token->name) {
  17939. $skipped_tags = array_slice($this->stack, $j);
  17940. break;
  17941. }
  17942. }
  17943. // we didn't find the tag, so remove
  17944. if ($skipped_tags === false) {
  17945. if ($escape_invalid_tags) {
  17946. if ($e) {
  17947. $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
  17948. }
  17949. $token = new HTMLPurifier_Token_Text($generator->generateFromToken($token));
  17950. } else {
  17951. if ($e) {
  17952. $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
  17953. }
  17954. $token = $this->remove();
  17955. }
  17956. $reprocess = true;
  17957. continue;
  17958. }
  17959. // do errors, in REVERSE $j order: a,b,c with </a></b></c>
  17960. $c = count($skipped_tags);
  17961. if ($e) {
  17962. for ($j = $c - 1; $j > 0; $j--) {
  17963. // notice we exclude $j == 0, i.e. the current ending tag, from
  17964. // the errors... [TagClosedSuppress]
  17965. if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
  17966. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
  17967. }
  17968. }
  17969. }
  17970. // insert tags, in FORWARD $j order: c,b,a with </a></b></c>
  17971. $replace = array($token);
  17972. for ($j = 1; $j < $c; $j++) {
  17973. // ...as well as from the insertions
  17974. $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name);
  17975. $new_token->start = $skipped_tags[$j];
  17976. array_unshift($replace, $new_token);
  17977. if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) {
  17978. // [TagClosedAuto]
  17979. $element = clone $skipped_tags[$j];
  17980. $element->carryover = true;
  17981. $element->armor['MakeWellFormed_TagClosedError'] = true;
  17982. $replace[] = $element;
  17983. }
  17984. }
  17985. $token = $this->processToken($replace);
  17986. $reprocess = true;
  17987. continue;
  17988. }
  17989. $context->destroy('CurrentToken');
  17990. $context->destroy('CurrentNesting');
  17991. $context->destroy('InputZipper');
  17992. unset($this->injectors, $this->stack, $this->tokens);
  17993. return $zipper->toArray($token);
  17994. }
  17995. /**
  17996. * Processes arbitrary token values for complicated substitution patterns.
  17997. * In general:
  17998. *
  17999. * If $token is an array, it is a list of tokens to substitute for the
  18000. * current token. These tokens then get individually processed. If there
  18001. * is a leading integer in the list, that integer determines how many
  18002. * tokens from the stream should be removed.
  18003. *
  18004. * If $token is a regular token, it is swapped with the current token.
  18005. *
  18006. * If $token is false, the current token is deleted.
  18007. *
  18008. * If $token is an integer, that number of tokens (with the first token
  18009. * being the current one) will be deleted.
  18010. *
  18011. * @param HTMLPurifier_Token|array|int|bool $token Token substitution value
  18012. * @param HTMLPurifier_Injector|int $injector Injector that performed the substitution; default is if
  18013. * this is not an injector related operation.
  18014. * @throws HTMLPurifier_Exception
  18015. */
  18016. protected function processToken($token, $injector = -1)
  18017. {
  18018. // Zend OpCache miscompiles $token = array($token), so
  18019. // avoid this pattern. See: https://github.com/ezyang/htmlpurifier/issues/108
  18020. // normalize forms of token
  18021. if (is_object($token)) {
  18022. $tmp = $token;
  18023. $token = array(1, $tmp);
  18024. }
  18025. if (is_int($token)) {
  18026. $tmp = $token;
  18027. $token = array($tmp);
  18028. }
  18029. if ($token === false) {
  18030. $token = array(1);
  18031. }
  18032. if (!is_array($token)) {
  18033. throw new HTMLPurifier_Exception('Invalid token type from injector');
  18034. }
  18035. if (!is_int($token[0])) {
  18036. array_unshift($token, 1);
  18037. }
  18038. if ($token[0] === 0) {
  18039. throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
  18040. }
  18041. // $token is now an array with the following form:
  18042. // array(number nodes to delete, new node 1, new node 2, ...)
  18043. $delete = array_shift($token);
  18044. list($old, $r) = $this->zipper->splice($this->token, $delete, $token);
  18045. if ($injector > -1) {
  18046. // See Note [Injector skips]
  18047. // Determine appropriate skips. Here's what the code does:
  18048. // *If* we deleted one or more tokens, copy the skips
  18049. // of those tokens into the skips of the new tokens (in $token).
  18050. // Also, mark the newly inserted tokens as having come from
  18051. // $injector.
  18052. $oldskip = isset($old[0]) ? $old[0]->skip : array();
  18053. foreach ($token as $object) {
  18054. $object->skip = $oldskip;
  18055. $object->skip[$injector] = true;
  18056. }
  18057. }
  18058. return $r;
  18059. }
  18060. /**
  18061. * Inserts a token before the current token. Cursor now points to
  18062. * this token. You must reprocess after this.
  18063. * @param HTMLPurifier_Token $token
  18064. */
  18065. private function insertBefore($token)
  18066. {
  18067. // NB not $this->zipper->insertBefore(), due to positioning
  18068. // differences
  18069. $splice = $this->zipper->splice($this->token, 0, array($token));
  18070. return $splice[1];
  18071. }
  18072. /**
  18073. * Removes current token. Cursor now points to new token occupying previously
  18074. * occupied space. You must reprocess after this.
  18075. */
  18076. private function remove()
  18077. {
  18078. return $this->zipper->delete();
  18079. }
  18080. }
  18081. // Note [Injector skips]
  18082. // ~~~~~~~~~~~~~~~~~~~~~
  18083. // When I originally designed this class, the idea behind the 'skip'
  18084. // property of HTMLPurifier_Token was to help avoid infinite loops
  18085. // in injector processing. For example, suppose you wrote an injector
  18086. // that bolded swear words. Naively, you might write it so that
  18087. // whenever you saw ****, you replaced it with <strong>****</strong>.
  18088. //
  18089. // When this happens, we will reprocess all of the tokens with the
  18090. // other injectors. Now there is an opportunity for infinite loop:
  18091. // if we rerun the swear-word injector on these tokens, we might
  18092. // see **** and then reprocess again to get
  18093. // <strong><strong>****</strong></strong> ad infinitum.
  18094. //
  18095. // Thus, the idea of a skip is that once we process a token with
  18096. // an injector, we mark all of those tokens as having "come from"
  18097. // the injector, and we never run the injector again on these
  18098. // tokens.
  18099. //
  18100. // There were two more complications, however:
  18101. //
  18102. // - With HTMLPurifier_Injector_RemoveEmpty, we noticed that if
  18103. // you had <b><i></i></b>, after you removed the <i></i>, you
  18104. // really would like this injector to go back and reprocess
  18105. // the <b> tag, discovering that it is now empty and can be
  18106. // removed. So we reintroduced the possibility of infinite looping
  18107. // by adding a "rewind" function, which let you go back to an
  18108. // earlier point in the token stream and reprocess it with injectors.
  18109. // Needless to say, we need to UN-skip the token so it gets
  18110. // reprocessed.
  18111. //
  18112. // - Suppose that you successfuly process a token, replace it with
  18113. // one with your skip mark, but now another injector wants to
  18114. // process the skipped token with another token. Should you continue
  18115. // to skip that new token, or reprocess it? If you reprocess,
  18116. // you can end up with an infinite loop where one injector converts
  18117. // <a> to <b>, and then another injector converts it back. So
  18118. // we inherit the skips, but for some reason, I thought that we
  18119. // should inherit the skip from the first token of the token
  18120. // that we deleted. Why? Well, it seems to work OK.
  18121. //
  18122. // If I were to redesign this functionality, I would absolutely not
  18123. // go about doing it this way: the semantics are just not very well
  18124. // defined, and in any case you probably wanted to operate on trees,
  18125. // not token streams.
  18126. /**
  18127. * Removes all unrecognized tags from the list of tokens.
  18128. *
  18129. * This strategy iterates through all the tokens and removes unrecognized
  18130. * tokens. If a token is not recognized but a TagTransform is defined for
  18131. * that element, the element will be transformed accordingly.
  18132. */
  18133. class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
  18134. {
  18135. /**
  18136. * @param HTMLPurifier_Token[] $tokens
  18137. * @param HTMLPurifier_Config $config
  18138. * @param HTMLPurifier_Context $context
  18139. * @return array|HTMLPurifier_Token[]
  18140. */
  18141. public function execute($tokens, $config, $context)
  18142. {
  18143. $definition = $config->getHTMLDefinition();
  18144. $generator = new HTMLPurifier_Generator($config, $context);
  18145. $result = array();
  18146. $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
  18147. $remove_invalid_img = $config->get('Core.RemoveInvalidImg');
  18148. // currently only used to determine if comments should be kept
  18149. $trusted = $config->get('HTML.Trusted');
  18150. $comment_lookup = $config->get('HTML.AllowedComments');
  18151. $comment_regexp = $config->get('HTML.AllowedCommentsRegexp');
  18152. $check_comments = $comment_lookup !== array() || $comment_regexp !== null;
  18153. $remove_script_contents = $config->get('Core.RemoveScriptContents');
  18154. $hidden_elements = $config->get('Core.HiddenElements');
  18155. // remove script contents compatibility
  18156. if ($remove_script_contents === true) {
  18157. $hidden_elements['script'] = true;
  18158. } elseif ($remove_script_contents === false && isset($hidden_elements['script'])) {
  18159. unset($hidden_elements['script']);
  18160. }
  18161. $attr_validator = new HTMLPurifier_AttrValidator();
  18162. // removes tokens until it reaches a closing tag with its value
  18163. $remove_until = false;
  18164. // converts comments into text tokens when this is equal to a tag name
  18165. $textify_comments = false;
  18166. $token = false;
  18167. $context->register('CurrentToken', $token);
  18168. $e = false;
  18169. if ($config->get('Core.CollectErrors')) {
  18170. $e =& $context->get('ErrorCollector');
  18171. }
  18172. foreach ($tokens as $token) {
  18173. if ($remove_until) {
  18174. if (empty($token->is_tag) || $token->name !== $remove_until) {
  18175. continue;
  18176. }
  18177. }
  18178. if (!empty($token->is_tag)) {
  18179. // DEFINITION CALL
  18180. // before any processing, try to transform the element
  18181. if (isset($definition->info_tag_transform[$token->name])) {
  18182. $original_name = $token->name;
  18183. // there is a transformation for this tag
  18184. // DEFINITION CALL
  18185. $token = $definition->
  18186. info_tag_transform[$token->name]->transform($token, $config, $context);
  18187. if ($e) {
  18188. $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Tag transform', $original_name);
  18189. }
  18190. }
  18191. if (isset($definition->info[$token->name])) {
  18192. // mostly everything's good, but
  18193. // we need to make sure required attributes are in order
  18194. if (($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) &&
  18195. $definition->info[$token->name]->required_attr &&
  18196. ($token->name != 'img' || $remove_invalid_img) // ensure config option still works
  18197. ) {
  18198. $attr_validator->validateToken($token, $config, $context);
  18199. $ok = true;
  18200. foreach ($definition->info[$token->name]->required_attr as $name) {
  18201. if (!isset($token->attr[$name])) {
  18202. $ok = false;
  18203. break;
  18204. }
  18205. }
  18206. if (!$ok) {
  18207. if ($e) {
  18208. $e->send(
  18209. E_ERROR,
  18210. 'Strategy_RemoveForeignElements: Missing required attribute',
  18211. $name
  18212. );
  18213. }
  18214. continue;
  18215. }
  18216. $token->armor['ValidateAttributes'] = true;
  18217. }
  18218. if (isset($hidden_elements[$token->name]) && $token instanceof HTMLPurifier_Token_Start) {
  18219. $textify_comments = $token->name;
  18220. } elseif ($token->name === $textify_comments && $token instanceof HTMLPurifier_Token_End) {
  18221. $textify_comments = false;
  18222. }
  18223. } elseif ($escape_invalid_tags) {
  18224. // invalid tag, generate HTML representation and insert in
  18225. if ($e) {
  18226. $e->send(E_WARNING, 'Strategy_RemoveForeignElements: Foreign element to text');
  18227. }
  18228. $token = new HTMLPurifier_Token_Text(
  18229. $generator->generateFromToken($token)
  18230. );
  18231. } else {
  18232. // check if we need to destroy all of the tag's children
  18233. // CAN BE GENERICIZED
  18234. if (isset($hidden_elements[$token->name])) {
  18235. if ($token instanceof HTMLPurifier_Token_Start) {
  18236. $remove_until = $token->name;
  18237. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  18238. // do nothing: we're still looking
  18239. } else {
  18240. $remove_until = false;
  18241. }
  18242. if ($e) {
  18243. $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed');
  18244. }
  18245. } else {
  18246. if ($e) {
  18247. $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed');
  18248. }
  18249. }
  18250. continue;
  18251. }
  18252. } elseif ($token instanceof HTMLPurifier_Token_Comment) {
  18253. // textify comments in script tags when they are allowed
  18254. if ($textify_comments !== false) {
  18255. $data = $token->data;
  18256. $token = new HTMLPurifier_Token_Text($data);
  18257. } elseif ($trusted || $check_comments) {
  18258. // always cleanup comments
  18259. $trailing_hyphen = false;
  18260. if ($e) {
  18261. // perform check whether or not there's a trailing hyphen
  18262. if (substr($token->data, -1) == '-') {
  18263. $trailing_hyphen = true;
  18264. }
  18265. }
  18266. $token->data = rtrim($token->data, '-');
  18267. $found_double_hyphen = false;
  18268. while (strpos($token->data, '--') !== false) {
  18269. $found_double_hyphen = true;
  18270. $token->data = str_replace('--', '-', $token->data);
  18271. }
  18272. if ($trusted || !empty($comment_lookup[trim($token->data)]) ||
  18273. ($comment_regexp !== null && preg_match($comment_regexp, trim($token->data)))) {
  18274. // OK good
  18275. if ($e) {
  18276. if ($trailing_hyphen) {
  18277. $e->send(
  18278. E_NOTICE,
  18279. 'Strategy_RemoveForeignElements: Trailing hyphen in comment removed'
  18280. );
  18281. }
  18282. if ($found_double_hyphen) {
  18283. $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Hyphens in comment collapsed');
  18284. }
  18285. }
  18286. } else {
  18287. if ($e) {
  18288. $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed');
  18289. }
  18290. continue;
  18291. }
  18292. } else {
  18293. // strip comments
  18294. if ($e) {
  18295. $e->send(E_NOTICE, 'Strategy_RemoveForeignElements: Comment removed');
  18296. }
  18297. continue;
  18298. }
  18299. } elseif ($token instanceof HTMLPurifier_Token_Text) {
  18300. } else {
  18301. continue;
  18302. }
  18303. $result[] = $token;
  18304. }
  18305. if ($remove_until && $e) {
  18306. // we removed tokens until the end, throw error
  18307. $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Token removed to end', $remove_until);
  18308. }
  18309. $context->destroy('CurrentToken');
  18310. return $result;
  18311. }
  18312. }
  18313. /**
  18314. * Validate all attributes in the tokens.
  18315. */
  18316. class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
  18317. {
  18318. /**
  18319. * @param HTMLPurifier_Token[] $tokens
  18320. * @param HTMLPurifier_Config $config
  18321. * @param HTMLPurifier_Context $context
  18322. * @return HTMLPurifier_Token[]
  18323. */
  18324. public function execute($tokens, $config, $context)
  18325. {
  18326. // setup validator
  18327. $validator = new HTMLPurifier_AttrValidator();
  18328. $token = false;
  18329. $context->register('CurrentToken', $token);
  18330. foreach ($tokens as $key => $token) {
  18331. // only process tokens that have attributes,
  18332. // namely start and empty tags
  18333. if (!$token instanceof HTMLPurifier_Token_Start && !$token instanceof HTMLPurifier_Token_Empty) {
  18334. continue;
  18335. }
  18336. // skip tokens that are armored
  18337. if (!empty($token->armor['ValidateAttributes'])) {
  18338. continue;
  18339. }
  18340. // note that we have no facilities here for removing tokens
  18341. $validator->validateToken($token, $config, $context);
  18342. }
  18343. $context->destroy('CurrentToken');
  18344. return $tokens;
  18345. }
  18346. }
  18347. /**
  18348. * Transforms FONT tags to the proper form (SPAN with CSS styling)
  18349. *
  18350. * This transformation takes the three proprietary attributes of FONT and
  18351. * transforms them into their corresponding CSS attributes. These are color,
  18352. * face, and size.
  18353. *
  18354. * @note Size is an interesting case because it doesn't map cleanly to CSS.
  18355. * Thanks to
  18356. * http://style.cleverchimp.com/font_size_intervals/altintervals.html
  18357. * for reasonable mappings.
  18358. * @warning This doesn't work completely correctly; specifically, this
  18359. * TagTransform operates before well-formedness is enforced, so
  18360. * the "active formatting elements" algorithm doesn't get applied.
  18361. */
  18362. class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
  18363. {
  18364. /**
  18365. * @type string
  18366. */
  18367. public $transform_to = 'span';
  18368. /**
  18369. * @type array
  18370. */
  18371. protected $_size_lookup = array(
  18372. '0' => 'xx-small',
  18373. '1' => 'xx-small',
  18374. '2' => 'small',
  18375. '3' => 'medium',
  18376. '4' => 'large',
  18377. '5' => 'x-large',
  18378. '6' => 'xx-large',
  18379. '7' => '300%',
  18380. '-1' => 'smaller',
  18381. '-2' => '60%',
  18382. '+1' => 'larger',
  18383. '+2' => '150%',
  18384. '+3' => '200%',
  18385. '+4' => '300%'
  18386. );
  18387. /**
  18388. * @param HTMLPurifier_Token_Tag $tag
  18389. * @param HTMLPurifier_Config $config
  18390. * @param HTMLPurifier_Context $context
  18391. * @return HTMLPurifier_Token_End|string
  18392. */
  18393. public function transform($tag, $config, $context)
  18394. {
  18395. if ($tag instanceof HTMLPurifier_Token_End) {
  18396. $new_tag = clone $tag;
  18397. $new_tag->name = $this->transform_to;
  18398. return $new_tag;
  18399. }
  18400. $attr = $tag->attr;
  18401. $prepend_style = '';
  18402. // handle color transform
  18403. if (isset($attr['color'])) {
  18404. $prepend_style .= 'color:' . $attr['color'] . ';';
  18405. unset($attr['color']);
  18406. }
  18407. // handle face transform
  18408. if (isset($attr['face'])) {
  18409. $prepend_style .= 'font-family:' . $attr['face'] . ';';
  18410. unset($attr['face']);
  18411. }
  18412. // handle size transform
  18413. if (isset($attr['size'])) {
  18414. // normalize large numbers
  18415. if ($attr['size'] !== '') {
  18416. if ($attr['size']{0} == '+' || $attr['size']{0} == '-') {
  18417. $size = (int)$attr['size'];
  18418. if ($size < -2) {
  18419. $attr['size'] = '-2';
  18420. }
  18421. if ($size > 4) {
  18422. $attr['size'] = '+4';
  18423. }
  18424. } else {
  18425. $size = (int)$attr['size'];
  18426. if ($size > 7) {
  18427. $attr['size'] = '7';
  18428. }
  18429. }
  18430. }
  18431. if (isset($this->_size_lookup[$attr['size']])) {
  18432. $prepend_style .= 'font-size:' .
  18433. $this->_size_lookup[$attr['size']] . ';';
  18434. }
  18435. unset($attr['size']);
  18436. }
  18437. if ($prepend_style) {
  18438. $attr['style'] = isset($attr['style']) ?
  18439. $prepend_style . $attr['style'] :
  18440. $prepend_style;
  18441. }
  18442. $new_tag = clone $tag;
  18443. $new_tag->name = $this->transform_to;
  18444. $new_tag->attr = $attr;
  18445. return $new_tag;
  18446. }
  18447. }
  18448. /**
  18449. * Simple transformation, just change tag name to something else,
  18450. * and possibly add some styling. This will cover most of the deprecated
  18451. * tag cases.
  18452. */
  18453. class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
  18454. {
  18455. /**
  18456. * @type string
  18457. */
  18458. protected $style;
  18459. /**
  18460. * @param string $transform_to Tag name to transform to.
  18461. * @param string $style CSS style to add to the tag
  18462. */
  18463. public function __construct($transform_to, $style = null)
  18464. {
  18465. $this->transform_to = $transform_to;
  18466. $this->style = $style;
  18467. }
  18468. /**
  18469. * @param HTMLPurifier_Token_Tag $tag
  18470. * @param HTMLPurifier_Config $config
  18471. * @param HTMLPurifier_Context $context
  18472. * @return string
  18473. */
  18474. public function transform($tag, $config, $context)
  18475. {
  18476. $new_tag = clone $tag;
  18477. $new_tag->name = $this->transform_to;
  18478. if (!is_null($this->style) &&
  18479. ($new_tag instanceof HTMLPurifier_Token_Start || $new_tag instanceof HTMLPurifier_Token_Empty)
  18480. ) {
  18481. $this->prependCSS($new_tag->attr, $this->style);
  18482. }
  18483. return $new_tag;
  18484. }
  18485. }
  18486. /**
  18487. * Concrete comment token class. Generally will be ignored.
  18488. */
  18489. class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
  18490. {
  18491. /**
  18492. * Character data within comment.
  18493. * @type string
  18494. */
  18495. public $data;
  18496. /**
  18497. * @type bool
  18498. */
  18499. public $is_whitespace = true;
  18500. /**
  18501. * Transparent constructor.
  18502. *
  18503. * @param string $data String comment data.
  18504. * @param int $line
  18505. * @param int $col
  18506. */
  18507. public function __construct($data, $line = null, $col = null)
  18508. {
  18509. $this->data = $data;
  18510. $this->line = $line;
  18511. $this->col = $col;
  18512. }
  18513. public function toNode() {
  18514. return new HTMLPurifier_Node_Comment($this->data, $this->line, $this->col);
  18515. }
  18516. }
  18517. /**
  18518. * Abstract class of a tag token (start, end or empty), and its behavior.
  18519. */
  18520. abstract class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
  18521. {
  18522. /**
  18523. * Static bool marker that indicates the class is a tag.
  18524. *
  18525. * This allows us to check objects with <tt>!empty($obj->is_tag)</tt>
  18526. * without having to use a function call <tt>is_a()</tt>.
  18527. * @type bool
  18528. */
  18529. public $is_tag = true;
  18530. /**
  18531. * The lower-case name of the tag, like 'a', 'b' or 'blockquote'.
  18532. *
  18533. * @note Strictly speaking, XML tags are case sensitive, so we shouldn't
  18534. * be lower-casing them, but these tokens cater to HTML tags, which are
  18535. * insensitive.
  18536. * @type string
  18537. */
  18538. public $name;
  18539. /**
  18540. * Associative array of the tag's attributes.
  18541. * @type array
  18542. */
  18543. public $attr = array();
  18544. /**
  18545. * Non-overloaded constructor, which lower-cases passed tag name.
  18546. *
  18547. * @param string $name String name.
  18548. * @param array $attr Associative array of attributes.
  18549. * @param int $line
  18550. * @param int $col
  18551. * @param array $armor
  18552. */
  18553. public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array())
  18554. {
  18555. $this->name = ctype_lower($name) ? $name : strtolower($name);
  18556. foreach ($attr as $key => $value) {
  18557. // normalization only necessary when key is not lowercase
  18558. if (!ctype_lower($key)) {
  18559. $new_key = strtolower($key);
  18560. if (!isset($attr[$new_key])) {
  18561. $attr[$new_key] = $attr[$key];
  18562. }
  18563. if ($new_key !== $key) {
  18564. unset($attr[$key]);
  18565. }
  18566. }
  18567. }
  18568. $this->attr = $attr;
  18569. $this->line = $line;
  18570. $this->col = $col;
  18571. $this->armor = $armor;
  18572. }
  18573. public function toNode() {
  18574. return new HTMLPurifier_Node_Element($this->name, $this->attr, $this->line, $this->col, $this->armor);
  18575. }
  18576. }
  18577. /**
  18578. * Concrete empty token class.
  18579. */
  18580. class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
  18581. {
  18582. public function toNode() {
  18583. $n = parent::toNode();
  18584. $n->empty = true;
  18585. return $n;
  18586. }
  18587. }
  18588. /**
  18589. * Concrete end token class.
  18590. *
  18591. * @warning This class accepts attributes even though end tags cannot. This
  18592. * is for optimization reasons, as under normal circumstances, the Lexers
  18593. * do not pass attributes.
  18594. */
  18595. class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
  18596. {
  18597. /**
  18598. * Token that started this node.
  18599. * Added by MakeWellFormed. Please do not edit this!
  18600. * @type HTMLPurifier_Token
  18601. */
  18602. public $start;
  18603. public function toNode() {
  18604. throw new Exception("HTMLPurifier_Token_End->toNode not supported!");
  18605. }
  18606. }
  18607. /**
  18608. * Concrete start token class.
  18609. */
  18610. class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
  18611. {
  18612. }
  18613. /**
  18614. * Concrete text token class.
  18615. *
  18616. * Text tokens comprise of regular parsed character data (PCDATA) and raw
  18617. * character data (from the CDATA sections). Internally, their
  18618. * data is parsed with all entities expanded. Surprisingly, the text token
  18619. * does have a "tag name" called #PCDATA, which is how the DTD represents it
  18620. * in permissible child nodes.
  18621. */
  18622. class HTMLPurifier_Token_Text extends HTMLPurifier_Token
  18623. {
  18624. /**
  18625. * @type string
  18626. */
  18627. public $name = '#PCDATA';
  18628. /**< PCDATA tag name compatible with DTD. */
  18629. /**
  18630. * @type string
  18631. */
  18632. public $data;
  18633. /**< Parsed character data of text. */
  18634. /**
  18635. * @type bool
  18636. */
  18637. public $is_whitespace;
  18638. /**< Bool indicating if node is whitespace. */
  18639. /**
  18640. * Constructor, accepts data and determines if it is whitespace.
  18641. * @param string $data String parsed character data.
  18642. * @param int $line
  18643. * @param int $col
  18644. */
  18645. public function __construct($data, $line = null, $col = null)
  18646. {
  18647. $this->data = $data;
  18648. $this->is_whitespace = ctype_space($data);
  18649. $this->line = $line;
  18650. $this->col = $col;
  18651. }
  18652. public function toNode() {
  18653. return new HTMLPurifier_Node_Text($this->data, $this->is_whitespace, $this->line, $this->col);
  18654. }
  18655. }
  18656. class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter
  18657. {
  18658. /**
  18659. * @type string
  18660. */
  18661. public $name = 'DisableExternal';
  18662. /**
  18663. * @type array
  18664. */
  18665. protected $ourHostParts = false;
  18666. /**
  18667. * @param HTMLPurifier_Config $config
  18668. * @return void
  18669. */
  18670. public function prepare($config)
  18671. {
  18672. $our_host = $config->getDefinition('URI')->host;
  18673. if ($our_host !== null) {
  18674. $this->ourHostParts = array_reverse(explode('.', $our_host));
  18675. }
  18676. }
  18677. /**
  18678. * @param HTMLPurifier_URI $uri Reference
  18679. * @param HTMLPurifier_Config $config
  18680. * @param HTMLPurifier_Context $context
  18681. * @return bool
  18682. */
  18683. public function filter(&$uri, $config, $context)
  18684. {
  18685. if (is_null($uri->host)) {
  18686. return true;
  18687. }
  18688. if ($this->ourHostParts === false) {
  18689. return false;
  18690. }
  18691. $host_parts = array_reverse(explode('.', $uri->host));
  18692. foreach ($this->ourHostParts as $i => $x) {
  18693. if (!isset($host_parts[$i])) {
  18694. return false;
  18695. }
  18696. if ($host_parts[$i] != $this->ourHostParts[$i]) {
  18697. return false;
  18698. }
  18699. }
  18700. return true;
  18701. }
  18702. }
  18703. class HTMLPurifier_URIFilter_DisableExternalResources extends HTMLPurifier_URIFilter_DisableExternal
  18704. {
  18705. /**
  18706. * @type string
  18707. */
  18708. public $name = 'DisableExternalResources';
  18709. /**
  18710. * @param HTMLPurifier_URI $uri
  18711. * @param HTMLPurifier_Config $config
  18712. * @param HTMLPurifier_Context $context
  18713. * @return bool
  18714. */
  18715. public function filter(&$uri, $config, $context)
  18716. {
  18717. if (!$context->get('EmbeddedURI', true)) {
  18718. return true;
  18719. }
  18720. return parent::filter($uri, $config, $context);
  18721. }
  18722. }
  18723. class HTMLPurifier_URIFilter_DisableResources extends HTMLPurifier_URIFilter
  18724. {
  18725. /**
  18726. * @type string
  18727. */
  18728. public $name = 'DisableResources';
  18729. /**
  18730. * @param HTMLPurifier_URI $uri
  18731. * @param HTMLPurifier_Config $config
  18732. * @param HTMLPurifier_Context $context
  18733. * @return bool
  18734. */
  18735. public function filter(&$uri, $config, $context)
  18736. {
  18737. return !$context->get('EmbeddedURI', true);
  18738. }
  18739. }
  18740. // It's not clear to me whether or not Punycode means that hostnames
  18741. // do not have canonical forms anymore. As far as I can tell, it's
  18742. // not a problem (punycoding should be identity when no Unicode
  18743. // points are involved), but I'm not 100% sure
  18744. class HTMLPurifier_URIFilter_HostBlacklist extends HTMLPurifier_URIFilter
  18745. {
  18746. /**
  18747. * @type string
  18748. */
  18749. public $name = 'HostBlacklist';
  18750. /**
  18751. * @type array
  18752. */
  18753. protected $blacklist = array();
  18754. /**
  18755. * @param HTMLPurifier_Config $config
  18756. * @return bool
  18757. */
  18758. public function prepare($config)
  18759. {
  18760. $this->blacklist = $config->get('URI.HostBlacklist');
  18761. return true;
  18762. }
  18763. /**
  18764. * @param HTMLPurifier_URI $uri
  18765. * @param HTMLPurifier_Config $config
  18766. * @param HTMLPurifier_Context $context
  18767. * @return bool
  18768. */
  18769. public function filter(&$uri, $config, $context)
  18770. {
  18771. foreach ($this->blacklist as $blacklisted_host_fragment) {
  18772. if (strpos($uri->host, $blacklisted_host_fragment) !== false) {
  18773. return false;
  18774. }
  18775. }
  18776. return true;
  18777. }
  18778. }
  18779. // does not support network paths
  18780. class HTMLPurifier_URIFilter_MakeAbsolute extends HTMLPurifier_URIFilter
  18781. {
  18782. /**
  18783. * @type string
  18784. */
  18785. public $name = 'MakeAbsolute';
  18786. /**
  18787. * @type
  18788. */
  18789. protected $base;
  18790. /**
  18791. * @type array
  18792. */
  18793. protected $basePathStack = array();
  18794. /**
  18795. * @param HTMLPurifier_Config $config
  18796. * @return bool
  18797. */
  18798. public function prepare($config)
  18799. {
  18800. $def = $config->getDefinition('URI');
  18801. $this->base = $def->base;
  18802. if (is_null($this->base)) {
  18803. trigger_error(
  18804. 'URI.MakeAbsolute is being ignored due to lack of ' .
  18805. 'value for URI.Base configuration',
  18806. E_USER_WARNING
  18807. );
  18808. return false;
  18809. }
  18810. $this->base->fragment = null; // fragment is invalid for base URI
  18811. $stack = explode('/', $this->base->path);
  18812. array_pop($stack); // discard last segment
  18813. $stack = $this->_collapseStack($stack); // do pre-parsing
  18814. $this->basePathStack = $stack;
  18815. return true;
  18816. }
  18817. /**
  18818. * @param HTMLPurifier_URI $uri
  18819. * @param HTMLPurifier_Config $config
  18820. * @param HTMLPurifier_Context $context
  18821. * @return bool
  18822. */
  18823. public function filter(&$uri, $config, $context)
  18824. {
  18825. if (is_null($this->base)) {
  18826. return true;
  18827. } // abort early
  18828. if ($uri->path === '' && is_null($uri->scheme) &&
  18829. is_null($uri->host) && is_null($uri->query) && is_null($uri->fragment)) {
  18830. // reference to current document
  18831. $uri = clone $this->base;
  18832. return true;
  18833. }
  18834. if (!is_null($uri->scheme)) {
  18835. // absolute URI already: don't change
  18836. if (!is_null($uri->host)) {
  18837. return true;
  18838. }
  18839. $scheme_obj = $uri->getSchemeObj($config, $context);
  18840. if (!$scheme_obj) {
  18841. // scheme not recognized
  18842. return false;
  18843. }
  18844. if (!$scheme_obj->hierarchical) {
  18845. // non-hierarchal URI with explicit scheme, don't change
  18846. return true;
  18847. }
  18848. // special case: had a scheme but always is hierarchical and had no authority
  18849. }
  18850. if (!is_null($uri->host)) {
  18851. // network path, don't bother
  18852. return true;
  18853. }
  18854. if ($uri->path === '') {
  18855. $uri->path = $this->base->path;
  18856. } elseif ($uri->path[0] !== '/') {
  18857. // relative path, needs more complicated processing
  18858. $stack = explode('/', $uri->path);
  18859. $new_stack = array_merge($this->basePathStack, $stack);
  18860. if ($new_stack[0] !== '' && !is_null($this->base->host)) {
  18861. array_unshift($new_stack, '');
  18862. }
  18863. $new_stack = $this->_collapseStack($new_stack);
  18864. $uri->path = implode('/', $new_stack);
  18865. } else {
  18866. // absolute path, but still we should collapse
  18867. $uri->path = implode('/', $this->_collapseStack(explode('/', $uri->path)));
  18868. }
  18869. // re-combine
  18870. $uri->scheme = $this->base->scheme;
  18871. if (is_null($uri->userinfo)) {
  18872. $uri->userinfo = $this->base->userinfo;
  18873. }
  18874. if (is_null($uri->host)) {
  18875. $uri->host = $this->base->host;
  18876. }
  18877. if (is_null($uri->port)) {
  18878. $uri->port = $this->base->port;
  18879. }
  18880. return true;
  18881. }
  18882. /**
  18883. * Resolve dots and double-dots in a path stack
  18884. * @param array $stack
  18885. * @return array
  18886. */
  18887. private function _collapseStack($stack)
  18888. {
  18889. $result = array();
  18890. $is_folder = false;
  18891. for ($i = 0; isset($stack[$i]); $i++) {
  18892. $is_folder = false;
  18893. // absorb an internally duplicated slash
  18894. if ($stack[$i] == '' && $i && isset($stack[$i + 1])) {
  18895. continue;
  18896. }
  18897. if ($stack[$i] == '..') {
  18898. if (!empty($result)) {
  18899. $segment = array_pop($result);
  18900. if ($segment === '' && empty($result)) {
  18901. // error case: attempted to back out too far:
  18902. // restore the leading slash
  18903. $result[] = '';
  18904. } elseif ($segment === '..') {
  18905. $result[] = '..'; // cannot remove .. with ..
  18906. }
  18907. } else {
  18908. // relative path, preserve the double-dots
  18909. $result[] = '..';
  18910. }
  18911. $is_folder = true;
  18912. continue;
  18913. }
  18914. if ($stack[$i] == '.') {
  18915. // silently absorb
  18916. $is_folder = true;
  18917. continue;
  18918. }
  18919. $result[] = $stack[$i];
  18920. }
  18921. if ($is_folder) {
  18922. $result[] = '';
  18923. }
  18924. return $result;
  18925. }
  18926. }
  18927. class HTMLPurifier_URIFilter_Munge extends HTMLPurifier_URIFilter
  18928. {
  18929. /**
  18930. * @type string
  18931. */
  18932. public $name = 'Munge';
  18933. /**
  18934. * @type bool
  18935. */
  18936. public $post = true;
  18937. /**
  18938. * @type string
  18939. */
  18940. private $target;
  18941. /**
  18942. * @type HTMLPurifier_URIParser
  18943. */
  18944. private $parser;
  18945. /**
  18946. * @type bool
  18947. */
  18948. private $doEmbed;
  18949. /**
  18950. * @type string
  18951. */
  18952. private $secretKey;
  18953. /**
  18954. * @type array
  18955. */
  18956. protected $replace = array();
  18957. /**
  18958. * @param HTMLPurifier_Config $config
  18959. * @return bool
  18960. */
  18961. public function prepare($config)
  18962. {
  18963. $this->target = $config->get('URI.' . $this->name);
  18964. $this->parser = new HTMLPurifier_URIParser();
  18965. $this->doEmbed = $config->get('URI.MungeResources');
  18966. $this->secretKey = $config->get('URI.MungeSecretKey');
  18967. if ($this->secretKey && !function_exists('hash_hmac')) {
  18968. throw new Exception("Cannot use %URI.MungeSecretKey without hash_hmac support.");
  18969. }
  18970. return true;
  18971. }
  18972. /**
  18973. * @param HTMLPurifier_URI $uri
  18974. * @param HTMLPurifier_Config $config
  18975. * @param HTMLPurifier_Context $context
  18976. * @return bool
  18977. */
  18978. public function filter(&$uri, $config, $context)
  18979. {
  18980. if ($context->get('EmbeddedURI', true) && !$this->doEmbed) {
  18981. return true;
  18982. }
  18983. $scheme_obj = $uri->getSchemeObj($config, $context);
  18984. if (!$scheme_obj) {
  18985. return true;
  18986. } // ignore unknown schemes, maybe another postfilter did it
  18987. if (!$scheme_obj->browsable) {
  18988. return true;
  18989. } // ignore non-browseable schemes, since we can't munge those in a reasonable way
  18990. if ($uri->isBenign($config, $context)) {
  18991. return true;
  18992. } // don't redirect if a benign URL
  18993. $this->makeReplace($uri, $config, $context);
  18994. $this->replace = array_map('rawurlencode', $this->replace);
  18995. $new_uri = strtr($this->target, $this->replace);
  18996. $new_uri = $this->parser->parse($new_uri);
  18997. // don't redirect if the target host is the same as the
  18998. // starting host
  18999. if ($uri->host === $new_uri->host) {
  19000. return true;
  19001. }
  19002. $uri = $new_uri; // overwrite
  19003. return true;
  19004. }
  19005. /**
  19006. * @param HTMLPurifier_URI $uri
  19007. * @param HTMLPurifier_Config $config
  19008. * @param HTMLPurifier_Context $context
  19009. */
  19010. protected function makeReplace($uri, $config, $context)
  19011. {
  19012. $string = $uri->toString();
  19013. // always available
  19014. $this->replace['%s'] = $string;
  19015. $this->replace['%r'] = $context->get('EmbeddedURI', true);
  19016. $token = $context->get('CurrentToken', true);
  19017. $this->replace['%n'] = $token ? $token->name : null;
  19018. $this->replace['%m'] = $context->get('CurrentAttr', true);
  19019. $this->replace['%p'] = $context->get('CurrentCSSProperty', true);
  19020. // not always available
  19021. if ($this->secretKey) {
  19022. $this->replace['%t'] = hash_hmac("sha256", $string, $this->secretKey);
  19023. }
  19024. }
  19025. }
  19026. /**
  19027. * Implements safety checks for safe iframes.
  19028. *
  19029. * @warning This filter is *critical* for ensuring that %HTML.SafeIframe
  19030. * works safely.
  19031. */
  19032. class HTMLPurifier_URIFilter_SafeIframe extends HTMLPurifier_URIFilter
  19033. {
  19034. /**
  19035. * @type string
  19036. */
  19037. public $name = 'SafeIframe';
  19038. /**
  19039. * @type bool
  19040. */
  19041. public $always_load = true;
  19042. /**
  19043. * @type string
  19044. */
  19045. protected $regexp = null;
  19046. // XXX: The not so good bit about how this is all set up now is we
  19047. // can't check HTML.SafeIframe in the 'prepare' step: we have to
  19048. // defer till the actual filtering.
  19049. /**
  19050. * @param HTMLPurifier_Config $config
  19051. * @return bool
  19052. */
  19053. public function prepare($config)
  19054. {
  19055. $this->regexp = $config->get('URI.SafeIframeRegexp');
  19056. return true;
  19057. }
  19058. /**
  19059. * @param HTMLPurifier_URI $uri
  19060. * @param HTMLPurifier_Config $config
  19061. * @param HTMLPurifier_Context $context
  19062. * @return bool
  19063. */
  19064. public function filter(&$uri, $config, $context)
  19065. {
  19066. // check if filter not applicable
  19067. if (!$config->get('HTML.SafeIframe')) {
  19068. return true;
  19069. }
  19070. // check if the filter should actually trigger
  19071. if (!$context->get('EmbeddedURI', true)) {
  19072. return true;
  19073. }
  19074. $token = $context->get('CurrentToken', true);
  19075. if (!($token && $token->name == 'iframe')) {
  19076. return true;
  19077. }
  19078. // check if we actually have some whitelists enabled
  19079. if ($this->regexp === null) {
  19080. return false;
  19081. }
  19082. // actually check the whitelists
  19083. return preg_match($this->regexp, $uri->toString());
  19084. }
  19085. }
  19086. /**
  19087. * Implements data: URI for base64 encoded images supported by GD.
  19088. */
  19089. class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme
  19090. {
  19091. /**
  19092. * @type bool
  19093. */
  19094. public $browsable = true;
  19095. /**
  19096. * @type array
  19097. */
  19098. public $allowed_types = array(
  19099. // you better write validation code for other types if you
  19100. // decide to allow them
  19101. 'image/jpeg' => true,
  19102. 'image/gif' => true,
  19103. 'image/png' => true,
  19104. );
  19105. // this is actually irrelevant since we only write out the path
  19106. // component
  19107. /**
  19108. * @type bool
  19109. */
  19110. public $may_omit_host = true;
  19111. /**
  19112. * @param HTMLPurifier_URI $uri
  19113. * @param HTMLPurifier_Config $config
  19114. * @param HTMLPurifier_Context $context
  19115. * @return bool
  19116. */
  19117. public function doValidate(&$uri, $config, $context)
  19118. {
  19119. $result = explode(',', $uri->path, 2);
  19120. $is_base64 = false;
  19121. $charset = null;
  19122. $content_type = null;
  19123. if (count($result) == 2) {
  19124. list($metadata, $data) = $result;
  19125. // do some legwork on the metadata
  19126. $metas = explode(';', $metadata);
  19127. while (!empty($metas)) {
  19128. $cur = array_shift($metas);
  19129. if ($cur == 'base64') {
  19130. $is_base64 = true;
  19131. break;
  19132. }
  19133. if (substr($cur, 0, 8) == 'charset=') {
  19134. // doesn't match if there are arbitrary spaces, but
  19135. // whatever dude
  19136. if ($charset !== null) {
  19137. continue;
  19138. } // garbage
  19139. $charset = substr($cur, 8); // not used
  19140. } else {
  19141. if ($content_type !== null) {
  19142. continue;
  19143. } // garbage
  19144. $content_type = $cur;
  19145. }
  19146. }
  19147. } else {
  19148. $data = $result[0];
  19149. }
  19150. if ($content_type !== null && empty($this->allowed_types[$content_type])) {
  19151. return false;
  19152. }
  19153. if ($charset !== null) {
  19154. // error; we don't allow plaintext stuff
  19155. $charset = null;
  19156. }
  19157. $data = rawurldecode($data);
  19158. if ($is_base64) {
  19159. $raw_data = base64_decode($data);
  19160. } else {
  19161. $raw_data = $data;
  19162. }
  19163. if ( strlen($raw_data) < 12 ) {
  19164. // error; exif_imagetype throws exception with small files,
  19165. // and this likely indicates a corrupt URI/failed parse anyway
  19166. return false;
  19167. }
  19168. // XXX probably want to refactor this into a general mechanism
  19169. // for filtering arbitrary content types
  19170. if (function_exists('sys_get_temp_dir')) {
  19171. $file = tempnam(sys_get_temp_dir(), "");
  19172. } else {
  19173. $file = tempnam("/tmp", "");
  19174. }
  19175. file_put_contents($file, $raw_data);
  19176. if (function_exists('exif_imagetype')) {
  19177. $image_code = exif_imagetype($file);
  19178. unlink($file);
  19179. } elseif (function_exists('getimagesize')) {
  19180. set_error_handler(array($this, 'muteErrorHandler'));
  19181. $info = getimagesize($file);
  19182. restore_error_handler();
  19183. unlink($file);
  19184. if ($info == false) {
  19185. return false;
  19186. }
  19187. $image_code = $info[2];
  19188. } else {
  19189. trigger_error("could not find exif_imagetype or getimagesize functions", E_USER_ERROR);
  19190. }
  19191. $real_content_type = image_type_to_mime_type($image_code);
  19192. if ($real_content_type != $content_type) {
  19193. // we're nice guys; if the content type is something else we
  19194. // support, change it over
  19195. if (empty($this->allowed_types[$real_content_type])) {
  19196. return false;
  19197. }
  19198. $content_type = $real_content_type;
  19199. }
  19200. // ok, it's kosher, rewrite what we need
  19201. $uri->userinfo = null;
  19202. $uri->host = null;
  19203. $uri->port = null;
  19204. $uri->fragment = null;
  19205. $uri->query = null;
  19206. $uri->path = "$content_type;base64," . base64_encode($raw_data);
  19207. return true;
  19208. }
  19209. /**
  19210. * @param int $errno
  19211. * @param string $errstr
  19212. */
  19213. public function muteErrorHandler($errno, $errstr)
  19214. {
  19215. }
  19216. }
  19217. /**
  19218. * Validates file as defined by RFC 1630 and RFC 1738.
  19219. */
  19220. class HTMLPurifier_URIScheme_file extends HTMLPurifier_URIScheme
  19221. {
  19222. /**
  19223. * Generally file:// URLs are not accessible from most
  19224. * machines, so placing them as an img src is incorrect.
  19225. * @type bool
  19226. */
  19227. public $browsable = false;
  19228. /**
  19229. * Basically the *only* URI scheme for which this is true, since
  19230. * accessing files on the local machine is very common. In fact,
  19231. * browsers on some operating systems don't understand the
  19232. * authority, though I hear it is used on Windows to refer to
  19233. * network shares.
  19234. * @type bool
  19235. */
  19236. public $may_omit_host = true;
  19237. /**
  19238. * @param HTMLPurifier_URI $uri
  19239. * @param HTMLPurifier_Config $config
  19240. * @param HTMLPurifier_Context $context
  19241. * @return bool
  19242. */
  19243. public function doValidate(&$uri, $config, $context)
  19244. {
  19245. // Authentication method is not supported
  19246. $uri->userinfo = null;
  19247. // file:// makes no provisions for accessing the resource
  19248. $uri->port = null;
  19249. // While it seems to work on Firefox, the querystring has
  19250. // no possible effect and is thus stripped.
  19251. $uri->query = null;
  19252. return true;
  19253. }
  19254. }
  19255. /**
  19256. * Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
  19257. */
  19258. class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme
  19259. {
  19260. /**
  19261. * @type int
  19262. */
  19263. public $default_port = 21;
  19264. /**
  19265. * @type bool
  19266. */
  19267. public $browsable = true; // usually
  19268. /**
  19269. * @type bool
  19270. */
  19271. public $hierarchical = true;
  19272. /**
  19273. * @param HTMLPurifier_URI $uri
  19274. * @param HTMLPurifier_Config $config
  19275. * @param HTMLPurifier_Context $context
  19276. * @return bool
  19277. */
  19278. public function doValidate(&$uri, $config, $context)
  19279. {
  19280. $uri->query = null;
  19281. // typecode check
  19282. $semicolon_pos = strrpos($uri->path, ';'); // reverse
  19283. if ($semicolon_pos !== false) {
  19284. $type = substr($uri->path, $semicolon_pos + 1); // no semicolon
  19285. $uri->path = substr($uri->path, 0, $semicolon_pos);
  19286. $type_ret = '';
  19287. if (strpos($type, '=') !== false) {
  19288. // figure out whether or not the declaration is correct
  19289. list($key, $typecode) = explode('=', $type, 2);
  19290. if ($key !== 'type') {
  19291. // invalid key, tack it back on encoded
  19292. $uri->path .= '%3B' . $type;
  19293. } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
  19294. $type_ret = ";type=$typecode";
  19295. }
  19296. } else {
  19297. $uri->path .= '%3B' . $type;
  19298. }
  19299. $uri->path = str_replace(';', '%3B', $uri->path);
  19300. $uri->path .= $type_ret;
  19301. }
  19302. return true;
  19303. }
  19304. }
  19305. /**
  19306. * Validates http (HyperText Transfer Protocol) as defined by RFC 2616
  19307. */
  19308. class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme
  19309. {
  19310. /**
  19311. * @type int
  19312. */
  19313. public $default_port = 80;
  19314. /**
  19315. * @type bool
  19316. */
  19317. public $browsable = true;
  19318. /**
  19319. * @type bool
  19320. */
  19321. public $hierarchical = true;
  19322. /**
  19323. * @param HTMLPurifier_URI $uri
  19324. * @param HTMLPurifier_Config $config
  19325. * @param HTMLPurifier_Context $context
  19326. * @return bool
  19327. */
  19328. public function doValidate(&$uri, $config, $context)
  19329. {
  19330. $uri->userinfo = null;
  19331. return true;
  19332. }
  19333. }
  19334. /**
  19335. * Validates https (Secure HTTP) according to http scheme.
  19336. */
  19337. class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http
  19338. {
  19339. /**
  19340. * @type int
  19341. */
  19342. public $default_port = 443;
  19343. /**
  19344. * @type bool
  19345. */
  19346. public $secure = true;
  19347. }
  19348. // VERY RELAXED! Shouldn't cause problems, not even Firefox checks if the
  19349. // email is valid, but be careful!
  19350. /**
  19351. * Validates mailto (for E-mail) according to RFC 2368
  19352. * @todo Validate the email address
  19353. * @todo Filter allowed query parameters
  19354. */
  19355. class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme
  19356. {
  19357. /**
  19358. * @type bool
  19359. */
  19360. public $browsable = false;
  19361. /**
  19362. * @type bool
  19363. */
  19364. public $may_omit_host = true;
  19365. /**
  19366. * @param HTMLPurifier_URI $uri
  19367. * @param HTMLPurifier_Config $config
  19368. * @param HTMLPurifier_Context $context
  19369. * @return bool
  19370. */
  19371. public function doValidate(&$uri, $config, $context)
  19372. {
  19373. $uri->userinfo = null;
  19374. $uri->host = null;
  19375. $uri->port = null;
  19376. // we need to validate path against RFC 2368's addr-spec
  19377. return true;
  19378. }
  19379. }
  19380. /**
  19381. * Validates news (Usenet) as defined by generic RFC 1738
  19382. */
  19383. class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme
  19384. {
  19385. /**
  19386. * @type bool
  19387. */
  19388. public $browsable = false;
  19389. /**
  19390. * @type bool
  19391. */
  19392. public $may_omit_host = true;
  19393. /**
  19394. * @param HTMLPurifier_URI $uri
  19395. * @param HTMLPurifier_Config $config
  19396. * @param HTMLPurifier_Context $context
  19397. * @return bool
  19398. */
  19399. public function doValidate(&$uri, $config, $context)
  19400. {
  19401. $uri->userinfo = null;
  19402. $uri->host = null;
  19403. $uri->port = null;
  19404. $uri->query = null;
  19405. // typecode check needed on path
  19406. return true;
  19407. }
  19408. }
  19409. /**
  19410. * Validates nntp (Network News Transfer Protocol) as defined by generic RFC 1738
  19411. */
  19412. class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme
  19413. {
  19414. /**
  19415. * @type int
  19416. */
  19417. public $default_port = 119;
  19418. /**
  19419. * @type bool
  19420. */
  19421. public $browsable = false;
  19422. /**
  19423. * @param HTMLPurifier_URI $uri
  19424. * @param HTMLPurifier_Config $config
  19425. * @param HTMLPurifier_Context $context
  19426. * @return bool
  19427. */
  19428. public function doValidate(&$uri, $config, $context)
  19429. {
  19430. $uri->userinfo = null;
  19431. $uri->query = null;
  19432. return true;
  19433. }
  19434. }
  19435. /**
  19436. * Validates tel (for phone numbers).
  19437. *
  19438. * The relevant specifications for this protocol are RFC 3966 and RFC 5341,
  19439. * but this class takes a much simpler approach: we normalize phone
  19440. * numbers so that they only include (possibly) a leading plus,
  19441. * and then any number of digits and x'es.
  19442. */
  19443. class HTMLPurifier_URIScheme_tel extends HTMLPurifier_URIScheme
  19444. {
  19445. /**
  19446. * @type bool
  19447. */
  19448. public $browsable = false;
  19449. /**
  19450. * @type bool
  19451. */
  19452. public $may_omit_host = true;
  19453. /**
  19454. * @param HTMLPurifier_URI $uri
  19455. * @param HTMLPurifier_Config $config
  19456. * @param HTMLPurifier_Context $context
  19457. * @return bool
  19458. */
  19459. public function doValidate(&$uri, $config, $context)
  19460. {
  19461. $uri->userinfo = null;
  19462. $uri->host = null;
  19463. $uri->port = null;
  19464. // Delete all non-numeric characters, non-x characters
  19465. // from phone number, EXCEPT for a leading plus sign.
  19466. $uri->path = preg_replace('/(?!^\+)[^\dx]/', '',
  19467. // Normalize e(x)tension to lower-case
  19468. str_replace('X', 'x', $uri->path));
  19469. return true;
  19470. }
  19471. }
  19472. /**
  19473. * Performs safe variable parsing based on types which can be used by
  19474. * users. This may not be able to represent all possible data inputs,
  19475. * however.
  19476. */
  19477. class HTMLPurifier_VarParser_Flexible extends HTMLPurifier_VarParser
  19478. {
  19479. /**
  19480. * @param mixed $var
  19481. * @param int $type
  19482. * @param bool $allow_null
  19483. * @return array|bool|float|int|mixed|null|string
  19484. * @throws HTMLPurifier_VarParserException
  19485. */
  19486. protected function parseImplementation($var, $type, $allow_null)
  19487. {
  19488. if ($allow_null && $var === null) {
  19489. return null;
  19490. }
  19491. switch ($type) {
  19492. // Note: if code "breaks" from the switch, it triggers a generic
  19493. // exception to be thrown. Specific errors can be specifically
  19494. // done here.
  19495. case self::MIXED:
  19496. case self::ISTRING:
  19497. case self::STRING:
  19498. case self::TEXT:
  19499. case self::ITEXT:
  19500. return $var;
  19501. case self::INT:
  19502. if (is_string($var) && ctype_digit($var)) {
  19503. $var = (int)$var;
  19504. }
  19505. return $var;
  19506. case self::FLOAT:
  19507. if ((is_string($var) && is_numeric($var)) || is_int($var)) {
  19508. $var = (float)$var;
  19509. }
  19510. return $var;
  19511. case self::BOOL:
  19512. if (is_int($var) && ($var === 0 || $var === 1)) {
  19513. $var = (bool)$var;
  19514. } elseif (is_string($var)) {
  19515. if ($var == 'on' || $var == 'true' || $var == '1') {
  19516. $var = true;
  19517. } elseif ($var == 'off' || $var == 'false' || $var == '0') {
  19518. $var = false;
  19519. } else {
  19520. throw new HTMLPurifier_VarParserException("Unrecognized value '$var' for $type");
  19521. }
  19522. }
  19523. return $var;
  19524. case self::ALIST:
  19525. case self::HASH:
  19526. case self::LOOKUP:
  19527. if (is_string($var)) {
  19528. // special case: technically, this is an array with
  19529. // a single empty string item, but having an empty
  19530. // array is more intuitive
  19531. if ($var == '') {
  19532. return array();
  19533. }
  19534. if (strpos($var, "\n") === false && strpos($var, "\r") === false) {
  19535. // simplistic string to array method that only works
  19536. // for simple lists of tag names or alphanumeric characters
  19537. $var = explode(',', $var);
  19538. } else {
  19539. $var = preg_split('/(,|[\n\r]+)/', $var);
  19540. }
  19541. // remove spaces
  19542. foreach ($var as $i => $j) {
  19543. $var[$i] = trim($j);
  19544. }
  19545. if ($type === self::HASH) {
  19546. // key:value,key2:value2
  19547. $nvar = array();
  19548. foreach ($var as $keypair) {
  19549. $c = explode(':', $keypair, 2);
  19550. if (!isset($c[1])) {
  19551. continue;
  19552. }
  19553. $nvar[trim($c[0])] = trim($c[1]);
  19554. }
  19555. $var = $nvar;
  19556. }
  19557. }
  19558. if (!is_array($var)) {
  19559. break;
  19560. }
  19561. $keys = array_keys($var);
  19562. if ($keys === array_keys($keys)) {
  19563. if ($type == self::ALIST) {
  19564. return $var;
  19565. } elseif ($type == self::LOOKUP) {
  19566. $new = array();
  19567. foreach ($var as $key) {
  19568. $new[$key] = true;
  19569. }
  19570. return $new;
  19571. } else {
  19572. break;
  19573. }
  19574. }
  19575. if ($type === self::ALIST) {
  19576. trigger_error("Array list did not have consecutive integer indexes", E_USER_WARNING);
  19577. return array_values($var);
  19578. }
  19579. if ($type === self::LOOKUP) {
  19580. foreach ($var as $key => $value) {
  19581. if ($value !== true) {
  19582. trigger_error(
  19583. "Lookup array has non-true value at key '$key'; " .
  19584. "maybe your input array was not indexed numerically",
  19585. E_USER_WARNING
  19586. );
  19587. }
  19588. $var[$key] = true;
  19589. }
  19590. }
  19591. return $var;
  19592. default:
  19593. $this->errorInconsistent(__CLASS__, $type);
  19594. }
  19595. $this->errorGeneric($var, $type);
  19596. }
  19597. }
  19598. /**
  19599. * This variable parser uses PHP's internal code engine. Because it does
  19600. * this, it can represent all inputs; however, it is dangerous and cannot
  19601. * be used by users.
  19602. */
  19603. class HTMLPurifier_VarParser_Native extends HTMLPurifier_VarParser
  19604. {
  19605. /**
  19606. * @param mixed $var
  19607. * @param int $type
  19608. * @param bool $allow_null
  19609. * @return null|string
  19610. */
  19611. protected function parseImplementation($var, $type, $allow_null)
  19612. {
  19613. return $this->evalExpression($var);
  19614. }
  19615. /**
  19616. * @param string $expr
  19617. * @return mixed
  19618. * @throws HTMLPurifier_VarParserException
  19619. */
  19620. protected function evalExpression($expr)
  19621. {
  19622. $var = null;
  19623. $result = eval("\$var = $expr;");
  19624. if ($result === false) {
  19625. throw new HTMLPurifier_VarParserException("Fatal error in evaluated code");
  19626. }
  19627. return $var;
  19628. }
  19629. }