$escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes(), true ) ? esc_url( $value ) : esc_attr( $value ); // If the escaping functions wiped out the update, reject it and indicate it was rejected. if ( '' === $escaped_new_value && '' !== $value ) { return false; } $updated_attribute = "{$name}=\"{$escaped_new_value}\""; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $comparable_name = strtolower( $name ); if ( isset( $this->attributes[ $comparable_name ] ) ) { /* * Update an existing attribute. * * Example – set attribute id to "new" in
: * * * ^-------------^ * start end * replacement: `id="new"` * * Result: */ $existing_attribute = $this->attributes[ $comparable_name ]; $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $existing_attribute->start, $existing_attribute->length, $updated_attribute ); } else { /* * Create a new attribute at the tag's name end. * * Example – add attribute id="new" to : * * * ^ * start and end * replacement: ` id="new"` * * Result: */ $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $this->tag_name_starts_at + $this->tag_name_length, 0, ' ' . $updated_attribute ); } /* * Any calls to update the `class` attribute directly should wipe out any * enqueued class changes from `add_class` and `remove_class`. */ if ( 'class' === $comparable_name && ! empty( $this->classname_updates ) ) { $this->classname_updates = array(); } return true; } /** * Remove an attribute from the currently-matched tag. * * @since 6.2.0 * * @param string $name The attribute name to remove. * @return bool Whether an attribute was removed. */ public function remove_attribute( $name ): bool { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $name = strtolower( $name ); /* * Any calls to update the `class` attribute directly should wipe out any * enqueued class changes from `add_class` and `remove_class`. */ if ( 'class' === $name && count( $this->classname_updates ) !== 0 ) { $this->classname_updates = array(); } /* * If updating an attribute that didn't exist in the input * document, then remove the enqueued update and move on. * * For example, this might occur when calling `remove_attribute()` * after calling `set_attribute()` for the same attribute * and when that attribute wasn't originally present. */ if ( ! isset( $this->attributes[ $name ] ) ) { if ( isset( $this->lexical_updates[ $name ] ) ) { unset( $this->lexical_updates[ $name ] ); } return false; } /* * Removes an existing tag attribute. * * Example – remove the attribute id from : * * ^-------------^ * start end * replacement: `` * * Result: */ $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( $this->attributes[ $name ]->start, $this->attributes[ $name ]->length, '' ); // Removes any duplicated attributes if they were also present. foreach ( $this->duplicate_attributes[ $name ] ?? array() as $attribute_token ) { $this->lexical_updates[] = new WP_HTML_Text_Replacement( $attribute_token->start, $attribute_token->length, '' ); } return true; } /** * Adds a new class name to the currently matched tag. * * @since 6.2.0 * * @param string $class_name The class name to add. * @return bool Whether the class was set to be added. */ public function add_class( $class_name ): bool { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } if ( self::QUIRKS_MODE !== $this->compat_mode ) { $this->classname_updates[ $class_name ] = self::ADD_CLASS; return true; } /* * Because class names are matched ASCII-case-insensitively in quirks mode, * this needs to see if a case variant of the given class name is already * enqueued and update that existing entry, if so. This picks the casing of * the first-provided class name for all lexical variations. */ $class_name_length = strlen( $class_name ); foreach ( $this->classname_updates as $updated_name => $action ) { if ( strlen( $updated_name ) === $class_name_length && 0 === substr_compare( $updated_name, $class_name, 0, $class_name_length, true ) ) { $this->classname_updates[ $updated_name ] = self::ADD_CLASS; return true; } } $this->classname_updates[ $class_name ] = self::ADD_CLASS; return true; } /** * Removes a class name from the currently matched tag. * * @since 6.2.0 * * @param string $class_name The class name to remove. * @return bool Whether the class was set to be removed. */ public function remove_class( $class_name ): bool { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } if ( self::QUIRKS_MODE !== $this->compat_mode ) { $this->classname_updates[ $class_name ] = self::REMOVE_CLASS; return true; } /* * Because class names are matched ASCII-case-insensitively in quirks mode, * this needs to see if a case variant of the given class name is already * enqueued and update that existing entry, if so. This picks the casing of * the first-provided class name for all lexical variations. */ $class_name_length = strlen( $class_name ); foreach ( $this->classname_updates as $updated_name => $action ) { if ( strlen( $updated_name ) === $class_name_length && 0 === substr_compare( $updated_name, $class_name, 0, $class_name_length, true ) ) { $this->classname_updates[ $updated_name ] = self::REMOVE_CLASS; return true; } } $this->classname_updates[ $class_name ] = self::REMOVE_CLASS; return true; } /** * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 * * @see WP_HTML_Tag_Processor::get_updated_html() * * @return string The processed HTML. */ public function __toString(): string { return $this->get_updated_html(); } /** * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 * @since 6.2.1 Shifts the internal cursor corresponding to the applied updates. * @since 6.4.0 No longer calls subclass method `next_tag()` after updating HTML. * * @return string The processed HTML. */ public function get_updated_html(): string { $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates ); /* * When there is nothing more to update and nothing has already been * updated, return the original document and avoid a string copy. */ if ( $requires_no_updating ) { return $this->html; } /* * Keep track of the position right before the current tag. This will * be necessary for reparsing the current tag after updating the HTML. */ $before_current_tag = $this->token_starts_at ?? 0; /* * 1. Apply the enqueued edits and update all the pointers to reflect those changes. */ $this->class_name_updates_to_attributes_updates(); $before_current_tag += $this->apply_attributes_updates( $before_current_tag ); /* * 2. Rewind to before the current tag and reparse to get updated attributes. * * At this point the internal cursor points to the end of the tag name. * Rewind before the tag name starts so that it's as if the cursor didn't * move; a call to `next_tag()` will reparse the recently-updated attributes * and additional calls to modify the attributes will apply at this same * location, but in order to avoid issues with subclasses that might add * behaviors to `next_tag()`, the internal methods should be called here * instead. * * It's important to note that in this specific place there will be no change * because the processor was already at a tag when this was called and it's * rewinding only to the beginning of this very tag before reprocessing it * and its attributes. * *Previous HTMLMore HTML
* ↑ │ back up by the length of the tag name plus the opening < * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; $this->base_class_next_token(); return $this->html; } /** * Parses tag query input into internal search criteria. * * @since 6.2.0 * * @param array|string|null $query { * Optional. Which tag name to find, having which class, etc. Default is to find any tag. * * @type string|null $tag_name Which tag to find, or `null` for "any tag." * @type int|null $match_offset Find the Nth tag matching all search criteria. * 1 for "first" tag, 3 for "third," etc. * Defaults to first tag. * @type string|null $class_name Tag must contain this class name to match. * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. . * } */ private function parse_query( $query ) { if ( null !== $query && $query === $this->last_query ) { return; } $this->last_query = $query; $this->sought_tag_name = null; $this->sought_class_name = null; $this->sought_match_offset = 1; $this->stop_on_tag_closers = false; // A single string value means "find the tag of this name". if ( is_string( $query ) ) { $this->sought_tag_name = $query; return; } // An empty query parameter applies no restrictions on the search. if ( null === $query ) { return; } // If not using the string interface, an associative array is required. if ( ! is_array( $query ) ) { _doing_it_wrong( __METHOD__, __( 'The query argument must be an array or a tag name.' ), '6.2.0' ); return; } if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) { $this->sought_tag_name = $query['tag_name']; } if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) { $this->sought_class_name = $query['class_name']; } if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) { $this->sought_match_offset = $query['match_offset']; } if ( isset( $query['tag_closers'] ) ) { $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; } } /** * Checks whether a given tag and its attributes match the search criteria. * * @since 6.2.0 * * @return bool Whether the given tag and its attribute match the search criteria. */ private function matches(): bool { if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { return false; } // Does the tag name match the requested tag name in a case-insensitive manner? if ( isset( $this->sought_tag_name ) && ( strlen( $this->sought_tag_name ) !== $this->tag_name_length || 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) ) ) { return false; } if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { return false; } return true; } /** * Gets DOCTYPE declaration info from a DOCTYPE token. * * DOCTYPE tokens may appear in many places in an HTML document. In most places, they are * simply ignored. The main parsing functions find the basic shape of DOCTYPE tokens but * do not perform detailed parsing. * * This method can be called to perform a full parse of the DOCTYPE token and retrieve * its information. * * @return WP_HTML_Doctype_Info|null The DOCTYPE declaration information or `null` if not * currently at a DOCTYPE node. */ public function get_doctype_info(): ?WP_HTML_Doctype_Info { if ( self::STATE_DOCTYPE !== $this->parser_state ) { return null; } return WP_HTML_Doctype_Info::from_doctype_token( substr( $this->html, $this->token_starts_at, $this->token_length ) ); } /** * Parser Ready State. * * Indicates that the parser is ready to run and waiting for a state transition. * It may not have started yet, or it may have just finished parsing a token and * is ready to find the next one. * * @since 6.5.0 * * @access private */ const STATE_READY = 'STATE_READY'; /** * Parser Complete State. * * Indicates that the parser has reached the end of the document and there is * nothing left to scan. It finished parsing the last token completely. * * @since 6.5.0 * * @access private */ const STATE_COMPLETE = 'STATE_COMPLETE'; /** * Parser Incomplete Input State. * * Indicates that the parser has reached the end of the document before finishing * a token. It started parsing a token but there is a possibility that the input * HTML document was truncated in the middle of a token. * * The parser is reset at the start of the incomplete token and has paused. There * is nothing more than can be scanned unless provided a more complete document. * * @since 6.5.0 * * @access private */ const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT'; /** * Parser Matched Tag State. * * Indicates that the parser has found an HTML tag and it's possible to get * the tag name and read or modify its attributes (if it's not a closing tag). * * @since 6.5.0 * * @access private */ const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; /** * Parser Text Node State. * * Indicates that the parser has found a text node and it's possible * to read and modify that text. * * @since 6.5.0 * * @access private */ const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; /** * Parser CDATA Node State. * * Indicates that the parser has found a CDATA node and it's possible * to read and modify its modifiable text. Note that in HTML there are * no CDATA nodes outside of foreign content (SVG and MathML). Outside * of foreign content, they are treated as HTML comments. * * @since 6.5.0 * * @access private */ const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; /** * Indicates that the parser has found an HTML comment and it's * possible to read and modify its modifiable text. * * @since 6.5.0 * * @access private */ const STATE_COMMENT = 'STATE_COMMENT'; /** * Indicates that the parser has found a DOCTYPE node and it's * possible to read its DOCTYPE information via `get_doctype_info()`. * * @since 6.5.0 * * @access private */ const STATE_DOCTYPE = 'STATE_DOCTYPE'; /** * Indicates that the parser has found an empty tag closer `>`. * * Note that in HTML there are no empty tag closers, and they * are ignored. Nonetheless, the Tag Processor still * recognizes them as they appear in the HTML stream. * * These were historically discussed as a "presumptuous tag * closer," which would close the nearest open tag, but were * dismissed in favor of explicitly-closing tags. * * @since 6.5.0 * * @access private */ const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG'; /** * Indicates that the parser has found a "funky comment" * and it's possible to read and modify its modifiable text. * * Example: * * %url> * {"wp-bit":"query/post-author"}> * 2> * * Funky comments are tag closers with invalid tag names. Note * that in HTML these are turn into bogus comments. Nonetheless, * the Tag Processor recognizes them in a stream of HTML and * exposes them for inspection and modification. * * @since 6.5.0 * * @access private */ const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY'; /** * Indicates that a comment was created when encountering abruptly-closed HTML comment. * * Example: * * * * * @since 6.5.0 */ const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT'; /** * Indicates that a comment would be parsed as a CDATA node, * were HTML to allow CDATA nodes outside of foreign content. * * Example: * * * * This is an HTML comment, but it looks like a CDATA node. * * @since 6.5.0 */ const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE'; /** * Indicates that a comment was created when encountering * normative HTML comment syntax. * * Example: * * * * @since 6.5.0 */ const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT'; /** * Indicates that a comment would be parsed as a Processing * Instruction node, were they to exist within HTML. * * Example: * * * * This is an HTML comment, but it looks like a CDATA node. * * @since 6.5.0 */ const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE'; /** * Indicates that a comment was created when encountering invalid * HTML input, a so-called "bogus comment." * * Example: * * * * * @since 6.5.0 */ const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML'; /** * No-quirks mode document compatability mode. * * > In no-quirks mode, the behavior is (hopefully) the desired behavior * > described by the modern HTML and CSS specifications. * * @see self::$compat_mode * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode * * @since 6.7.0 * * @var string */ const NO_QUIRKS_MODE = 'no-quirks-mode'; /** * Quirks mode document compatability mode. * * > In quirks mode, layout emulates behavior in Navigator 4 and Internet * > Explorer 5. This is essential in order to support websites that were * > built before the widespread adoption of web standards. * * @see self::$compat_mode * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode * * @since 6.7.0 * * @var string */ const QUIRKS_MODE = 'quirks-mode'; /** * Indicates that a span of text may contain any combination of significant * kinds of characters: NULL bytes, whitespace, and others. * * @see self::$text_node_classification * @see self::subdivide_text_appropriately * * @since 6.7.0 */ const TEXT_IS_GENERIC = 'TEXT_IS_GENERIC'; /** * Indicates that a span of text comprises a sequence only of NULL bytes. * * @see self::$text_node_classification * @see self::subdivide_text_appropriately * * @since 6.7.0 */ const TEXT_IS_NULL_SEQUENCE = 'TEXT_IS_NULL_SEQUENCE'; /** * Indicates that a span of decoded text comprises only whitespace. * * @see self::$text_node_classification * @see self::subdivide_text_appropriately * * @since 6.7.0 */ const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE'; }