File: //proc/self/cwd/wp-content/plugins/autodescription/inc/classes/helper/format/html.class.php
<?php
/**
* @package The_SEO_Framework\Classes\Helper\Format\Color
* @subpackage The_SEO_Framework\Formatting
*/
namespace The_SEO_Framework\Helper\Format;
\defined( 'THE_SEO_FRAMEWORK_PRESENT' ) or die;
use function \The_SEO_Framework\umemo;
use \The_SEO_Framework\{
Data,
Data\Filter\Sanitize,
};
/**
* The SEO Framework plugin
* Copyright (C) 2023 - 2024 Sybre Waaijer, CyberWire B.V. (https://cyberwire.nl/)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 3 as published
* by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* Holds methods for HTML Color interpretation and conversion.
*
* @since 5.0.0
*
* @access protected
* Use tsf()->format()->html() instead.
*/
class HTML {
/**
* Strips all URLs that are placed on new lines. These are prone to be embeds.
*
* This might leave stray line feeds. Use `tsf()->sanitize()->newline_to_space()` to fix that.
*
* @since 3.1.0
* @since 5.0.0 Moved from `\The_SEO_Framework\Load`.
* @see \WP_Embed::autoembed()
*
* @param string $content The content to look for embed.
* @return string $content Content without single-lined URLs.
*/
public static function strip_newline_urls( $content ) {
return preg_replace( '/^(?!\r|\n)\s*?(https?:\/\/[^\s<>"]+)(\s*)$/mi', '', $content );
}
/**
* Strips all URLs that are placed in paragraphs on their own. These are prone to be embeds.
*
* This might leave stray line feeds. Use `tsf()->sanitize()->newline_to_space()` to fix that.
*
* @since 3.1.0
* @since 5.0.0 1. Moved from `\The_SEO_Framework\Load`.
* 2. Improved regex to reflect absurd HTML.
* @see \WP_Embed::autoembed()
* @link <https://regex101.com/r/hjHjgp/2>
*
* @param string $content The content to look for embed.
* @return string $content Content without the paragraphs containing solely URLs.
*/
public static function strip_paragraph_urls( $content ) {
return preg_replace( '/<p\b[^>]*>\s*https?:\/\/[^\s<>"]+\s*<\/p\s*>/i', '', $content );
}
/**
* Strips tags with HTML Context-Sensitivity and outputs its breakdown.
*
* It essentially strips all tags, and replaces block-type tags' endings with spaces.
* When done, it performs a sanity-cleanup via `strip_tags()`.
*
* Tip: You might want to use method `s_dupe_space()` to clear up the duplicated/repeated spaces afterward.
*
* @since 3.2.4
* @since 4.0.0 Now allows emptying the indexes `space` and `clear`.
* @since 4.0.5 1. Added the `strip` argument index to the second parameter for clearing leftover tags.
* 2. Now also clears `iframe` tags by default.
* 3. Now no longer (for example) accidentally takes `link` tags when only `li` tags are set for stripping.
* 4. Now performs a separate query for void elements; to prevent regex recursion.
* @since 4.1.0 Now detects nested elements and preserves that content correctly--as if we'd pass through scrupulously beyond infinity.
* @since 4.1.1 Can now replace void elements with spaces when so inclined via the arguments (space vs clear).
* @since 4.2.7 1. Revamped the HTML lookup: it now (more) accurately processes HTML, and is less likely to be fooled by HTML tags
* in attributes.
* 2. The 'space' index no longer has default `fieldset`, `figcaption`, `form`, `main`, `nav`, `pre`, `table`, and `tfoot`.
* 3. The space index now has added to default `details`, `hgroup`, and `hr`.
* 4. The 'clear' index no longer has default `bdo`, `hr`, `link`, `meta`, `option`, `samp`, `style`, and `var`.
* 5. The 'clear' index now has added to default `area`, `audio`, `datalist`, `del`, `dialog`, `fieldset`, `form`, `map`,
* `menu`, `meter`, `nav`, `object`, `output`, `pre`, `progress`, `s`, `table`, and `template`.
* 6. Added the 'passes' index to `$args`. This tells the maximum passes 'space' may process.
* Read TSF option `auto_description_html_method` to use the user-defined method.
* 7. Now replaces all elements passed with spaces. For void elements, or phrasing elements, you'd want to omit
* those from '$args' so it falls through to `strip_tags()`.
* 8. Added preparation memoization using cache delimiters `$args['space']` and `$args['clear']`.
* @since 4.2.8 Elements with that start with exactly the same text as others won't be preemptively closed.
* @since 5.0.0 Moved from `\The_SEO_Framework\Load`.
*
* @link https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories
* @link https://html.spec.whatwg.org/multipage/syntax.html#void-elements
*
* @param string $input The input text that needs its tags stripped.
* @param array $args The input arguments. Tags not included are ignored. {
* 'space' : @param ?string[] HTML elements that should be processed for spacing. If the space
* element is of void element type, it'll be treated as 'clear'.
* If not set or null, skip check.
* If empty array, skips stripping; otherwise, use input.
* 'clear' : @param ?string[] HTML elements that should be emptied and replaced with a space.
* If not set or null, skip check.
* If empty array, skips stripping; otherwise, use input.
* 'strip' : @param ?bool If set, strip_tags() is performed before returning the output.
* Recommended always true, since Regex doesn't understand XML. Default true.
* 'passes' : @param ?int If set, the maximum number of passes 'space' may conduct. More is slower,
* but more accurate. 'clear' is unaffected. Default 1.
* }
* NOTE: WARNING The array values are forwarded to a regex without sanitization/quoting.
* NOTE: Unlisted, script, and style tags will be stripped via PHP's `strip_tags()`. (togglable via `$args['strip']`)
* This means that their contents are maintained as-is, without added spaces. So, CSS and JS will become text.
* It is why you should always list `style` and `script` in the `clear` array, never in 'space'.
* @return string The output string without tags. May have many stray and repeated spaces.
* NOT SECURE for display! Don't trust this method. Always use esc_* functionality.
*/
public static function strip_tags_cs( $input, $args = [] ) {
if ( ! str_contains( $input, '<' ) )
return $input;
/**
* Find the optimized version in `s_excerpt()`. The defaults here treats HTML for a18y reading, not description generation.
*
* Contains HTML5 supported flow content elements only, even though browsers might behave differently.
* https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#flow_content
*
* Missing phrasing elements: 'a', 'abbr', 'b', 'bdo', 'bdi', 'cite', 'data', 'dfn', 'em', 'embed', 'i', 'img', 'ins', 'kbd',
* 'mark', 'math', 'picture', 'q', 'ruby', 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', and 'wbr'.
* There's no need to add these, for they're cleared plainly by `strip_tags()`.
*
* Missing flow elements: 'link', 'meta'
* There's no need to add these, for they are void content.
*
* Contains all form elements. Those must be stripped in almost any context.
*/
$default_args = [
'space' =>
[ 'address', 'article', 'aside', 'br', 'blockquote', 'details', 'dd', 'div', 'dl', 'dt', 'figure', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'ol', 'p', 'section', 'ul' ],
'clear' =>
[ 'area', 'audio', 'button', 'canvas', 'code', 'datalist', 'del', 'dialog', 'fieldset', 'form', 'iframe', 'input', 'label', 'map', 'menu', 'meter', 'nav', 'noscript', 'object', 'output', 'pre', 'progress', 's', 'script', 'select', 'style', 'svg', 'table', 'template', 'textarea', 'video' ],
'strip' => true,
'passes' => 1,
];
if ( ! $args ) {
$args = $default_args;
} else {
// We don't use array_merge() here because we want to default these to [] when $args is given.
foreach ( [ 'clear', 'space' ] as $type )
$args[ $type ] = (array) ( $args[ $type ] ?? [] );
$args['strip'] ??= $default_args['strip'];
$args['passes'] ??= $default_args['passes'];
}
$parse = umemo( __METHOD__ . '/parse', null, $args['space'], $args['clear'] );
// phpcs:ignore, WordPress.CodeAnalysis.AssignmentInCondition -- I know.
if ( ! $parse ) {
// Void elements never have content. 'param', 'source', 'track',
$void = [ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'wbr' ];
// Phrase elements should be replaced without spacing around them. There are more phrasing (54) than block elements (39)...
// Blocks: address, area, article, aside, audio, blockquote, br, button, canvas, dd, details, dialog, div, dl, dt, fieldset, figure, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, li, ol, pre, table, td, template, textarea, th, tr, ul, video.
// Some block elements can be interpreted as phrasing elements, like audio, canvas, button, and video; hence, they're also listed in $phrase.
// 'br' is a phrase element, but also a struct whitespace -- let's omit it so we can substitute it with a space as block.
$phrase = [ 'a', 'area', 'abbr', 'audio', 'b', 'bdo', 'bdi', 'button', 'canvas', 'cite', 'code', 'data', 'datalist', 'del', 'dfn', 'em', 'embed', 'i', 'iframe', 'img', 'input', 'ins', 'link', 'kbd', 'label', 'map', 'mark', 'meta', 'math', 'meter', 'noscript', 'object', 'output', 'picture', 'progress', 'q', 'ruby', 's', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup', 'svg', 'textarea', 'time', 'u', 'var', 'video', 'wbr' ];
$marked_for_parsing = array_merge( $args['space'], $args['clear'] );
$void_elements = array_intersect( $marked_for_parsing, $void );
$flow_elements = array_diff( $marked_for_parsing, $void );
$clear_elements = array_intersect( $flow_elements, $args['clear'] );
$parse = umemo(
__METHOD__ . '/parse',
[
// void = element without content.
'void_query' => [
'phrase' => array_intersect( $void_elements, $phrase ),
'block' => array_diff( $void_elements, $phrase ),
],
// fill = <normal | template | raw text | escapable text | foreign> element.
'clear_query' => [
'phrase' => array_intersect( $clear_elements, $phrase ),
'block' => array_diff( $clear_elements, $phrase ),
],
'space_query' => [
'phrase' => array_intersect( $flow_elements, $args['space'] ),
],
],
$args['space'],
$args['clear'],
);
}
foreach ( $parse as $query_type => $handles ) {
foreach ( $handles as $flow_type => $elements ) {
// Test $input again as it's overwritten in loop.
if ( ! str_contains( $input, '<' ) || ! $elements ) break 2;
switch ( $query_type ) {
case 'void_query':
$input = preg_replace(
/**
* This one grabs opening tags only, and no content.
* Basically, the content and closing tag reader is split from clear_query/flow_query's regex.
* Akin to https://regex101.com/r/BqUCCG/1.
*/
\sprintf(
'/<(?!\/)(?:%s)\b(?:[^=>\/]*=(?:(?:([\'"])[^$]*?\g{-1})|[\s\/]*))*+[^>]*>/i',
implode( '|', $elements )
),
'phrase' === $flow_type ? '' : ' ', // Add space if block, otherwise clear.
$input
) ?? '';
break;
case 'space_query':
$passes = $args['passes'];
$replacement = ' $4 ';
// Fall through;
case 'clear_query':
$passes ??= 1;
$replacement ??= 'phrase' === $flow_type ? '' : ' ';
// Akin to https://regex101.com/r/LR8iem/6. (This might be outdated, copy work!)
// Ref https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html (specifically end-tags)
$regex = \sprintf(
'/<(?!\/)(%s)\b([^=>\/]*=(?:(?:([\'"])[^$]*?\g{-1})|[\s\/]*))*+(?:(?2)++|[^>]*>)((?:[^<]*+(?:<(?!\/?\1\b.*?>)[^<]+)*|(?R))*?)<\/\1\s*>/i', // good enough
implode( '|', $elements )
);
// Work in progress: /(<(?(R)\/?|(?!\/))(%s)\b)([^=>\/]*=(?:(?:([\'"])[^$]*?\g{-1})|[\s\/]*))*+(?:(?-2)*+|(?:.*?))>([^<]*+|(?R)|<\/\2\b\s*>)/i
$i = 0;
// To be most accurate, we should parse 'space' $type at least twice, up to 6 times. This is a performance hog.
// This is because we process the tags from the outer layer to the most inner. Each pass goes deeper.
while ( $i++ < $passes ) {
$pre_pass_input = $input;
$input = preg_replace( $regex, $replacement, $input ) ?? '';
// If nothing changed, or no more HTML is present, we're done.
if ( $pre_pass_input === $input || ! str_contains( $input, '<' ) ) break;
}
// Reset for next fall-through null-coalescing.
unset( $passes, $replacement );
}
}
}
// phpcs:ignore, WordPress.WP.AlternativeFunctions.strip_tags_strip_tags -- $args defines stripping of 'script' and 'style'.
return $args['strip'] ? \strip_tags( $input ) : $input;
}
/**
* Extracts a usable excerpt from singular content.
*
* @since 2.8.0
* @since 2.8.2 1. Added `$allow_shortcodes` parameter.
* 2. Added `$escape` parameter.
* @since 3.2.4 Now selectively clears tags.
* @since 4.1.0 Moved `figcaption`, `figure`, `footer`, and `tfoot`, from `space` to `clear`.
* @since 4.2.7 1. No longer clears `figcaption`, `hr`, `link`, `meta`, `option`, or `tfoot`.
* 2. Now clears `area`, `audio`, `datalist`, `del`, `dialog`, `dl`, `hgroup`, `menu`, `meter`, `ol`,
* `object`, `output`, `progress`, `s`, `template`, and `ul`.
* 3. Now adds spaces around `blockquote`, `details`, and `hr`.
* 4. Now ignores `dd`, `dl`, `dt`, `li`, `main`, for they are inherently excluded or ignored anyway.
* 5. Now processed the `auto_description_html_method` option for stripping tags.
* @since 5.0.0 1. The first parameter is now required.
* 2. Now returns an empty string when something falsesque is returned.
* 3. Removed the third `$escape` parameter.
* 4. The second parameter is changed from `$allow_shortcodes`
* 5. Moved from `\The_SEO_Framework\Load`.
* 6. Renamed from `s_excerpt`.
*
* @param string $html The HTML to extract content from.
* @param array $args {
* Optional. The extraction parameters.
*
* @type bool $allow_shortcodes Whether to allow shortcodes. Default true.
* @type bool $sanitize Whether to sanitize spacing and make the return value single-line.
* Default true.
* @type false|int $clamp Set to int to clamp the sentence intelligently to that number of characters.
* }
* @return string The extracted html content.
*/
public static function extract_content( $html, $args = [] ) {
if ( empty( $html ) ) return '';
$args += [
'allow_shortcodes' => true,
'sanitize' => true,
'clamp' => false,
];
switch ( Data\Plugin::get_option( 'auto_description_html_method' ) ) {
case 'thorough':
$passes = 12;
break;
case 'accurate':
$passes = 6;
break;
case 'fast':
default:
$passes = 2;
}
/**
* Missing 'th', 'tr', 'tbody', 'thead', 'dd', 'dt', and 'li' -- these are obligatory subelements of what's already cleared.
*
* @since 5.0.5
* @param array $strip_args The content stripping arguments, associative.
* Refer to the second parameter of `\The_SEO_Framework\Helper\Format\HTML::strip_tags_cs()`.
*/
$strip_args = (array) \apply_filters(
'the_seo_framework_extract_content_strip_args',
[
'space' =>
[ 'article', 'br', 'blockquote', 'details', 'div', 'hr', 'p', 'section' ],
'clear' =>
[ 'address', 'area', 'aside', 'audio', 'blockquote', 'button', 'canvas', 'code', 'datalist', 'del', 'dialog', 'dl', 'fieldset', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'iframe', 'input', 'label', 'map', 'menu', 'meter', 'nav', 'noscript', 'ol', 'object', 'output', 'pre', 'progress', 's', 'script', 'select', 'style', 'svg', 'table', 'template', 'textarea', 'ul', 'video' ],
'passes' => $passes,
]
);
/**
* Always strip shortcodes unless specifically allowed via the filter.
* Always strip shortcodes if not allowed by the arguments, ignoring the filter.
*
* @since 2.6.6.1
* @since 5.0.0 Added the third `$args` parameter.
* @param bool $allow_shortcodes Whether to allow shortcodes.
* @param array $args The extraction parameters.
*/
if ( ! $args['allow_shortcodes'] || ! \apply_filters( 'the_seo_framework_allow_excerpt_shortcode_tags', false, $args ) )
$html = \strip_shortcodes( $html );
$html = static::strip_tags_cs( $html, $strip_args );
if ( \is_int( $args['clamp'] ) )
$html = Strings::clamp_sentence( $html, 1, $args['clamp'] );
return $args['sanitize'] ? Sanitize::metadata_content( $html ) : $html;
}
}