HEX

File: //proc/self/cwd/wp-content/plugins/autodescription/inc/classes/robotstxt/main.class.php
<?php
/**
 * @package The_SEO_Framework\Classes\RobotsTXT\Main
 * @subpackage The_SEO_Framework\RobotsTXT
 */

namespace The_SEO_Framework\RobotsTXT;

\defined( 'THE_SEO_FRAMEWORK_PRESENT' ) or die;

use \The_SEO_Framework\{
	Data,
	Helper\Compatibility,
	Helper\Query,
	Meta,
	RobotsTXT, // Yes, it is legal to import the same namespace.
	Sitemap,
};

/**
 * The SEO Framework plugin
 * Copyright (C) 2023 - 2024 Sybre Waaijer, CyberWire B.V. (https://cyberwire.nl/)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3 as published
 * by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * Holds various methods for the robots txt output.
 *
 * @since 5.0.0
 * @access protected
 *         Use tsf()->robotstxt() instead.
 */
class Main {

	/**
	 * Edits the robots.txt output.
	 * Requires the site not to have a robots.txt file in the root directory.
	 *
	 * This methods completely hijacks default output. This is intentional (read next paragraph).
	 * Use a higher filter priority to override TSF's output.
	 *
	 * The robots.txt file should not be used to block endpoints that are supposed to be hidden.
	 * This is because the robots.txt file is public; adding endpoints there would expose them.
	 * Blocking pages via robots.txt is not effective, either; if a direct link to a page is found,
	 * it can still be crawled and indexed. Use the robots meta tags (and headers) instead.
	 *
	 * @hook robots_txt 10
	 * @since 5.0.0
	 * @since 5.1.0 1. Refactored to output the directives via a priority system.
	 *              2. Now supports blocking AI language model trainers and SEO analysis tools.
	 * @link <https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt>
	 *
	 * @return string Robots.txt output.
	 */
	public static function get_robots_txt() {

		$output = '';

		// Simple test for invalid directory depth. Even //robots.txt is an invalid location.
		// To be fair, though, up to 5 redirects from /robots.txt are allowed. However, nobody has notified us of this usage.
		// TODO Should we add a test for `/?robots=1.*`? Eh...
		if ( strrpos( rawurldecode( stripslashes( $_SERVER['REQUEST_URI'] ) ), '/' ) > 0 ) {
			$correct_location = \esc_url(
				\trailingslashit( Meta\URI\Utils::set_preferred_url_scheme(
					Meta\URI\Utils::get_site_host()
				) ) . 'robots.txt',
			);

			$output .= "# This is an invalid robots.txt location.\n# Please visit: $correct_location\n\n";
		}

		$site_path = parse_url( \site_url(), \PHP_URL_PATH ) ?: '';

		/**
		 * @since 2.5.0
		 * @since 5.1.0 Deprecated.
		 * @deprecated
		 * @param bool $disallow Whether to disallow robots queries.
		 */
		$disallow_queries = \apply_filters_deprecated(
			'the_seo_framework_robots_disallow_queries',
			[ false ],
			'5.1.0 of The SEO Framework',
			'the_seo_framework_robots_txt_sections'
		) ? '/*?*'
		  : '';

		$sitemaps = [];

		// Add extra whitespace and sitemap full URL
		if ( Data\Plugin::get_option( 'sitemaps_robots' ) ) {
			if ( Data\Plugin::get_option( 'sitemaps_output' ) ) {
				foreach ( Sitemap\Registry::get_sitemap_endpoint_list() as $id => $data )
					if ( ! empty( $data['robots'] ) )
						$sitemaps[] = \esc_url( Sitemap\Registry::get_expected_sitemap_endpoint_url( $id ) );

			} elseif ( ! Compatibility::get_active_conflicting_plugin_types()['sitemaps'] && Sitemap\Utils::use_core_sitemaps() ) {
				$wp_sitemaps_server = \wp_sitemaps_get_server();

				if ( method_exists( $wp_sitemaps_server, 'add_robots' ) ) {
					// Already escaped.
					$sitemaps[] = trim( "\n", \wp_sitemaps_get_server()->add_robots( '', Data\Blog::is_public() ) );
				}
			}
		}

		/**
		 * @since 5.1.0
		 * @param array  $robots_sections {
		 *     The robots directives, associative by key.
		 *     All input is expected to be escaped.
		 *
		 *     @type array {$key} {
		 *         The default or custom directives.
		 *
		 *         @type string   $raw        The raw output to prepend.
		 *         @type string[] $user-agent The user agent to apply the directives for.
		 *         @type string[] $disallow   The disallow directives.
		 *         @type string[] $allow      The allow directives.
		 *         @type int      $priority   The priority of the output, a lower priority means earlier output.
		 *                                    Defaults to 10.
		 *     }
		 * }
		 * @param string $site_path The determined site path. Use this path to prefix URLs.
		 */
		$robots_sections = (array) \apply_filters(
			'the_seo_framework_robots_txt_sections',
			[
				'deprecated_before' => [
					/**
					 * @since 2.5.0
					 * @since 5.1.0 Deprecated.
					 * @deprecated
					 * @param string $pre The output before this plugin's output.
					 *                    Don't forget to add line breaks ( "\n" )!
					 */
					'raw'      => (string) \apply_filters_deprecated(
						'the_seo_framework_robots_txt_pre',
						[ '' ],
						'5.1.0 of The SEO Framework',
						'the_seo_framework_robots_txt_sections',
					),
					'priority' => 0,
				],
				'default'           => [
					'user-agent' => [ '*' ],
					'disallow'   => [ "$site_path/wp-admin/", $disallow_queries ],
					'allow'      => [ "$site_path/wp-admin/admin-ajax.php" ],
				],
				'block_ai'          => Data\Plugin::get_option( 'robotstxt_block_ai' ) ? [
					'user-agent' => array_keys( RobotsTXT\Utils::get_blocked_user_agents( 'ai' ) ),
					'disallow'   => [ '/' ],
				] : [],
				'block_seo'         => Data\Plugin::get_option( 'robotstxt_block_seo' ) ? [
					'user-agent' => array_keys( RobotsTXT\Utils::get_blocked_user_agents( 'seo' ) ),
					'disallow'   => [ '/' ],
				] : [],
				'deprecated_after'  => [
					/**
					 * @since 2.5.0
					 * @since 5.1.0 Deprecated.
					 * @deprecated
					 * @param string $pro The output after this plugin's output.
					 *                    Don't forget to add line breaks ( "\n" )!
					 */
					'raw'      => (string) \apply_filters_deprecated(
						'the_seo_framework_robots_txt_pro',
						[ '' ],
						'5.1.0 of The SEO Framework',
						'the_seo_framework_robots_txt_sections',
					),
					'priority' => 500,
				],
				'sitemaps'          => [
					'sitemaps' => $sitemaps,
					'priority' => 1000,
				],
			],
			$site_path,
		);

		// We need to use uasort to maintain index association, but we don't read the indexes.
		usort( $robots_sections, fn( $a, $b ) => ( $a['priority'] ?? 10 ) <=> ( $b['priority'] ?? 10 ) );

		$pieces     = [];
		$directives = [
			'user-agent' => 'User-agent',
			'disallow'   => 'Disallow',
			'allow'      => 'Allow',
			'sitemaps'   => 'Sitemap',
		];
		foreach ( $robots_sections as $section ) {
			$piece = '';

			if ( isset( $section['raw'] ) )
				$piece .= $section['raw'];

			if ( ! empty( $section['user-agent'] ) || ! empty( $section['sitemaps'] ) )
				foreach ( $directives as $key => $directive ) // implies order and allowed keys.
					foreach ( $section[ $key ] ?? [] as $value )
						$piece .= \strlen( $value ) ? "$directive: $value\n" : '';

			if ( \strlen( $piece ) )
				$pieces[] = $piece;
		}

		$output .= implode( "\n", $pieces );

		/**
		 * The robots.txt output.
		 *
		 * @since 4.0.5
		 * @param string $output The robots.txt output.
		 */
		return (string) \apply_filters( 'the_seo_framework_robots_txt', $output );
	}
}