HEX

File: //proc/self/cwd/wp-content/plugins/autodescription/inc/classes/robotstxt/utils.class.php
<?php
/**
 * @package The_SEO_Framework\Classes\RobotsTXT\Utils
 * @subpackage The_SEO_Framework\RobotsTXT
 */

namespace The_SEO_Framework\RobotsTXT;

\defined( 'THE_SEO_FRAMEWORK_PRESENT' ) or die;

use function \The_SEO_Framework\umemo;

use \The_SEO_Framework\{
	Data,
	Helper\Query,
	Meta,
};

/**
 * The SEO Framework plugin
 * Copyright (C) 2023 - 2024 Sybre Waaijer, CyberWire B.V. (https://cyberwire.nl/)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 3 as published
 * by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * Holds various utility methods for the robots txt.
 *
 * @since 5.0.0
 * @access protected
 *         Use tsf()->robotstxt()->utils() instead.
 */
class Utils {

	/**
	 * Returns a list of filterable user-agents that can be blocked.
	 *
	 * @since 5.1.0
	 *
	 * @param string $type The type of user-agents to get. Accepts 'ai' and 'seo'.
	 * @return array {
	 *     A list of user-agents with extra info.
	 *
	 *     @type array $user_agent {
	 *         The user-agent's information.
	 *
	 *         @type string $by   The entity behind the user-agent.
	 *         @type string $link The link to the user-agent's documentation.
	 *     }
	 * }
	 */
	public static function get_blocked_user_agents( $type ) {

		switch ( $type ) {
			case 'ai':
				// Excerpt from https://github.com/ai-robots-txt/ai.robots.txt
				$agents = [
					'Amazonbot'          => [
						'by'   => 'Amazon',
						'link' => 'https://developer.amazon.com/amazonbot',
					],
					'Applebot-Extended'  => [
						'by'   => 'Apple',
						'link' => 'https://support.apple.com/en-us/119829',
					],
					'CCBot'              => [
						'by'   => 'Common Crawl',
						'link' => 'https://commoncrawl.org/ccbot',
					],
					'ClaudeBot'          => [
						'by'   => 'Anthropic',
						'link' => 'https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler',
					],
					'GPTBot'             => [
						'by'   => 'OpenAI',
						'link' => 'https://platform.openai.com/docs/bots',
					],
					'Google-Extended'    => [
						'by'   => 'Google',
						'link' => 'https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers',
					],
					'GoogleOther'        => [
						'by'   => 'Google',
						'link' => 'https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers',
					],
					'Meta-ExternalAgent' => [ // Why does Meta say lowercase meta-externalagent?
						'by'   => 'Meta',
						'link' => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/',
					],
					'FacebookBot'        => [ // Should not impede social sharing, they use FacebookExternalHit otherwise.
						'by'   => 'Meta',
						'link' => 'https://developers.facebook.com/docs/sharing/bot',
					],
				];
				break;
			case 'seo':
				$agents = [
					'AhrefsBot'        => [
						'by'   => 'Ahrefs',
						'link' => 'https://ahrefs.com/robot',
					],
					'AhrefsSiteAudit ' => [
						'by'   => 'Ahrefs',
						'link' => 'https://ahrefs.com/robot/site-audit',
					],
					'barkrowler'       => [
						'by'   => 'Babbar',
						'link' => 'https://www.babbar.tech/crawler',
					],
					'DataForSeoBot'    => [
						'by'   => 'DataForSEO',
						'link' => 'https://dataforseo.com/dataforseo-bot',
					],
					'dotbot'           => [
						'by'   => 'Moz',
						'link' => 'https://moz.com/help/moz-procedures/crawlers/dotbot',
					],
					'rogerbot'         => [
						'by'   => 'Moz',
						'link' => 'https://moz.com/help/moz-procedures/crawlers/rogerbot',
					],
					'SemrushBot'       => [
						'by'   => 'SEMrush',
						'link' => 'https://www.semrush.com/bot/',
					],
					'SiteAuditBot'     => [
						'by'   => 'SEMrush',
						'link' => 'https://www.semrush.com/bot/',
					],
					'SemrushBot-BA'    => [
						'by'   => 'SEMrush',
						'link' => 'https://www.semrush.com/bot/',
					],
				];
		}

		/**
		 * @since 5.1.0
		 * @param array $agents The user-agent list for $type.
		 * @param arrary $type  The agent type requested by the method caller.
		 */
		return (array) \apply_filters(
			'the_seo_framework_robots_blocked_user_agents',
			$agents ?? [],
			$type,
		);
	}

	/**
	 * Checks if a Robots.txt file exists in the root directory of the WordPress installation.
	 * Memoizes the return value.
	 *
	 * @since 5.0.0
	 *
	 * @return bool Whether the robots.txt file exists.
	 */
	public static function has_root_robots_txt() {
		// phpcs:ignore, WordPress.CodeAnalysis.AssignmentInCondition -- I know.
		if ( null !== $memo = umemo( __METHOD__ ) ) return $memo;

		// Ensure get_home_path() is declared.
		if ( ! \function_exists( 'get_home_path' ) )
			require_once \ABSPATH . 'wp-admin/includes/file.php';

		$path = \get_home_path() . 'robots.txt';

		// phpcs:ignore, TSF.Performance.Functions.PHP -- we use path, not URL.
		return umemo( __METHOD__, file_exists( $path ) );
	}

	/**
	 * Returns the robots.txt location URL.
	 * Only allows root domains.
	 *
	 * @since 2.9.2
	 * @since 4.0.2 Now uses the preferred URL scheme.
	 * @since 5.0.0 Moved from `\The_SEO_Framework\Load`.
	 * @since 5.1.0 Now memoizes the return value.
	 *
	 * @return string URL location of robots.txt. Unescaped.
	 */
	public static function get_robots_txt_url() {

		// phpcs:ignore, WordPress.CodeAnalysis.AssignmentInCondition -- I know.
		if ( null !== $memo = umemo( __METHOD__ ) ) return $memo;

		if ( $GLOBALS['wp_rewrite']->using_permalinks() && ! Data\Blog::is_subdirectory_installation() ) {
			$home = \trailingslashit( Meta\URI\Utils::set_preferred_url_scheme( Meta\URI\Utils::get_site_host() ) );
			$path = "{$home}robots.txt";
		} elseif ( static::has_root_robots_txt() ) {
			// TODO: This URL is wrong on subdirectory installations? Use Meta\URI\Utils::get_site_host() instead?
			$home = \trailingslashit( Meta\URI\Utils::set_preferred_url_scheme( \get_option( 'home' ) ) );
			$path = "{$home}robots.txt";
		} else {
			$path = '';
		}

		return umemo( __METHOD__, $path );
	}
}