Nenhuma Descrição

sitemaps.php 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. <?php // phpcs:ignore WordPress.Files.FileName.InvalidClassFileName
  2. /**
  3. * Generate sitemap files in base XML as well as some namespace extensions.
  4. *
  5. * This module generates two different base sitemaps.
  6. *
  7. * 1. sitemap.xml
  8. * The basic sitemap is updated regularly by wp-cron. It is stored in the
  9. * database and retrieved when requested. This sitemap aims to include canonical
  10. * URLs for all published content and abide by the sitemap spec. This is the root
  11. * of a tree of sitemap and sitemap index xml files, depending on the number of URLs.
  12. *
  13. * By default the sitemap contains published posts of type 'post' and 'page', as
  14. * well as the home url. To include other post types use the 'jetpack_sitemap_post_types'
  15. * filter.
  16. *
  17. * @link https://www.sitemaps.org/protocol.html Base sitemaps protocol.
  18. * @link https://support.google.com/webmasters/answer/178636 Image sitemap extension.
  19. * @link https://developers.google.com/webmasters/videosearch/sitemaps Video sitemap extension.
  20. *
  21. * 2. news-sitemap.xml
  22. * The news sitemap is generated on the fly when requested. It does not aim for
  23. * completeness, instead including at most 1000 of the most recent published posts
  24. * from the previous 2 days, per the news-sitemap spec.
  25. *
  26. * @link https://support.google.com/webmasters/answer/74288 News sitemap extension.
  27. *
  28. * @package automattic/jetpack
  29. * @since 3.9.0
  30. * @since 4.8.0 Remove 1000 post limit.
  31. * @author Automattic
  32. */
  33. /* Include all of the sitemap subclasses. */
  34. require_once __DIR__ . '/sitemap-constants.php';
  35. require_once __DIR__ . '/sitemap-buffer.php';
  36. require_once __DIR__ . '/sitemap-stylist.php';
  37. require_once __DIR__ . '/sitemap-librarian.php';
  38. require_once __DIR__ . '/sitemap-finder.php';
  39. require_once __DIR__ . '/sitemap-builder.php';
  40. if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) {
  41. require_once __DIR__ . '/sitemap-logger.php';
  42. }
  43. /**
  44. * Governs the generation, storage, and serving of sitemaps.
  45. *
  46. * @since 4.8.0
  47. */
  48. class Jetpack_Sitemap_Manager {
  49. /**
  50. * Librarian object for storing and retrieving sitemap data.
  51. *
  52. * @see Jetpack_Sitemap_Librarian
  53. * @since 4.8.0
  54. * @var Jetpack_Sitemap_Librarian $librarian Librarian object for storing and retrieving sitemap data.
  55. */
  56. private $librarian;
  57. /**
  58. * Logger object for reporting debug messages.
  59. *
  60. * @see Jetpack_Sitemap_Logger
  61. * @since 4.8.0
  62. * @var Jetpack_Sitemap_Logger $logger Logger object for reporting debug messages.
  63. */
  64. private $logger;
  65. /**
  66. * Finder object for handling sitemap URIs.
  67. *
  68. * @see Jetpack_Sitemap_Finder
  69. * @since 4.8.0
  70. * @var Jetpack_Sitemap_Finder $finder Finder object for handling with sitemap URIs.
  71. */
  72. private $finder;
  73. /**
  74. * Construct a new Jetpack_Sitemap_Manager.
  75. *
  76. * @access public
  77. * @since 4.8.0
  78. */
  79. public function __construct() {
  80. $this->librarian = new Jetpack_Sitemap_Librarian();
  81. $this->finder = new Jetpack_Sitemap_Finder();
  82. if ( defined( 'WP_DEBUG' ) && ( true === WP_DEBUG ) ) {
  83. $this->logger = new Jetpack_Sitemap_Logger();
  84. }
  85. // Add callback for sitemap URL handler.
  86. add_action(
  87. 'init',
  88. array( $this, 'callback_action_catch_sitemap_urls' ),
  89. defined( 'IS_WPCOM' ) && IS_WPCOM ? 100 : 10
  90. );
  91. // Add generator to wp_cron task list.
  92. $this->schedule_sitemap_generation();
  93. // Add sitemap to robots.txt.
  94. add_action(
  95. 'do_robotstxt',
  96. array( $this, 'callback_action_do_robotstxt' ),
  97. 20
  98. );
  99. // The news sitemap is cached; here we add a callback to
  100. // flush the cached news sitemap when a post is published.
  101. add_action(
  102. 'publish_post',
  103. array( $this, 'callback_action_flush_news_sitemap_cache' ),
  104. 10
  105. );
  106. // In case we need to purge all sitemaps, we do this.
  107. add_action(
  108. 'jetpack_sitemaps_purge_data',
  109. array( $this, 'callback_action_purge_data' )
  110. );
  111. /*
  112. * Module parameters are stored as options in the database.
  113. * This allows us to avoid having to process all of init
  114. * before serving the sitemap data. The following actions
  115. * process and store these filters.
  116. */
  117. // Process filters and store location string for sitemap.
  118. add_action(
  119. 'init',
  120. array( $this, 'callback_action_filter_sitemap_location' ),
  121. 999
  122. );
  123. }
  124. /**
  125. * Echo a raw string of given content-type.
  126. *
  127. * @access private
  128. * @since 4.8.0
  129. *
  130. * @param string $the_content_type The content type to be served.
  131. * @param string $the_content The string to be echoed.
  132. */
  133. private function serve_raw_and_die( $the_content_type, $the_content ) {
  134. header( 'Content-Type: ' . $the_content_type . '; charset=UTF-8' );
  135. global $wp_query;
  136. $wp_query->is_feed = true;
  137. set_query_var( 'feed', 'sitemap' );
  138. if ( '' === $the_content ) {
  139. $error = __( 'No sitemap found. Please try again later.', 'jetpack' );
  140. if ( current_user_can( 'manage_options' ) ) {
  141. $next = human_time_diff( wp_next_scheduled( 'jp_sitemap_cron_hook' ) );
  142. /* translators: %s is a human_time_diff until next sitemap generation. */
  143. $error = sprintf( __( 'No sitemap found. The system will try to build it again in %s.', 'jetpack' ), $next );
  144. }
  145. wp_die(
  146. esc_html( $error ),
  147. esc_html__( 'Sitemaps', 'jetpack' ),
  148. array(
  149. 'response' => 404,
  150. )
  151. );
  152. }
  153. echo $the_content; // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- All content created by Jetpack.
  154. die();
  155. }
  156. /**
  157. * Callback to intercept sitemap url requests and serve sitemap files.
  158. *
  159. * @access public
  160. * @since 4.8.0
  161. */
  162. public function callback_action_catch_sitemap_urls() {
  163. // Regular expressions for sitemap URL routing.
  164. $regex = array(
  165. 'master' => '/^sitemap\.xml$/',
  166. 'sitemap' => '/^sitemap-[1-9][0-9]*\.xml$/',
  167. 'index' => '/^sitemap-index-[1-9][0-9]*\.xml$/',
  168. 'sitemap-style' => '/^sitemap\.xsl$/',
  169. 'index-style' => '/^sitemap-index\.xsl$/',
  170. 'image' => '/^image-sitemap-[1-9][0-9]*\.xml$/',
  171. 'image-index' => '/^image-sitemap-index-[1-9][0-9]*\.xml$/',
  172. 'image-style' => '/^image-sitemap\.xsl$/',
  173. 'video' => '/^video-sitemap-[1-9][0-9]*\.xml$/',
  174. 'video-index' => '/^video-sitemap-index-[1-9][0-9]*\.xml$/',
  175. 'video-style' => '/^video-sitemap\.xsl$/',
  176. 'news' => '/^news-sitemap\.xml$/',
  177. 'news-style' => '/^news-sitemap\.xsl$/',
  178. );
  179. // The raw path(+query) of the requested URI.
  180. if ( isset( $_SERVER['REQUEST_URI'] ) ) { // WPCS: Input var okay.
  181. $raw_uri = sanitize_text_field(
  182. wp_unslash( $_SERVER['REQUEST_URI'] ) // WPCS: Input var okay.
  183. );
  184. } else {
  185. $raw_uri = '';
  186. }
  187. $request = $this->finder->recognize_sitemap_uri( $raw_uri );
  188. if ( isset( $request['sitemap_name'] ) ) {
  189. /**
  190. * Filter the content type used to serve the sitemap XML files.
  191. *
  192. * @module sitemaps
  193. *
  194. * @since 3.9.0
  195. *
  196. * @param string $xml_content_type By default, it's 'text/xml'.
  197. */
  198. $xml_content_type = apply_filters( 'jetpack_sitemap_content_type', 'text/xml' );
  199. // Catch master sitemap xml.
  200. if ( preg_match( $regex['master'], $request['sitemap_name'] ) ) {
  201. $sitemap_content = $this->librarian->get_sitemap_text(
  202. jp_sitemap_filename( JP_MASTER_SITEMAP_TYPE, 0 ),
  203. JP_MASTER_SITEMAP_TYPE
  204. );
  205. // if there is no master sitemap yet, let's just return an empty sitemap with a short TTL instead of a 404.
  206. if ( empty( $sitemap_content ) ) {
  207. $builder = new Jetpack_Sitemap_Builder();
  208. $sitemap_content = $builder->empty_sitemap_xml();
  209. }
  210. $this->serve_raw_and_die(
  211. $xml_content_type,
  212. $sitemap_content
  213. );
  214. }
  215. // Catch sitemap xml.
  216. if ( preg_match( $regex['sitemap'], $request['sitemap_name'] ) ) {
  217. $this->serve_raw_and_die(
  218. $xml_content_type,
  219. $this->librarian->get_sitemap_text(
  220. $request['sitemap_name'],
  221. JP_PAGE_SITEMAP_TYPE
  222. )
  223. );
  224. }
  225. // Catch sitemap index xml.
  226. if ( preg_match( $regex['index'], $request['sitemap_name'] ) ) {
  227. $this->serve_raw_and_die(
  228. $xml_content_type,
  229. $this->librarian->get_sitemap_text(
  230. $request['sitemap_name'],
  231. JP_PAGE_SITEMAP_INDEX_TYPE
  232. )
  233. );
  234. }
  235. // Catch sitemap xsl.
  236. if ( preg_match( $regex['sitemap-style'], $request['sitemap_name'] ) ) {
  237. $this->serve_raw_and_die(
  238. 'application/xml',
  239. Jetpack_Sitemap_Stylist::sitemap_xsl()
  240. );
  241. }
  242. // Catch sitemap index xsl.
  243. if ( preg_match( $regex['index-style'], $request['sitemap_name'] ) ) {
  244. $this->serve_raw_and_die(
  245. 'application/xml',
  246. Jetpack_Sitemap_Stylist::sitemap_index_xsl()
  247. );
  248. }
  249. // Catch image sitemap xml.
  250. if ( preg_match( $regex['image'], $request['sitemap_name'] ) ) {
  251. $this->serve_raw_and_die(
  252. $xml_content_type,
  253. $this->librarian->get_sitemap_text(
  254. $request['sitemap_name'],
  255. JP_IMAGE_SITEMAP_TYPE
  256. )
  257. );
  258. }
  259. // Catch image sitemap index xml.
  260. if ( preg_match( $regex['image-index'], $request['sitemap_name'] ) ) {
  261. $this->serve_raw_and_die(
  262. $xml_content_type,
  263. $this->librarian->get_sitemap_text(
  264. $request['sitemap_name'],
  265. JP_IMAGE_SITEMAP_INDEX_TYPE
  266. )
  267. );
  268. }
  269. // Catch image sitemap xsl.
  270. if ( preg_match( $regex['image-style'], $request['sitemap_name'] ) ) {
  271. $this->serve_raw_and_die(
  272. 'application/xml',
  273. Jetpack_Sitemap_Stylist::image_sitemap_xsl()
  274. );
  275. }
  276. // Catch video sitemap xml.
  277. if ( preg_match( $regex['video'], $request['sitemap_name'] ) ) {
  278. $this->serve_raw_and_die(
  279. $xml_content_type,
  280. $this->librarian->get_sitemap_text(
  281. $request['sitemap_name'],
  282. JP_VIDEO_SITEMAP_TYPE
  283. )
  284. );
  285. }
  286. // Catch video sitemap index xml.
  287. if ( preg_match( $regex['video-index'], $request['sitemap_name'] ) ) {
  288. $this->serve_raw_and_die(
  289. $xml_content_type,
  290. $this->librarian->get_sitemap_text(
  291. $request['sitemap_name'],
  292. JP_VIDEO_SITEMAP_INDEX_TYPE
  293. )
  294. );
  295. }
  296. // Catch video sitemap xsl.
  297. if ( preg_match( $regex['video-style'], $request['sitemap_name'] ) ) {
  298. $this->serve_raw_and_die(
  299. 'application/xml',
  300. Jetpack_Sitemap_Stylist::video_sitemap_xsl()
  301. );
  302. }
  303. // Catch news sitemap xml.
  304. if ( preg_match( $regex['news'], $request['sitemap_name'] ) ) {
  305. $sitemap_builder = new Jetpack_Sitemap_Builder();
  306. $this->serve_raw_and_die(
  307. $xml_content_type,
  308. $sitemap_builder->news_sitemap_xml()
  309. );
  310. }
  311. // Catch news sitemap xsl.
  312. if ( preg_match( $regex['news-style'], $request['sitemap_name'] ) ) {
  313. $this->serve_raw_and_die(
  314. 'application/xml',
  315. Jetpack_Sitemap_Stylist::news_sitemap_xsl()
  316. );
  317. }
  318. }
  319. }
  320. /**
  321. * Callback for adding sitemap-interval to the list of schedules.
  322. *
  323. * @access public
  324. * @since 4.8.0
  325. *
  326. * @param array $schedules The array of WP_Cron schedules.
  327. *
  328. * @return array The updated array of WP_Cron schedules.
  329. */
  330. public function callback_add_sitemap_schedule( $schedules ) {
  331. $schedules['sitemap-interval'] = array(
  332. 'interval' => JP_SITEMAP_INTERVAL,
  333. 'display' => __( 'Sitemap Interval', 'jetpack' ),
  334. );
  335. return $schedules;
  336. }
  337. /**
  338. * Callback handler for sitemap cron hook
  339. *
  340. * @access public
  341. */
  342. public function callback_sitemap_cron_hook() {
  343. $sitemap_builder = new Jetpack_Sitemap_Builder();
  344. $sitemap_builder->update_sitemap();
  345. }
  346. /**
  347. * Add actions to schedule sitemap generation.
  348. * Should only be called once, in the constructor.
  349. *
  350. * @access private
  351. * @since 4.8.0
  352. */
  353. private function schedule_sitemap_generation() {
  354. // Add cron schedule.
  355. add_filter( 'cron_schedules', array( $this, 'callback_add_sitemap_schedule' ) ); // phpcs:ignore WordPress.WP.CronInterval.ChangeDetected
  356. add_action(
  357. 'jp_sitemap_cron_hook',
  358. array( $this, 'callback_sitemap_cron_hook' )
  359. );
  360. if ( ! wp_next_scheduled( 'jp_sitemap_cron_hook' ) ) {
  361. /**
  362. * Filter the delay in seconds until sitemap generation cron job is started.
  363. *
  364. * This filter allows a site operator or hosting provider to potentialy spread out sitemap generation for a
  365. * lot of sites over time. By default, it will be randomly done over 15 minutes.
  366. *
  367. * @module sitemaps
  368. * @since 6.6.1
  369. *
  370. * @param int $delay Time to delay in seconds.
  371. */
  372. $delay = apply_filters( 'jetpack_sitemap_generation_delay', MINUTE_IN_SECONDS * wp_rand( 1, 15 ) ); // Randomly space it out to start within next fifteen minutes.
  373. wp_schedule_event(
  374. time() + $delay,
  375. 'sitemap-interval',
  376. 'jp_sitemap_cron_hook'
  377. );
  378. }
  379. }
  380. /**
  381. * Callback to add sitemap to robots.txt.
  382. *
  383. * @access public
  384. * @since 4.8.0
  385. */
  386. public function callback_action_do_robotstxt() {
  387. /**
  388. * Filter whether to make the default sitemap discoverable to robots or not. Default true.
  389. *
  390. * @module sitemaps
  391. * @since 3.9.0
  392. * @deprecated 7.4.0
  393. *
  394. * @param bool $discover_sitemap Make default sitemap discoverable to robots.
  395. */
  396. $discover_sitemap = apply_filters_deprecated( 'jetpack_sitemap_generate', array( true ), 'jetpack-7.4.0', 'jetpack_sitemap_include_in_robotstxt' );
  397. /**
  398. * Filter whether to make the default sitemap discoverable to robots or not. Default true.
  399. *
  400. * @module sitemaps
  401. * @since 7.4.0
  402. *
  403. * @param bool $discover_sitemap Make default sitemap discoverable to robots.
  404. */
  405. $discover_sitemap = apply_filters( 'jetpack_sitemap_include_in_robotstxt', $discover_sitemap );
  406. if ( true === $discover_sitemap ) {
  407. $sitemap_url = $this->finder->construct_sitemap_url( 'sitemap.xml' );
  408. echo 'Sitemap: ' . esc_url( $sitemap_url ) . "\n";
  409. }
  410. /**
  411. * Filter whether to make the news sitemap discoverable to robots or not. Default true.
  412. *
  413. * @module sitemaps
  414. * @since 3.9.0
  415. * @deprecated 7.4.0
  416. *
  417. * @param bool $discover_news_sitemap Make default news sitemap discoverable to robots.
  418. */
  419. $discover_news_sitemap = apply_filters_deprecated( 'jetpack_news_sitemap_generate', array( true ), 'jetpack-7.4.0', 'jetpack_news_sitemap_include_in_robotstxt' );
  420. /**
  421. * Filter whether to make the news sitemap discoverable to robots or not. Default true.
  422. *
  423. * @module sitemaps
  424. * @since 7.4.0
  425. *
  426. * @param bool $discover_news_sitemap Make default news sitemap discoverable to robots.
  427. */
  428. $discover_news_sitemap = apply_filters( 'jetpack_news_sitemap_include_in_robotstxt', $discover_news_sitemap );
  429. if ( true === $discover_news_sitemap ) {
  430. $news_sitemap_url = $this->finder->construct_sitemap_url( 'news-sitemap.xml' );
  431. echo 'Sitemap: ' . esc_url( $news_sitemap_url ) . "\n";
  432. }
  433. }
  434. /**
  435. * Callback to delete the news sitemap cache.
  436. *
  437. * @access public
  438. * @since 4.8.0
  439. */
  440. public function callback_action_flush_news_sitemap_cache() {
  441. delete_transient( 'jetpack_news_sitemap_xml' );
  442. }
  443. /**
  444. * Callback for resetting stored sitemap data.
  445. *
  446. * @access public
  447. * @since 5.3.0
  448. * @since 6.7.0 Schedules a regeneration.
  449. */
  450. public function callback_action_purge_data() {
  451. $this->callback_action_flush_news_sitemap_cache();
  452. $this->librarian->delete_all_stored_sitemap_data();
  453. /** This filter is documented in modules/sitemaps/sitemaps.php */
  454. $delay = apply_filters( 'jetpack_sitemap_generation_delay', MINUTE_IN_SECONDS * wp_rand( 1, 15 ) ); // Randomly space it out to start within next fifteen minutes.
  455. wp_schedule_single_event( time() + $delay, 'jp_sitemap_cron_hook' );
  456. }
  457. /**
  458. * Callback to set the sitemap location.
  459. *
  460. * @access public
  461. * @since 4.8.0
  462. */
  463. public function callback_action_filter_sitemap_location() {
  464. update_option(
  465. 'jetpack_sitemap_location',
  466. /**
  467. * Additional path for sitemap URIs. Default value is empty.
  468. *
  469. * This string is any additional path fragment you want included between
  470. * the home URL and the sitemap filenames. Exactly how this fragment is
  471. * interpreted depends on your permalink settings. For example:
  472. *
  473. * Pretty permalinks:
  474. * home_url() . jetpack_sitemap_location . '/sitemap.xml'
  475. *
  476. * Plain ("ugly") permalinks:
  477. * home_url() . jetpack_sitemap_location . '/?jetpack-sitemap=sitemap.xml'
  478. *
  479. * PATHINFO permalinks:
  480. * home_url() . '/index.php' . jetpack_sitemap_location . '/sitemap.xml'
  481. *
  482. * where 'sitemap.xml' is the name of a specific sitemap file.
  483. * The value of this filter must be a valid path fragment per RFC 3986;
  484. * in particular it must either be empty or begin with a '/'.
  485. * Also take care that any restrictions on sitemap location imposed by
  486. * the sitemap protocol are satisfied.
  487. *
  488. * The result of this filter is stored in an option, 'jetpack_sitemap_location';
  489. * that option is what gets read when the sitemap location is needed.
  490. * This way we don't have to wait for init to finish before building sitemaps.
  491. *
  492. * @link https://tools.ietf.org/html/rfc3986#section-3.3 RFC 3986
  493. * @link https://www.sitemaps.org/ The sitemap protocol
  494. *
  495. * @since 4.8.0
  496. */
  497. apply_filters(
  498. 'jetpack_sitemap_location',
  499. ''
  500. )
  501. );
  502. }
  503. } // End Jetpack_Sitemap_Manager class.
  504. new Jetpack_Sitemap_Manager();
  505. /**
  506. * Absolute URL of the current blog's sitemap.
  507. *
  508. * @module sitemaps
  509. *
  510. * @since 3.9.0
  511. * @since 4.8.1 Code uses method found in Jetpack_Sitemap_Finder::construct_sitemap_url in 4.8.0.
  512. * It has been moved here to avoid fatal errors with other plugins that were expecting to find this function.
  513. *
  514. * @param string $filename Sitemap file name. Defaults to 'sitemap.xml', the initial sitemaps page.
  515. *
  516. * @return string Sitemap URL.
  517. */
  518. function jetpack_sitemap_uri( $filename = 'sitemap.xml' ) {
  519. global $wp_rewrite;
  520. $location = Jetpack_Options::get_option_and_ensure_autoload( 'jetpack_sitemap_location', '' );
  521. if ( $wp_rewrite->using_index_permalinks() ) {
  522. $sitemap_url = home_url( '/index.php' . $location . '/' . $filename );
  523. } elseif ( $wp_rewrite->using_permalinks() ) {
  524. $sitemap_url = home_url( $location . '/' . $filename );
  525. } else {
  526. $sitemap_url = home_url( $location . '/?jetpack-sitemap=' . $filename );
  527. }
  528. /**
  529. * Filter sitemap URL relative to home URL.
  530. *
  531. * @module sitemaps
  532. *
  533. * @since 3.9.0
  534. *
  535. * @param string $sitemap_url Sitemap URL.
  536. */
  537. return apply_filters( 'jetpack_sitemap_location', $sitemap_url );
  538. }