Sin descripción

class.media-extractor.php 19KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. <?php // phpcs:ignore WordPress.Files.FileName.InvalidClassFileName
  2. /**
  3. * Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
  4. * in or attached to the post/page.
  5. *
  6. * @package automattic/jetpack
  7. */
  8. /**
  9. * Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
  10. * in or attached to the post/page.
  11. *
  12. * @todo Additionally, have some filters on number of items in each field
  13. */
  14. class Jetpack_Media_Meta_Extractor {
  15. // Some consts for what to extract.
  16. const ALL = 255;
  17. const LINKS = 1;
  18. const MENTIONS = 2;
  19. const IMAGES = 4;
  20. const SHORTCODES = 8; // Only the keeper shortcodes below.
  21. const EMBEDS = 16;
  22. const HASHTAGS = 32;
  23. /**
  24. * Shortcodes to keep.
  25. *
  26. * For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all)
  27. * There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these.
  28. *
  29. * @var string[]
  30. */
  31. private static $keeper_shortcodes = array(
  32. 'youtube',
  33. 'vimeo',
  34. 'hulu',
  35. 'ted',
  36. 'wpvideo',
  37. 'videopress',
  38. );
  39. /**
  40. * Gets the specified media and meta info from the given post.
  41. * NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead.
  42. *
  43. * @param int $blog_id The ID of the blog.
  44. * @param int $post_id The ID of the post.
  45. * @param int $what_to_extract A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS.
  46. *
  47. * @return array|WP_Error a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error.
  48. */
  49. public static function extract( $blog_id, $post_id, $what_to_extract = self::ALL ) {
  50. // multisite?
  51. if ( function_exists( 'switch_to_blog' ) ) {
  52. switch_to_blog( $blog_id );
  53. }
  54. $post = get_post( $post_id );
  55. if ( ! $post instanceof WP_Post ) {
  56. if ( function_exists( 'restore_current_blog' ) ) {
  57. restore_current_blog();
  58. }
  59. return array();
  60. }
  61. $content = $post->post_title . "\n\n" . $post->post_content;
  62. $char_cnt = strlen( $content );
  63. // prevent running extraction on really huge amounts of content.
  64. if ( $char_cnt > 100000 ) { // about 20k English words.
  65. $content = substr( $content, 0, 100000 );
  66. }
  67. $extracted = array();
  68. // Get images first, we need the full post for that.
  69. if ( self::IMAGES & $what_to_extract ) {
  70. $extracted = self::get_image_fields( $post );
  71. // Turn off images so we can safely call extract_from_content() below.
  72. $what_to_extract = $what_to_extract - self::IMAGES;
  73. }
  74. if ( function_exists( 'restore_current_blog' ) ) {
  75. restore_current_blog();
  76. }
  77. // All of the other things besides images can be extracted from just the content.
  78. $extracted = self::extract_from_content( $content, $what_to_extract, $extracted );
  79. return $extracted;
  80. }
  81. /**
  82. * Gets the specified meta info from the given post content.
  83. * NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction
  84. * This method will give you an error if you ask for IMAGES.
  85. *
  86. * @param string $content The HTML post_content of a post.
  87. * @param int $what_to_extract A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS.
  88. * @param array $already_extracted Previously extracted things, e.g. images from extract(), which can be used for x-referencing here.
  89. *
  90. * @return array a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error.
  91. */
  92. public static function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) {
  93. $stripped_content = self::get_stripped_content( $content );
  94. // Maybe start with some previously extracted things (e.g. images from extract().
  95. $extracted = $already_extracted;
  96. // Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.
  97. if ( self::IMAGES & $what_to_extract ) {
  98. $images = self::extract_images_from_content( $stripped_content, array() );
  99. $extracted = array_merge( $extracted, $images );
  100. }
  101. // ----------------------------------- MENTIONS ------------------------------
  102. if ( self::MENTIONS & $what_to_extract ) {
  103. if ( preg_match_all( '/(^|\s)@(\w+)/u', $stripped_content, $matches ) ) {
  104. $mentions = array_values( array_unique( $matches[2] ) ); // array_unique() retains the keys!
  105. $mentions = array_map( 'strtolower', $mentions );
  106. $extracted['mention'] = array( 'name' => $mentions );
  107. if ( ! isset( $extracted['has'] ) ) {
  108. $extracted['has'] = array();
  109. }
  110. $extracted['has']['mention'] = count( $mentions );
  111. }
  112. }
  113. // ----------------------------------- HASHTAGS ------------------------------
  114. /**
  115. * Some hosts may not compile with --enable-unicode-properties and kick a warning:
  116. * Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
  117. * Therefore, we only run this code block on wpcom, not in Jetpack.
  118. */
  119. if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) {
  120. // This regex does not exactly match Twitter's
  121. // if there are problems/complaints we should implement this:
  122. // https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java .
  123. if ( preg_match_all( '/(?:^|\s)#(\w*\p{L}+\w*)/u', $stripped_content, $matches ) ) {
  124. $hashtags = array_values( array_unique( $matches[1] ) ); // array_unique() retains the keys!
  125. $hashtags = array_map( 'strtolower', $hashtags );
  126. $extracted['hashtag'] = array( 'name' => $hashtags );
  127. if ( ! isset( $extracted['has'] ) ) {
  128. $extracted['has'] = array();
  129. }
  130. $extracted['has']['hashtag'] = count( $hashtags );
  131. }
  132. }
  133. // ----------------------------------- SHORTCODES ------------------------------
  134. // Always look for shortcodes.
  135. // If we don't want them, we'll just remove them, so we don't grab them as links below.
  136. $shortcode_pattern = '/' . get_shortcode_regex() . '/s';
  137. if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) {
  138. $shortcode_total_count = 0;
  139. $shortcode_type_counts = array();
  140. $shortcode_types = array();
  141. $shortcode_details = array();
  142. if ( self::SHORTCODES & $what_to_extract ) {
  143. foreach ( $matches[2] as $key => $shortcode ) {
  144. // Elasticsearch (and probably other things) doesn't deal well with some chars as key names.
  145. $shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode );
  146. $attr = shortcode_parse_atts( $matches[3][ $key ] );
  147. $shortcode_total_count++;
  148. if ( ! isset( $shortcode_type_counts[ $shortcode_name ] ) ) {
  149. $shortcode_type_counts[ $shortcode_name ] = 0;
  150. }
  151. $shortcode_type_counts[ $shortcode_name ]++;
  152. // Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
  153. // @todo Store number of occurrences?
  154. if ( ! in_array( $shortcode_name, $shortcode_types, true ) ) {
  155. $shortcode_types[] = $shortcode_name;
  156. }
  157. // For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.).
  158. if ( in_array( $shortcode, self::$keeper_shortcodes, true ) ) {
  159. // Clear shortcode ID data left from the last shortcode.
  160. $id = null;
  161. // We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id().
  162. // If the shortcode is a class, we'll call XyzShortcode::get_xyz_id().
  163. $shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
  164. $shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode';
  165. $shortcode_get_id_method = "get_{$shortcode}_id";
  166. if ( function_exists( $shortcode_get_id_func ) ) {
  167. $id = call_user_func( $shortcode_get_id_func, $attr );
  168. } elseif ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) {
  169. $id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr );
  170. }
  171. if ( ! empty( $id )
  172. && ( ! isset( $shortcode_details[ $shortcode_name ] ) || ! in_array( $id, $shortcode_details[ $shortcode_name ], true ) ) ) {
  173. $shortcode_details[ $shortcode_name ][] = $id;
  174. }
  175. }
  176. }
  177. if ( $shortcode_total_count > 0 ) {
  178. // Add the shortcode info to the $extracted array.
  179. if ( ! isset( $extracted['has'] ) ) {
  180. $extracted['has'] = array();
  181. }
  182. $extracted['has']['shortcode'] = $shortcode_total_count;
  183. $extracted['shortcode'] = array();
  184. foreach ( $shortcode_type_counts as $type => $count ) {
  185. $extracted['shortcode'][ $type ] = array( 'count' => $count );
  186. }
  187. if ( ! empty( $shortcode_types ) ) {
  188. $extracted['shortcode_types'] = $shortcode_types;
  189. }
  190. foreach ( $shortcode_details as $type => $id ) {
  191. $extracted['shortcode'][ $type ]['id'] = $id;
  192. }
  193. }
  194. }
  195. // Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
  196. $content = preg_replace( $shortcode_pattern, ' ', $content );
  197. }
  198. // ----------------------------------- LINKS ------------------------------
  199. if ( self::LINKS & $what_to_extract ) {
  200. // To hold the extracted stuff we find.
  201. $links = array();
  202. // @todo Get the text inside the links?
  203. // Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images.
  204. // (we treat embed links as just another link).
  205. if ( preg_match_all( '#(?:^|\s|"|\')(https?://([^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))))#', $content, $matches ) ) {
  206. foreach ( $matches[1] as $link_raw ) {
  207. $url = wp_parse_url( $link_raw );
  208. // Data URI links.
  209. if ( ! isset( $url['scheme'] ) || 'data' === $url['scheme'] ) {
  210. continue;
  211. }
  212. // Reject invalid URLs.
  213. if ( ! isset( $url['host'] ) ) {
  214. continue;
  215. }
  216. // Remove large (and likely invalid) links.
  217. if ( 4096 < strlen( $link_raw ) ) {
  218. continue;
  219. }
  220. // Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those.
  221. $simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' );
  222. if ( isset( $extracted['image']['url'] ) ) {
  223. if ( in_array( $simple_url, (array) $extracted['image']['url'], true ) ) {
  224. continue;
  225. }
  226. }
  227. list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable
  228. // Build a reversed hostname.
  229. $host_parts = array_reverse( explode( '.', $url['host'] ) );
  230. $host_reversed = '';
  231. foreach ( $host_parts as $part ) {
  232. $host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part;
  233. }
  234. $link_analyzed = '';
  235. if ( ! empty( $url['path'] ) ) {
  236. // The whole path (no query args or fragments).
  237. $path = substr( $url['path'], 1 ); // strip the leading '/'.
  238. $link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path;
  239. // The path split by /.
  240. $path_split = explode( '/', $path );
  241. if ( count( $path_split ) > 1 ) {
  242. $link_analyzed .= ' ' . implode( ' ', $path_split );
  243. }
  244. // The fragment.
  245. if ( ! empty( $url['fragment'] ) ) {
  246. $link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment'];
  247. }
  248. }
  249. // @todo Check unique before adding
  250. $links[] = array(
  251. 'url' => $link_all_but_proto,
  252. 'host_reversed' => $host_reversed,
  253. 'host' => $url['host'],
  254. );
  255. }
  256. }
  257. $link_count = count( $links );
  258. if ( $link_count ) {
  259. $extracted['link'] = $links;
  260. if ( ! isset( $extracted['has'] ) ) {
  261. $extracted['has'] = array();
  262. }
  263. $extracted['has']['link'] = $link_count;
  264. }
  265. }
  266. // ----------------------------------- EMBEDS ------------------------------
  267. // Embeds are just individual links on their own line.
  268. if ( self::EMBEDS & $what_to_extract ) {
  269. if ( ! function_exists( '_wp_oembed_get_object' ) ) {
  270. include ABSPATH . WPINC . '/class-oembed.php';
  271. }
  272. // get an oembed object.
  273. $oembed = _wp_oembed_get_object();
  274. // Grab any links on their own lines that may be embeds.
  275. if ( preg_match_all( '|^\s*(https?://[^\s"]+)\s*$|im', $content, $matches ) ) {
  276. // To hold the extracted stuff we find.
  277. $embeds = array();
  278. foreach ( $matches[1] as $link_raw ) {
  279. $url = wp_parse_url( $link_raw );
  280. list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable
  281. // Check whether this "link" is really an embed.
  282. foreach ( $oembed->providers as $matchmask => $data ) {
  283. list( $providerurl, $regex ) = $data; // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable
  284. // Turn the asterisk-type provider URLs into regex.
  285. if ( ! $regex ) {
  286. $matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i';
  287. $matchmask = preg_replace( '|^#http\\\://|', '#https?\://', $matchmask );
  288. }
  289. if ( preg_match( $matchmask, $link_raw ) ) {
  290. $embeds[] = $link_all_but_proto; // @todo Check unique before adding
  291. // @todo Try to get ID's for the ones we care about (shortcode_keepers)
  292. break;
  293. }
  294. }
  295. }
  296. if ( ! empty( $embeds ) ) {
  297. if ( ! isset( $extracted['has'] ) ) {
  298. $extracted['has'] = array();
  299. }
  300. $extracted['has']['embed'] = count( $embeds );
  301. $extracted['embed'] = array( 'url' => array() );
  302. foreach ( $embeds as $e ) {
  303. $extracted['embed']['url'][] = $e;
  304. }
  305. }
  306. }
  307. }
  308. return $extracted;
  309. }
  310. /**
  311. * Get image fields for matching images.
  312. *
  313. * @uses Jetpack_PostImages
  314. *
  315. * @param WP_Post $post A post object.
  316. * @param array $args Optional args, see defaults list for details.
  317. *
  318. * @return array Returns an array of all images meeting the specified criteria in $args.
  319. */
  320. private static function get_image_fields( $post, $args = array() ) {
  321. if ( ! $post instanceof WP_Post ) {
  322. return array();
  323. }
  324. $defaults = array(
  325. 'width' => 200, // Required minimum width (if possible to determine).
  326. 'height' => 200, // Required minimum height (if possible to determine).
  327. );
  328. $args = wp_parse_args( $args, $defaults );
  329. $image_list = array();
  330. $image_booleans = array();
  331. $image_booleans['gallery'] = 0;
  332. $from_featured_image = Jetpack_PostImages::from_thumbnail( $post->ID, $args['width'], $args['height'] );
  333. if ( ! empty( $from_featured_image ) ) {
  334. $srcs = wp_list_pluck( $from_featured_image, 'src' );
  335. $image_list = array_merge( $image_list, $srcs );
  336. }
  337. $from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] );
  338. if ( ! empty( $from_slideshow ) ) {
  339. $srcs = wp_list_pluck( $from_slideshow, 'src' );
  340. $image_list = array_merge( $image_list, $srcs );
  341. }
  342. $from_gallery = Jetpack_PostImages::from_gallery( $post->ID );
  343. if ( ! empty( $from_gallery ) ) {
  344. $srcs = wp_list_pluck( $from_gallery, 'src' );
  345. $image_list = array_merge( $image_list, $srcs );
  346. $image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1
  347. }
  348. // @todo Can we check width/height of these efficiently? Could maybe use query args at least, before we strip them out
  349. $image_list = self::get_images_from_html( $post->post_content, $image_list );
  350. return self::build_image_struct( $image_list, $image_booleans );
  351. }
  352. /**
  353. * Helper function to get images from HTML and return it with the set sturcture.
  354. *
  355. * @param string $content HTML content.
  356. * @param array $image_list Array of already found images.
  357. *
  358. * @return array|array[] Array of images.
  359. */
  360. public static function extract_images_from_content( $content, $image_list ) {
  361. $image_list = self::get_images_from_html( $content, $image_list );
  362. return self::build_image_struct( $image_list, array() );
  363. }
  364. /**
  365. * Produces a set structure for extracted media items.
  366. *
  367. * @param array $image_list Array of images.
  368. * @param array $image_booleans Image booleans.
  369. *
  370. * @return array|array[]
  371. */
  372. public static function build_image_struct( $image_list, $image_booleans ) {
  373. if ( ! empty( $image_list ) ) {
  374. $retval = array( 'image' => array() );
  375. $image_list = array_unique( $image_list );
  376. foreach ( $image_list as $img ) {
  377. $retval['image'][] = array( 'url' => $img );
  378. }
  379. $image_booleans['image'] = count( $retval['image'] );
  380. if ( ! empty( $image_booleans ) ) {
  381. $retval['has'] = $image_booleans;
  382. }
  383. return $retval;
  384. } else {
  385. return array();
  386. }
  387. }
  388. /**
  389. * Extracts images from html.
  390. *
  391. * @param string $html Some markup, possibly containing image tags.
  392. * @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication.
  393. *
  394. * @return array Image URLs extracted from the HTML, stripped of query params and de-duped
  395. */
  396. public static function get_images_from_html( $html, $images_already_extracted ) {
  397. $image_list = $images_already_extracted;
  398. $from_html = Jetpack_PostImages::from_html( $html );
  399. if ( ! empty( $from_html ) ) {
  400. $srcs = wp_list_pluck( $from_html, 'src' );
  401. foreach ( $srcs as $image_url ) {
  402. $length = strpos( $image_url, '?' );
  403. $src = wp_parse_url( $image_url );
  404. if ( $src && isset( $src['scheme'], $src['host'], $src['path'] ) ) {
  405. // Rebuild the URL without the query string.
  406. $queryless = $src['scheme'] . '://' . $src['host'] . $src['path'];
  407. } elseif ( $length ) {
  408. // If wp_parse_url() didn't work, strip off the query string the old fashioned way.
  409. $queryless = substr( $image_url, 0, $length );
  410. } else {
  411. // Failing that, there was no spoon! Err ... query string!
  412. $queryless = $image_url;
  413. }
  414. // Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML.
  415. if ( 4096 < strlen( $queryless ) ) {
  416. continue;
  417. }
  418. if ( ! in_array( $queryless, $image_list, true ) ) {
  419. $image_list[] = $queryless;
  420. }
  421. }
  422. }
  423. return $image_list;
  424. }
  425. /**
  426. * Strips concents of all tags, shortcodes, and decodes HTML entities.
  427. *
  428. * @param string $content Original content.
  429. *
  430. * @return string Cleaned content.
  431. */
  432. private static function get_stripped_content( $content ) {
  433. $clean_content = wp_strip_all_tags( $content );
  434. $clean_content = html_entity_decode( $clean_content );
  435. // completely strip shortcodes and any content they enclose.
  436. $clean_content = strip_shortcodes( $clean_content );
  437. return $clean_content;
  438. }
  439. }