| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683 |
- <?php
- /**
- * Tweetstorm block and API helper.
- *
- * @package automattic/jetpack
- * @since 8.7.0
- */
- use Automattic\Jetpack\Connection\Client;
- use Automattic\Jetpack\Status;
- use Twitter\Text\Regex as Twitter_Regex;
- use Twitter\Text\Validator as Twitter_Validator;
- /**
- * Class Jetpack_Tweetstorm_Helper
- *
- * @since 8.7.0
- */
- class Jetpack_Tweetstorm_Helper {
- /**
- * Blocks that can be converted to tweets.
- *
- * @var array {
- * The key for each element must match the registered block name.
- *
- * @type string $type Required. The type of content this block produces. Can be one of 'break', 'embed', 'image',
- * 'multiline', 'text', or 'video'.
- * @type string $content_location Optional. Where the block content can be found. Can be 'html', if we need to parse
- * it out of the block HTML text, 'html-attributes', if the we need to parse it out of HTML attributes
- * in the block HTML, or 'block-attributes', if the content can be found in the block attributes.
- * Note that these attributes need to be available when the serialised block is
- * parsed using `parse_blocks()`. If it isn't set, it's assumed the block doesn't add
- * any content to the Twitter thread.
- * @type array $content Optional. Defines what parts of the block content need to be extracted. Behaviour can vary based on
- * `$content_location`, and `$type`:
- *
- * - When `$content_location` is 'html', a value of `array()` or `array( 'content' )` have the same meaning:
- * The entire block HTML should be used. In both cases, 'content' will be the corresponding tag in `$template`.
- * - When `$content_location` is 'html', it should be formatted as `array( 'container' => 'tag' )`,
- * where 'container' is the name of the corresponding RichText container in the block editor, and is also the name
- * of the corresponding tag in the $template string. 'tag' is the HTML tag within the block that corresponds to this
- * container. When `$type` is 'multiline', there must only be one element in the array, and tag should be set to the HTML
- * tag that corresponds to each line, though the 'container' should still be the RichText container name. (Eg, in the core/list block, the tag is 'li'.)
- * - When `$content_location` is 'html-attributes', the array should be formatted as `array( 'name' => array( 'tag', 'attribute') )`,
- * where 'name' is the name of a particular value that different block types require, 'tag' is the name of the HTML tag where 'attribute'
- * can be found, containing the value to use for 'name'. When `$type` is 'image', 'url' and 'alt' must be defined. When `$type` is 'video',
- * 'url' must be defined.
- * - When `$content_location` is 'block-attributes', it must be an array of block attribute names. When `$type` is 'embed', there
- * only be one element, corresponding to the URL for the embed.
- * @type string $template Required for 'text' and 'multiline' types, ignored for all other types. Describes how the block content will be formatted when tweeted.
- * Tags should match the keys of `$content`, except for the special "{{content}}", which matches the entire HTML content of the block.
- * For 'multiline' types, the template will be repeated for every line in the block.
- * @type boolean $force_new Required. Whether or not a new tweet should be started when this block is encountered.
- * @type boolean $force_finished Required. Whether or not a new tweet should be started after this block is finished.
- * }
- */
- private static $supported_blocks = array(
- 'core/embed' => array(
- 'type' => 'embed',
- 'content_location' => 'block-attributes',
- 'content' => array( 'url' ),
- 'force_new' => false,
- 'force_finished' => true,
- ),
- 'core/gallery' => array(
- 'type' => 'image',
- 'content_location' => 'html-attributes',
- 'content' => array(
- 'url' => array( 'img', 'src' ),
- 'alt' => array( 'img', 'alt' ),
- ),
- 'force_new' => false,
- 'force_finished' => true,
- ),
- 'core/heading' => array(
- 'type' => 'text',
- 'content_location' => 'html',
- 'content' => array(),
- 'template' => '{{content}}',
- 'force_new' => true,
- 'force_finished' => false,
- ),
- 'core/image' => array(
- 'type' => 'image',
- 'content_location' => 'html-attributes',
- 'content' => array(
- 'url' => array( 'img', 'src' ),
- 'alt' => array( 'img', 'alt' ),
- ),
- 'force_new' => false,
- 'force_finished' => true,
- ),
- 'core/list' => array(
- 'type' => 'multiline',
- 'content_location' => 'html',
- // It looks a little weird to use the 'values' key for a single line,
- // but 'values' is the name of the RichText content area.
- 'content' => array(
- 'values' => 'li',
- ),
- 'template' => '- {{values}}',
- 'force_new' => false,
- 'force_finished' => false,
- ),
- 'core/paragraph' => array(
- 'type' => 'text',
- 'content_location' => 'html',
- 'content' => array(),
- 'template' => '{{content}}',
- 'force_new' => false,
- 'force_finished' => false,
- ),
- 'core/quote' => array(
- 'type' => 'text',
- 'content_location' => 'html',
- // The quote content will always be inside <p> tags.
- 'content' => array(
- 'value' => 'p',
- 'citation' => 'cite',
- ),
- 'template' => '“{{value}}” – {{citation}}',
- 'force_new' => false,
- 'force_finished' => false,
- ),
- 'core/separator' => array(
- 'type' => 'break',
- 'force_new' => false,
- 'force_finished' => true,
- ),
- 'core/spacer' => array(
- 'type' => 'break',
- 'force_new' => false,
- 'force_finished' => true,
- ),
- 'core/verse' => array(
- 'type' => 'text',
- 'content_location' => 'html',
- 'content' => array(),
- 'template' => '{{content}}',
- 'force_new' => false,
- 'force_finished' => false,
- ),
- 'core/video' => array(
- 'type' => 'video',
- 'content_location' => 'html-attributes',
- 'content' => array(
- 'url' => array( 'video', 'src' ),
- ),
- 'force_new' => false,
- 'force_finished' => true,
- ),
- 'jetpack/gif' => array(
- 'type' => 'embed',
- 'content_location' => 'block-attributes',
- 'content' => array( 'giphyUrl' ),
- 'force_new' => false,
- 'force_finished' => true,
- ),
- );
- /**
- * A cache of _wp_emoji_list( 'entities' ), after being run through html_entity_decode().
- *
- * Initialised in ::is_valid_tweet().
- *
- * @var array
- */
- private static $emoji_list = array();
- /**
- * Special line separator character, for multiline text.
- *
- * @var string
- */
- private static $line_separator = "\xE2\x80\xA8";
- /**
- * Special inline placeholder character, for inline tags that change content length in the RichText..
- *
- * @var string
- */
- private static $inline_placeholder = "\xE2\x81\xA3";
- /**
- * URLs always take up a fixed length from the text limit.
- *
- * @var int
- */
- private static $characters_per_url = 24;
- /**
- * Every media attachment takes up some space from the text limit.
- *
- * @var int
- */
- private static $characters_per_media = 24;
- /**
- * An array to store all the tweets in.
- *
- * @var array
- */
- private static $tweets = array();
- /**
- * While we're caching everything, we want to keep track of the URLs we're adding.
- *
- * @var array
- */
- private static $urls = array();
- /**
- * Gather the Tweetstorm.
- *
- * @param string $url The tweet URL to gather from.
- * @return mixed
- */
- public static function gather( $url ) {
- if ( ( new Status() )->is_offline_mode() ) {
- return new WP_Error(
- 'dev_mode',
- __( 'Tweet unrolling is not available in offline mode.', 'jetpack' )
- );
- }
- $site_id = self::get_site_id();
- if ( is_wp_error( $site_id ) ) {
- return $site_id;
- }
- if ( defined( 'IS_WPCOM' ) && IS_WPCOM ) {
- if ( ! class_exists( 'WPCOM_Gather_Tweetstorm' ) ) {
- \jetpack_require_lib( 'gather-tweetstorm' );
- }
- return WPCOM_Gather_Tweetstorm::gather( $url );
- }
- $response = Client::wpcom_json_api_request_as_blog(
- sprintf( '/sites/%d/tweetstorm/gather?url=%s', $site_id, rawurlencode( $url ) ),
- 2,
- array( 'headers' => array( 'content-type' => 'application/json' ) ),
- null,
- 'wpcom'
- );
- if ( is_wp_error( $response ) ) {
- return $response;
- }
- $data = json_decode( wp_remote_retrieve_body( $response ) );
- if ( wp_remote_retrieve_response_code( $response ) >= 400 ) {
- return new WP_Error( $data->code, $data->message, $data->data );
- }
- return $data;
- }
- /**
- * Parse blocks into an array of tweets.
- *
- * @param array $blocks {
- * An array of blocks, with optional editor-specific information, that need to be parsed into tweets.
- *
- * @type array $block A single block, in the form produce by parse_blocks().
- * @type array $attributes Optional. A list of block attributes and their values from the block editor.
- * @type string $clientId Optional. The clientId of this block in the block editor.
- * }
- * @return array An array of tweets.
- */
- public static function parse( $blocks ) {
- // Reset the tweets array.
- self::$tweets = array();
- $blocks = self::extract_blocks( $blocks );
- if ( empty( $blocks ) ) {
- return array();
- }
- // Initialise the tweets array with an empty tweet, so we don't need to check
- // if we're creating the first tweet while processing blocks.
- self::start_new_tweet();
- foreach ( $blocks as $block ) {
- $block_def = self::get_block_definition( $block['name'] );
- // Grab the most recent tweet.
- $current_tweet = self::get_current_tweet();
- // Break blocks have no content to add, so we can skip the rest of this loop.
- if ( 'break' === $block_def['type'] ) {
- self::save_current_tweet( $current_tweet, $block );
- continue;
- }
- // Check if we need to start a new tweet.
- if ( $current_tweet['finished'] || $block_def['force_new'] ) {
- self::start_new_tweet();
- }
- // Process the block.
- self::add_text_to_tweets( $block );
- self::add_media_to_tweets( $block );
- self::add_tweet_to_tweets( $block );
- self::add_embed_to_tweets( $block );
- }
- return self::clean_return_tweets();
- }
- /**
- * If the passed block name is supported, return the block definition.
- *
- * @param string $block_name The registered block name.
- * @return array|null The block definition, if it's supported.
- */
- private static function get_block_definition( $block_name ) {
- if ( isset( self::$supported_blocks[ $block_name ] ) ) {
- return self::$supported_blocks[ $block_name ];
- }
- return null;
- }
- /**
- * If the block has any text, process it, and add it to the tweet list.
- *
- * @param array $block The block to process.
- */
- private static function add_text_to_tweets( $block ) {
- // This is a text block, is there any text?
- if ( 0 === strlen( $block['text'] ) ) {
- return;
- }
- $block_def = self::get_block_definition( $block['name'] );
- // Grab the most recent tweet, so we can append to that if we can.
- $current_tweet = self::get_current_tweet();
- // If the entire block can't be fit in this tweet, we need to start a new tweet.
- if ( $current_tweet['changed'] && ! self::is_valid_tweet( trim( $current_tweet['text'] ) . "\n\n{$block['text']}" ) ) {
- self::start_new_tweet();
- }
- // Multiline blocks prioritise splitting by line, but are otherwise identical to
- // normal text blocks. This means we can treat normal text blocks as being
- // "multiline", but with a single line.
- if ( 'multiline' === $block_def['type'] ) {
- $lines = explode( self::$line_separator, $block['text'] );
- } else {
- $lines = array( $block['text'] );
- }
- $line_total = count( $lines );
- // Keep track of how many characters from this block we've allocated to tweets.
- $current_character_count = 0;
- for ( $line_count = 0; $line_count < $line_total; $line_count++ ) {
- $line_text = $lines[ $line_count ];
- // Make sure we have the most recent tweet at the start of every loop.
- $current_tweet = self::get_current_tweet();
- if ( $current_tweet['changed'] ) {
- // When it's the first line, add an extra blank line to seperate
- // the tweet text from that of the previous block.
- $separator = "\n\n";
- if ( $line_count > 0 ) {
- $separator = "\n";
- }
- // Is this line short enough to append to the current tweet?
- if ( self::is_valid_tweet( trim( $current_tweet['text'] ) . "$separator$line_text" ) ) {
- // Don't trim the text yet, as we may need it for boundary calculations.
- $current_tweet['text'] = $current_tweet['text'] . "$separator$line_text";
- self::save_current_tweet( $current_tweet, $block );
- continue;
- }
- // This line is too long, and lines *must* be split to a new tweet if they don't fit
- // into the current tweet. If this isn't the first line, record where we split the block.
- if ( $line_count > 0 ) {
- // Increment by 1 to allow for the \n between lines to be counted by ::get_boundary().
- $current_character_count += strlen( $current_tweet['text'] ) + 1;
- $current_tweet['boundary'] = self::get_boundary( $block, $current_character_count );
- self::save_current_tweet( $current_tweet );
- }
- // Start a new tweet.
- $current_tweet = self::start_new_tweet();
- }
- // Since we're now at the start of a new tweet, is this line short enough to be a tweet by itself?
- if ( self::is_valid_tweet( $line_text ) ) {
- $current_tweet['text'] = $line_text;
- self::save_current_tweet( $current_tweet, $block );
- continue;
- }
- // The line is too long for a single tweet, so split it by sentences, or linebreaks.
- $sentences = preg_split( '/(?|(?<!\.\.\.)(?<=[.?!]|\.\)|\.["\'])(\s+)(?=[\p{L}\'"\(])|(\n+))/u', $line_text, -1, PREG_SPLIT_DELIM_CAPTURE );
- $sentence_total = count( $sentences );
- // preg_split() puts the blank space between sentences into a seperate entry in the result,
- // so we need to step through the result array by two, and append the blank space when needed.
- for ( $sentence_count = 0; $sentence_count < $sentence_total; $sentence_count += 2 ) {
- $current_sentence = $sentences[ $sentence_count ];
- if ( isset( $sentences[ $sentence_count + 1 ] ) ) {
- $current_sentence .= $sentences[ $sentence_count + 1 ];
- }
- // Make sure we have the most recent tweet.
- $current_tweet = self::get_current_tweet();
- // After the first sentence, we can try and append sentences to the previous sentence.
- if ( $current_tweet['changed'] && $sentence_count > 0 ) {
- // Is this sentence short enough for appending to the current tweet?
- if ( self::is_valid_tweet( $current_tweet['text'] . rtrim( $current_sentence ) ) ) {
- $current_tweet['text'] .= $current_sentence;
- self::save_current_tweet( $current_tweet, $block );
- continue;
- }
- }
- // Will this sentence fit in its own tweet?
- if ( self::is_valid_tweet( trim( $current_sentence ) ) ) {
- if ( $current_tweet['changed'] ) {
- // If we're already in the middle of a block, record the boundary
- // before creating a new tweet.
- if ( $line_count > 0 || $sentence_count > 0 ) {
- $current_character_count += strlen( $current_tweet['text'] );
- $current_tweet['boundary'] = self::get_boundary( $block, $current_character_count );
- self::save_current_tweet( $current_tweet );
- }
- $current_tweet = self::start_new_tweet();
- }
- $current_tweet['text'] = $current_sentence;
- self::save_current_tweet( $current_tweet, $block );
- continue;
- }
- // This long sentence will start the next tweet that this block is going
- // to be turned into, so we need to record the boundary and start a new tweet.
- if ( $current_tweet['changed'] ) {
- $current_character_count += strlen( $current_tweet['text'] );
- $current_tweet['boundary'] = self::get_boundary( $block, $current_character_count );
- self::save_current_tweet( $current_tweet );
- $current_tweet = self::start_new_tweet();
- }
- // Split the long sentence into words.
- $words = preg_split( '/(\p{Z})/u', $current_sentence, -1, PREG_SPLIT_DELIM_CAPTURE );
- $word_total = count( $words );
- for ( $word_count = 0; $word_count < $word_total; $word_count += 2 ) {
- // Make sure we have the most recent tweet.
- $current_tweet = self::get_current_tweet();
- // If we're on a new tweet, we don't want to add a space at the start.
- if ( ! $current_tweet['changed'] ) {
- $current_tweet['text'] = $words[ $word_count ];
- self::save_current_tweet( $current_tweet, $block );
- continue;
- }
- // Can we add this word to the current tweet?
- if ( self::is_valid_tweet( "{$current_tweet['text']} {$words[ $word_count ]}…" ) ) {
- $space = isset( $words[ $word_count - 1 ] ) ? $words[ $word_count - 1 ] : ' ';
- $current_tweet['text'] .= $space . $words[ $word_count ];
- self::save_current_tweet( $current_tweet, $block );
- continue;
- }
- // Add one for the space character that we won't include in the tweet text.
- $current_character_count += strlen( $current_tweet['text'] ) + 1;
- // We're starting a new tweet with this word. Append ellipsis to
- // the current tweet, then move on.
- $current_tweet['text'] .= '…';
- $current_tweet['boundary'] = self::get_boundary( $block, $current_character_count );
- self::save_current_tweet( $current_tweet );
- $current_tweet = self::start_new_tweet();
- // If this is the second tweet created by the split sentence, it'll start
- // with ellipsis, which we don't want to count, but we do want to count the space
- // that was replaced by this ellipsis.
- $current_tweet['text'] = "…{$words[ $word_count ]}";
- $current_character_count -= strlen( '…' );
- self::save_current_tweet( $current_tweet, $block );
- }
- }
- }
- }
- /**
- * Check if the block has any media to add, and add it.
- *
- * @param array $block The block to process.
- */
- private static function add_media_to_tweets( $block ) {
- // There's some media to attach!
- $media_count = count( $block['media'] );
- if ( 0 === $media_count ) {
- return;
- }
- $current_tweet = self::get_current_tweet();
- // We can only attach media to the previous tweet if the previous tweet
- // doesn't already have media.
- if ( count( $current_tweet['media'] ) > 0 ) {
- $current_tweet = self::start_new_tweet();
- }
- // Would adding this media make the text of the previous tweet too long?
- if ( ! self::is_valid_tweet( $current_tweet['text'], $media_count * self::$characters_per_media ) ) {
- $current_tweet = self::start_new_tweet();
- }
- $media = array_values(
- array_filter(
- $block['media'],
- function ( $single ) {
- // Only images and videos can be uploaded.
- if ( 0 === strpos( $single['type'], 'image/' ) || 0 === strpos( $single['type'], 'video/' ) ) {
- return true;
- }
- return false;
- }
- )
- );
- if ( count( $media ) > 0 ) {
- if ( 0 === strpos( $media[0]['type'], 'video/' ) || 'image/gif' === $media[0]['type'] ) {
- // We can only attach a single video or GIF.
- $current_tweet['media'] = array_slice( $media, 0, 1 );
- } else {
- // Since a GIF or video isn't the first element, we can remove all of them from the array.
- $filtered_media = array_values(
- array_filter(
- $media,
- function ( $single ) {
- if ( 0 === strpos( $single['type'], 'video/' ) || 'image/gif' === $single['type'] ) {
- return false;
- }
- return true;
- }
- )
- );
- // We can only add the first four images found to the tweet.
- $current_tweet['media'] = array_slice( $filtered_media, 0, 4 );
- }
- self::save_current_tweet( $current_tweet, $block );
- }
- }
- /**
- * Check if the block has a tweet that we can attach to the current tweet as a quote, and add it.
- *
- * @param array $block The block to process.
- */
- private static function add_tweet_to_tweets( $block ) {
- if ( 0 === strlen( $block['tweet'] ) ) {
- return;
- }
- $current_tweet = self::get_current_tweet();
- // We can only attach a tweet to the previous tweet if the previous tweet
- // doesn't already have a tweet quoted.
- if ( strlen( $current_tweet['tweet'] ) > 0 ) {
- $current_tweet = self::start_new_tweet();
- }
- $current_tweet['tweet'] = $block['tweet'];
- self::save_current_tweet( $current_tweet, $block );
- }
- /**
- * Check if the block has an embed URL that we can append to the current tweet text.
- *
- * @param array $block The block to process.
- */
- private static function add_embed_to_tweets( $block ) {
- if ( 0 === strlen( $block['embed'] ) ) {
- return;
- }
- $current_tweet = self::get_current_tweet();
- $reserved_characters = count( $current_tweet['media'] ) * self::$characters_per_media;
- $reserved_characters += 1 + self::$characters_per_url;
- // We can only attach an embed to the previous tweet if it doesn't already
- // have any URLs in it. Also, we can't attach it if it'll make the tweet too long.
- if ( preg_match( '/url-placeholder-\d+-*/', $current_tweet['text'] ) || ! self::is_valid_tweet( $current_tweet['text'], $reserved_characters ) ) {
- $current_tweet = self::start_new_tweet();
- $current_tweet['text'] = self::generate_url_placeholder( $block['embed'] );
- } else {
- $space = empty( $current_tweet['text'] ) ? '' : ' ';
- $current_tweet['text'] .= $space . self::generate_url_placeholder( $block['embed'] );
- }
- self::save_current_tweet( $current_tweet, $block );
- }
- /**
- * Given an array of blocks and optional editor information, this will extract them into
- * the internal representation used during parsing.
- *
- * @param array $blocks An array of blocks and optional editor-related information.
- * @return array An array of blocks, in our internal representation.
- */
- private static function extract_blocks( $blocks ) {
- if ( empty( $blocks ) ) {
- return array();
- }
- $block_count = count( $blocks );
- for ( $ii = 0; $ii < $block_count; $ii++ ) {
- if ( ! self::get_block_definition( $blocks[ $ii ]['block']['blockName'] ) ) {
- unset( $blocks[ $ii ] );
- continue;
- }
- $blocks[ $ii ]['name'] = $blocks[ $ii ]['block']['blockName'];
- $blocks[ $ii ]['text'] = self::extract_text_from_block( $blocks[ $ii ]['block'] );
- $blocks[ $ii ]['media'] = self::extract_media_from_block( $blocks[ $ii ]['block'] );
- $blocks[ $ii ]['tweet'] = self::extract_tweet_from_block( $blocks[ $ii ]['block'] );
- $blocks[ $ii ]['embed'] = self::extract_embed_from_block( $blocks[ $ii ]['block'] );
- }
- return array_values( $blocks );
- }
- /**
- * Creates a blank tweet, appends it to the tweets array, and returns the tweet.
- *
- * @return array The blank tweet.
- */
- private static function start_new_tweet() {
- self::$tweets[] = array(
- // An array of blocks that make up this tweet.
- 'blocks' => array(),
- // If this tweet only contains part of a block, the boundary contains
- // information about where in the block the tweet ends.
- 'boundary' => false,
- // The text content of the tweet.
- 'text' => '',
- // The media content of the tweet.
- 'media' => array(),
- // The quoted tweet in this tweet.
- 'tweet' => '',
- // Some blocks force a hard finish to the tweet, even if subsequent blocks
- // could technically be appended. This flag shows when a tweet is finished.
- 'finished' => false,
- // Flag if the current tweet already has content in it.
- 'changed' => false,
- );
- return self::get_current_tweet();
- }
- /**
- * Get the last tweet in the array.
- *
- * @return array The tweet.
- */
- private static function get_current_tweet() {
- return end( self::$tweets );
- }
- /**
- * Saves the passed tweet array as the last tweet, overwriting the former last tweet.
- *
- * This method adds some last minute checks: marking the tweet as "changed", as well
- * as adding the $block to the tweet (if it was passed, and hasn't already been added).
- *
- * @param array $tweet The tweet being stored.
- * @param array $block Optional. The block that was used to modify this tweet.
- * @return array The saved tweet, after the last minute checks have been done.
- */
- private static function save_current_tweet( $tweet, $block = null ) {
- $tweet['changed'] = true;
- if ( isset( $block ) ) {
- $block_def = self::get_block_definition( $block['name'] );
- // Check if this block type will be forcing a new tweet.
- if ( $block_def['force_finished'] ) {
- $tweet['finished'] = true;
- }
- // Check if this block is already recorded against this tweet.
- $last_block = end( $tweet['blocks'] );
- if ( isset( $block['clientId'] ) && ( false === $last_block || $last_block['clientId'] !== $block['clientId'] ) ) {
- $tweet['blocks'][] = $block;
- }
- }
- // Find the index of the last tweet in the array.
- end( self::$tweets );
- $tweet_index = key( self::$tweets );
- self::$tweets[ $tweet_index ] = $tweet;
- return $tweet;
- }
- /**
- * Checks if the passed text is valid for a tweet or not.
- *
- * @param string $text The text to check.
- * @param int $reserved_characters Optional. The number of characters to reduce the maximum tweet length by.
- * @return bool Whether or not the text is valid.
- */
- private static function is_valid_tweet( $text, $reserved_characters = 0 ) {
- return self::is_within_twitter_length( $text, 280 - $reserved_characters );
- }
- /**
- * Checks if the passed text is valid for image alt text.
- *
- * @param string $text The text to check.
- * @return bool Whether or not the text is valid.
- */
- private static function is_valid_alt_text( $text ) {
- return self::is_within_twitter_length( $text, 1000 );
- }
- /**
- * Check if a string is shorter than a given length, according to Twitter's rules for counting string length.
- *
- * @param string $text The text to check.
- * @param int $max_length The number of characters long this string can be.
- * @return bool Whether or not the string is no longer than the length limit.
- */
- private static function is_within_twitter_length( $text, $max_length ) {
- // Replace all multiline separators with a \n, since that's the
- // character we actually want to count.
- $text = str_replace( self::$line_separator, "\n", $text );
- // Keep a running total of characters we've removed.
- $stripped_characters = 0;
- // Since we use '…' a lot, strip it out, so we can still use the ASCII checks.
- $ellipsis_count = 0;
- $text = str_replace( '…', '', $text, $ellipsis_count );
- // The ellipsis glyph counts for two characters.
- $stripped_characters += $ellipsis_count * 2;
- // Try filtering out emoji first, since ASCII text + emoji is a relatively common case.
- if ( ! self::is_ascii( $text ) ) {
- // Initialise the emoji cache.
- if ( 0 === count( self::$emoji_list ) ) {
- self::$emoji_list = array_map( 'html_entity_decode', _wp_emoji_list( 'entities' ) );
- }
- $emoji_count = 0;
- $text = str_replace( self::$emoji_list, '', $text, $emoji_count );
- // Emoji graphemes count as 2 characters each.
- $stripped_characters += $emoji_count * 2;
- }
- if ( self::is_ascii( $text ) ) {
- $stripped_characters += strlen( $text );
- if ( $stripped_characters <= $max_length ) {
- return true;
- }
- return false;
- }
- // Remove any glyphs that count as 1 character.
- // Source: https://github.com/twitter/twitter-text/blob/master/config/v3.json .
- // Note that the source ranges are in decimal, the regex ranges are converted to hex.
- $single_character_count = 0;
- $text = preg_replace( '/[\x{0000}-\x{10FF}\x{2000}-\x{200D}\x{2010}-\x{201F}\x{2032}-\x{2037}]/uS', '', $text, -1, $single_character_count );
- $stripped_characters += $single_character_count;
- // Check if there's any text we haven't counted yet.
- // Any remaining glyphs count as 2 characters each.
- if ( 0 !== strlen( $text ) ) {
- // WP provides a compat version of mb_strlen(), no need to check if it exists.
- $stripped_characters += mb_strlen( $text, 'UTF-8' ) * 2;
- }
- if ( $stripped_characters <= $max_length ) {
- return true;
- }
- return false;
- }
- /**
- * Checks if a string only contains ASCII characters.
- *
- * @param string $text The string to check.
- * @return bool Whether or not the string is ASCII-only.
- */
- private static function is_ascii( $text ) {
- if ( function_exists( 'mb_check_encoding' ) ) {
- if ( mb_check_encoding( $text, 'ASCII' ) ) {
- return true;
- }
- } elseif ( ! preg_match( '/[^\x00-\x7F]/', $text ) ) {
- return true;
- }
- return false;
- }
- /**
- * A block will generate a certain amount of text to be inserted into a tweet. If that text is too
- * long for a tweet, we already know where the text will be split when it's published as tweet, but
- * we need to calculate where that corresponds to in the block edit UI.
- *
- * The tweet template for that block may add extra characters, extra characters are added for URL
- * placeholders, and the block may contain multiple RichText areas (corresponding to attributes),
- * so we need to keep track of both until the this function calculates which attribute area (in the
- * block editor, the richTextIdentifier) that offset corresponds to, and how far into that attribute
- * area it is.
- *
- * @param array $block The block being checked.
- * @param integer $offset The position in the tweet text where it will be split.
- * @return array|false `false` if the boundary can't be determined. Otherwise, returns the
- * position in the block editor to insert the tweet boundary annotation.
- */
- private static function get_boundary( $block, $offset ) {
- // If we don't have a clientId, there's no point in generating a boundary, since this
- // parse request doesn't have a way to map blocks back to editor UI.
- if ( ! isset( $block['clientId'] ) ) {
- return false;
- }
- $block_def = self::get_block_definition( $block['name'] );
- if ( isset( $block_def['content'] ) && count( $block_def['content'] ) > 0 ) {
- $tags = $block_def['content'];
- } else {
- $tags = array( 'content' );
- }
- $tag_content = self::extract_tag_content_from_html( $tags, $block['block']['innerHTML'] );
- // $tag_content is split up by tag first, then lines. We want to remap it to split it by lines
- // first, then tag.
- $lines = array();
- foreach ( $tag_content as $tag => $content ) {
- if ( 'content' === $tag ) {
- $attribute_name = 'content';
- } else {
- $attribute_name = array_search( $tag, $block_def['content'], true );
- }
- foreach ( $content as $id => $content_string ) {
- // Multiline blocks can have multiple lines, but other blocks will always only have 1.
- if ( 'multiline' === $block_def['type'] ) {
- $line_number = $id;
- } else {
- $line_number = 0;
- }
- if ( ! isset( $lines[ $line_number ] ) ) {
- $lines[ $line_number ] = array();
- }
- if ( ! isset( $lines[ $line_number ][ $attribute_name ] ) ) {
- // For multiline blocks, or the first time this attribute has been encountered
- // in single line blocks, assign the string to the line/attribute.
- $lines[ $line_number ][ $attribute_name ] = $content_string;
- } else {
- // For subsequent times this line/attribute is encountered (only in single line blocks),
- // append the string with a line break.
- $lines[ $line_number ][ $attribute_name ] .= "\n$content_string";
- }
- }
- }
- $line_count = count( $lines );
- $template_parts = preg_split( '/({{\w+}})/', $block_def['template'], -1, PREG_SPLIT_DELIM_CAPTURE );
- // Keep track of the total number of bytes we've processed from this block.
- $total_bytes_processed = 0;
- // Keep track of the number of characters that the processed data translates to in the editor.
- $characters_processed = 0;
- foreach ( $lines as $line_number => $line ) {
- // Add up the length of all the parts of this line.
- $line_byte_total = array_sum( array_map( 'strlen', $line ) );
- if ( $line_byte_total > 0 ) {
- // We have something to use in the template, so loop over each part of the template, and count it.
- foreach ( $template_parts as $template_part ) {
- $matches = array();
- if ( preg_match( '/{{(\w+)}}/', $template_part, $matches ) ) {
- $part_name = $matches[1];
- $line_part_data = $line[ $part_name ];
- $line_part_bytes = strlen( $line_part_data );
- $cleaned_line_part_data = preg_replace( '/ \(url-placeholder-\d+-*\)/', '', $line_part_data );
- $cleaned_line_part_data = preg_replace_callback(
- '/url-placeholder-(\d+)-*/',
- function ( $matches ) {
- return self::$urls[ $matches[1] ];
- },
- $cleaned_line_part_data
- );
- if ( $total_bytes_processed + $line_part_bytes >= $offset ) {
- // We know that the offset is somewhere inside this part of the tweet, but we need to remove the length
- // of any URL placeholders that appear before the boundary, to be able to calculate the correct attribute offset.
- // $total_bytes_processed is the sum of everything we've processed so far, (including previous parts)
- // on this line. This makes it relatively easy to calculate the number of bytes into this part
- // that the boundary will occur.
- $line_part_byte_boundary = $offset - $total_bytes_processed;
- // Grab the data from this line part that appears before the boundary.
- $line_part_pre_boundary_data = substr( $line_part_data, 0, $line_part_byte_boundary );
- // Remove any URL placeholders, since these aren't shown in the editor.
- $line_part_pre_boundary_data = preg_replace( '/ \(url-placeholder-\d+-*\)/', '', $line_part_pre_boundary_data );
- $line_part_pre_boundary_data = preg_replace_callback(
- '/url-placeholder-(\d+)-*/',
- function ( $matches ) {
- return self::$urls[ $matches[1] ];
- },
- $line_part_pre_boundary_data
- );
- $boundary_start = self::utf_16_code_unit_length( $line_part_pre_boundary_data ) - 1;
- // Multiline blocks need to offset for the characters that are in the same content area,
- // but which were counted on previous lines.
- if ( 'multiline' === $block_def['type'] ) {
- $boundary_start += $characters_processed;
- }
- // Check if the boundary is happening on a line break or a space.
- if ( "\n" === $line_part_data[ $line_part_byte_boundary - 1 ] ) {
- $type = 'line-break';
- // A line break boundary can actually be multiple consecutive line breaks,
- // count them all up so we know how big the annotation needs to be.
- $matches = array();
- preg_match( '/\n+$/', substr( $line_part_data, 0, $line_part_byte_boundary ), $matches );
- $boundary_end = $boundary_start + 1;
- $boundary_start -= strlen( $matches[0] ) - 1;
- } else {
- $type = 'normal';
- $boundary_end = $boundary_start + 1;
- }
- return array(
- 'start' => $boundary_start,
- 'end' => $boundary_end,
- 'container' => $part_name,
- 'type' => $type,
- );
- } else {
- $total_bytes_processed += $line_part_bytes;
- $characters_processed += self::utf_16_code_unit_length( $cleaned_line_part_data );
- continue;
- }
- } else {
- $total_bytes_processed += strlen( $template_part );
- }
- }
- // Are we breaking at the end of this line?
- if ( $total_bytes_processed + 1 === $offset && $line_count > 1 ) {
- reset( $block_def['content'] );
- $container = key( $block_def['content'] );
- return array(
- 'line' => $line_number,
- 'container' => $container,
- 'type' => 'end-of-line',
- );
- }
- // The newline at the end of each line is 1 byte, but we don't need to count empty lines.
- $total_bytes_processed++;
- }
- // We do need to count empty lines in the editor, since they'll be displayed.
- $characters_processed++;
- }
- return false;
- }
- /**
- * JavaScript uses UTF-16 for encoding strings, which means we need to provide UTF-16
- * based offsets for the block editor to render tweet boundaries in the correct location.
- *
- * UTF-16 is a variable-width character encoding: every code unit is 2 bytes, a single character
- * can be one or two code units long. Fortunately for us, JavaScript's String.charAt() is based
- * on the older UCS-2 character encoding, which only counts single code units. PHP's strlen()
- * counts a code unit as being 2 characters, so once a string is converted to UTF-16, we have
- * a fast way to determine how long it is in UTF-16 code units.
- *
- * @param string $text The natively encoded string to get the length of.
- * @return int The length of the string in UTF-16 code units. Returns -1 if the length could not
- * be calculated.
- */
- private static function utf_16_code_unit_length( $text ) {
- // If mb_convert_encoding() exists, we can use that for conversion.
- if ( function_exists( 'mb_convert_encoding' ) ) {
- // UTF-16 can add an additional code unit to the start of the string, called the
- // Byte Order Mark (BOM), which indicates whether the string is encoding as
- // big-endian, or little-endian. Since we don't want to count code unit, and the endianness
- // doesn't matter for our purposes, using PHP's UTF-16BE encoding uses big-endian
- // encoding, and ensures the BOM *won't* be prepended to the string to the string.
- return strlen( mb_convert_encoding( $text, 'UTF-16BE' ) ) / 2;
- }
- // If we can't convert this string, return a result that will avoid an incorrect annotation being added.
- return -1;
- }
- /**
- * Extracts the tweetable text from a block.
- *
- * @param array $block A single block, as generated by parse_block().
- * @return string The tweetable text from the block, in the correct template form.
- */
- private static function extract_text_from_block( $block ) {
- // If the block doesn't have an innerHTMl, we're not going to get any text.
- if ( empty( $block['innerHTML'] ) ) {
- return '';
- }
- $block_def = self::get_block_definition( $block['blockName'] );
- // We currently only support extracting text from HTML text nodes.
- if ( ! isset( $block_def['content_location'] ) || 'html' !== $block_def['content_location'] ) {
- return '';
- }
- // Find out which tags we need to extract content from.
- if ( isset( $block_def['content'] ) && count( $block_def['content'] ) > 0 ) {
- $tags = $block_def['content'];
- } else {
- $tags = array( 'content' );
- }
- $tag_values = self::extract_tag_content_from_html( $tags, $block['innerHTML'] );
- // We can treat single line blocks as "multiline", with only one line in them.
- $lines = array();
- foreach ( $tag_values as $tag => $values ) {
- // For single-line blocks, we need to squash all the values for this tag into a single value.
- if ( 'multiline' !== $block_def['type'] ) {
- $values = array( implode( "\n", $values ) );
- }
- // Handle the special "content" tag.
- if ( 'content' === $tag ) {
- $placeholder = 'content';
- } else {
- $placeholder = array_search( $tag, $block_def['content'], true );
- }
- // Loop over each instance of this value, appling that value to the corresponding line template.
- foreach ( $values as $line_number => $value ) {
- if ( ! isset( $lines[ $line_number ] ) ) {
- $lines[ $line_number ] = $block_def['template'];
- }
- $lines[ $line_number ] = str_replace( '{{' . $placeholder . '}}', $value, $lines[ $line_number ] );
- }
- }
- // Remove any lines that didn't apply any content.
- $empty_template = preg_replace( '/{{.*?}}/', '', $block_def['template'] );
- $lines = array_filter(
- $lines,
- function ( $line ) use ( $empty_template ) {
- return $line !== $empty_template;
- }
- );
- // Join the lines together into a single string.
- $text = implode( self::$line_separator, $lines );
- // Trim off any trailing whitespace that we no longer need.
- $text = preg_replace( '/(\s|' . self::$line_separator . ')+$/u', '', $text );
- return $text;
- }
- /**
- * Extracts the tweetable media from a block.
- *
- * @param array $block A single block, as generated by parse_block().
- * @return array {
- * An array of media.
- *
- * @type string url The URL of the media.
- * @type string alt The alt text of the media.
- * }
- */
- private static function extract_media_from_block( $block ) {
- $block_def = self::get_block_definition( $block['blockName'] );
- $media = array();
- if ( 'image' === $block_def['type'] ) {
- $url = self::extract_attr_content_from_html(
- $block_def['content']['url'][0],
- $block_def['content']['url'][1],
- $block['innerHTML']
- );
- $alt = self::extract_attr_content_from_html(
- $block_def['content']['alt'][0],
- $block_def['content']['alt'][1],
- $block['innerHTML']
- );
- $img_count = count( $url );
- for ( $ii = 0; $ii < $img_count; $ii++ ) {
- $filedata = wp_check_filetype( basename( wp_parse_url( $url[ $ii ], PHP_URL_PATH ) ) );
- $media[] = array(
- 'url' => $url[ $ii ],
- 'alt' => self::is_valid_alt_text( $alt[ $ii ] ) ? $alt[ $ii ] : '',
- 'type' => $filedata['type'],
- );
- }
- } elseif ( 'video' === $block_def['type'] ) {
- // Handle VideoPress videos.
- if ( isset( $block['attrs']['src'] ) && 0 === strpos( $block['attrs']['src'], 'https://videos.files.wordpress.com/' ) ) {
- $url = array( $block['attrs']['src'] );
- } else {
- $url = self::extract_attr_content_from_html(
- $block_def['content']['url'][0],
- $block_def['content']['url'][1],
- $block['innerHTML']
- );
- }
- // We can only ever use the first video found, no need to go through all of them.
- if ( count( $url ) > 0 ) {
- $filedata = wp_check_filetype( basename( wp_parse_url( $url[0], PHP_URL_PATH ) ) );
- $media[] = array(
- 'url' => $url[0],
- 'type' => $filedata['type'],
- );
- }
- }
- return $media;
- }
- /**
- * Extracts the tweet URL from a Twitter embed block.
- *
- * @param array $block A single block, as generated by parse_block().
- * @return string The tweet URL. Empty string if there is none available.
- */
- private static function extract_tweet_from_block( $block ) {
- if (
- 'core/embed' === $block['blockName']
- && ( isset( $block['attrs']['providerNameSlug'] ) && 'twitter' === $block['attrs']['providerNameSlug'] )
- ) {
- return $block['attrs']['url'];
- }
- return '';
- }
- /**
- * Extracts URL from an embed block.
- *
- * @param array $block A single block, as generated by parse_block().
- * @return string The URL. Empty string if there is none available.
- */
- private static function extract_embed_from_block( $block ) {
- $block_def = self::get_block_definition( $block['blockName'] );
- if ( 'embed' !== $block_def['type'] ) {
- return '';
- }
- // Twitter embeds are handled in ::extract_tweet_from_block().
- if (
- 'core/embed' === $block['blockName']
- && ( isset( $block['attrs']['providerNameSlug'] ) && 'twitter' === $block['attrs']['providerNameSlug'] )
- ) {
- return '';
- }
- $url = '';
- if ( 'block-attributes' === $block_def['content_location'] ) {
- $url = $block['attrs'][ $block_def['content'][0] ];
- }
- if ( 'jetpack/gif' === $block['blockName'] ) {
- $url = str_replace( '/embed/', '/gifs/', $url );
- }
- return $url;
- }
- /**
- * There's a bunch of left-over cruft in the tweets array that we don't need to return. Removing
- * it helps keep the size of the data down.
- */
- private static function clean_return_tweets() {
- // Before we return, clean out unnecessary cruft from the return data.
- $tweets = array_map(
- function ( $tweet ) {
- // Remove tweets that don't have anything saved in them. eg, if the last block is a
- // header with no text, it'll force a new tweet, but we won't end up putting anything
- // in that tweet.
- if ( ! $tweet['changed'] ) {
- return false;
- }
- // Replace any URL placeholders that appear in the text.
- $tweet['urls'] = array();
- foreach ( self::$urls as $id => $url ) {
- $count = 0;
- $tweet['text'] = str_replace( str_pad( "url-placeholder-$id", self::$characters_per_url, '-' ), $url, $tweet['text'], $count );
- // If we found a URL, keep track of it for the editor.
- if ( $count > 0 ) {
- $tweet['urls'][] = $url;
- }
- }
- // Remove any inline placeholders.
- $tweet['text'] = str_replace( self::$inline_placeholder, '', $tweet['text'] );
- // If the tweet text consists only of whitespace, we can remove all of it.
- if ( preg_match( '/^\s*$/u', $tweet['text'] ) ) {
- $tweet['text'] = '';
- }
- // Remove trailing whitespace from every line.
- $tweet['text'] = preg_replace( '/\p{Z}+$/um', '', $tweet['text'] );
- // Remove all trailing whitespace (including line breaks) from the end of the text.
- $tweet['text'] = rtrim( $tweet['text'] );
- // Remove internal flags.
- unset( $tweet['changed'] );
- unset( $tweet['finished'] );
- // Remove bulky block data.
- if ( ! isset( $tweet['blocks'][0]['attributes'] ) && ! isset( $tweet['blocks'][0]['clientId'] ) ) {
- $tweet['blocks'] = array();
- } else {
- // Remove the parts of the block data that the editor doesn't need.
- $block_count = count( $tweet['blocks'] );
- for ( $ii = 0; $ii < $block_count; $ii++ ) {
- $keys = array_keys( $tweet['blocks'][ $ii ] );
- foreach ( $keys as $key ) {
- // The editor only needs these attributes, everything else will be unset.
- if ( in_array( $key, array( 'attributes', 'clientId' ), true ) ) {
- continue;
- }
- unset( $tweet['blocks'][ $ii ][ $key ] );
- }
- }
- }
- // Once we've finished cleaning up, check if there's anything left to be tweeted.
- if ( empty( $tweet['text'] ) && empty( $tweet['media'] ) && empty( $tweet['tweet'] ) ) {
- return false;
- }
- return $tweet;
- },
- self::$tweets
- );
- // Clean any removed tweets out of the result.
- return array_values( array_filter( $tweets, 'is_array' ) );
- }
- /**
- * Given a list of tags and a HTML blob, this will extract the text content inside
- * each of the given tags.
- *
- * @param array $tags An array of tag names.
- * @param string $html A blob of HTML.
- * @return array An array of the extract content. The keys in the array are the $tags,
- * each value is an array. The value array is indexed in the same order as the tag
- * appears in the HTML blob, including nested tags.
- */
- private static function extract_tag_content_from_html( $tags, $html ) {
- // Serialised blocks will sometimes wrap the innerHTML in newlines, but those newlines
- // are removed when innerHTML is parsed into an attribute. Remove them so we're working
- // with the same information.
- if ( "\n" === $html[0] && "\n" === $html[ strlen( $html ) - 1 ] ) {
- $html = substr( $html, 1, strlen( $html ) - 2 );
- }
- // Normalise <br>.
- $html = preg_replace( '/<br\s*\/?>/', '<br>', $html );
- // If there were no tags passed, assume the entire text is required.
- if ( empty( $tags ) ) {
- $tags = array( 'content' );
- }
- $values = array();
- $tokens = wp_html_split( $html );
- $validator = new Twitter_Validator();
- foreach ( $tags as $tag ) {
- $values[ $tag ] = array();
- // Since tags can be nested, keeping track of the nesting level allows
- // us to extract nested content into a flat array.
- if ( 'content' === $tag ) {
- // The special "content" tag means we should store the entire content,
- // so assume the tag is open from the beginning.
- $opened = 0;
- $closed = -1;
- $values['content'][0] = '';
- } else {
- $opened = -1;
- $closed = -1;
- }
- // When we come across a URL, we need to keep track of it, so it can then be inserted
- // in the right place.
- $current_url = '';
- foreach ( $tokens as $token ) {
- if ( 0 === strlen( $token ) ) {
- // Skip any empty tokens.
- continue;
- }
- // If we're currently storing content, check if it's a text-formatting
- // tag that we should apply.
- if ( $opened !== $closed ) {
- // End of a paragraph, put in some newlines (as long as we're not extracting paragraphs).
- if ( '</p>' === $token && 'p' !== $tag ) {
- $values[ $tag ][ $opened ] .= "\n\n";
- }
- // A line break gets one newline.
- if ( '<br>' === $token ) {
- $values[ $tag ][ $opened ] .= "\n";
- }
- // A link has opened, grab the URL for inserting later.
- if ( 0 === strpos( $token, '<a ' ) ) {
- $href_values = self::extract_attr_content_from_html( 'a', 'href', $token );
- if ( ! empty( $href_values[0] ) && $validator->isValidURL( $href_values[0] ) ) {
- // Remember the URL.
- $current_url = $href_values[0];
- }
- }
- // A link has closed, insert the URL from that link if we have one.
- if ( '</a>' === $token && '' !== $current_url ) {
- // Generate a unique-to-this-block placeholder which takes up the
- // same number of characters as a URL does.
- $values[ $tag ][ $opened ] .= ' (' . self::generate_url_placeholder( $current_url ) . ')';
- $current_url = '';
- }
- // We don't return inline images, but they technically take up 1 character in the RichText.
- if ( 0 === strpos( $token, '<img ' ) ) {
- $values[ $tag ][ $opened ] .= self::$inline_placeholder;
- }
- }
- if ( "<$tag>" === $token || 0 === strpos( $token, "<$tag " ) ) {
- // A tag has just been opened.
- $opened++;
- // Set an empty value now, so we're keeping track of empty tags.
- if ( ! isset( $values[ $tag ][ $opened ] ) ) {
- $values[ $tag ][ $opened ] = '';
- }
- continue;
- }
- if ( "</$tag>" === $token ) {
- // The tag has been closed.
- $closed++;
- continue;
- }
- if ( '<' === $token[0] ) {
- // We can skip any other tags.
- continue;
- }
- if ( $opened !== $closed ) {
- // We're currently in a tag, with some content. Start by decoding any HTML entities.
- $token = html_entity_decode( $token, ENT_QUOTES );
- // Find any URLs in this content, and replace them with a placeholder.
- preg_match_all( Twitter_Regex::getValidUrlMatcher(), $token, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE );
- $offset = 0;
- foreach ( $matches as $match ) {
- list( $url, $start ) = $match[2];
- $token = substr_replace( $token, self::generate_url_placeholder( $url ), $start + $offset, strlen( $url ) );
- $offset += self::$characters_per_url - strlen( $url );
- // If we're in a link with a URL set, there's no need to keep two copies of the same link.
- if ( ! empty( $current_url ) ) {
- $lower_url = strtolower( $url );
- $lower_current_url = strtolower( $current_url );
- if ( $lower_url === $lower_current_url ) {
- $current_url = '';
- }
- // Check that the link text isn't just a shortened version of the href value.
- $trimmed_current_url = preg_replace( '|^https?://|', '', $lower_current_url );
- if ( $lower_url === $trimmed_current_url || trim( $trimmed_current_url, '/' ) === $lower_url ) {
- $current_url = '';
- }
- }
- }
- // Append it to the right value.
- $values[ $tag ][ $opened ] .= $token;
- }
- }
- }
- return $values;
- }
- /**
- * Extracts the attribute content from a tag.
- *
- * This method allows for the HTML to have multiple instances of the tag, and will return
- * an array containing the attribute value (or an empty string, if the tag doesn't have the
- * requested attribute) for each occurrence of the tag.
- *
- * @param string $tag The tag we're looking for.
- * @param string $attr The name of the attribute we're looking for.
- * @param string $html The HTML we're searching through.
- * @param array $attr_filters Optional. Filters tags based on whether or not they have attributes with given values.
- * @return array The array of attribute values found.
- */
- private static function extract_attr_content_from_html( $tag, $attr, $html, $attr_filters = array() ) {
- // Given our single tag and attribute, construct a KSES filter for it.
- $kses_filter = array(
- $tag => array(
- $attr => array(),
- ),
- );
- foreach ( $attr_filters as $filter_attr => $filter_value ) {
- $kses_filter[ $tag ][ $filter_attr ] = array();
- }
- // Remove all HTML except for the tag we're after. On that tag,
- // remove all attributes except for the one we're after.
- $stripped_html = wp_kses( $html, $kses_filter );
- $values = array();
- $tokens = wp_html_split( $stripped_html );
- foreach ( $tokens as $token ) {
- $found_value = '';
- if ( 0 === strlen( $token ) ) {
- // Skip any empty tokens.
- continue;
- }
- if ( '<' !== $token[0] ) {
- // We can skip any non-tag tokens.
- continue;
- }
- $token_attrs = wp_kses_attr_parse( $token );
- // Skip tags that KSES couldn't handle.
- if ( false === $token_attrs ) {
- continue;
- }
- // Remove the tag open and close chunks.
- $found_tag = array_shift( $token_attrs );
- array_pop( $token_attrs );
- // We somehow got a tag that isn't the one we're after. Skip it.
- if ( 0 !== strpos( $found_tag, "<$tag " ) ) {
- continue;
- }
- // We can only fail an attribute filter if one is set.
- $passed_filter = count( $attr_filters ) === 0;
- foreach ( $token_attrs as $token_attr_string ) {
- // The first "=" in the string will be between the attribute name/value.
- list( $token_attr_name, $token_attr_value ) = explode( '=', $token_attr_string, 2 );
- $token_attr_name = trim( $token_attr_name );
- $token_attr_value = trim( $token_attr_value );
- // Remove a single set of quotes from around the value.
- if ( '' !== $token_attr_value && in_array( $token_attr_value[0], array( '"', "'" ), true ) ) {
- $token_attr_value = trim( $token_attr_value, $token_attr_value[0] );
- }
- // If this is the attribute we're after, save the value for the end of the loop.
- if ( $token_attr_name === $attr ) {
- $found_value = $token_attr_value;
- }
- if ( isset( $attr_filters[ $token_attr_name ] ) && $attr_filters[ $token_attr_name ] === $token_attr_value ) {
- $passed_filter = true;
- }
- }
- if ( $passed_filter ) {
- // We always want to append the found value, even if we didn't "find" a matching attribute.
- // An empty string in the return value means that we found the tag, but the attribute was
- // either empty, or not set.
- $values[] = html_entity_decode( $found_value, ENT_QUOTES );
- }
- }
- return $values;
- }
- /**
- * Generates a placeholder for URLs, using the appropriate number of characters to imitate how
- * Twitter counts the length of URLs in tweets.
- *
- * @param string $url The URL to generate a placeholder for.
- * @return string The placeholder.
- */
- public static function generate_url_placeholder( $url ) {
- self::$urls[] = $url;
- return str_pad( 'url-placeholder-' . ( count( self::$urls ) - 1 ), self::$characters_per_url, '-' );
- }
- /**
- * Retrieves the Twitter card data for a list of URLs.
- *
- * @param array $urls The list of URLs to grab Twitter card data for.
- * @return array The Twitter card data.
- */
- public static function generate_cards( $urls ) {
- $validator = new Twitter_Validator();
- $requests = array_map(
- function ( $url ) use ( $validator ) {
- if ( $validator->isValidURL( $url ) ) {
- return array(
- 'url' => $url,
- );
- }
- return false;
- },
- $urls
- );
- $requests = array_filter( $requests );
- $results = Requests::request_multiple( $requests );
- $card_data = array(
- 'creator' => array(
- 'name' => 'twitter:creator',
- ),
- 'description' => array(
- 'name' => 'twitter:description',
- 'property' => 'og:description',
- ),
- 'image' => array(
- 'name' => 'twitter:image',
- 'property' => 'og:image:secure',
- 'property' => 'og:image',
- ),
- 'title' => array(
- 'name' => 'twitter:text:title',
- 'property' => 'og:title',
- ),
- 'type' => array(
- 'name' => 'twitter:card',
- ),
- );
- $cards = array();
- foreach ( $results as $id => $result ) {
- $url = $requests[ $id ]['url'];
- if ( ! $result->success ) {
- $cards[ $url ] = array(
- 'error' => 'invalid_url',
- );
- continue;
- }
- $url_card_data = array();
- foreach ( $card_data as $key => $filters ) {
- foreach ( $filters as $attribute => $value ) {
- $found_data = self::extract_attr_content_from_html( 'meta', 'content', $result->body, array( $attribute => $value ) );
- if ( count( $found_data ) > 0 && strlen( $found_data[0] ) > 0 ) {
- $url_card_data[ $key ] = html_entity_decode( $found_data[0], ENT_QUOTES );
- break;
- }
- }
- }
- if ( count( $url_card_data ) > 0 ) {
- $cards[ $url ] = $url_card_data;
- } else {
- $cards[ $url ] = array(
- 'error' => 'no_og_data',
- );
- }
- }
- return $cards;
- }
- /**
- * Get the WPCOM or self-hosted site ID.
- *
- * @return mixed
- */
- public static function get_site_id() {
- $is_wpcom = ( defined( 'IS_WPCOM' ) && IS_WPCOM );
- $site_id = $is_wpcom ? get_current_blog_id() : Jetpack_Options::get_option( 'id' );
- if ( ! $site_id ) {
- return new WP_Error(
- 'unavailable_site_id',
- __( 'Sorry, something is wrong with your Jetpack connection.', 'jetpack' ),
- 403
- );
- }
- return (int) $site_id;
- }
- }
|