class.media-extractor.php — Jetpack – WP Security, Backup, Speed, & Growth 11.1.1

jetpack / _inc / lib / class.media-extractor.php

jetpack / _inc / lib Last commit date

class.media-extractor.php

574 lines

1	<?php // phpcs:ignore WordPress.Files.FileName.InvalidClassFileName
2	/**
3	* Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
4	* in or attached to the post/page.
5	*
6	* @package automattic/jetpack
7	*/
8
9	/**
10	* Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
11	* in or attached to the post/page.
12	*
13	* @todo Additionally, have some filters on number of items in each field
14	*/
15	class Jetpack_Media_Meta_Extractor {
16
17	// Some consts for what to extract.
18	const ALL = 255;
19	const LINKS = 1;
20	const MENTIONS = 2;
21	const IMAGES = 4;
22	const SHORTCODES = 8; // Only the keeper shortcodes below.
23	const EMBEDS = 16;
24	const HASHTAGS = 32;
25
26	/**
27	* Shortcodes to keep.
28	*
29	* For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all)
30	* There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these.
31	*
32	* @var string[]
33	*/
34	private static $keeper_shortcodes = array(
35	'youtube',
36	'vimeo',
37	'hulu',
38	'ted',
39	'wpvideo',
40	'videopress',
41	);
42
43	/**
44	* Gets the specified media and meta info from the given post.
45	* NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead.
46	*
47	* @param int $blog_id The ID of the blog.
48	* @param int $post_id The ID of the post.
49	* @param int $what_to_extract A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES \| Jetpack_Media_Meta_Extractor::MENTIONS.
50	* @param boolean $extract_alt_text Should alt_text be extracted, defaults to false.
51	*
52	* @return array\|WP_Error a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error.
53	*/
54	public static function extract( $blog_id, $post_id, $what_to_extract = self::ALL, $extract_alt_text = false ) {
55
56	// multisite?
57	if ( function_exists( 'switch_to_blog' ) ) {
58	switch_to_blog( $blog_id );
59	}
60
61	$post = get_post( $post_id );
62	if ( ! $post instanceof WP_Post ) {
63	if ( function_exists( 'restore_current_blog' ) ) {
64	restore_current_blog();
65	}
66	return array();
67	}
68	$content = $post->post_title . "\n\n" . $post->post_content;
69	$char_cnt = strlen( $content );
70
71	// prevent running extraction on really huge amounts of content.
72	if ( $char_cnt > 100000 ) { // about 20k English words.
73	$content = substr( $content, 0, 100000 );
74	}
75
76	$extracted = array();
77
78	// Get images first, we need the full post for that.
79	if ( self::IMAGES & $what_to_extract ) {
80	$extracted = self::get_image_fields( $post, array(), $extract_alt_text );
81
82	// Turn off images so we can safely call extract_from_content() below.
83	$what_to_extract = $what_to_extract - self::IMAGES;
84	}
85
86	if ( function_exists( 'restore_current_blog' ) ) {
87	restore_current_blog();
88	}
89
90	// All of the other things besides images can be extracted from just the content.
91	$extracted = self::extract_from_content( $content, $what_to_extract, $extracted );
92
93	return $extracted;
94	}
95
96	/**
97	* Gets the specified meta info from the given post content.
98	* NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction
99	* This method will give you an error if you ask for IMAGES.
100	*
101	* @param string $content The HTML post_content of a post.
102	* @param int $what_to_extract A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES \| Jetpack_Media_Meta_Extractor::MENTIONS.
103	* @param array $already_extracted Previously extracted things, e.g. images from extract(), which can be used for x-referencing here.
104	*
105	* @return array a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error.
106	*/
107	public static function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) {
108	$stripped_content = self::get_stripped_content( $content );
109
110	// Maybe start with some previously extracted things (e.g. images from extract().
111	$extracted = $already_extracted;
112
113	// Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.
114
115	if ( self::IMAGES & $what_to_extract ) {
116	$images = self::extract_images_from_content( $stripped_content, array() );
117	$extracted = array_merge( $extracted, $images );
118	}
119
120	// ----------------------------------- MENTIONS ------------------------------
121
122	if ( self::MENTIONS & $what_to_extract ) {
123	if ( preg_match_all( '/(^\|\s)@(\w+)/u', $stripped_content, $matches ) ) {
124	$mentions = array_values( array_unique( $matches[2] ) ); // array_unique() retains the keys!
125	$mentions = array_map( 'strtolower', $mentions );
126	$extracted['mention'] = array( 'name' => $mentions );
127	if ( ! isset( $extracted['has'] ) ) {
128	$extracted['has'] = array();
129	}
130	$extracted['has']['mention'] = count( $mentions );
131	}
132	}
133
134	// ----------------------------------- HASHTAGS ------------------------------
135	/**
136	* Some hosts may not compile with --enable-unicode-properties and kick a warning:
137	* Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
138	* Therefore, we only run this code block on wpcom, not in Jetpack.
139	*/
140	if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) {
141	// This regex does not exactly match Twitter's
142	// if there are problems/complaints we should implement this:
143	// https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java .
144	if ( preg_match_all( '/(?:^\|\s)#(\w\p{L}+\w)/u', $stripped_content, $matches ) ) {
145	$hashtags = array_values( array_unique( $matches[1] ) ); // array_unique() retains the keys!
146	$hashtags = array_map( 'strtolower', $hashtags );
147	$extracted['hashtag'] = array( 'name' => $hashtags );
148	if ( ! isset( $extracted['has'] ) ) {
149	$extracted['has'] = array();
150	}
151	$extracted['has']['hashtag'] = count( $hashtags );
152	}
153	}
154
155	// ----------------------------------- SHORTCODES ------------------------------
156
157	// Always look for shortcodes.
158	// If we don't want them, we'll just remove them, so we don't grab them as links below.
159	$shortcode_pattern = '/' . get_shortcode_regex() . '/s';
160	if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) {
161
162	$shortcode_total_count = 0;
163	$shortcode_type_counts = array();
164	$shortcode_types = array();
165	$shortcode_details = array();
166
167	if ( self::SHORTCODES & $what_to_extract ) {
168
169	foreach ( $matches[2] as $key => $shortcode ) {
170	// Elasticsearch (and probably other things) doesn't deal well with some chars as key names.
171	$shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode );
172
173	$attr = shortcode_parse_atts( $matches[3][ $key ] );
174
175	$shortcode_total_count++;
176	if ( ! isset( $shortcode_type_counts[ $shortcode_name ] ) ) {
177	$shortcode_type_counts[ $shortcode_name ] = 0;
178	}
179	$shortcode_type_counts[ $shortcode_name ]++;
180
181	// Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
182	// @todo Store number of occurrences?
183	if ( ! in_array( $shortcode_name, $shortcode_types, true ) ) {
184	$shortcode_types[] = $shortcode_name;
185	}
186
187	// For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.).
188	if ( in_array( $shortcode, self::$keeper_shortcodes, true ) ) {
189	// Clear shortcode ID data left from the last shortcode.
190	$id = null;
191	// We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id().
192	// If the shortcode is a class, we'll call XyzShortcode::get_xyz_id().
193	$shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
194	$shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode';
195	$shortcode_get_id_method = "get_{$shortcode}_id";
196	if ( function_exists( $shortcode_get_id_func ) ) {
197	$id = call_user_func( $shortcode_get_id_func, $attr );
198	} elseif ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) {
199	$id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr );
200	}
201	if ( ! empty( $id )
202	&& ( ! isset( $shortcode_details[ $shortcode_name ] ) \|\| ! in_array( $id, $shortcode_details[ $shortcode_name ], true ) ) ) {
203	$shortcode_details[ $shortcode_name ][] = $id;
204	}
205	}
206	}
207
208	if ( $shortcode_total_count > 0 ) {
209	// Add the shortcode info to the $extracted array.
210	if ( ! isset( $extracted['has'] ) ) {
211	$extracted['has'] = array();
212	}
213	$extracted['has']['shortcode'] = $shortcode_total_count;
214	$extracted['shortcode'] = array();
215	foreach ( $shortcode_type_counts as $type => $count ) {
216	$extracted['shortcode'][ $type ] = array( 'count' => $count );
217	}
218	if ( ! empty( $shortcode_types ) ) {
219	$extracted['shortcode_types'] = $shortcode_types;
220	}
221	foreach ( $shortcode_details as $type => $id ) {
222	$extracted['shortcode'][ $type ]['id'] = $id;
223	}
224	}
225	}
226
227	// Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
228	$content = preg_replace( $shortcode_pattern, ' ', $content );
229	}
230
231	// ----------------------------------- LINKS ------------------------------
232
233	if ( self::LINKS & $what_to_extract ) {
234
235	// To hold the extracted stuff we find.
236	$links = array();
237
238	// @todo Get the text inside the links?
239
240	// Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images.
241	// (we treat embed links as just another link).
242	if ( preg_match_all( '#(?:^\|\s\|"\|\')(https?://([^\s()<>]+(?:$[\w\d]+$\|([^[:punct:]\s]\|/))))#', $content, $matches ) ) {
243
244	foreach ( $matches[1] as $link_raw ) {
245	$url = wp_parse_url( $link_raw );
246
247	// Data URI links.
248	if ( ! isset( $url['scheme'] ) \|\| 'data' === $url['scheme'] ) {
249	continue;
250	}
251
252	// Reject invalid URLs.
253	if ( ! isset( $url['host'] ) ) {
254	continue;
255	}
256
257	// Remove large (and likely invalid) links.
258	if ( 4096 < strlen( $link_raw ) ) {
259	continue;
260	}
261
262	// Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those.
263	$simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' );
264	if ( isset( $extracted['image']['url'] ) ) {
265	if ( in_array( $simple_url, (array) $extracted['image']['url'], true ) ) {
266	continue;
267	}
268	}
269
270	list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable
271
272	// Build a reversed hostname.
273	$host_parts = array_reverse( explode( '.', $url['host'] ) );
274	$host_reversed = '';
275	foreach ( $host_parts as $part ) {
276	$host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part;
277	}
278
279	$link_analyzed = '';
280	if ( ! empty( $url['path'] ) ) {
281	// The whole path (no query args or fragments).
282	$path = substr( $url['path'], 1 ); // strip the leading '/'.
283	$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path;
284
285	// The path split by /.
286	$path_split = explode( '/', $path );
287	if ( count( $path_split ) > 1 ) {
288	$link_analyzed .= ' ' . implode( ' ', $path_split );
289	}
290
291	// The fragment.
292	if ( ! empty( $url['fragment'] ) ) {
293	$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment'];
294	}
295	}
296
297	// @todo Check unique before adding
298	$links[] = array(
299	'url' => $link_all_but_proto,
300	'host_reversed' => $host_reversed,
301	'host' => $url['host'],
302	);
303	}
304	}
305
306	$link_count = count( $links );
307	if ( $link_count ) {
308	$extracted['link'] = $links;
309	if ( ! isset( $extracted['has'] ) ) {
310	$extracted['has'] = array();
311	}
312	$extracted['has']['link'] = $link_count;
313	}
314	}
315
316	// ----------------------------------- EMBEDS ------------------------------
317
318	// Embeds are just individual links on their own line.
319	if ( self::EMBEDS & $what_to_extract ) {
320
321	if ( ! function_exists( '_wp_oembed_get_object' ) ) {
322	include ABSPATH . WPINC . '/class-oembed.php';
323	}
324
325	// get an oembed object.
326	$oembed = _wp_oembed_get_object();
327
328	// Grab any links on their own lines that may be embeds.
329	if ( preg_match_all( '\|^\s(https?://[^\s"]+)\s$\|im', $content, $matches ) ) {
330
331	// To hold the extracted stuff we find.
332	$embeds = array();
333
334	foreach ( $matches[1] as $link_raw ) {
335	$url = wp_parse_url( $link_raw );
336
337	list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable
338
339	// Check whether this "link" is really an embed.
340	foreach ( $oembed->providers as $matchmask => $data ) {
341	list( $providerurl, $regex ) = $data; // phpcs:ignore VariableAnalysis.CodeAnalysis.VariableAnalysis.UnusedVariable
342
343	// Turn the asterisk-type provider URLs into regex.
344	if ( ! $regex ) {
345	$matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i';
346	$matchmask = preg_replace( '\|^#http\\\://\|', '#https?\://', $matchmask );
347	}
348
349	if ( preg_match( $matchmask, $link_raw ) ) {
350	$embeds[] = $link_all_but_proto; // @todo Check unique before adding
351
352	// @todo Try to get ID's for the ones we care about (shortcode_keepers)
353	break;
354	}
355	}
356	}
357
358	if ( ! empty( $embeds ) ) {
359	if ( ! isset( $extracted['has'] ) ) {
360	$extracted['has'] = array();
361	}
362	$extracted['has']['embed'] = count( $embeds );
363	$extracted['embed'] = array( 'url' => array() );
364	foreach ( $embeds as $e ) {
365	$extracted['embed']['url'][] = $e;
366	}
367	}
368	}
369	}
370
371	return $extracted;
372	}
373
374	/**
375	* Get image fields for matching images.
376	*
377	* @uses Jetpack_PostImages
378	*
379	* @param WP_Post $post A post object.
380	* @param array $args Optional args, see defaults list for details.
381	* @param boolean $extract_alt_text Should alt_text be extracted, defaults to false.
382	*
383	* @return array Returns an array of all images meeting the specified criteria in $args.
384	*/
385	private static function get_image_fields( $post, $args = array(), $extract_alt_text = false ) {
386
387	if ( ! $post instanceof WP_Post ) {
388	return array();
389	}
390
391	$defaults = array(
392	'width' => 200, // Required minimum width (if possible to determine).
393	'height' => 200, // Required minimum height (if possible to determine).
394	);
395
396	$args = wp_parse_args( $args, $defaults );
397
398	$image_list = array();
399	$image_booleans = array();
400	$image_booleans['gallery'] = 0;
401
402	$from_featured_image = Jetpack_PostImages::from_thumbnail( $post->ID, $args['width'], $args['height'] );
403	if ( ! empty( $from_featured_image ) ) {
404	if ( $extract_alt_text ) {
405	$image_list = array_merge( $image_list, self::reduce_extracted_images( $from_featured_image ) );
406	} else {
407	$srcs = wp_list_pluck( $from_featured_image, 'src' );
408	$image_list = array_merge( $image_list, $srcs );
409	}
410	}
411
412	$from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] );
413	if ( ! empty( $from_slideshow ) ) {
414	if ( $extract_alt_text ) {
415	$image_list = array_merge( $image_list, self::reduce_extracted_images( $from_slideshow ) );
416	} else {
417	$srcs = wp_list_pluck( $from_slideshow, 'src' );
418	$image_list = array_merge( $image_list, $srcs );
419	}
420	}
421
422	$from_gallery = Jetpack_PostImages::from_gallery( $post->ID );
423	if ( ! empty( $from_gallery ) ) {
424	if ( $extract_alt_text ) {
425	$image_list = array_merge( $image_list, self::reduce_extracted_images( $from_gallery ) );
426	} else {
427	$srcs = wp_list_pluck( $from_gallery, 'src' );
428	$image_list = array_merge( $image_list, $srcs );
429	}
430	$image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1
431	}
432
433	// @todo Can we check width/height of these efficiently? Could maybe use query args at least, before we strip them out
434	$image_list = self::get_images_from_html( $post->post_content, $image_list, $extract_alt_text );
435
436	return self::build_image_struct( $image_list, $image_booleans );
437	}
438
439	/**
440	* Given an extracted image array reduce to src and alt_text.
441	*
442	* @param array $images extracted image array.
443	*
444	* @return array reduced image array
445	*/
446	protected static function reduce_extracted_images( $images ) {
447	$ret_images = array();
448	foreach ( $images as $image ) {
449	// skip if src isn't set.
450	if ( empty( $image['src'] ) ) {
451	continue;
452	}
453	if ( ! empty( $image['alt_text'] ) ) {
454	$ret_images[] = array(
455	'url' => $image['src'],
456	'alt_text' => $image['alt_text'],
457	);
458	} else {
459	$ret_images[] = $image['src'];
460	}
461	}
462	return $ret_images;
463	}
464
465	/**
466	* Helper function to get images from HTML and return it with the set sturcture.
467	*
468	* @param string $content HTML content.
469	* @param array $image_list Array of already found images.
470	*
471	* @return array\|array[] Array of images.
472	*/
473	public static function extract_images_from_content( $content, $image_list ) {
474	$image_list = self::get_images_from_html( $content, $image_list );
475	return self::build_image_struct( $image_list, array() );
476	}
477
478	/**
479	* Produces a set structure for extracted media items.
480	*
481	* @param array $image_list Array of images.
482	* @param array $image_booleans Image booleans.
483	*
484	* @return array\|array[]
485	*/
486	public static function build_image_struct( $image_list, $image_booleans ) {
487	if ( ! empty( $image_list ) ) {
488	$retval = array( 'image' => array() );
489	$image_list = array_unique( $image_list, SORT_REGULAR );
490	foreach ( $image_list as $img ) {
491	if ( is_string( $img ) ) {
492	$retval['image'][] = array( 'url' => $img );
493	} else {
494	$retval['image'][] = $img;
495	}
496	}
497	$image_booleans['image'] = count( $retval['image'] );
498	if ( ! empty( $image_booleans ) ) {
499	$retval['has'] = $image_booleans;
500	}
501	return $retval;
502	} else {
503	return array();
504	}
505	}
506
507	/**
508	* Extracts images from html.
509	*
510	* @param string $html Some markup, possibly containing image tags.
511	* @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication.
512	* @param boolean $extract_alt_text Should alt_text be extracted, defaults to false.
513	*
514	* @return array Image URLs extracted from the HTML, stripped of query params and de-duped
515	*/
516	public static function get_images_from_html( $html, $images_already_extracted, $extract_alt_text = false ) {
517	$image_list = $images_already_extracted;
518	$from_html = Jetpack_PostImages::from_html( $html );
519	// early return if no image in html.
520	if ( empty( $from_html ) ) {
521	return $image_list;
522	}
523	// process images.
524	foreach ( $from_html as $extracted_image ) {
525	$image_url = $extracted_image['src'];
526	$length = strpos( $image_url, '?' );
527	$src = wp_parse_url( $image_url );
528
529	if ( $src && isset( $src['scheme'], $src['host'], $src['path'] ) ) {
530	// Rebuild the URL without the query string.
531	$queryless = $src['scheme'] . '://' . $src['host'] . $src['path'];
532	} elseif ( $length ) {
533	// If wp_parse_url() didn't work, strip off the query string the old fashioned way.
534	$queryless = substr( $image_url, 0, $length );
535	} else {
536	// Failing that, there was no spoon! Err ... query string!
537	$queryless = $image_url;
538	}
539
540	// Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML.
541	if ( 4096 < strlen( $queryless ) ) {
542	continue;
543	}
544
545	if ( ! in_array( $queryless, $image_list, true ) ) {
546	if ( $extract_alt_text && ! empty( $extracted_image['alt_text'] ) ) {
547	$image_list[] = array(
548	'url' => $queryless,
549	'alt_text' => $extracted_image['alt_text'],
550	);
551	} else {
552	$image_list[] = $queryless;
553	}
554	}
555	}
556	return $image_list;
557	}
558
559	/**
560	* Strips concents of all tags, shortcodes, and decodes HTML entities.
561	*
562	* @param string $content Original content.
563	*
564	* @return string Cleaned content.
565	*/
566	private static function get_stripped_content( $content ) {
567	$clean_content = wp_strip_all_tags( $content );
568	$clean_content = html_entity_decode( $clean_content );
569	// completely strip shortcodes and any content they enclose.
570	$clean_content = strip_shortcodes( $clean_content );
571	return $clean_content;
572	}
573	}
574