class.media-extractor.php — Jetpack – WP Security, Backup, Speed, & Growth 3.7.5

jetpack / class.media-extractor.php

jetpack Last commit date

class.media-extractor.php

437 lines

1	<?php
2	/**
3	* Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
4	* in or attached to the post/page.
5	*
6	* @todo Additionally, have some filters on number of items in each field
7	*/
8	class Jetpack_Media_Meta_Extractor {
9
10	// Some consts for what to extract
11	const ALL = 255;
12	const LINKS = 1;
13	const MENTIONS = 2;
14	const IMAGES = 4;
15	const SHORTCODES = 8; // Only the keeper shortcodes below
16	const EMBEDS = 16;
17	const HASHTAGS = 32;
18
19	// For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all)
20	// There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these.
21	private static $KEEPER_SHORTCODES = array(
22	'youtube',
23	'vimeo',
24	'hulu',
25	'ted',
26	'wpvideo',
27	'audio',
28	);
29
30	/**
31	* Gets the specified media and meta info from the given post.
32	* NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead.
33	*
34	* @param $blog_id The ID of the blog
35	* @param $post_id The ID of the post
36	* @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES \| Jetpack_Media_Meta_Extractor::MENTIONS
37	* @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
38	*/
39	static public function extract( $blog_id, $post_id, $what_to_extract = self::ALL ) {
40
41	// multisite?
42	if ( function_exists( 'switch_to_blog') )
43	switch_to_blog( $blog_id );
44
45	$post = get_post( $post_id );
46	$content = $post->post_title . "\n\n" . $post->post_content;
47	$char_cnt = strlen( $content );
48
49	//prevent running extraction on really huge amounts of content
50	if ( $char_cnt > 100000 ) //about 20k English words
51	$content = substr( $content, 0, 100000 );
52
53	$extracted = array();
54
55	// Get images first, we need the full post for that
56	if ( self::IMAGES & $what_to_extract ) {
57	$extracted = self::get_image_fields( $post );
58
59	// Turn off images so we can safely call extract_from_content() below
60	$what_to_extract = $what_to_extract - self::IMAGES;
61	}
62
63	if ( function_exists( 'switch_to_blog') )
64	restore_current_blog();
65
66	// All of the other things besides images can be extracted from just the content
67	$extracted = self::extract_from_content( $content, $what_to_extract, $extracted );
68
69	return $extracted;
70	}
71
72	/**
73	* Gets the specified meta info from the given post content.
74	* NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction
75	* This method will give you an error if you ask for IMAGES.
76	*
77	* @param $content The HTML post_content of a post
78	* @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES \| Jetpack_Media_Meta_Extractor::MENTIONS
79	* @param $already_extracted (array) Previously extracted things, e.g. images from extract(), which can be used for x-referencing here
80	* @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
81	*/
82	static public function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) {
83	$stripped_content = self::get_stripped_content( $content );
84
85	// Maybe start with some previously extracted things (e.g. images from extract()
86	$extracted = $already_extracted;
87
88	// Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.
89
90	if ( self::IMAGES & $what_to_extract ) {
91	$images = Jetpack_Media_Meta_Extractor::extract_images_from_content( $stripped_content, array() );
92	$extracted = array_merge( $extracted, $images );
93	}
94
95	// ----------------------------------- MENTIONS ------------------------------
96
97	if ( self::MENTIONS & $what_to_extract ) {
98	if ( preg_match_all( '/(^\|\s)@(\w+)/u', $stripped_content, $matches ) ) {
99	$mentions = array_values( array_unique( $matches[2] ) ); //array_unique() retains the keys!
100	$mentions = array_map( 'strtolower', $mentions );
101	$extracted['mention'] = array( 'name' => $mentions );
102	if ( !isset( $extracted['has'] ) )
103	$extracted['has'] = array();
104	$extracted['has']['mention'] = count( $mentions );
105	}
106	}
107
108	// ----------------------------------- HASHTAGS ------------------------------
109	/** Some hosts may not compile with --enable-unicode-properties and kick a warning:
110	* Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
111	* Therefore, we only run this code block on wpcom, not in Jetpack.
112	*/
113	if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) {
114	//This regex does not exactly match Twitter's
115	// if there are problems/complaints we should implement this:
116	// https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
117	if ( preg_match_all( '/(?:^\|\s)#(\w\p{L}+\w)/u', $stripped_content, $matches ) ) {
118	$hashtags = array_values( array_unique( $matches[1] ) ); //array_unique() retains the keys!
119	$hashtags = array_map( 'strtolower', $hashtags );
120	$extracted['hashtag'] = array( 'name' => $hashtags );
121	if ( !isset( $extracted['has'] ) )
122	$extracted['has'] = array();
123	$extracted['has']['hashtag'] = count( $hashtags );
124	}
125	}
126
127	// ----------------------------------- SHORTCODES ------------------------------
128
129	// Always look for shortcodes.
130	// If we don't want them, we'll just remove them, so we don't grab them as links below
131	$shortcode_pattern = '/' . get_shortcode_regex() . '/s';
132	if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) {
133
134	$shortcode_total_count = 0;
135	$shortcode_type_counts = array();
136	$shortcode_types = array();
137	$shortcode_details = array();
138
139	if ( self::SHORTCODES & $what_to_extract ) {
140
141	foreach( $matches[2] as $key => $shortcode ) {
142	//Elasticsearch (and probably other things) doesn't deal well with some chars as key names
143	$shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode );
144
145	$attr = shortcode_parse_atts( $matches[3][ $key ] );
146
147	$shortcode_total_count++;
148	if ( ! isset( $shortcode_type_counts[$shortcode_name] ) )
149	$shortcode_type_counts[$shortcode_name] = 0;
150	$shortcode_type_counts[$shortcode_name]++;
151
152	// Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
153	// @todo Store number of occurrences?
154	if ( ! in_array( $shortcode_name, $shortcode_types ) )
155	$shortcode_types[] = $shortcode_name;
156
157	// For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.)
158	if ( in_array( $shortcode, self::$KEEPER_SHORTCODES ) ) {
159	unset( $id ); // Clear shortcode ID data left from the last shortcode
160	// We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id()
161	// If the shortcode is a class, we'll call XyzShortcode::get_xyz_id()
162	$shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
163	$shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode';
164	$shortcode_get_id_method = "get_{$shortcode}_id";
165	if ( function_exists( $shortcode_get_id_func ) ) {
166	$id = call_user_func( $shortcode_get_id_func, $attr );
167	} else if ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) {
168	$id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr );
169	}
170	if ( ! empty( $id )
171	&& ( ! isset( $shortcode_details[$shortcode_name] ) \|\| ! in_array( $id, $shortcode_details[$shortcode_name] ) ) )
172	$shortcode_details[$shortcode_name][] = $id;
173	}
174	}
175
176	if ( $shortcode_total_count > 0 ) {
177	// Add the shortcode info to the $extracted array
178	if ( !isset( $extracted['has'] ) )
179	$extracted['has'] = array();
180	$extracted['has']['shortcode'] = $shortcode_total_count;
181	$extracted['shortcode'] = array();
182	foreach ( $shortcode_type_counts as $type => $count )
183	$extracted['shortcode'][$type] = array( 'count' => $count );
184	if ( ! empty( $shortcode_types ) )
185	$extracted['shortcode_types'] = $shortcode_types;
186	foreach ( $shortcode_details as $type => $id )
187	$extracted['shortcode'][$type]['id'] = $id;
188	}
189	}
190
191	// Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
192	$content = preg_replace( $shortcode_pattern, ' ', $content );
193	}
194
195	// ----------------------------------- LINKS ------------------------------
196
197	if ( self::LINKS & $what_to_extract ) {
198
199	// To hold the extracted stuff we find
200	$links = array();
201
202	// @todo Get the text inside the links?
203
204	// Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images
205	// (we treat embed links as just another link)
206	if ( preg_match_all( '#(?:^\|\s\|"\|\')(https?://([^\s()<>]+(?:$[\w\d]+$\|([^[:punct:]\s]\|/))))#', $content, $matches ) ) {
207
208	foreach ( $matches[1] as $link_raw ) {
209	$url = parse_url( $link_raw );
210
211	// Data URI links
212	if ( isset( $url['scheme'] ) && 'data' === $url['scheme'] )
213	continue;
214
215	// Remove large (and likely invalid) links
216	if ( 4096 < strlen( $link_raw ) )
217	continue;
218
219	// Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those
220	$simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' );
221	if ( isset( $extracted['image']['url'] ) ) {
222	if ( in_array( $simple_url, (array) $extracted['image']['url'] ) )
223	continue;
224	}
225
226	list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );
227
228	// Build a reversed hostname
229	$host_parts = array_reverse( explode( '.', $url['host'] ) );
230	$host_reversed = '';
231	foreach ( $host_parts as $part ) {
232	$host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part;
233	}
234
235	$link_analyzed = '';
236	if ( !empty( $url['path'] ) ) {
237	// The whole path (no query args or fragments)
238	$path = substr( $url['path'], 1 ); // strip the leading '/'
239	$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path;
240
241	// The path split by /
242	$path_split = explode( '/', $path );
243	if ( count( $path_split ) > 1 ) {
244	$link_analyzed .= ' ' . implode( ' ', $path_split );
245	}
246
247	// The fragment
248	if ( ! empty( $url['fragment'] ) )
249	$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment'];
250	}
251
252	// @todo Check unique before adding
253	$links[] = array(
254	'url' => $link_all_but_proto,
255	'host_reversed' => $host_reversed,
256	'host' => $url['host'],
257	);
258	}
259
260	}
261
262	$link_count = count( $links );
263	if ( $link_count ) {
264	$extracted[ 'link' ] = $links;
265	if ( !isset( $extracted['has'] ) )
266	$extracted['has'] = array();
267	$extracted['has']['link'] = $link_count;
268	}
269	}
270
271	// ----------------------------------- EMBEDS ------------------------------
272
273	//Embeds are just individual links on their own line
274	if ( self::EMBEDS & $what_to_extract ) {
275
276	if ( !function_exists( '_wp_oembed_get_object' ) )
277	include( ABSPATH . WPINC . '/class-oembed.php' );
278
279	// get an oembed object
280	$oembed = _wp_oembed_get_object();
281
282	// Grab any links on their own lines that may be embeds
283	if ( preg_match_all( '\|^\s(https?://[^\s"]+)\s$\|im', $content, $matches ) ) {
284
285	// To hold the extracted stuff we find
286	$embeds = array();
287
288	foreach ( $matches[1] as $link_raw ) {
289	$url = parse_url( $link_raw );
290
291	list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );
292
293	// Check whether this "link" is really an embed.
294	foreach ( $oembed->providers as $matchmask => $data ) {
295	list( $providerurl, $regex ) = $data;
296
297	// Turn the asterisk-type provider URLs into regex
298	if ( !$regex ) {
299	$matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i';
300	$matchmask = preg_replace( '\|^#http\\\://\|', '#https?\://', $matchmask );
301	}
302
303	if ( preg_match( $matchmask, $link_raw ) ) {
304	$provider = str_replace( '{format}', 'json', $providerurl ); // JSON is easier to deal with than XML
305	$embeds[] = $link_all_but_proto; // @todo Check unique before adding
306
307	// @todo Try to get ID's for the ones we care about (shortcode_keepers)
308	break;
309	}
310	}
311	}
312
313	if ( ! empty( $embeds ) ) {
314	if ( !isset( $extracted['has'] ) )
315	$extracted['has'] = array();
316	$extracted['has']['embed'] = count( $embeds );
317	$extracted['embed'] = array( 'url' => array() );
318	foreach ( $embeds as $e )
319	$extracted['embed']['url'][] = $e;
320	}
321	}
322	}
323
324	return $extracted;
325	}
326
327	/**
328	* @param $post A post object
329	* @param $args (array) Optional args, see defaults list for details
330	* @returns array Returns an array of all images meeting the specified criteria in $args
331	*
332	* Uses Jetpack Post Images
333	*/
334	private static function get_image_fields( $post, $args = array() ) {
335
336	$defaults = array(
337	'width' => 200, // Required minimum width (if possible to determine)
338	'height' => 200, // Required minimum height (if possible to determine)
339	);
340
341	$args = wp_parse_args( $args, $defaults );
342
343	$image_list = array();
344	$image_booleans = array();
345	$image_booleans['gallery'] = 0;
346
347	$from_featured_image = Jetpack_PostImages::from_thumbnail( $post->ID, $args['width'], $args['height'] );
348	if ( !empty( $from_featured_image ) ) {
349	$srcs = wp_list_pluck( $from_featured_image, 'src' );
350	$image_list = array_merge( $image_list, $srcs );
351	}
352
353	$from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] );
354	if ( !empty( $from_slideshow ) ) {
355	$srcs = wp_list_pluck( $from_slideshow, 'src' );
356	$image_list = array_merge( $image_list, $srcs );
357	}
358
359	$from_gallery = Jetpack_PostImages::from_gallery( $post->ID );
360	if ( !empty( $from_gallery ) ) {
361	$srcs = wp_list_pluck( $from_gallery, 'src' );
362	$image_list = array_merge( $image_list, $srcs );
363	$image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1
364	}
365
366	// @todo Can we check width/height of these efficiently? Could maybe use query args at least, before we strip them out
367	$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $post->post_content, $image_list );
368
369	return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list );
370	}
371
372	public static function extract_images_from_content( $content, $image_list ) {
373	$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $content, $image_list );
374	return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list );
375	}
376
377	public static function build_image_struct( $image_list ) {
378	if ( ! empty( $image_list ) ) {
379	$retval = array( 'image' => array() );
380	$image_list = array_unique( $image_list );
381	foreach ( $image_list as $img ) {
382	$retval['image'][] = array( 'url' => $img );
383	}
384	$image_booleans['image'] = count( $retval['image'] );
385	if ( ! empty( $image_booleans ) )
386	$retval['has'] = $image_booleans;
387	return $retval;
388	} else {
389	return array();
390	}
391	}
392
393	/**
394	*
395	* @param string $html Some markup, possibly containing image tags
396	* @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication
397	* @return array Image URLs extracted from the HTML, stripped of query params and de-duped
398	*/
399	public static function get_images_from_html( $html, $images_already_extracted ) {
400	$image_list = $images_already_extracted;
401	$from_html = Jetpack_PostImages::from_html( $html );
402	if ( !empty( $from_html ) ) {
403	$srcs = wp_list_pluck( $from_html, 'src' );
404	foreach( $srcs as $image_url ) {
405	if ( ( $src = parse_url( $image_url ) ) && isset( $src['scheme'], $src['host'], $src['path'] ) ) {
406	// Rebuild the URL without the query string
407	$queryless = $src['scheme'] . '://' . $src['host'] . $src['path'];
408	} elseif ( $length = strpos( $image_url, '?' ) ) {
409	// If parse_url() didn't work, strip off the query string the old fashioned way
410	$queryless = substr( $image_url, 0, $length );
411	} else {
412	// Failing that, there was no spoon! Err ... query string!
413	$queryless = $image_url;
414	}
415
416	// Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML.
417	if ( 4096 < strlen( $queryless ) ) {
418	continue;
419	}
420
421	if ( ! in_array( $queryless, $image_list ) ) {
422	$image_list[] = $queryless;
423	}
424	}
425	}
426	return $image_list;
427	}
428
429	private static function get_stripped_content( $content ) {
430	$clean_content = strip_tags( $content );
431	$clean_content = html_entity_decode( $clean_content );
432	//completely strip shortcodes and any content they enclose
433	$clean_content = strip_shortcodes( $clean_content );
434	return $clean_content;
435	}
436	}
437