broken-link-checker
Last commit date
broken-link-checker.php
18 years ago
readme.txt
18 years ago
wsblc_ajax.php
18 years ago
wsblc_ajax.php
324 lines
| 1 | <?php |
| 2 | /* |
| 3 | The AJAX-y part of the link checker. |
| 4 | */ |
| 5 | require_once("../../../wp-config.php"); |
| 6 | require_once("../../../wp-includes/wp-db.php"); |
| 7 | |
| 8 | //error_reporting(E_ALL); |
| 9 | |
| 10 | $execution_start_time=microtime(true); |
| 11 | |
| 12 | function execution_time(){ |
| 13 | global $execution_start_time; |
| 14 | return microtime(true)-$execution_start_time; |
| 15 | } |
| 16 | |
| 17 | |
| 18 | if(!is_object($ws_link_checker)) { |
| 19 | die('Fatal error : undefined object; plugin may not be active.'); |
| 20 | }; |
| 21 | |
| 22 | $url_pattern='/(<a[\s]+[^>]*href\s*=\s*[\"\']?)([^\'\" >]+)([\'\"]+[^<>]*>)((?sU).*)(<\/a>)/i'; |
| 23 | |
| 24 | $postdata_name=$wpdb->prefix . "blc_postdata"; |
| 25 | $linkdata_name=$wpdb->prefix . "blc_linkdata"; |
| 26 | |
| 27 | $options=$ws_link_checker->options; //get_option('wsblc_options'); |
| 28 | $siteurl=get_option('siteurl'); |
| 29 | $max_execution_time=isset($options['max_work_session'])?intval($options['max_work_session']):27; |
| 30 | |
| 31 | // Check for safe mode |
| 32 | if( ini_get('safe_mode') ){ |
| 33 | // Do it the safe mode way |
| 34 | $t=ini_get('max_execution_time'); |
| 35 | if ($t && ($t < $max_execution_time)) |
| 36 | $max_execution_time = $t-1; |
| 37 | } else { |
| 38 | // Do it the regular way |
| 39 | @set_time_limit(0); |
| 40 | } |
| 41 | @ignore_user_abort(true); |
| 42 | |
| 43 | $check_treshold=date('Y-m-d H:i:s', strtotime('-'.$options['check_treshold'].' hours')); |
| 44 | $recheck_treshold=date('Y-m-d H:i:s', strtotime('-20 minutes')); |
| 45 | |
| 46 | $action=isset($_GET['action'])?$_GET['action']:'run_check'; |
| 47 | |
| 48 | if($action=='dashboard_status'){ |
| 49 | /* displays a notification if broken links have been found */ |
| 50 | $sql="SELECT count(*) FROM $linkdata_name WHERE broken=1"; |
| 51 | $broken_links=$wpdb->get_var($sql); |
| 52 | if($broken_links>0){ |
| 53 | echo "<div> |
| 54 | <h3>Broken Links</h3> |
| 55 | <p><a href='".get_option('siteurl')."/wp-admin/edit.php?page=". |
| 56 | $ws_link_checker->mybasename."' title='View broken links'>Found $broken_links broken links</a></p> |
| 57 | </div>"; |
| 58 | }; |
| 59 | |
| 60 | } else if($action=='full_status'){ |
| 61 | /* give some stats about the current situation */ |
| 62 | $sql="SELECT count(*) FROM $postdata_name WHERE last_check<'$check_treshold'"; |
| 63 | $posts_unchecked=$wpdb->get_var($sql); |
| 64 | |
| 65 | $sql="SELECT count(*) FROM $linkdata_name WHERE last_check<'$check_treshold'"; |
| 66 | $links_unchecked=$wpdb->get_var($sql); |
| 67 | |
| 68 | $sql="SELECT count(*) FROM $linkdata_name WHERE broken=1"; |
| 69 | $broken_links=$wpdb->get_var($sql); |
| 70 | |
| 71 | if($broken_links>0){ |
| 72 | echo "<a href='".get_option('siteurl')."/wp-admin/edit.php?page=". |
| 73 | $ws_link_checker->mybasename."' title='View broken links'><strong>Found $broken_links broken links</strong></a>"; |
| 74 | } else { |
| 75 | echo "No broken links found."; |
| 76 | } |
| 77 | |
| 78 | echo "<br/>"; |
| 79 | |
| 80 | if($posts_unchecked || $links_unchecked) { |
| 81 | echo "$posts_unchecked posts and $links_unchecked links in the work queue."; |
| 82 | } else { |
| 83 | echo "The work queue is empty."; |
| 84 | } |
| 85 | |
| 86 | |
| 87 | } else if($action=='run_check'){ |
| 88 | /* check for posts that haven't been checked for a long time & parse them for links, put the links in queue */ |
| 89 | echo "<!-- run_check -->"; |
| 90 | |
| 91 | $sql="SELECT b.* FROM $postdata_name a, $wpdb->posts b |
| 92 | WHERE a.last_check<'$check_treshold' AND a.post_id=b.id ORDER BY a.last_check ASC LIMIT 20"; |
| 93 | |
| 94 | $rows=$wpdb->get_results($sql, OBJECT); |
| 95 | if($rows && (count($rows)>0)){ |
| 96 | //some rows found |
| 97 | echo "<!-- parsing pages (rand : ".rand(1,1000).") -->"; |
| 98 | foreach ($rows as $post) { |
| 99 | $wpdb->query("DELETE FROM $linkdata_name WHERE post_id=$post->ID"); |
| 100 | gather_and_save_links($post->post_content, $post->ID); |
| 101 | $wpdb->query("UPDATE $postdata_name SET last_check=NOW() WHERE post_id=$post->ID"); |
| 102 | } |
| 103 | }; |
| 104 | |
| 105 | if(execution_time()>$max_execution_time){ |
| 106 | die('<!-- general timeout -->'); |
| 107 | } |
| 108 | |
| 109 | /* check the queue and process any links unchecked */ |
| 110 | $sql="SELECT * FROM $linkdata_name WHERE ". |
| 111 | " ((last_check<'$check_treshold') OR ". |
| 112 | " (broken=1 AND check_count<5 AND last_check<'$recheck_treshold')) ". |
| 113 | " LIMIT 100"; |
| 114 | |
| 115 | $links=$wpdb->get_results($sql, OBJECT); |
| 116 | if($links && (count($links)>0)){ |
| 117 | //some unchecked links found |
| 118 | echo "<!-- checking links (rand : ".rand(1,1000).") -->"; |
| 119 | foreach ($links as $link) { |
| 120 | if(page_exists_simple($link->url)){ |
| 121 | //link OK, remove from queue |
| 122 | $wpdb->query("DELETE FROM $linkdata_name WHERE id=$link->id"); |
| 123 | } else { |
| 124 | $wpdb->query("UPDATE $linkdata_name SET broken=1, ". |
| 125 | " last_check=NOW(), check_count=check_count+1 WHERE id=$link->id"); |
| 126 | }; |
| 127 | |
| 128 | |
| 129 | if(execution_time()>$max_execution_time){ |
| 130 | die('<!-- url loop timeout -->'); |
| 131 | } |
| 132 | } |
| 133 | }; |
| 134 | |
| 135 | die('<!-- /run_check -->'); |
| 136 | |
| 137 | } else if ($action=='discard_link'){ |
| 138 | $id=intval($_GET['id']); |
| 139 | $wpdb->query("DELETE FROM $linkdata_name WHERE id=$id LIMIT 1"); |
| 140 | }; |
| 141 | |
| 142 | |
| 143 | function parse_link($matches, $post_id){ |
| 144 | global $wpdb, $siteurl, $linkdata_name; |
| 145 | |
| 146 | $url=$matches[2]; |
| 147 | |
| 148 | $parts=@parse_url($url); |
| 149 | |
| 150 | if(!$parts) return false; |
| 151 | |
| 152 | $url=preg_replace( |
| 153 | array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&/','/^(javascript:.*)/i','/([\?&]sid=\w+)$/i'), |
| 154 | array('','','&','',''), |
| 155 | $url); |
| 156 | |
| 157 | $url=trim($url); |
| 158 | if($url=='') return false; |
| 159 | |
| 160 | // turn relative URLs into absolute URLs |
| 161 | $url = relative2absolute($siteurl, $url); |
| 162 | |
| 163 | if(strlen($url)>5){ |
| 164 | $wpdb->query( |
| 165 | "INSERT INTO $linkdata_name(post_id, url, link_text) |
| 166 | VALUES($post_id, '".$wpdb->escape($url)."', '".$wpdb->escape(strip_tags($matches[4]))."')" |
| 167 | ); |
| 168 | }; |
| 169 | |
| 170 | return true; |
| 171 | } |
| 172 | |
| 173 | function parse_image($matches, $post_id){ |
| 174 | global $wpdb, $siteurl, $linkdata_name; |
| 175 | |
| 176 | $url=$matches[2]; |
| 177 | |
| 178 | $parts=@parse_url($url); |
| 179 | |
| 180 | if(!$parts) return false; |
| 181 | |
| 182 | $url=preg_replace( |
| 183 | array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&/','/^(javascript:.*)/i','/([\?&]sid=\w+)$/i'), |
| 184 | array('','','&','',''), |
| 185 | $url); |
| 186 | |
| 187 | $url=trim($url); |
| 188 | if($url=='') return false; |
| 189 | |
| 190 | // turn relative URLs into absolute URLs |
| 191 | $url = relative2absolute($siteurl, $url); |
| 192 | |
| 193 | if(strlen($url)>3){ |
| 194 | $wpdb->query( |
| 195 | "INSERT INTO $linkdata_name(post_id, url, link_text) |
| 196 | VALUES($post_id, '".$wpdb->escape($url)."', '[image]')" |
| 197 | ); |
| 198 | }; |
| 199 | |
| 200 | return true; |
| 201 | } |
| 202 | |
| 203 | function gather_and_save_links($content, $post_id){ |
| 204 | //gather links (<a href=...>) |
| 205 | $url_pattern='/(<a[\s]+[^>]*href\s*=\s*[\"\']?)([^\'\" >]+)([\'\"]+[^<>]*>)((?sU).*)(<\/a>)/i'; |
| 206 | |
| 207 | if(preg_match_all($url_pattern, $content, $matches, PREG_SET_ORDER)){ |
| 208 | foreach($matches as $link){ |
| 209 | parse_link($link, $post_id); |
| 210 | } |
| 211 | }; |
| 212 | |
| 213 | //gather images (<img src=...>) |
| 214 | $url_pattern='/(<img[\s]+[^>]*src\s*=\s*[\"\']?)([^\'\" >]+)([\'\"]+[^<>]*>)/i'; |
| 215 | |
| 216 | if(preg_match_all($url_pattern, $content, $matches, PREG_SET_ORDER)){ |
| 217 | foreach($matches as $img){ |
| 218 | parse_image($img, $post_id); |
| 219 | } |
| 220 | }; |
| 221 | |
| 222 | return $content; |
| 223 | } |
| 224 | |
| 225 | function page_exists_simple($url){ |
| 226 | $parts=parse_url($url); |
| 227 | if(!$parts) return false; |
| 228 | |
| 229 | if(!isset($parts['scheme'])) $url='http://'.$url; |
| 230 | |
| 231 | $ch = curl_init(); |
| 232 | curl_setopt($ch, CURLOPT_URL, $url); |
| 233 | curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'); |
| 234 | curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); |
| 235 | |
| 236 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); |
| 237 | curl_setopt($ch, CURLOPT_MAXREDIRS, 10); |
| 238 | |
| 239 | curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 15); |
| 240 | curl_setopt($ch, CURLOPT_TIMEOUT, 25); |
| 241 | |
| 242 | curl_setopt($ch, CURLOPT_FAILONERROR, false); |
| 243 | |
| 244 | $nobody=false; |
| 245 | if($parts['scheme']=='https'){ |
| 246 | curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); |
| 247 | curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); |
| 248 | } else { |
| 249 | $nobody=true; |
| 250 | curl_setopt($ch, CURLOPT_NOBODY, true); |
| 251 | } |
| 252 | curl_setopt($ch, CURLOPT_HEADER, true); |
| 253 | |
| 254 | $response = curl_exec($ch); |
| 255 | $code=intval(curl_getinfo($ch, CURLINFO_HTTP_CODE)); |
| 256 | |
| 257 | if ( (($code<200) || ($code>=400)) && $nobody) { |
| 258 | curl_setopt($ch, CURLOPT_NOBODY, false); |
| 259 | curl_setopt($ch, CURLOPT_HTTPGET, true); |
| 260 | $response = curl_exec($ch); |
| 261 | $code=intval(curl_getinfo($ch, CURLINFO_HTTP_CODE)); |
| 262 | } |
| 263 | |
| 264 | curl_close($ch); |
| 265 | |
| 266 | return (($code>=200) && ($code<400)); |
| 267 | } |
| 268 | |
| 269 | function relative2absolute($absolute, $relative) { |
| 270 | $p = @parse_url($relative); |
| 271 | if(!$p) { |
| 272 | //WTF? $relative is a seriously malformed URL |
| 273 | return false; |
| 274 | } |
| 275 | if(isset($p["scheme"])) return $relative; |
| 276 | |
| 277 | $parts=(parse_url($absolute)); |
| 278 | |
| 279 | if(substr($relative,0,1)=='/') { |
| 280 | $cparts = (explode("/", $relative)); |
| 281 | array_shift($cparts); |
| 282 | } else { |
| 283 | if(isset($parts['path'])){ |
| 284 | $aparts=explode('/',$parts['path']); |
| 285 | array_pop($aparts); |
| 286 | $aparts=array_filter($aparts); |
| 287 | } else { |
| 288 | $aparts=array(); |
| 289 | } |
| 290 | |
| 291 | $rparts = (explode("/", $relative)); |
| 292 | |
| 293 | $cparts = array_merge($aparts, $rparts); |
| 294 | foreach($cparts as $i => $part) { |
| 295 | if($part == '.') { |
| 296 | unset($cparts[$i]); |
| 297 | } else if($part == '..') { |
| 298 | unset($cparts[$i]); |
| 299 | unset($cparts[$i-1]); |
| 300 | } |
| 301 | } |
| 302 | } |
| 303 | $path = implode("/", $cparts); |
| 304 | |
| 305 | $url = ''; |
| 306 | if($parts['scheme']) { |
| 307 | $url = "$parts[scheme]://"; |
| 308 | } |
| 309 | if(isset($parts['user'])) { |
| 310 | $url .= $parts['user']; |
| 311 | if(isset($parts['pass'])) { |
| 312 | $url .= ":".$parts['pass']; |
| 313 | } |
| 314 | $url .= "@"; |
| 315 | } |
| 316 | if(isset($parts['host'])) { |
| 317 | $url .= $parts['host']."/"; |
| 318 | } |
| 319 | $url .= $path; |
| 320 | |
| 321 | return $url; |
| 322 | } |
| 323 | |
| 324 | ?> |