PluginProbe ʕ •ᴥ•ʔ
LiteSpeed Cache / 7.6.1
LiteSpeed Cache v7.6.1
trunk 1.0.15 1.9.1.1 2.9.9.2 3.6.4 4.6 5.7.0.1 6.5.4 7.0.0.1 7.0.1 7.1 7.2 7.3 7.3.0.1 7.4 7.5 7.5.0.1 7.6 7.6.1 7.6.2 7.7 7.8 7.8.0.1 7.8.1
litespeed-cache / src / crawler.cls.php
litespeed-cache / src Last commit date
cdn 7 months ago data_structure 7 months ago activation.cls.php 7 months ago admin-display.cls.php 7 months ago admin-settings.cls.php 7 months ago admin.cls.php 7 months ago api.cls.php 7 months ago avatar.cls.php 7 months ago base.cls.php 7 months ago cdn.cls.php 7 months ago cloud.cls.php 7 months ago conf.cls.php 7 months ago control.cls.php 7 months ago core.cls.php 7 months ago crawler-map.cls.php 7 months ago crawler.cls.php 7 months ago css.cls.php 7 months ago data.cls.php 7 months ago data.upgrade.func.php 7 months ago db-optm.cls.php 7 months ago debug2.cls.php 7 months ago doc.cls.php 7 months ago error.cls.php 7 months ago esi.cls.php 7 months ago file.cls.php 7 months ago gui.cls.php 7 months ago health.cls.php 7 months ago htaccess.cls.php 7 months ago img-optm.cls.php 7 months ago import.cls.php 7 months ago import.preset.cls.php 7 months ago lang.cls.php 7 months ago localization.cls.php 7 months ago media.cls.php 7 months ago metabox.cls.php 7 months ago object-cache-wp.cls.php 7 months ago object-cache.cls.php 7 months ago object.lib.php 7 months ago optimize.cls.php 7 months ago optimizer.cls.php 7 months ago placeholder.cls.php 7 months ago purge.cls.php 7 months ago report.cls.php 7 months ago rest.cls.php 7 months ago root.cls.php 7 months ago router.cls.php 7 months ago str.cls.php 7 months ago tag.cls.php 7 months ago task.cls.php 7 months ago tool.cls.php 7 months ago ucss.cls.php 7 months ago utility.cls.php 7 months ago vary.cls.php 7 months ago vpi.cls.php 7 months ago
crawler.cls.php
1494 lines
1 <?php
2 // phpcs:ignoreFile
3
4 /**
5 * The crawler class
6 *
7 * @since 1.1.0
8 */
9
10 namespace LiteSpeed;
11
12 defined('WPINC') || exit();
13
14 class Crawler extends Root {
15
16 const LOG_TAG = '🕸️';
17
18 const TYPE_REFRESH_MAP = 'refresh_map';
19 const TYPE_EMPTY = 'empty';
20 const TYPE_BLACKLIST_EMPTY = 'blacklist_empty';
21 const TYPE_BLACKLIST_DEL = 'blacklist_del';
22 const TYPE_BLACKLIST_ADD = 'blacklist_add';
23 const TYPE_START = 'start';
24 const TYPE_RESET = 'reset';
25
26 const USER_AGENT = 'lscache_walker';
27 const FAST_USER_AGENT = 'lscache_runner';
28 const CHUNKS = 10000;
29
30 const STATUS_WAIT = 'W';
31 const STATUS_HIT = 'H';
32 const STATUS_MISS = 'M';
33 const STATUS_BLACKLIST = 'B';
34 const STATUS_NOCACHE = 'N';
35
36 private $_sitemeta = 'meta.data';
37 private $_resetfile;
38 private $_end_reason;
39 private $_ncpu = 1;
40 private $_server_ip;
41
42 private $_crawler_conf = array(
43 'cookies' => array(),
44 'headers' => array(),
45 'ua' => '',
46 );
47 private $_crawlers = array();
48 private $_cur_threads = -1;
49 private $_max_run_time;
50 private $_cur_thread_time;
51 private $_map_status_list = array(
52 'H' => array(),
53 'M' => array(),
54 'B' => array(),
55 'N' => array(),
56 );
57 protected $_summary;
58
59 /**
60 * Initialize crawler, assign sitemap path
61 *
62 * @since 1.1.0
63 */
64 public function __construct() {
65 if (is_multisite()) {
66 $this->_sitemeta = 'meta' . get_current_blog_id() . '.data';
67 }
68
69 $this->_resetfile = LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta . '.reset';
70
71 $this->_summary = self::get_summary();
72
73 $this->_ncpu = $this->_get_server_cpu();
74 $this->_server_ip = $this->conf(Base::O_SERVER_IP);
75
76 self::debug('Init w/ CPU cores=' . $this->_ncpu);
77 }
78
79 /**
80 * Try get server CPUs
81 *
82 * @since 5.2
83 */
84 private function _get_server_cpu() {
85 $cpuinfo_file = '/proc/cpuinfo';
86 $setting_open_dir = ini_get('open_basedir');
87 if ($setting_open_dir) {
88 return 1;
89 } // Server has limit
90
91 try {
92 if (!@is_file($cpuinfo_file)) {
93 return 1;
94 }
95 } catch (\Exception $e) {
96 return 1;
97 }
98
99 $cpuinfo = file_get_contents($cpuinfo_file);
100 preg_match_all('/^processor/m', $cpuinfo, $matches);
101 return count($matches[0]) ?: 1;
102 }
103
104 /**
105 * Check whether the current crawler is active/runable/useable/enabled/want it to work or not
106 *
107 * @since 4.3
108 */
109 public function is_active( $curr ) {
110 $bypass_list = self::get_option('bypass_list', array());
111 return !in_array($curr, $bypass_list);
112 }
113
114 /**
115 * Toggle the current crawler's activeness state, i.e., runable/useable/enabled/want it to work or not, and return the updated state
116 *
117 * @since 4.3
118 */
119 public function toggle_activeness( $curr ) {
120 // param type: int
121 $bypass_list = self::get_option('bypass_list', array());
122 if (in_array($curr, $bypass_list)) {
123 // when the ith opt was off / in the bypassed list, turn it on / remove it from the list
124 unset($bypass_list[array_search($curr, $bypass_list)]);
125 $bypass_list = array_values($bypass_list);
126 self::update_option('bypass_list', $bypass_list);
127 return true;
128 } else {
129 // when the ith opt was on / not in the bypassed list, turn it off / add it to the list
130 $bypass_list[] = (int) $curr;
131 self::update_option('bypass_list', $bypass_list);
132 return false;
133 }
134 }
135
136 /**
137 * Clear bypassed list
138 *
139 * @since 4.3
140 * @access public
141 */
142 public function clear_disabled_list() {
143 self::update_option('bypass_list', array());
144
145 $msg = __('Crawler disabled list is cleared! All crawlers are set to active! ', 'litespeed-cache');
146 Admin_Display::note($msg);
147
148 self::debug('All crawlers are set to active...... ');
149 }
150
151 /**
152 * Overwrite get_summary to init elements
153 *
154 * @since 3.0
155 * @access public
156 */
157 public static function get_summary( $field = false ) {
158 $_default = array(
159 'list_size' => 0,
160 'last_update_time' => 0,
161 'curr_crawler' => 0,
162 'curr_crawler_beginning_time' => 0,
163 'last_pos' => 0,
164 'last_count' => 0,
165 'last_crawled' => 0,
166 'last_start_time' => 0,
167 'last_status' => '',
168 'is_running' => 0,
169 'end_reason' => '',
170 'meta_save_time' => 0,
171 'pos_reset_check' => 0,
172 'done' => 0,
173 'this_full_beginning_time' => 0,
174 'last_full_time_cost' => 0,
175 'last_crawler_total_cost' => 0,
176 'crawler_stats' => array(), // this will store all crawlers hit/miss crawl status
177 );
178
179 wp_cache_delete('alloptions', 'options'); // ensure the summary is current
180 $summary = parent::get_summary();
181 $summary = array_merge($_default, $summary);
182
183 if (!$field) {
184 return $summary;
185 }
186
187 if (array_key_exists($field, $summary)) {
188 return $summary[$field];
189 }
190
191 return null;
192 }
193
194 /**
195 * Overwrite save_summary
196 *
197 * @since 3.0
198 * @access public
199 */
200 public static function save_summary( $data = false, $reload = false, $overwrite = false ) {
201 $instance = self::cls();
202 $instance->_summary['meta_save_time'] = time();
203
204 if (!$data) {
205 $data = $instance->_summary;
206 }
207
208 parent::save_summary($data, $reload, $overwrite);
209
210 File::save(LITESPEED_STATIC_DIR . '/crawler/' . $instance->_sitemeta, \json_encode($data), true);
211 }
212
213 /**
214 * Cron start async crawling
215 *
216 * @since 5.5
217 */
218 public static function start_async_cron() {
219 Task::async_call('crawler');
220 }
221
222 /**
223 * Manually start async crawling
224 *
225 * @since 5.5
226 */
227 public static function start_async() {
228 Task::async_call('crawler_force');
229
230 $msg = __('Started async crawling', 'litespeed-cache');
231 Admin_Display::success($msg);
232 }
233
234 /**
235 * Ajax crawl handler
236 *
237 * @since 5.5
238 */
239 public static function async_handler( $manually_run = false ) {
240 self::debug('------------async-------------start_async_handler');
241 // check_ajax_referer('async_crawler', 'nonce');
242 self::start($manually_run);
243 }
244
245 /**
246 * Proceed crawling
247 *
248 * @since 1.1.0
249 * @access public
250 */
251 public static function start( $manually_run = false ) {
252 if (!Router::can_crawl()) {
253 self::debug('......crawler is NOT allowed by the server admin......');
254 return false;
255 }
256
257 if ($manually_run) {
258 self::debug('......crawler manually ran......');
259 }
260
261 self::cls()->_crawl_data($manually_run);
262 }
263
264 /**
265 * Crawling start
266 *
267 * @since 1.1.0
268 * @access private
269 */
270 private function _crawl_data( $manually_run ) {
271 if (!defined('LITESPEED_LANE_HASH')) {
272 define('LITESPEED_LANE_HASH', Str::rrand(8));
273 }
274 if ($this->_check_valid_lane()) {
275 $this->_take_over_lane();
276 } else {
277 self::debug('⚠️ lane in use');
278 return;
279 // if ($manually_run) {
280 // self::debug('......crawler started (manually_rund)......');
281 // Log pid to prevent from multi running
282 // if (defined('LITESPEED_CLI')) {
283 // Take over lane
284 // self::debug('⚠️⚠️⚠️ Forced take over lane (CLI)');
285 // $this->_take_over_lane();
286 // }
287 // }
288 }
289 self::debug('......crawler started......');
290
291 // for the first time running
292 if (!$this->_summary || !Data::cls()->tb_exist('crawler') || !Data::cls()->tb_exist('crawler_blacklist')) {
293 $this->cls('Crawler_Map')->gen();
294 }
295
296 // if finished last time, regenerate sitemap
297 if ($this->_summary['done'] === 'touchedEnd') {
298 // check whole crawling interval
299 $last_finished_at = $this->_summary['last_full_time_cost'] + $this->_summary['this_full_beginning_time'];
300 if (!$manually_run && time() - $last_finished_at < $this->conf(Base::O_CRAWLER_CRAWL_INTERVAL)) {
301 self::debug('Cron abort: cache warmed already.');
302 // if not reach whole crawling interval, exit
303 $this->Release_lane();
304 return;
305 }
306 self::debug('TouchedEnd. regenerate sitemap....');
307 $this->cls('Crawler_Map')->gen();
308 }
309
310 $this->list_crawlers();
311
312 // Skip the crawlers that in bypassed list
313 while (!$this->is_active($this->_summary['curr_crawler']) && $this->_summary['curr_crawler'] < count($this->_crawlers)) {
314 self::debug('Skipped the Crawler #' . $this->_summary['curr_crawler'] . ' ......');
315 ++$this->_summary['curr_crawler'];
316 }
317 if ($this->_summary['curr_crawler'] >= count($this->_crawlers)) {
318 $this->_end_reason = 'end';
319 $this->_terminate_running();
320 $this->Release_lane();
321 return;
322 }
323
324 // In case crawlers are all done but not reload, reload it
325 if (empty($this->_summary['curr_crawler']) || empty($this->_crawlers[$this->_summary['curr_crawler']])) {
326 $this->_summary['curr_crawler'] = 0;
327 $this->_summary['crawler_stats'][$this->_summary['curr_crawler']] = array();
328 }
329
330 $res = $this->load_conf();
331 if (!$res) {
332 self::debug('Load conf failed');
333 $this->_terminate_running();
334 $this->Release_lane();
335 return;
336 }
337
338 try {
339 $this->_engine_start();
340 $this->Release_lane();
341 } catch (\Exception $e) {
342 self::debug('🛑 ' . $e->getMessage());
343 }
344 }
345
346 /**
347 * Load conf before running crawler
348 *
349 * @since 3.0
350 * @access private
351 */
352 private function load_conf() {
353 $this->_crawler_conf['base'] = site_url();
354
355 $current_crawler = $this->_crawlers[$this->_summary['curr_crawler']];
356
357 /**
358 * Check cookie crawler
359 *
360 * @since 2.8
361 */
362 foreach ($current_crawler as $k => $v) {
363 if (strpos($k, 'cookie:') !== 0) {
364 continue;
365 }
366
367 if ($v == '_null') {
368 continue;
369 }
370
371 $this->_crawler_conf['cookies'][substr($k, 7)] = $v;
372 }
373
374 /**
375 * Set WebP simulation
376 *
377 * @since 1.9.1
378 */
379 if (!empty($current_crawler['webp'])) {
380 $this->_crawler_conf['headers'][] = 'Accept: image/' . ($this->conf(Base::O_IMG_OPTM_WEBP) == 2 ? 'avif' : 'webp') . ',*/*';
381 }
382
383 /**
384 * Set mobile crawler
385 *
386 * @since 2.8
387 */
388 if (!empty($current_crawler['mobile'])) {
389 $this->_crawler_conf['ua'] = 'Mobile iPhone';
390 }
391
392 /**
393 * Limit delay to use server setting
394 *
395 * @since 1.8.3
396 */
397 $this->_crawler_conf['run_delay'] = 500; // microseconds
398 if (defined('LITESPEED_CRAWLER_USLEEP') && constant('LITESPEED_CRAWLER_USLEEP') > $this->_crawler_conf['run_delay']) {
399 $this->_crawler_conf['run_delay'] = constant('LITESPEED_CRAWLER_USLEEP');
400 }
401 if (!empty($_SERVER[Base::ENV_CRAWLER_USLEEP]) && $_SERVER[Base::ENV_CRAWLER_USLEEP] > $this->_crawler_conf['run_delay']) {
402 $this->_crawler_conf['run_delay'] = $_SERVER[Base::ENV_CRAWLER_USLEEP];
403 }
404
405 $this->_crawler_conf['run_duration'] = $this->get_crawler_duration();
406
407 $this->_crawler_conf['load_limit'] = $this->conf(Base::O_CRAWLER_LOAD_LIMIT);
408 if (!empty($_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE])) {
409 $this->_crawler_conf['load_limit'] = $_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE];
410 } elseif (!empty($_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT]) && $_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT] < $this->_crawler_conf['load_limit']) {
411 $this->_crawler_conf['load_limit'] = $_SERVER[Base::ENV_CRAWLER_LOAD_LIMIT];
412 }
413 if ($this->_crawler_conf['load_limit'] == 0) {
414 self::debug('🛑 Terminated crawler due to load limit set to 0');
415 return false;
416 }
417
418 /**
419 * Set role simulation
420 *
421 * @since 1.9.1
422 */
423 if (!empty($current_crawler['uid'])) {
424 if (!$this->_server_ip) {
425 self::debug('🛑 Terminated crawler due to Server IP not set');
426 return false;
427 }
428 // Get role simulation vary name
429 $vary_name = $this->cls('Vary')->get_vary_name();
430 $vary_val = $this->cls('Vary')->finalize_default_vary($current_crawler['uid']);
431 $this->_crawler_conf['cookies'][$vary_name] = $vary_val;
432 $this->_crawler_conf['cookies']['litespeed_hash'] = Router::cls()->get_hash($current_crawler['uid']);
433 }
434
435 return true;
436 }
437
438 /**
439 * Get crawler duration allowance
440 *
441 * @since 7.0
442 */
443 public function get_crawler_duration() {
444 $RUN_DURATION = defined('LITESPEED_CRAWLER_DURATION') ? constant('LITESPEED_CRAWLER_DURATION') : 900;
445 if ($RUN_DURATION > 900) {
446 $RUN_DURATION = 900; // reset to default value if defined in conf file is higher than 900 seconds for security enhancement
447 }
448 return $RUN_DURATION;
449 }
450
451 /**
452 * Start crawler
453 *
454 * @since 1.1.0
455 * @access private
456 */
457 private function _engine_start() {
458 // check if is running
459 // if ($this->_summary['is_running'] && time() - $this->_summary['is_running'] < $this->_crawler_conf['run_duration']) {
460 // $this->_end_reason = 'stopped';
461 // self::debug('The crawler is running.');
462 // return;
463 // }
464
465 // check current load
466 $this->_adjust_current_threads();
467 if ($this->_cur_threads == 0) {
468 $this->_end_reason = 'stopped_highload';
469 self::debug('Stopped due to heavy load.');
470 return;
471 }
472
473 // log started time
474 self::save_summary(array( 'last_start_time' => time() ));
475
476 // set time limit
477 $maxTime = (int) ini_get('max_execution_time');
478 self::debug('ini_get max_execution_time=' . $maxTime);
479 if ($maxTime == 0) {
480 $maxTime = 300; // hardlimit
481 } else {
482 $maxTime -= 5;
483 }
484 if ($maxTime >= $this->_crawler_conf['run_duration']) {
485 $maxTime = $this->_crawler_conf['run_duration'];
486 self::debug('Use run_duration setting as max_execution_time=' . $maxTime);
487 } elseif (ini_set('max_execution_time', $this->_crawler_conf['run_duration'] + 15) !== false) {
488 $maxTime = $this->_crawler_conf['run_duration'];
489 self::debug('ini_set max_execution_time=' . $maxTime);
490 }
491 self::debug('final max_execution_time=' . $maxTime);
492 $this->_max_run_time = $maxTime + time();
493
494 // mark running
495 $this->_prepare_running();
496 // run crawler
497 $this->_do_running();
498 $this->_terminate_running();
499 }
500
501 /**
502 * Get server load
503 *
504 * @since 5.5
505 */
506 public function get_server_load() {
507 /**
508 * If server is windows, exit
509 *
510 * @see https://wordpress.org/support/topic/crawler-keeps-causing-crashes/
511 */
512 if (!function_exists('sys_getloadavg')) {
513 return -1;
514 }
515
516 $curload = sys_getloadavg();
517 $curload = $curload[0];
518 self::debug('Server load: ' . $curload);
519 return $curload;
520 }
521
522 /**
523 * Adjust threads dynamically
524 *
525 * @since 1.1.0
526 * @access private
527 */
528 private function _adjust_current_threads() {
529 $curload = $this->get_server_load();
530 if ($curload == -1) {
531 self::debug('set threads=0 due to func sys_getloadavg not exist!');
532 $this->_cur_threads = 0;
533 return;
534 }
535
536 $curload /= $this->_ncpu;
537 // $curload = 1;
538 $CRAWLER_THREADS = defined('LITESPEED_CRAWLER_THREADS') ? constant('LITESPEED_CRAWLER_THREADS') : 3;
539
540 if ($this->_cur_threads == -1) {
541 // init
542 if ($curload > $this->_crawler_conf['load_limit']) {
543 $curthreads = 0;
544 } elseif ($curload >= $this->_crawler_conf['load_limit'] - 1) {
545 $curthreads = 1;
546 } else {
547 $curthreads = intval($this->_crawler_conf['load_limit'] - $curload);
548 if ($curthreads > $CRAWLER_THREADS) {
549 $curthreads = $CRAWLER_THREADS;
550 }
551 }
552 } else {
553 // adjust
554 $curthreads = $this->_cur_threads;
555 if ($curload >= $this->_crawler_conf['load_limit'] + 1) {
556 sleep(5); // sleep 5 secs
557 if ($curthreads >= 1) {
558 --$curthreads;
559 }
560 } elseif ($curload >= $this->_crawler_conf['load_limit']) {
561 // if ( $curthreads > 1 ) {// if already 1, keep
562 --$curthreads;
563 // }
564 } elseif ($curload + 1 < $this->_crawler_conf['load_limit']) {
565 if ($curthreads < $CRAWLER_THREADS) {
566 ++$curthreads;
567 }
568 }
569 }
570
571 // $log = 'set current threads = ' . $curthreads . ' previous=' . $this->_cur_threads
572 // . ' max_allowed=' . $CRAWLER_THREADS . ' load_limit=' . $this->_crawler_conf[ 'load_limit' ] . ' current_load=' . $curload;
573
574 $this->_cur_threads = $curthreads;
575 $this->_cur_thread_time = time();
576 }
577
578 /**
579 * Mark running status
580 *
581 * @since 1.1.0
582 * @access private
583 */
584 private function _prepare_running() {
585 $this->_summary['is_running'] = time();
586 $this->_summary['done'] = 0; // reset done status
587 $this->_summary['last_status'] = 'prepare running';
588 $this->_summary['last_crawled'] = 0;
589
590 // Current crawler starttime mark
591 if ($this->_summary['last_pos'] == 0) {
592 $this->_summary['curr_crawler_beginning_time'] = time();
593 }
594
595 if ($this->_summary['curr_crawler'] == 0 && $this->_summary['last_pos'] == 0) {
596 $this->_summary['this_full_beginning_time'] = time();
597 $this->_summary['list_size'] = $this->cls('Crawler_Map')->count_map();
598 }
599
600 if ($this->_summary['end_reason'] == 'end' && $this->_summary['last_pos'] == 0) {
601 $this->_summary['crawler_stats'][$this->_summary['curr_crawler']] = array();
602 }
603
604 self::save_summary();
605 }
606
607 /**
608 * Take over lane
609 *
610 * @since 6.1
611 */
612 private function _take_over_lane() {
613 self::debug('Take over lane as lane is free: ' . $this->json_local_path() . '.pid');
614 File::save($this->json_local_path() . '.pid', LITESPEED_LANE_HASH);
615 }
616
617 /**
618 * Update lane file
619 *
620 * @since 6.1
621 */
622 private function _touch_lane() {
623 touch($this->json_local_path() . '.pid');
624 }
625
626 /**
627 * Release lane file
628 *
629 * @since 6.1
630 */
631 public function Release_lane() {
632 $lane_file = $this->json_local_path() . '.pid';
633 if (!file_exists($lane_file)) {
634 return;
635 }
636
637 self::debug('Release lane');
638 unlink($lane_file);
639 }
640
641 /**
642 * Check if lane is used by other crawlers
643 *
644 * @since 6.1
645 */
646 private function _check_valid_lane( $strict_mode = false ) {
647 // Check lane hash
648 $lane_file = $this->json_local_path() . '.pid';
649 if ($strict_mode) {
650 if (!file_exists($lane_file)) {
651 self::debug("lane file not existed, strict mode is false [file] $lane_file");
652 return false;
653 }
654 }
655 $pid = File::read($lane_file);
656 if ($pid && LITESPEED_LANE_HASH != $pid) {
657 // If lane file is older than 1h, ignore
658 if (time() - filemtime($lane_file) > 3600) {
659 self::debug('Lane file is older than 1h, releasing lane');
660 $this->Release_lane();
661 return true;
662 }
663 return false;
664 }
665 return true;
666 }
667
668 /**
669 * Test port for simulator
670 *
671 * @since 7.0
672 * @access private
673 * @return bool true if success and can continue crawling, false if failed and need to stop
674 */
675 private function _test_port() {
676 if (!$this->_server_ip) {
677 if (empty($this->_crawlers[$this->_summary['curr_crawler']]['uid'])) {
678 self::debug('Bypass test port as Server IP is not set');
679 return true;
680 }
681 self::debug('❌ Server IP not set');
682 return false;
683 }
684 if (defined('LITESPEED_CRAWLER_LOCAL_PORT')) {
685 self::debug('�
686 LITESPEED_CRAWLER_LOCAL_PORT already defined');
687 return true;
688 }
689 // Don't repeat testing in 120s
690 if (!empty($this->_summary['test_port_tts']) && time() - $this->_summary['test_port_tts'] < 120) {
691 if (!empty($this->_summary['test_port'])) {
692 self::debug('�
693 Use tested local port: ' . $this->_summary['test_port']);
694 define('LITESPEED_CRAWLER_LOCAL_PORT', $this->_summary['test_port']);
695 return true;
696 }
697 return false;
698 }
699 $this->_summary['test_port_tts'] = time();
700 self::save_summary();
701
702 $options = $this->_get_curl_options();
703 $home = home_url();
704 File::save(LITESPEED_STATIC_DIR . '/crawler/test_port.html', $home, true);
705 $url = LITESPEED_STATIC_URL . '/crawler/test_port.html';
706 $parsed_url = parse_url($url);
707 if (empty($parsed_url['host'])) {
708 self::debug('❌ Test port failed, invalid URL: ' . $url);
709 return false;
710 }
711 $resolved = $parsed_url['host'] . ':443:' . $this->_server_ip;
712 $options[CURLOPT_RESOLVE] = array( $resolved );
713 $options[CURLOPT_DNS_USE_GLOBAL_CACHE] = false;
714 $options[CURLOPT_HEADER] = false;
715 self::debug('Test local 443 port for ' . $resolved);
716
717 $ch = curl_init();
718 curl_setopt_array($ch, $options);
719 curl_setopt($ch, CURLOPT_URL, $url);
720 $result = curl_exec($ch);
721 $test_result = false;
722 if (curl_errno($ch) || $result !== $home) {
723 if (curl_errno($ch)) {
724 self::debug('❌ Test port curl error: [errNo] ' . curl_errno($ch) . ' [err] ' . curl_error($ch));
725 } elseif ($result !== $home) {
726 self::debug('❌ Test port response is wrong: ' . $result);
727 }
728 self::debug('❌ Test local 443 port failed, try port 80');
729
730 // Try port 80
731 $resolved = $parsed_url['host'] . ':80:' . $this->_server_ip;
732 $options[CURLOPT_RESOLVE] = array( $resolved );
733 $url = str_replace('https://', 'http://', $url);
734 if (!in_array('X-Forwarded-Proto: https', $options[CURLOPT_HTTPHEADER])) {
735 $options[CURLOPT_HTTPHEADER][] = 'X-Forwarded-Proto: https';
736 }
737 // $options[CURLOPT_HTTPHEADER][] = 'X-Forwarded-SSL: on';
738 $ch = curl_init();
739 curl_setopt_array($ch, $options);
740 curl_setopt($ch, CURLOPT_URL, $url);
741 $result = curl_exec($ch);
742 if (curl_errno($ch)) {
743 self::debug('❌ Test port curl error: [errNo] ' . curl_errno($ch) . ' [err] ' . curl_error($ch));
744 } elseif ($result !== $home) {
745 self::debug('❌ Test port response is wrong: ' . $result);
746 } else {
747 self::debug('�
748 Test local 80 port successfully');
749 define('LITESPEED_CRAWLER_LOCAL_PORT', 80);
750 $this->_summary['test_port'] = 80;
751 $test_result = true;
752 }
753 // self::debug('Response data: ' . $result);
754 // $this->Release_lane();
755 // exit($result);
756 } else {
757 self::debug('�
758 Tested local 443 port successfully');
759 define('LITESPEED_CRAWLER_LOCAL_PORT', 443);
760 $this->_summary['test_port'] = 443;
761 $test_result = true;
762 }
763 self::save_summary();
764 curl_close($ch);
765 return $test_result;
766 }
767
768 /**
769 * Run crawler
770 *
771 * @since 1.1.0
772 * @access private
773 */
774 private function _do_running() {
775 $options = $this->_get_curl_options(true);
776
777 // If is role simulator and not defined local port, check port once
778 $test_result = $this->_test_port();
779 if (!$test_result) {
780 $this->_end_reason = 'port_test_failed';
781 self::debug('❌ Test port failed, crawler stopped.');
782 return;
783 }
784
785 while ($urlChunks = $this->cls('Crawler_Map')->list_map(self::CHUNKS, $this->_summary['last_pos'])) {
786 // self::debug('$urlChunks=' . count($urlChunks) . ' $this->_cur_threads=' . $this->_cur_threads);
787 // start crawling
788 $urlChunks = array_chunk($urlChunks, $this->_cur_threads);
789 // self::debug('$urlChunks after array_chunk: ' . count($urlChunks));
790 foreach ($urlChunks as $rows) {
791 if (!$this->_check_valid_lane(true)) {
792 $this->_end_reason = 'lane_invalid';
793 self::debug('🛑 The crawler lane is used by newer crawler.');
794 throw new \Exception('invalid crawler lane');
795 }
796 // Update time
797 $this->_touch_lane();
798
799 // self::debug('chunk fetching count($rows)= ' . count($rows));
800 // multi curl
801 $rets = $this->_multi_request($rows, $options);
802
803 // check result headers
804 foreach ($rows as $row) {
805 // self::debug('chunk fetching 553');
806 if (empty($rets[$row['id']])) {
807 // If already in blacklist, no curl happened, no corresponding record
808 continue;
809 }
810 // self::debug('chunk fetching 557');
811 // check response
812 if ($rets[$row['id']]['code'] == 428) {
813 // HTTP/1.1 428 Precondition Required (need to test)
814 $this->_end_reason = 'crawler_disabled';
815 self::debug('crawler_disabled');
816 return;
817 }
818
819 $status = $this->_status_parse($rets[$row['id']]['header'], $rets[$row['id']]['code'], $row['url']); // B or H or M or N(nocache)
820 self::debug('[status] ' . $this->_status2title($status) . "\t\t [url] " . $row['url']);
821 $this->_map_status_list[$status][$row['id']] = array(
822 'url' => $row['url'],
823 'code' => $rets[$row['id']]['code'], // 201 or 200 or 404
824 );
825 if (empty($this->_summary['crawler_stats'][$this->_summary['curr_crawler']][$status])) {
826 $this->_summary['crawler_stats'][$this->_summary['curr_crawler']][$status] = 0;
827 }
828 ++$this->_summary['crawler_stats'][$this->_summary['curr_crawler']][$status];
829 }
830
831 // update offset position
832 $_time = time();
833 $this->_summary['last_count'] = count($rows);
834 $this->_summary['last_pos'] += $this->_summary['last_count'];
835 $this->_summary['last_crawled'] += $this->_summary['last_count'];
836 $this->_summary['last_update_time'] = $_time;
837 $this->_summary['last_status'] = 'updated position';
838 // self::debug("chunk fetching 604 last_pos:{$this->_summary['last_pos']} last_count:{$this->_summary['last_count']} last_crawled:{$this->_summary['last_crawled']}");
839 // check duration
840 if ($this->_summary['last_update_time'] > $this->_max_run_time) {
841 $this->_end_reason = 'stopped_maxtime';
842 self::debug('Terminated due to maxtime');
843 return;
844 // return __('Stopped due to exceeding defined Maximum Run Time', 'litespeed-cache');
845 }
846
847 // make sure at least each 10s save meta & map status once
848 if ($_time - $this->_summary['meta_save_time'] > 10) {
849 $this->_map_status_list = $this->cls('Crawler_Map')->save_map_status($this->_map_status_list, $this->_summary['curr_crawler']);
850 self::save_summary();
851 }
852 // self::debug('chunk fetching 597');
853 // check if need to reset pos each 5s
854 if ($_time > $this->_summary['pos_reset_check']) {
855 $this->_summary['pos_reset_check'] = $_time + 5;
856 if (file_exists($this->_resetfile) && unlink($this->_resetfile)) {
857 self::debug('Terminated due to reset file');
858
859 $this->_summary['last_pos'] = 0;
860 $this->_summary['curr_crawler'] = 0;
861 $this->_summary['crawler_stats'][$this->_summary['curr_crawler']] = array();
862 // reset done status
863 $this->_summary['done'] = 0;
864 $this->_summary['this_full_beginning_time'] = 0;
865 $this->_end_reason = 'stopped_reset';
866 return;
867 // return __('Stopped due to reset meta position', 'litespeed-cache');
868 }
869 }
870 // self::debug('chunk fetching 615');
871 // check loads
872 if ($this->_summary['last_update_time'] - $this->_cur_thread_time > 60) {
873 $this->_adjust_current_threads();
874 if ($this->_cur_threads == 0) {
875 $this->_end_reason = 'stopped_highload';
876 self::debug('🛑 Terminated due to highload');
877 return;
878 // return __('Stopped due to load over limit', 'litespeed-cache');
879 }
880 }
881
882 $this->_summary['last_status'] = 'sleeping ' . $this->_crawler_conf['run_delay'] . 'ms';
883
884 usleep($this->_crawler_conf['run_delay']);
885 }
886 // self::debug('chunk fetching done');
887 }
888
889 // All URLs are done for current crawler
890 $this->_end_reason = 'end';
891 $this->_summary['crawler_stats'][$this->_summary['curr_crawler']]['W'] = 0;
892 self::debug('Crawler #' . $this->_summary['curr_crawler'] . ' touched end');
893 }
894
895 /**
896 * If need to resolve DNS or not
897 *
898 * @since 7.3.0.1
899 */
900 private function _should_force_resolve_dns() {
901 if ($this->_server_ip) {
902 return true;
903 }
904 if (!empty($this->_crawler_conf['cookies']) && !empty($this->_crawler_conf['cookies']['litespeed_hash'])) {
905 return true;
906 }
907 return false;
908 }
909
910 /**
911 * Send multi curl requests
912 * If res=B, bypass request and won't return
913 *
914 * @since 1.1.0
915 * @access private
916 */
917 private function _multi_request( $rows, $options ) {
918 if (!function_exists('curl_multi_init')) {
919 exit('curl_multi_init disabled');
920 }
921 $mh = curl_multi_init();
922 $CRAWLER_DROP_DOMAIN = defined('LITESPEED_CRAWLER_DROP_DOMAIN') ? constant('LITESPEED_CRAWLER_DROP_DOMAIN') : false;
923 $curls = array();
924 foreach ($rows as $row) {
925 if (substr($row['res'], $this->_summary['curr_crawler'], 1) == self::STATUS_BLACKLIST) {
926 continue;
927 }
928 if (substr($row['res'], $this->_summary['curr_crawler'], 1) == self::STATUS_NOCACHE) {
929 continue;
930 }
931
932 if (!function_exists('curl_init')) {
933 exit('curl_init disabled');
934 }
935
936 $curls[$row['id']] = curl_init();
937
938 // Append URL
939 $url = $row['url'];
940 if ($CRAWLER_DROP_DOMAIN) {
941 $url = $this->_crawler_conf['base'] . $row['url'];
942 }
943
944 // IP resolve
945 if ($this->_should_force_resolve_dns()) {
946 $parsed_url = parse_url($url);
947 // self::debug('Crawl role simulator, required to use localhost for resolve');
948
949 if (!empty($parsed_url['host'])) {
950 $dom = $parsed_url['host'];
951 $port = defined('LITESPEED_CRAWLER_LOCAL_PORT') ? LITESPEED_CRAWLER_LOCAL_PORT : '443';
952 $resolved = $dom . ':' . $port . ':' . $this->_server_ip;
953 $options[CURLOPT_RESOLVE] = array( $resolved );
954 $options[CURLOPT_DNS_USE_GLOBAL_CACHE] = false;
955 // $options[CURLOPT_PORT] = $port;
956 if ($port == 80) {
957 $url = str_replace('https://', 'http://', $url);
958 if (!in_array('X-Forwarded-Proto: https', $options[CURLOPT_HTTPHEADER])) {
959 $options[CURLOPT_HTTPHEADER][] = 'X-Forwarded-Proto: https';
960 }
961 }
962 self::debug('Resolved DNS for ' . $resolved);
963 }
964 }
965
966 curl_setopt($curls[$row['id']], CURLOPT_URL, $url);
967 self::debug('Crawling [url] ' . $url . ($url == $row['url'] ? '' : ' [ori] ' . $row['url']));
968
969 curl_setopt_array($curls[$row['id']], $options);
970
971 curl_multi_add_handle($mh, $curls[$row['id']]);
972 }
973
974 // execute curl
975 if ($curls) {
976 do {
977 $status = curl_multi_exec($mh, $active);
978 if ($active) {
979 curl_multi_select($mh);
980 }
981 } while ($active && $status == CURLM_OK);
982 }
983
984 // curl done
985 $ret = array();
986 foreach ($rows as $row) {
987 if (substr($row['res'], $this->_summary['curr_crawler'], 1) == self::STATUS_BLACKLIST) {
988 continue;
989 }
990 if (substr($row['res'], $this->_summary['curr_crawler'], 1) == self::STATUS_NOCACHE) {
991 continue;
992 }
993 // self::debug('-----debug3');
994 $ch = $curls[$row['id']];
995
996 // Parse header
997 $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
998 $content = curl_multi_getcontent($ch);
999 $header = substr($content, 0, $header_size);
1000
1001 $ret[$row['id']] = array(
1002 'header' => $header,
1003 'code' => curl_getinfo($ch, CURLINFO_HTTP_CODE),
1004 );
1005 // self::debug('-----debug4');
1006 curl_multi_remove_handle($mh, $ch);
1007 curl_close($ch);
1008 }
1009 // self::debug('-----debug5');
1010 curl_multi_close($mh);
1011 // self::debug('-----debug6');
1012 return $ret;
1013 }
1014
1015 /**
1016 * Translate the status to title
1017 *
1018 * @since 6.0
1019 */
1020 private function _status2title( $status ) {
1021 if ($status == self::STATUS_HIT) {
1022 return '�
1023 Hit';
1024 }
1025 if ($status == self::STATUS_MISS) {
1026 return '😊 Miss';
1027 }
1028 if ($status == self::STATUS_BLACKLIST) {
1029 return '�
1030 Blacklisted';
1031 }
1032 if ($status == self::STATUS_NOCACHE) {
1033 return '�
1034 Blacklisted';
1035 }
1036 return '🛸 Unknown';
1037 }
1038
1039 /**
1040 * Check returned curl header to find if cached or not
1041 *
1042 * @since 2.0
1043 * @access private
1044 */
1045 private function _status_parse( $header, $code, $url ) {
1046 // self::debug('http status code: ' . $code . ' [headers]', $header);
1047 if ($code == 201) {
1048 return self::STATUS_HIT;
1049 }
1050
1051 if (stripos($header, 'X-Litespeed-Cache-Control: no-cache') !== false) {
1052 // If is from DIVI, taken as miss
1053 if (defined('LITESPEED_CRAWLER_IGNORE_NONCACHEABLE') && LITESPEED_CRAWLER_IGNORE_NONCACHEABLE) {
1054 return self::STATUS_MISS;
1055 }
1056
1057 // If blacklist is disabled
1058 if ((defined('LITESPEED_CRAWLER_DISABLE_BLOCKLIST') && constant('LITESPEED_CRAWLER_DISABLE_BLOCKLIST')) || apply_filters('litespeed_crawler_disable_blocklist', false, $url)) {
1059 return self::STATUS_MISS;
1060 }
1061
1062 return self::STATUS_NOCACHE; // Blacklist
1063 }
1064
1065 $_cache_headers = array( 'x-litespeed-cache', 'x-qc-cache', 'x-lsadc-cache' );
1066
1067 foreach ($_cache_headers as $_header) {
1068 if (stripos($header, $_header) !== false) {
1069 if (stripos($header, $_header . ': bkn') !== false) {
1070 return self::STATUS_HIT; // Hit
1071 }
1072 if (stripos($header, $_header . ': miss') !== false) {
1073 return self::STATUS_MISS; // Miss
1074 }
1075 return self::STATUS_HIT; // Hit
1076 }
1077 }
1078
1079 // If blacklist is disabled
1080 if ((defined('LITESPEED_CRAWLER_DISABLE_BLOCKLIST') && constant('LITESPEED_CRAWLER_DISABLE_BLOCKLIST')) || apply_filters('litespeed_crawler_disable_blocklist', false, $url)) {
1081 return self::STATUS_MISS;
1082 }
1083
1084 return self::STATUS_BLACKLIST; // Blacklist
1085 }
1086
1087 /**
1088 * Get curl_options
1089 *
1090 * @since 1.1.0
1091 * @access private
1092 */
1093 private function _get_curl_options( $crawler_only = false ) {
1094 $CRAWLER_TIMEOUT = defined('LITESPEED_CRAWLER_TIMEOUT') ? constant('LITESPEED_CRAWLER_TIMEOUT') : 30;
1095 $options = array(
1096 CURLOPT_RETURNTRANSFER => true,
1097 CURLOPT_HEADER => true,
1098 CURLOPT_CUSTOMREQUEST => 'GET',
1099 CURLOPT_FOLLOWLOCATION => false,
1100 CURLOPT_ENCODING => 'gzip',
1101 CURLOPT_CONNECTTIMEOUT => 10,
1102 CURLOPT_TIMEOUT => $CRAWLER_TIMEOUT, // Larger timeout to avoid incorrect blacklist addition #900171
1103 CURLOPT_SSL_VERIFYHOST => 0,
1104 CURLOPT_SSL_VERIFYPEER => false,
1105 CURLOPT_NOBODY => false,
1106 CURLOPT_HTTPHEADER => $this->_crawler_conf['headers'],
1107 );
1108 $options[CURLOPT_HTTPHEADER][] = 'Cache-Control: max-age=0';
1109
1110 /**
1111 * Try to enable http2 connection (only available since PHP7+)
1112 *
1113 * @since 1.9.1
1114 * @since 2.2.7 Commented due to cause no-cache issue
1115 * @since 2.9.1+ Fixed wrongly usage of CURL_HTTP_VERSION_1_1 const
1116 */
1117 $options[CURLOPT_HTTP_VERSION] = CURL_HTTP_VERSION_1_1;
1118 // $options[ CURL_HTTP_VERSION_2 ] = 1;
1119
1120 // if is walker
1121 // $options[ CURLOPT_FRESH_CONNECT ] = true;
1122
1123 // Referer
1124 if (isset($_SERVER['HTTP_HOST']) && isset($_SERVER['REQUEST_URI'])) {
1125 $options[CURLOPT_REFERER] = 'http://' . $_SERVER['HTTP_HOST'] . $_SERVER['REQUEST_URI'];
1126 }
1127
1128 // User Agent
1129 if ($crawler_only) {
1130 if (strpos($this->_crawler_conf['ua'], self::FAST_USER_AGENT) !== 0) {
1131 $this->_crawler_conf['ua'] = self::FAST_USER_AGENT . ' ' . $this->_crawler_conf['ua'];
1132 }
1133 }
1134 $options[CURLOPT_USERAGENT] = $this->_crawler_conf['ua'];
1135
1136 // Cookies
1137 $cookies = array();
1138 foreach ($this->_crawler_conf['cookies'] as $k => $v) {
1139 if (!$v) {
1140 continue;
1141 }
1142 $cookies[] = $k . '=' . urlencode($v);
1143 }
1144 if ($cookies) {
1145 $options[CURLOPT_COOKIE] = implode('; ', $cookies);
1146 }
1147
1148 return $options;
1149 }
1150
1151 /**
1152 * Self curl to get HTML content
1153 *
1154 * @since 3.3
1155 */
1156 public function self_curl( $url, $ua, $uid = false, $accept = false ) {
1157 // $accept not in use yet
1158 $this->_crawler_conf['base'] = site_url();
1159 $this->_crawler_conf['ua'] = $ua;
1160 if ($accept) {
1161 $this->_crawler_conf['headers'] = array( 'Accept: ' . $accept );
1162 }
1163 $options = $this->_get_curl_options();
1164
1165 if ($uid) {
1166 $this->_crawler_conf['cookies']['litespeed_flash_hash'] = Router::cls()->get_flash_hash($uid);
1167 $parsed_url = parse_url($url);
1168
1169 if (!empty($parsed_url['host'])) {
1170 $dom = $parsed_url['host'];
1171 $port = defined('LITESPEED_CRAWLER_LOCAL_PORT') ? LITESPEED_CRAWLER_LOCAL_PORT : '443'; // TODO: need to test port?
1172 $resolved = $dom . ':' . $port . ':' . $this->_server_ip;
1173 $options[CURLOPT_RESOLVE] = array( $resolved );
1174 $options[CURLOPT_DNS_USE_GLOBAL_CACHE] = false;
1175 $options[CURLOPT_PORT] = $port;
1176 self::debug('Resolved DNS for ' . $resolved);
1177 }
1178 }
1179
1180 $options[CURLOPT_HEADER] = false;
1181 $options[CURLOPT_FOLLOWLOCATION] = true;
1182
1183 $ch = curl_init();
1184 curl_setopt_array($ch, $options);
1185 curl_setopt($ch, CURLOPT_URL, $url);
1186 $result = curl_exec($ch);
1187 $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
1188 curl_close($ch);
1189
1190 if ($code != 200) {
1191 self::debug('❌ Response code is not 200 in self_curl() [code] ' . var_export($code, true));
1192 return false;
1193 }
1194
1195 return $result;
1196 }
1197
1198 /**
1199 * Terminate crawling
1200 *
1201 * @since 1.1.0
1202 * @access private
1203 */
1204 private function _terminate_running() {
1205 $this->_map_status_list = $this->cls('Crawler_Map')->save_map_status($this->_map_status_list, $this->_summary['curr_crawler']);
1206
1207 if ($this->_end_reason == 'end') {
1208 // Current crawler is fully done
1209 // $end_reason = sprintf( __( 'Crawler %s reached end of sitemap file.', 'litespeed-cache' ), '#' . ( $this->_summary['curr_crawler'] + 1 ) );
1210 ++$this->_summary['curr_crawler']; // Jump to next crawler
1211 // $this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array(); // reset this at next crawl time
1212 $this->_summary['last_pos'] = 0; // reset last position
1213 $this->_summary['last_crawler_total_cost'] = time() - $this->_summary['curr_crawler_beginning_time'];
1214 $count_crawlers = count($this->list_crawlers());
1215 if ($this->_summary['curr_crawler'] >= $count_crawlers) {
1216 self::debug('_terminate_running Touched end, whole crawled. Reload crawler!');
1217 $this->_summary['curr_crawler'] = 0;
1218 // $this->_summary[ 'crawler_stats' ][ $this->_summary[ 'curr_crawler' ] ] = array();
1219 $this->_summary['done'] = 'touchedEnd'; // log done status
1220 $this->_summary['last_full_time_cost'] = time() - $this->_summary['this_full_beginning_time'];
1221 }
1222 }
1223 $this->_summary['last_status'] = 'stopped';
1224 $this->_summary['is_running'] = 0;
1225 $this->_summary['end_reason'] = $this->_end_reason;
1226 self::save_summary();
1227 }
1228
1229 /**
1230 * List all crawlers ( tagA => [ valueA => titleA, ... ] ...)
1231 *
1232 * @since 1.9.1
1233 * @access public
1234 */
1235 public function list_crawlers() {
1236 if ($this->_crawlers) {
1237 return $this->_crawlers;
1238 }
1239
1240 $crawler_factors = array();
1241
1242 // Add default Guest crawler
1243 $crawler_factors['uid'] = array( 0 => __('Guest', 'litespeed-cache') );
1244
1245 // WebP on/off
1246 if ($this->conf(Base::O_IMG_OPTM_WEBP)) {
1247 $crawler_factors['webp'] = array( 1 => $this->cls('Media')->next_gen_image_title() );
1248 if (apply_filters('litespeed_crawler_webp', false)) {
1249 $crawler_factors['webp'][0] = '';
1250 }
1251 }
1252
1253 // Guest Mode on/off
1254 if ($this->conf(Base::O_GUEST)) {
1255 $vary_name = $this->cls('Vary')->get_vary_name();
1256 $vary_val = 'guest_mode:1';
1257 if (!defined('LSCWP_LOG')) {
1258 $vary_val = md5($this->conf(Base::HASH) . $vary_val);
1259 }
1260 $crawler_factors['cookie:' . $vary_name] = array(
1261 $vary_val => '',
1262 '_null' => '<font data-balloon-pos="up" aria-label="Guest Mode">👒</font>',
1263 );
1264 }
1265
1266 // Mobile crawler
1267 if ($this->conf(Base::O_CACHE_MOBILE)) {
1268 $crawler_factors['mobile'] = array(
1269 1 => '<font data-balloon-pos="up" aria-label="Mobile">📱</font>',
1270 0 => '',
1271 );
1272 }
1273
1274 // Get roles set
1275 // List all roles
1276 foreach ($this->conf(Base::O_CRAWLER_ROLES) as $v) {
1277 $role_title = '';
1278 $udata = get_userdata($v);
1279 if (isset($udata->roles) && is_array($udata->roles)) {
1280 $tmp = array_values($udata->roles);
1281 $role_title = array_shift($tmp);
1282 }
1283 if (!$role_title) {
1284 continue;
1285 }
1286
1287 $crawler_factors['uid'][$v] = ucfirst($role_title);
1288 }
1289
1290 // Cookie crawler
1291 foreach ($this->conf(Base::O_CRAWLER_COOKIES) as $v) {
1292 if (empty($v['name'])) {
1293 continue;
1294 }
1295
1296 $this_cookie_key = 'cookie:' . $v['name'];
1297
1298 $crawler_factors[$this_cookie_key] = array();
1299
1300 foreach ($v['vals'] as $v2) {
1301 $crawler_factors[$this_cookie_key][$v2] =
1302 $v2 == '_null' ? '' : '<font data-balloon-pos="up" aria-label="Cookie">🍪</font>' . esc_html($v['name']) . '=' . esc_html($v2);
1303 }
1304 }
1305
1306 // Crossing generate the crawler list
1307 $this->_crawlers = $this->_recursive_build_crawler($crawler_factors);
1308
1309 return $this->_crawlers;
1310 }
1311
1312 /**
1313 * Build a crawler list recursively
1314 *
1315 * @since 2.8
1316 * @access private
1317 */
1318 private function _recursive_build_crawler( $crawler_factors, $group = array(), $i = 0 ) {
1319 $current_factor = array_keys($crawler_factors);
1320 $current_factor = $current_factor[$i];
1321
1322 $if_touch_end = $i + 1 >= count($crawler_factors);
1323
1324 $final_list = array();
1325
1326 foreach ($crawler_factors[$current_factor] as $k => $v) {
1327 // Don't alter $group bcos of loop usage
1328 $item = $group;
1329 $item['title'] = !empty($group['title']) ? $group['title'] : '';
1330 if ($v) {
1331 if ($item['title']) {
1332 $item['title'] .= ' - ';
1333 }
1334 $item['title'] .= $v;
1335 }
1336 $item[$current_factor] = $k;
1337
1338 if ($if_touch_end) {
1339 $final_list[] = $item;
1340 } else {
1341 // Inception: next layer
1342 $final_list = array_merge($final_list, $this->_recursive_build_crawler($crawler_factors, $item, $i + 1));
1343 }
1344 }
1345
1346 return $final_list;
1347 }
1348
1349 /**
1350 * Return crawler meta file local path
1351 *
1352 * @since 6.1
1353 * @access public
1354 */
1355 public function json_local_path() {
1356 // if (!file_exists(LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta)) {
1357 // return false;
1358 // }
1359
1360 return LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta;
1361 }
1362
1363 /**
1364 * Return crawler meta file
1365 *
1366 * @since 1.1.0
1367 * @access public
1368 */
1369 public function json_path() {
1370 if (!file_exists(LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta)) {
1371 return false;
1372 }
1373
1374 return LITESPEED_STATIC_URL . '/crawler/' . $this->_sitemeta;
1375 }
1376
1377 /**
1378 * Create reset pos file
1379 *
1380 * @since 1.1.0
1381 * @access public
1382 */
1383 public function reset_pos() {
1384 File::save($this->_resetfile, time(), true);
1385
1386 self::save_summary(array( 'is_running' => 0 ));
1387 }
1388
1389 /**
1390 * Display status based by matching crawlers order
1391 *
1392 * @since 3.0
1393 * @access public
1394 */
1395 public function display_status( $status_row, $reason_set ) {
1396 if (!$status_row) {
1397 return '';
1398 }
1399
1400 $_status_list = array(
1401 '-' => 'default',
1402 self::STATUS_MISS => 'primary',
1403 self::STATUS_HIT => 'success',
1404 self::STATUS_BLACKLIST => 'danger',
1405 self::STATUS_NOCACHE => 'warning',
1406 );
1407
1408 $reason_set = explode(',', $reason_set);
1409
1410 $status = '';
1411 foreach (str_split($status_row) as $k => $v) {
1412 $reason = $reason_set[$k];
1413 if ($reason == 'Man') {
1414 $reason = __('Manually added to blocklist', 'litespeed-cache');
1415 }
1416 if ($reason == 'Existed') {
1417 $reason = __('Previously existed in blocklist', 'litespeed-cache');
1418 }
1419 if ($reason) {
1420 $reason = 'data-balloon-pos="up" aria-label="' . $reason . '"';
1421 }
1422 $status .= '<i class="litespeed-dot litespeed-bg-' . $_status_list[$v] . '" ' . $reason . '>' . ($k + 1) . '</i>';
1423 }
1424
1425 return $status;
1426 }
1427
1428 /**
1429 * Output info and exit
1430 *
1431 * @since 1.1.0
1432 * @access protected
1433 * @param string $msg Error info
1434 */
1435 protected function output( $msg ) {
1436 if (wp_doing_cron()) {
1437 echo $msg;
1438 // exit();
1439 } else {
1440 echo "<script>alert('" . htmlspecialchars($msg) . "');</script>";
1441 // exit;
1442 }
1443 }
1444
1445 /**
1446 * Handle all request actions from main cls
1447 *
1448 * @since 3.0
1449 * @access public
1450 */
1451 public function handler() {
1452 $type = Router::verify_type();
1453
1454 switch ($type) {
1455 case self::TYPE_REFRESH_MAP:
1456 $this->cls('Crawler_Map')->gen(true);
1457 break;
1458
1459 case self::TYPE_EMPTY:
1460 $this->cls('Crawler_Map')->empty_map();
1461 break;
1462
1463 case self::TYPE_BLACKLIST_EMPTY:
1464 $this->cls('Crawler_Map')->blacklist_empty();
1465 break;
1466
1467 case self::TYPE_BLACKLIST_DEL:
1468 if (!empty($_GET['id'])) {
1469 $this->cls('Crawler_Map')->blacklist_del($_GET['id']);
1470 }
1471 break;
1472
1473 case self::TYPE_BLACKLIST_ADD:
1474 if (!empty($_GET['id'])) {
1475 $this->cls('Crawler_Map')->blacklist_add($_GET['id']);
1476 }
1477 break;
1478
1479 case self::TYPE_START: // Handle the ajax request to proceed crawler manually by admin
1480 self::start_async();
1481 break;
1482
1483 case self::TYPE_RESET:
1484 $this->reset_pos();
1485 break;
1486
1487 default:
1488 break;
1489 }
1490
1491 Admin::redirect();
1492 }
1493 }
1494