PluginProbe ʕ •ᴥ•ʔ
GPTranslate – Multilingual AI Translation for WordPress: Automatically Translate Websites / 2.18.1
GPTranslate – Multilingual AI Translation for WordPress: Automatically Translate Websites v2.18.1
2.33.6 2.33.5 2.33.2 2.32.10 2.33 2.33.1 2.32.6 2.32.7 2.32.8 trunk 2.10.3 2.10.4 2.10.5 2.10.6 2.11 2.12 2.13 2.14 2.14.1 2.15 2.15.1 2.16.1 2.16.2 2.17 2.18 2.18.1 2.18.2 2.19 2.20 2.21 2.22 2.23 2.24 2.25 2.25.1 2.25.2 2.26 2.27 2.27.10 2.27.5 2.28 2.28.1 2.29 2.30 2.31 2.32 2.32.5
gptranslate / simplehtmldom.php
gptranslate Last commit date
assets 6 months ago flags 6 months ago language 6 months ago gptranslate.php 6 months ago multilang-routing.php 6 months ago readme.txt 6 months ago serverside-translations.php 6 months ago settings.php 6 months ago simplehtmldom.php 6 months ago uninstall.php 6 months ago
simplehtmldom.php
1757 lines
1 <?php
2 if (!defined('ABSPATH')) exit;
3
4 define ( 'GPTRANSLATE_HDOM_TYPE_ELEMENT', 1 );
5 define ( 'GPTRANSLATE_HDOM_TYPE_COMMENT', 2 );
6 define ( 'GPTRANSLATE_HDOM_TYPE_TEXT', 3 );
7 define ( 'GPTRANSLATE_HDOM_TYPE_ENDTAG', 4 );
8 define ( 'GPTRANSLATE_HDOM_TYPE_ROOT', 5 );
9 define ( 'GPTRANSLATE_HDOM_TYPE_UNKNOWN', 6 );
10 define ( 'GPTRANSLATE_HDOM_QUOTE_DOUBLE', 0 );
11 define ( 'GPTRANSLATE_HDOM_QUOTE_SINGLE', 1 );
12 define ( 'GPTRANSLATE_HDOM_QUOTE_NO', 3 );
13 define ( 'GPTRANSLATE_HDOM_INFO_BEGIN', 0 );
14 define ( 'GPTRANSLATE_HDOM_INFO_END', 1 );
15 define ( 'GPTRANSLATE_HDOM_INFO_QUOTE', 2 );
16 define ( 'GPTRANSLATE_HDOM_INFO_SPACE', 3 );
17 define ( 'GPTRANSLATE_HDOM_INFO_TEXT', 4 );
18 define ( 'GPTRANSLATE_HDOM_INFO_INNER', 5 );
19 define ( 'GPTRANSLATE_HDOM_INFO_OUTER', 6 );
20 define ( 'GPTRANSLATE_HDOM_INFO_ENDSPACE', 7 );
21 define ( 'GPTRANSLATE_DEFAULT_TARGET_CHARSET', 'UTF-8' );
22 define ( 'GPTRANSLATE_DEFAULT_BR_TEXT', "\r\n" );
23 define ( 'GPTRANSLATE_DEFAULT_SPAN_TEXT', " " );
24 define ( 'GPTRANSLATE_MAX_FILE_SIZE', 600000 );
25 // helper functions
26 // -----------------------------------------------------------------------------
27 // get html dom from file
28 // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
29 function gptranslate_simplehtmldom_file_get_html($url, $use_include_path = false, $context = null, $offset = -1, $maxLen = -1, $lowercase = true, $forceTagsClosed = true, $target_charset = GPTRANSLATE_DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) {
30 // We DO force the tags to be terminated.
31 $dom = new GPTranslateSimpleHtmlDom ( null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText );
32 // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
33 $contents = file_get_contents ( $url, $use_include_path, $context, $offset );
34 // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
35 // $contents = retrieve_url_contents($url);
36 if (empty ( $contents ) || strlen ( $contents ) > GPTRANSLATE_MAX_FILE_SIZE) {
37 return false;
38 }
39 // The second parameter can force the selectors to all be lowercase.
40 $dom->load ( $contents, $lowercase, $stripRN );
41 return $dom;
42 }
43
44 // get html dom from string
45 function gptranslate_simplehtmldom_str_get_html($str, $lowercase = true, $forceTagsClosed = true, $target_charset = GPTRANSLATE_DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) {
46 $dom = new GPTranslateSimpleHtmlDom ( null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText );
47 if (empty ( $str ) || strlen ( $str ) > GPTRANSLATE_MAX_FILE_SIZE) {
48 $dom->clear ();
49 return false;
50 }
51 $dom->load ( $str, $lowercase, $stripRN );
52 return $dom;
53 }
54
55 /**
56 * simple html dom node
57 * PaperG - added ability for "find" routine to lowercase the value of the selector.
58 * PaperG - added $tag_start to track the start position of the tag in the total byte index
59 *
60 * @package PlaceLocalInclude
61 */
62 class GPTranslateSimpleHtmlDomNode {
63 public $nodetype = GPTRANSLATE_HDOM_TYPE_TEXT;
64 public $tag = 'text';
65 public $attr = array ();
66 public $children = array ();
67 public $nodes = array ();
68 public $parent = null;
69 // The "info" array - see GPTRANSLATE_HDOM_INFO_... for what each element contains.
70 public $_ = array ();
71 public $tag_start = 0;
72 public $no_remove = null;
73 private $dom = null;
74 private $gptranslate_debug_object = null;
75 function __construct($dom) {
76 $this->dom = $dom;
77 $dom->nodes [] = $this;
78 }
79 function __destruct() {
80 $this->clear ();
81 }
82 function __toString() {
83 return $this->outertext ();
84 }
85
86 // clean up memory due to php5 circular references memory leak...
87 function clear() {
88 $this->dom = null;
89 $this->nodes = null;
90 $this->parent = null;
91 $this->children = null;
92 }
93
94 // dump node's tree
95 function dump($show_attr = true, $deep = 0) {
96 $lead = str_repeat ( ' ', $deep );
97 // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- output is safe
98 echo $lead . $this->tag;
99 if ($show_attr && count ( $this->attr ) > 0) {
100 echo '(';
101 foreach ( $this->attr as $k => $v )
102 // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- output is safe
103 echo "[$k]=>\"" . $this->$k . '", ';
104 echo ')';
105 }
106 echo "\n";
107
108 if ($this->nodes) {
109 foreach ( $this->nodes as $c ) {
110 $c->dump ( $show_attr, $deep + 1 );
111 }
112 }
113 }
114
115 // Debugging function to dump a single dom node with a bunch of information about it.
116 function dump_node($echo = true) {
117 $string = $this->tag;
118 if (count ( $this->attr ) > 0) {
119 $string .= '(';
120 foreach ( $this->attr as $k => $v ) {
121 $string .= "[$k]=>\"" . $this->$k . '", ';
122 }
123 $string .= ')';
124 }
125 if (count ( $this->_ ) > 0) {
126 $string .= ' $_ (';
127 foreach ( $this->_ as $k => $v ) {
128 if (is_array ( $v )) {
129 $string .= "[$k]=>(";
130 foreach ( $v as $k2 => $v2 ) {
131 $string .= "[$k2]=>\"" . $v2 . '", ';
132 }
133 $string .= ")";
134 } else {
135 $string .= "[$k]=>\"" . $v . '", ';
136 }
137 }
138 $string .= ")";
139 }
140
141 if (isset ( $this->text )) {
142 $string .= " text: (" . $this->text . ")";
143 }
144
145 $string .= " GPTRANSLATE_HDOM_INNER_INFO: '";
146 if (isset ( $node->_ [GPTRANSLATE_HDOM_INFO_INNER] )) {
147 $string .= $node->_ [GPTRANSLATE_HDOM_INFO_INNER] . "'";
148 } else {
149 $string .= ' NULL ';
150 }
151
152 $string .= " children: " . count ( $this->children );
153 $string .= " nodes: " . count ( $this->nodes );
154 $string .= " tag_start: " . $this->tag_start;
155 $string .= "\n";
156 // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- output is safe
157 if ($echo) {
158 // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- output is safe
159 echo $string;
160 return;
161 } else {
162 return $string;
163 }
164 }
165
166 // returns the parent of node
167 // If a node is passed in, it will reset the parent of the current node to that one.
168 function parent($parent = null) {
169 // I am SURE that this doesn't work properly.
170 // It fails to unset the current node from it's current parents nodes or children list first.
171 if ($parent !== null) {
172 $this->parent = $parent;
173 $this->parent->nodes [] = $this;
174 $this->parent->children [] = $this;
175 }
176
177 return $this->parent;
178 }
179
180 // verify that node has children
181 function has_child() {
182 return ! empty ( $this->children );
183 }
184
185 // returns children of node
186 function children($idx = -1) {
187 if ($idx === - 1) {
188 return $this->children;
189 }
190 if (isset ( $this->children [$idx] )) {
191 return $this->children [$idx];
192 }
193 return null;
194 }
195
196 // returns children of node recursively
197 function childrenRecursive($element, &$allChildren = array()) {
198 if ($element->hasChildNodes()) {
199 foreach ($element->childNodes() as $childElement) {
200 $allChildren[] = $childElement;
201 if($childElement->hasChildNodes()) {
202 $this->childrenRecursive($childElement, $allChildren);
203 }
204 }
205 }
206 return $allChildren;
207 }
208
209 // returns the first child of node
210 function first_child() {
211 if (count ( $this->children ) > 0) {
212 return $this->children [0];
213 }
214 return null;
215 }
216
217 // returns the last child of node
218 function last_child() {
219 if (($count = count ( $this->children )) > 0) {
220 return $this->children [$count - 1];
221 }
222 return null;
223 }
224
225 // returns the next sibling of node
226 function next_sibling() {
227 if ($this->parent === null) {
228 return null;
229 }
230
231 $idx = 0;
232 $count = count ( $this->parent->children );
233 while ( $idx < $count && $this !== $this->parent->children [$idx] ) {
234 ++ $idx;
235 }
236 if (++ $idx >= $count) {
237 return null;
238 }
239 return $this->parent->children [$idx];
240 }
241
242 // returns the previous sibling of node
243 function prev_sibling() {
244 if ($this->parent === null)
245 return null;
246 $idx = 0;
247 $count = count ( $this->parent->children );
248 while ( $idx < $count && $this !== $this->parent->children [$idx] )
249 ++ $idx;
250 if (-- $idx < 0)
251 return null;
252 return $this->parent->children [$idx];
253 }
254
255 // function to locate a specific ancestor tag in the path to the root.
256 function find_ancestor_tag($tag) {
257 if (is_object ( $this->gptranslate_debug_object )) {
258 $this->gptranslate_debug_object->debug_log_entry ( 1 );
259 }
260
261 // Start by including ourselves in the comparison.
262 $returnDom = $this;
263
264 while ( ! is_null ( $returnDom ) ) {
265 if (is_object ( $this->gptranslate_debug_object )) {
266 $this->gptranslate_debug_object->debug_log ( 2, "Current tag is: " . $returnDom->tag );
267 }
268
269 if ($returnDom->tag == $tag) {
270 break;
271 }
272 $returnDom = $returnDom->parent;
273 }
274 return $returnDom;
275 }
276
277 // get dom node's inner html
278 function innertext() {
279 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_INNER] ))
280 return $this->_ [GPTRANSLATE_HDOM_INFO_INNER];
281 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] ))
282 return $this->dom->restore_noise ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] );
283
284 $ret = '';
285 foreach ( $this->nodes as $n )
286 $ret .= $n->outertext ();
287 return $ret;
288 }
289
290 // get dom node's outer text (with tag)
291 function outertext() {
292 if (is_object ( $this->gptranslate_debug_object )) {
293 $text = '';
294 if ($this->tag == 'text') {
295 if (! empty ( $this->text )) {
296 $text = " with text: " . $this->text;
297 }
298 }
299 $this->gptranslate_debug_object->debug_log ( 1, 'Innertext of tag: ' . $this->tag . $text );
300 }
301
302 if ($this->tag === 'root')
303 return $this->innertext ();
304
305 // trigger callback
306 if ($this->dom && $this->dom->callback !== null) {
307 call_user_func_array ( $this->dom->callback, array (
308 $this
309 ) );
310 }
311
312 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_OUTER] ))
313 return $this->_ [GPTRANSLATE_HDOM_INFO_OUTER];
314 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] ))
315 return $this->dom->restore_noise ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] );
316
317 // render begin tag
318 if ($this->dom && $this->dom->nodes [$this->_ [GPTRANSLATE_HDOM_INFO_BEGIN]]) {
319 $ret = $this->dom->nodes [$this->_ [GPTRANSLATE_HDOM_INFO_BEGIN]]->makeup ();
320 } else {
321 $ret = "";
322 }
323
324 // render inner text
325 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_INNER] )) {
326 // If it's a br tag... don't return the GPTRANSLATE_HDOM_INNER_INFO that we may or may not have added.
327 if ($this->tag != "br") {
328 $ret .= $this->_ [GPTRANSLATE_HDOM_INFO_INNER];
329 }
330 } else {
331 if ($this->nodes) {
332 foreach ( $this->nodes as $n ) {
333 $ret .= $this->convert_text ( $n->outertext () );
334 }
335 }
336 }
337
338 // render end tag
339 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_END] ) && $this->_ [GPTRANSLATE_HDOM_INFO_END] != 0)
340 $ret .= '</' . $this->tag . '>';
341 return $ret;
342 }
343
344 // get dom node's plain text
345 function text($script = false) {
346 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_INNER] ))
347 return $this->_ [GPTRANSLATE_HDOM_INFO_INNER];
348 switch ($this->nodetype) {
349 case GPTRANSLATE_HDOM_TYPE_TEXT :
350 return $this->dom->restore_noise ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] );
351 case GPTRANSLATE_HDOM_TYPE_COMMENT :
352 return '';
353 case GPTRANSLATE_HDOM_TYPE_UNKNOWN :
354 return '';
355 }
356 if (strcasecmp ( $this->tag, 'script' ) === 0 && !$script)
357 return '';
358 if (strcasecmp ( $this->tag, 'style' ) === 0)
359 return '';
360
361 $ret = '';
362 // In rare cases, (always node type 1 or GPTRANSLATE_HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
363 // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
364 // WHY is this happening?
365 if (! is_null ( $this->nodes )) {
366 foreach ( $this->nodes as $n ) {
367 $ret .= $this->convert_text ( $n->text () );
368 }
369
370 // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
371 if ($this->tag == "span") {
372 $ret .= $this->dom->default_span_text;
373 }
374 }
375 return $ret;
376 }
377 function xmltext() {
378 $ret = $this->innertext ();
379 $ret = str_ireplace ( '<![CDATA[', '', $ret );
380 $ret = str_replace ( ']]>', '', $ret );
381 return $ret;
382 }
383
384 // build node's text with tag
385 function makeup() {
386 // text, comment, unknown
387 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] ))
388 return $this->dom->restore_noise ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] );
389
390 $ret = '<' . $this->tag;
391 $i = - 1;
392
393 foreach ( $this->attr as $key => $val ) {
394 ++ $i;
395
396 // skip removed attribute
397 if ($val === null || $val === false)
398 continue;
399
400 $ret .= $this->_ [GPTRANSLATE_HDOM_INFO_SPACE] [$i] [0];
401 // no value attr: nowrap, checked selected...
402 if ($val === true)
403 $ret .= $key;
404 else {
405 switch ($this->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [$i]) {
406 case GPTRANSLATE_HDOM_QUOTE_DOUBLE :
407 $quote = '"';
408 break;
409 case GPTRANSLATE_HDOM_QUOTE_SINGLE :
410 $quote = '\'';
411 break;
412 default :
413 $quote = '';
414 }
415 $ret .= $key . $this->_ [GPTRANSLATE_HDOM_INFO_SPACE] [$i] [1] . '=' . $this->_ [GPTRANSLATE_HDOM_INFO_SPACE] [$i] [2] . $quote . $val . $quote;
416 }
417 }
418 $ret = $this->dom->restore_noise ( $ret );
419 return $ret . $this->_ [GPTRANSLATE_HDOM_INFO_ENDSPACE] . '>';
420 }
421
422 // find elements by css selector
423 // PaperG - added ability for find to lowercase the value of the selector.
424 function find($selector, $idx = null, $lowercase = false) {
425 $selectors = $this->parse_selector ( $selector );
426 if (($count = count ( $selectors )) === 0)
427 return array ();
428 $found_keys = array ();
429
430 // find each selector
431 for($c = 0; $c < $count; ++ $c) {
432 // The change on the below line was documented on the sourceforge code tracker id 2788009
433 // used to be: if (($levle=count($selectors[0]))===0) return array();
434 if (($levle = count ( $selectors [$c] )) === 0)
435 return array ();
436 if (! isset ( $this->_ [GPTRANSLATE_HDOM_INFO_BEGIN] ))
437 return array ();
438
439 $head = array (
440 $this->_ [GPTRANSLATE_HDOM_INFO_BEGIN] => 1
441 );
442
443 // handle descendant selectors, no recursive!
444 for($l = 0; $l < $levle; ++ $l) {
445 $ret = array ();
446 foreach ( $head as $k => $v ) {
447 $n = ($k === - 1) ? $this->dom->root : $this->dom->nodes [$k];
448 // PaperG - Pass this optional parameter on to the seek function.
449 $n->seek ( $selectors [$c] [$l], $ret, $lowercase );
450 }
451 $head = $ret;
452 }
453
454 foreach ( $head as $k => $v ) {
455 if (! isset ( $found_keys [$k] )) {
456 $found_keys [$k] = 1;
457 }
458 }
459 }
460
461 // sort keys
462 ksort ( $found_keys );
463
464 $found = array ();
465 foreach ( $found_keys as $k => $v )
466 $found [] = $this->dom->nodes [$k];
467
468 // return nth-element or array
469 if (is_null ( $idx ))
470 return $found;
471 else if ($idx < 0)
472 $idx = count ( $found ) + $idx;
473 return (isset ( $found [$idx] )) ? $found [$idx] : null;
474 }
475
476 // seek for given conditions
477 // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
478 protected function seek($selector, &$ret, $lowercase = false) {
479 if (is_object ( $this->gptranslate_debug_object )) {
480 $this->gptranslate_debug_object->debug_log_entry ( 1 );
481 }
482
483 list ( $tag, $key, $val, $exp, $no_key ) = $selector;
484
485 // xpath index
486 if ($tag && $key && is_numeric ( $key )) {
487 $count = 0;
488 foreach ( $this->children as $c ) {
489 if ($tag === '*' || $tag === $c->tag) {
490 if (++ $count == $key) {
491 $ret [$c->_ [GPTRANSLATE_HDOM_INFO_BEGIN]] = 1;
492 return;
493 }
494 }
495 }
496 return;
497 }
498
499 $end = (! empty ( $this->_ [GPTRANSLATE_HDOM_INFO_END] )) ? $this->_ [GPTRANSLATE_HDOM_INFO_END] : 0;
500 if ($end == 0) {
501 $parent = $this->parent;
502 while ( ! isset ( $parent->_ [GPTRANSLATE_HDOM_INFO_END] ) && $parent !== null ) {
503 $end -= 1;
504 $parent = $parent->parent;
505 }
506 $end += $parent->_ [GPTRANSLATE_HDOM_INFO_END];
507 }
508
509 for($i = $this->_ [GPTRANSLATE_HDOM_INFO_BEGIN] + 1; $i < $end; ++ $i) {
510 $node = $this->dom->nodes [$i];
511
512 $pass = true;
513
514 if ($tag === '*' && ! $key) {
515 if (in_array ( $node, $this->children, true ))
516 $ret [$i] = 1;
517 continue;
518 }
519
520 // compare tag
521 if ($tag && $tag != $node->tag && $tag !== '*') {
522 $pass = false;
523 }
524 // compare key
525 if ($pass && $key) {
526 if ($no_key) {
527 if (isset ( $node->attr [$key] ))
528 $pass = false;
529 } else {
530 if (($key != "plaintext") && ! isset ( $node->attr [$key] ))
531 $pass = false;
532 }
533 }
534 // compare value
535 if ($pass && $key && $val && $val !== '*') {
536 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
537 if ($key == "plaintext") {
538 // $node->plaintext actually returns $node->text();
539 $nodeKeyValue = $node->text ();
540 } else {
541 // this is a normal search, we want the value of that attribute of the tag.
542 $nodeKeyValue = $node->attr [$key];
543 }
544 if (is_object ( $this->gptranslate_debug_object )) {
545 $this->gptranslate_debug_object->debug_log ( 2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue );
546 }
547
548 // PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
549 if ($lowercase) {
550 $check = $this->match ( $exp, strtolower ( $val ), strtolower ( $nodeKeyValue ) );
551 } else {
552 $check = $this->match ( $exp, $val, $nodeKeyValue );
553 }
554 if (is_object ( $this->gptranslate_debug_object )) {
555 $this->gptranslate_debug_object->debug_log ( 2, "after match: " . ($check ? "true" : "false") );
556 }
557
558 // handle multiple class
559 if (! $check && strcasecmp ( $key, 'class' ) === 0) {
560 foreach ( explode ( ' ', $node->attr [$key] ) as $k ) {
561 // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
562 if (! empty ( $k )) {
563 if ($lowercase) {
564 $check = $this->match ( $exp, strtolower ( $val ), strtolower ( $k ) );
565 } else {
566 $check = $this->match ( $exp, $val, $k );
567 }
568 if ($check)
569 break;
570 }
571 }
572 }
573 if (! $check)
574 $pass = false;
575 }
576 if ($pass)
577 $ret [$i] = 1;
578 unset ( $node );
579 }
580 // It's passed by reference so this is actually what this function returns.
581 if (is_object ( $this->gptranslate_debug_object )) {
582 $this->gptranslate_debug_object->debug_log ( 1, "EXIT - ret: ", $ret );
583 }
584 }
585 protected function match($exp, $pattern, $value) {
586 if (is_object ( $this->gptranslate_debug_object )) {
587 $this->gptranslate_debug_object->debug_log_entry ( 1 );
588 }
589
590 switch ($exp) {
591 case '=' :
592 return ($value === $pattern);
593 case '!=' :
594 return ($value !== $pattern);
595 case '^=' :
596 return preg_match ( "/^" . preg_quote ( $pattern, '/' ) . "/", $value );
597 case '$=' :
598 return preg_match ( "/" . preg_quote ( $pattern, '/' ) . "$/", $value );
599 case '*=' :
600 if ($pattern [0] == '/') {
601 return preg_match ( $pattern, $value );
602 }
603 return preg_match ( "/" . $pattern . "/i", $value );
604 }
605 return false;
606 }
607 protected function parse_selector($selector_string) {
608 if (is_object ( $this->gptranslate_debug_object )) {
609 $this->gptranslate_debug_object->debug_log_entry ( 1 );
610 }
611
612 // pattern of CSS selectors, modified from mootools
613 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
614 // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
615 // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
616 // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
617 // farther study is required to determine of this should be documented or removed.
618 // $pattern = "/([\w\-:\*]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
619 $pattern = "/([\w\-:\*]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
620 preg_match_all ( $pattern, trim ( $selector_string ) . ' ', $matches, PREG_SET_ORDER );
621 if (is_object ( $this->gptranslate_debug_object )) {
622 $this->gptranslate_debug_object->debug_log ( 2, "Matches Array: ", $matches );
623 }
624
625 $selectors = array ();
626 $result = array ();
627 // print_r($matches);
628
629 foreach ( $matches as $m ) {
630 $m [0] = trim ( $m [0] );
631 if ($m [0] === '' || $m [0] === '/' || $m [0] === '//')
632 continue;
633 // for browser generated xpath
634 if ($m [1] === 'tbody')
635 continue;
636
637 list ( $tag, $key, $val, $exp, $no_key ) = array (
638 $m [1],
639 '',
640 null,
641 '=',
642 false
643 );
644 if (! empty ( $m [2] )) {
645 $key = 'id';
646 $val = $m [2];
647 }
648 if (! empty ( $m [3] )) {
649 $key = 'class';
650 $val = $m [3];
651 }
652 if (! empty ( $m [4] )) {
653 $key = $m [4];
654 }
655 if (! empty ( $m [5] )) {
656 $exp = $m [5];
657 }
658 if (! empty ( $m [6] )) {
659 $val = $m [6];
660 }
661
662 // convert to lowercase
663 if ($this->dom->lowercase) {
664 $tag = strtolower ( $tag );
665 $key = strtolower ( $key );
666 }
667 // elements that do NOT have the specified attribute
668 if (isset ( $key [0] ) && $key [0] === '!') {
669 $key = substr ( $key, 1 );
670 $no_key = true;
671 }
672
673 $result [] = array (
674 $tag,
675 $key,
676 $val,
677 $exp,
678 $no_key
679 );
680 if (trim ( $m [7] ) === ',') {
681 $selectors [] = $result;
682 $result = array ();
683 }
684 }
685 if (count ( $result ) > 0)
686 $selectors [] = $result;
687 return $selectors;
688 }
689 function __get($name) {
690 if (isset ( $this->attr [$name] )) {
691 return $this->convert_text ( $this->attr [$name] );
692 }
693 switch ($name) {
694 case 'outertext' :
695 return $this->outertext ();
696 case 'innertext' :
697 return $this->innertext ();
698 case 'plaintext' :
699 return $this->text ();
700 case 'xmltext' :
701 return $this->xmltext ();
702 default :
703 return array_key_exists ( $name, $this->attr );
704 }
705 }
706 function __set($name, $value) {
707 if (is_object ( $this->gptranslate_debug_object )) {
708 $this->gptranslate_debug_object->debug_log_entry ( 1 );
709 }
710
711 switch ($name) {
712 case 'outertext' :
713 return $this->_ [GPTRANSLATE_HDOM_INFO_OUTER] = $value;
714 case 'innertext' :
715 if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] ))
716 return $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] = $value;
717 return $this->_ [GPTRANSLATE_HDOM_INFO_INNER] = $value;
718 }
719 if (! isset ( $this->attr [$name] )) {
720 $this->_ [GPTRANSLATE_HDOM_INFO_SPACE] [] = array (
721 ' ',
722 '',
723 ''
724 );
725 $this->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_DOUBLE;
726 }
727 $this->attr [$name] = $value;
728 }
729 function __isset($name) {
730 switch ($name) {
731 case 'outertext' :
732 return true;
733 case 'innertext' :
734 return true;
735 case 'plaintext' :
736 return true;
737 }
738 // no value attr: nowrap, checked selected...
739 return (array_key_exists ( $name, $this->attr )) ? true : isset ( $this->attr [$name] );
740 }
741 function __unset($name) {
742 if (isset ( $this->attr [$name] ))
743 unset ( $this->attr [$name] );
744 }
745
746 // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
747 function convert_text($text) {
748 if (is_object ( $this->gptranslate_debug_object )) {
749 $this->gptranslate_debug_object->debug_log_entry ( 1 );
750 }
751
752 $converted_text = $text;
753
754 $sourceCharset = "";
755 $targetCharset = "";
756
757 if ($this->dom) {
758 $sourceCharset = strtoupper ( $this->dom->_charset );
759 $targetCharset = strtoupper ( $this->dom->_target_charset );
760 }
761 if (is_object ( $this->gptranslate_debug_object )) {
762 $this->gptranslate_debug_object->debug_log ( 3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset );
763 }
764
765 if (! empty ( $sourceCharset ) && ! empty ( $targetCharset ) && (strcasecmp ( $sourceCharset, $targetCharset ) != 0)) {
766 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
767 if ((strcasecmp ( $targetCharset, 'UTF-8' ) == 0) && ($this->is_utf8 ( $text ))) {
768 $converted_text = $text;
769 } else {
770 $converted_text = iconv ( $sourceCharset, $targetCharset, $text );
771 }
772 }
773
774 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
775 if ($targetCharset == 'UTF-8') {
776 if (substr ( $converted_text, 0, 3 ) == "\xef\xbb\xbf") {
777 $converted_text = substr ( $converted_text, 3 );
778 }
779 if (substr ( $converted_text, - 3 ) == "\xef\xbb\xbf") {
780 $converted_text = substr ( $converted_text, 0, - 3 );
781 }
782 }
783
784 return $converted_text;
785 }
786
787 /**
788 * Returns true if $string is valid UTF-8 and false otherwise.
789 *
790 * @param mixed $str
791 * String to be tested
792 * @return boolean
793 */
794 static function is_utf8($str) {
795 $c = 0;
796 $b = 0;
797 $bits = 0;
798 $len = strlen ( $str );
799 for($i = 0; $i < $len; $i ++) {
800 if($str && is_string($str)) {
801 $c = ord ( $str [$i] );
802 if ($c > 128) {
803 if (($c >= 254))
804 return false;
805 elseif ($c >= 252)
806 $bits = 6;
807 elseif ($c >= 248)
808 $bits = 5;
809 elseif ($c >= 240)
810 $bits = 4;
811 elseif ($c >= 224)
812 $bits = 3;
813 elseif ($c >= 192)
814 $bits = 2;
815 else
816 return false;
817 if (($i + $bits) > $len)
818 return false;
819 while ( $bits > 1 ) {
820 $i ++;
821 $b = ord ( $str [$i] );
822 if ($b < 128 || $b > 191)
823 return false;
824 $bits --;
825 }
826 }
827 }
828 }
829 return true;
830 }
831
832 /**
833 * Function to try a few tricks to determine the displayed size of an img on the page.
834 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
835 *
836 * @author John Schlick
837 * @version April 19 2012
838 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
839 */
840 function get_display_size() {
841 $width = - 1;
842 $height = - 1;
843
844 if ($this->tag !== 'img') {
845 return false;
846 }
847
848 // See if there is aheight or width attribute in the tag itself.
849 if (isset ( $this->attr ['width'] )) {
850 $width = $this->attr ['width'];
851 }
852
853 if (isset ( $this->attr ['height'] )) {
854 $height = $this->attr ['height'];
855 }
856
857 // Now look for an inline style.
858 if (isset ( $this->attr ['style'] )) {
859 // Thanks to user gnarf from stackoverflow for this regular expression.
860 $attributes = array ();
861 preg_match_all ( "/([\w\-]+)\s*:\s*([^;]+)\s*;?/", $this->attr ['style'], $matches, PREG_SET_ORDER );
862 foreach ( $matches as $match ) {
863 $attributes [$match [1]] = $match [2];
864 }
865
866 // If there is a width in the style attributes:
867 if (isset ( $attributes ['width'] ) && $width == - 1) {
868 // check that the last two characters are px (pixels)
869 if (strtolower ( substr ( $attributes ['width'], - 2 ) ) == 'px') {
870 $proposed_width = substr ( $attributes ['width'], 0, - 2 );
871 // Now make sure that it's an integer and not something stupid.
872 if (filter_var ( $proposed_width, FILTER_VALIDATE_INT )) {
873 $width = $proposed_width;
874 }
875 }
876 }
877
878 // If there is a width in the style attributes:
879 if (isset ( $attributes ['height'] ) && $height == - 1) {
880 // check that the last two characters are px (pixels)
881 if (strtolower ( substr ( $attributes ['height'], - 2 ) ) == 'px') {
882 $proposed_height = substr ( $attributes ['height'], 0, - 2 );
883 // Now make sure that it's an integer and not something stupid.
884 if (filter_var ( $proposed_height, FILTER_VALIDATE_INT )) {
885 $height = $proposed_height;
886 }
887 }
888 }
889 }
890
891 // Future enhancement:
892 // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
893
894 // Far future enhancement
895 // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
896 // Note that in this case, the class or id will have the img subselector for it to apply to the image.
897
898 // ridiculously far future development
899 // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
900
901 $result = array (
902 'height' => $height,
903 'width' => $width
904 );
905 return $result;
906 }
907
908 // camel naming conventions
909 function getAllAttributes() {
910 return $this->attr;
911 }
912 function getAttribute($name) {
913 return $this->__get ( $name );
914 }
915 function setAttribute($name, $value) {
916 $this->__set ( $name, $value );
917 }
918 function hasAttribute($name) {
919 return $this->__isset ( $name );
920 }
921 function removeAttribute($name) {
922 $this->__set ( $name, null );
923 }
924 function getElementById($id) {
925 return $this->find ( "#$id", 0 );
926 }
927 function getElementsById($id, $idx = null) {
928 return $this->find ( "#$id", $idx );
929 }
930 function getElementByTagName($name) {
931 return $this->find ( $name, 0 );
932 }
933 function getElementsByTagName($name, $idx = null) {
934 return $this->find ( $name, $idx );
935 }
936 function parentNode() {
937 return $this->parent ();
938 }
939 function childNodes($idx = -1) {
940 return $this->children ( $idx );
941 }
942 function firstChild() {
943 return $this->first_child ();
944 }
945 function lastChild() {
946 return $this->last_child ();
947 }
948 function nextSibling() {
949 return $this->next_sibling ();
950 }
951 function previousSibling() {
952 return $this->prev_sibling ();
953 }
954 function hasChildNodes() {
955 return $this->has_child ();
956 }
957 function nodeName() {
958 return $this->tag;
959 }
960 function appendChild($node) {
961 $node->parent ( $this );
962 return $node;
963 }
964 }
965
966 /**
967 * simple html dom parser
968 * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
969 * Paperg - change $size from protected to public so we can easily access it
970 * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
971 *
972 * @package PlaceLocalInclude
973 */
974 class GPTranslateSimpleHtmlDom {
975 public $root = null;
976 public $nodes = array ();
977 public $callback = null;
978 public $lowercase = false;
979 // Used to keep track of how large the text was when we started.
980 public $original_size;
981 public $size;
982 protected $pos;
983 protected $doc;
984 protected $char;
985 protected $cursor;
986 protected $parent;
987 protected $noise = array ();
988 protected $token_blank = " \t\r\n";
989 protected $token_equal = ' =/>';
990 protected $token_slash = " />\r\n\t";
991 protected $token_attr = ' >';
992 // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
993 public $_charset = '';
994 public $_target_charset = '';
995 protected $default_br_text = "";
996 public $default_span_text = "";
997 protected $gptranslate_debug_object = null;
998
999 // use isset instead of in_array, performance boost about 30%...
1000 protected $self_closing_tags = array (
1001 'img' => 1,
1002 'br' => 1,
1003 'input' => 1,
1004 'meta' => 1,
1005 'link' => 1,
1006 'hr' => 1,
1007 'base' => 1,
1008 'embed' => 1,
1009 'spacer' => 1
1010 );
1011 protected $block_tags = array (
1012 'root' => 1,
1013 'body' => 1,
1014 'form' => 1,
1015 'div' => 1,
1016 'span' => 1,
1017 'table' => 1
1018 );
1019 // Known sourceforge issue #2977341
1020 // B tags that are not closed cause us to return everything to the end of the document.
1021 protected $optional_closing_tags = array (
1022 'tr' => array (
1023 'tr' => 1,
1024 'td' => 1,
1025 'th' => 1
1026 ),
1027 'th' => array (
1028 'th' => 1
1029 ),
1030 'td' => array (
1031 'td' => 1
1032 ),
1033 'li' => array (
1034 'li' => 1
1035 ),
1036 'dt' => array (
1037 'dt' => 1,
1038 'dd' => 1
1039 ),
1040 'dd' => array (
1041 'dd' => 1,
1042 'dt' => 1
1043 ),
1044 'dl' => array (
1045 'dd' => 1,
1046 'dt' => 1
1047 ),
1048 'p' => array (
1049 'p' => 1
1050 ),
1051 'nobr' => array (
1052 'nobr' => 1
1053 ),
1054 'b' => array (
1055 'b' => 1
1056 ),
1057 'option' => array (
1058 'option' => 1
1059 )
1060 );
1061 function __construct($str = null, $lowercase = true, $forceTagsClosed = true, $target_charset = GPTRANSLATE_DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) {
1062 if ($str) {
1063 if (preg_match ( "/^http:\/\//i", $str ) || is_file ( $str )) {
1064 $this->load_file ( $str );
1065 } else {
1066 $this->load ( $str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText );
1067 }
1068 }
1069 // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1070 if (! $forceTagsClosed) {
1071 $this->optional_closing_array = array ();
1072 }
1073 $this->_target_charset = $target_charset;
1074 }
1075 function __destruct() {
1076 $this->clear ();
1077 }
1078
1079 // load html from string
1080 function load($str, $lowercase = false, $stripRN = false, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) {
1081
1082 // prepare
1083 $this->prepare ( $str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText );
1084 // strip out cdata
1085 $this->remove_noise ( "'<!\[CDATA\[(.*?)\]\]>'is", true );
1086 // strip out comments
1087 $this->remove_noise ( "'<!--(.*?)-->'is" );
1088 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1089 // Script tags removal now preceeds style tag removal.
1090 // strip out <script> tags
1091 $this->remove_noise ( "'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is" );
1092 $this->remove_noise ( "'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is" );
1093 // strip out <style> tags
1094 $this->remove_noise ( "'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is" );
1095 $this->remove_noise ( "'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is" );
1096 // strip out preformatted tags
1097 $this->remove_noise ( "'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is" );
1098 // strip out server side scripts
1099 $this->remove_noise ( "'(<\?)(.*?)(\?>)'s", true );
1100 // strip smarty scripts
1101 $this->remove_noise ( "'(\{\w)(.*?)(\})'s", true );
1102
1103 // parsing
1104 while ( $this->parse () )
1105 ;
1106 // end
1107 $this->root->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor;
1108 $this->parse_charset ();
1109
1110 // make load function chainable
1111 return $this;
1112 }
1113
1114 // load html from file
1115 function load_file() {
1116 $args = func_get_args ();
1117 $this->load ( call_user_func_array ( 'file_get_contents', $args ), true );
1118 // Throw an error if we can't properly load the dom.
1119 if (($error = error_get_last ()) !== null) {
1120 $this->clear ();
1121 return false;
1122 }
1123 }
1124
1125 // set callback function
1126 function set_callback($function_name) {
1127 $this->callback = $function_name;
1128 }
1129
1130 // remove callback function
1131 function remove_callback() {
1132 $this->callback = null;
1133 }
1134
1135 // save dom as string
1136 function save($filepath = '') {
1137 $ret = $this->root->innertext ();
1138 if ($filepath !== '')
1139 file_put_contents ( $filepath, $ret, LOCK_EX );
1140 return $ret;
1141 }
1142
1143 // find dom node by css selector
1144 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1145 function find($selector, $idx = null, $lowercase = false) {
1146 return $this->root->find ( $selector, $idx, $lowercase );
1147 }
1148
1149 // clean up memory due to php5 circular references memory leak...
1150 function clear() {
1151 foreach ( $this->nodes as $n ) {
1152 $n->clear ();
1153 $n = null;
1154 }
1155 // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1156 if (isset ( $this->children ))
1157 foreach ( $this->children as $n ) {
1158 $n->clear ();
1159 $n = null;
1160 }
1161 if (isset ( $this->parent )) {
1162 $this->parent->clear ();
1163 unset ( $this->parent );
1164 }
1165 if (isset ( $this->root )) {
1166 $this->root->clear ();
1167 unset ( $this->root );
1168 }
1169 unset ( $this->doc );
1170 unset ( $this->noise );
1171 }
1172 function dump($show_attr = true) {
1173 $this->root->dump ( $show_attr );
1174 }
1175
1176 // prepare HTML data and init everything
1177 protected function prepare($str, $lowercase = true, $stripRN = true, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) {
1178 $this->clear ();
1179
1180 // set the length of content before we do anything to it.
1181 $this->size = strlen ( $str );
1182 // Save the original size of the html that we got in. It might be useful to someone.
1183 $this->original_size = $this->size;
1184
1185 // before we save the string as the doc... strip out the \r \n's if we are told to.
1186 if ($stripRN) {
1187 $str = str_replace ( "\r", " ", $str );
1188 $str = str_replace ( "\n", " ", $str );
1189
1190 // set the length of content since we have changed it.
1191 $this->size = strlen ( $str );
1192 }
1193
1194 $this->doc = $str;
1195 $this->pos = 0;
1196 $this->cursor = 1;
1197 $this->noise = array ();
1198 $this->nodes = array ();
1199 $this->lowercase = $lowercase;
1200 $this->default_br_text = $defaultBRText;
1201 $this->default_span_text = $defaultSpanText;
1202 $this->root = new GPTranslateSimpleHtmlDomNode ( $this );
1203 $this->root->tag = 'root';
1204 $this->root->_ [GPTRANSLATE_HDOM_INFO_BEGIN] = - 1;
1205 $this->root->nodetype = GPTRANSLATE_HDOM_TYPE_ROOT;
1206 $this->parent = $this->root;
1207 if ($this->size > 0)
1208 $this->char = $this->doc [0];
1209 }
1210
1211 // parse html content
1212 protected function parse() {
1213 if (($s = $this->copy_until_char ( '<' )) === '') {
1214 return $this->read_tag ();
1215 }
1216
1217 // text
1218 $node = new GPTranslateSimpleHtmlDomNode ( $this );
1219 ++ $this->cursor;
1220 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = $s;
1221 $this->link_nodes ( $node, false );
1222 return true;
1223 }
1224
1225 // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1226 // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1227 // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1228 protected function parse_charset() {
1229 $charset = null;
1230
1231 if (function_exists ( 'get_last_retrieve_url_contents_content_type' )) {
1232 $contentTypeHeader = get_last_retrieve_url_contents_content_type ();
1233 $success = preg_match ( '/charset=(.+)/', $contentTypeHeader, $matches );
1234 if ($success) {
1235 $charset = $matches [1];
1236 if (is_object ( $this->gptranslate_debug_object )) {
1237 $this->gptranslate_debug_object->debug_log ( 2, 'header content-type found charset of: ' . $charset );
1238 }
1239 }
1240 }
1241
1242 if (empty ( $charset )) {
1243 $el = $this->root->find ( 'meta[http-equiv=Content-Type]', 0, true );
1244 if (! empty ( $el )) {
1245 $fullvalue = $el->content;
1246 if (is_object ( $this->gptranslate_debug_object )) {
1247 $this->gptranslate_debug_object->debug_log ( 2, 'meta content-type tag found' . $fullvalue );
1248 }
1249
1250 if (! empty ( $fullvalue )) {
1251 $success = preg_match ( '/charset=(.+)/i', $fullvalue, $matches );
1252 if ($success) {
1253 $charset = $matches [1];
1254 } else {
1255 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1256 if (is_object ( $this->gptranslate_debug_object )) {
1257 $this->gptranslate_debug_object->debug_log ( 2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' );
1258 }
1259 $charset = 'ISO-8859-1';
1260 }
1261 }
1262 }
1263 }
1264
1265 // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1266 if (empty ( $charset )) {
1267 // Use this in case mb_detect_charset isn't installed/loaded on this machine.
1268 $charset = false;
1269 if (function_exists ( 'mb_detect_encoding' )) {
1270 // Have php try to detect the encoding from the text given to us.
1271 $charset = mb_detect_encoding ( $this->root->plaintext . "ascii", $encoding_list = array (
1272 "UTF-8",
1273 "CP1252"
1274 ) );
1275 if (is_object ( $this->gptranslate_debug_object )) {
1276 $this->gptranslate_debug_object->debug_log ( 2, 'mb_detect found: ' . $charset );
1277 }
1278 }
1279
1280 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1281 if ($charset === false) {
1282 if (is_object ( $this->gptranslate_debug_object )) {
1283 $this->gptranslate_debug_object->debug_log ( 2, 'since mb_detect failed - using default of utf-8' );
1284 }
1285 $charset = 'UTF-8';
1286 }
1287 }
1288
1289 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1290 if ((strtolower ( $charset ) == strtolower ( 'ISO-8859-1' )) || (strtolower ( $charset ) == strtolower ( 'Latin1' )) || (strtolower ( $charset ) == strtolower ( 'Latin-1' ))) {
1291 if (is_object ( $this->gptranslate_debug_object )) {
1292 $this->gptranslate_debug_object->debug_log ( 2, 'replacing ' . $charset . ' with CP1252 as its a superset' );
1293 }
1294 $charset = 'CP1252';
1295 }
1296
1297 if (is_object ( $this->gptranslate_debug_object )) {
1298 $this->gptranslate_debug_object->debug_log ( 1, 'EXIT - ' . $charset );
1299 }
1300
1301 return $this->_charset = $charset;
1302 }
1303
1304 // read tag info
1305 protected function read_tag() {
1306 if ($this->char !== '<') {
1307 $this->root->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor;
1308 return false;
1309 }
1310 $begin_tag_pos = $this->pos;
1311 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1312
1313 // end tag
1314 if ($this->char === '/') {
1315 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1316 // This represents the change in the GPTranslateSimpleHtmlDom trunk from revision 180 to 181.
1317 // $this->skip($this->token_blank_t);
1318 $this->skip ( $this->token_blank );
1319 $tag = $this->copy_until_char ( '>' );
1320
1321 // skip attributes in end tag
1322 if (($pos = strpos ( $tag, ' ' )) !== false)
1323 $tag = substr ( $tag, 0, $pos );
1324
1325 $parent_lower = strtolower ( $this->parent->tag );
1326 $tag_lower = strtolower ( $tag );
1327
1328 if ($parent_lower !== $tag_lower) {
1329 if (isset ( $this->optional_closing_tags [$parent_lower] ) && isset ( $this->block_tags [$tag_lower] )) {
1330 $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = 0;
1331 $org_parent = $this->parent;
1332
1333 while ( ($this->parent->parent) && strtolower ( $this->parent->tag ) !== $tag_lower )
1334 $this->parent = $this->parent->parent;
1335
1336 if (strtolower ( $this->parent->tag ) !== $tag_lower) {
1337 $this->parent = $org_parent; // restore origonal parent
1338 if ($this->parent->parent)
1339 $this->parent = $this->parent->parent;
1340 $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor;
1341 return $this->as_text_node ( $tag );
1342 }
1343 } else if (($this->parent->parent) && isset ( $this->block_tags [$tag_lower] )) {
1344 $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = 0;
1345 $org_parent = $this->parent;
1346
1347 while ( ($this->parent->parent) && strtolower ( $this->parent->tag ) !== $tag_lower )
1348 $this->parent = $this->parent->parent;
1349
1350 if (strtolower ( $this->parent->tag ) !== $tag_lower) {
1351 $this->parent = $org_parent; // restore origonal parent
1352 $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor;
1353 return $this->as_text_node ( $tag );
1354 }
1355 } else if (($this->parent->parent) && strtolower ( $this->parent->parent->tag ) === $tag_lower) {
1356 $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = 0;
1357 $this->parent = $this->parent->parent;
1358 } else
1359 return $this->as_text_node ( $tag );
1360 }
1361
1362 $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor;
1363 if ($this->parent->parent)
1364 $this->parent = $this->parent->parent;
1365
1366 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1367 return true;
1368 }
1369
1370 $node = new GPTranslateSimpleHtmlDomNode ( $this );
1371 $node->_ [GPTRANSLATE_HDOM_INFO_BEGIN] = $this->cursor;
1372 ++ $this->cursor;
1373 $tag = $this->copy_until ( $this->token_slash );
1374 $node->tag_start = $begin_tag_pos;
1375
1376 // doctype, cdata & comments...
1377 if (isset ( $tag [0] ) && $tag [0] === '!') {
1378 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char ( '>' );
1379
1380 if (isset ( $tag [2] ) && $tag [1] === '-' && $tag [2] === '-') {
1381 $node->nodetype = GPTRANSLATE_HDOM_TYPE_COMMENT;
1382 $node->tag = 'comment';
1383 } else {
1384 $node->nodetype = GPTRANSLATE_HDOM_TYPE_UNKNOWN;
1385 $node->tag = 'unknown';
1386 }
1387 if ($this->char === '>')
1388 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] .= '>';
1389 $this->link_nodes ( $node, true );
1390 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1391 return true;
1392 }
1393
1394 // text
1395 if ($pos = strpos ( $tag, '<' ) !== false) {
1396 $tag = '<' . substr ( $tag, 0, - 1 );
1397 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = $tag;
1398 $this->link_nodes ( $node, false );
1399 $this->char = $this->doc [-- $this->pos]; // prev
1400 return true;
1401 }
1402
1403 if (! preg_match ( "/^[\w\-:]+$/", $tag )) {
1404 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until ( '<>' );
1405 if ($this->char === '<') {
1406 $this->link_nodes ( $node, false );
1407 return true;
1408 }
1409
1410 if ($this->char === '>')
1411 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] .= '>';
1412 $this->link_nodes ( $node, false );
1413 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1414 return true;
1415 }
1416
1417 // begin tag
1418 $node->nodetype = GPTRANSLATE_HDOM_TYPE_ELEMENT;
1419 $tag_lower = strtolower ( $tag );
1420 $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1421
1422 // handle optional closing tags
1423 if (isset ( $this->optional_closing_tags [$tag_lower] )) {
1424 while ( isset ( $this->optional_closing_tags [$tag_lower] [strtolower ( $this->parent->tag )] ) ) {
1425 $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = 0;
1426 $this->parent = $this->parent->parent;
1427 }
1428 $node->parent = $this->parent;
1429 }
1430
1431 $guard = 0; // prevent infinity loop
1432 $space = array (
1433 $this->copy_skip ( $this->token_blank ),
1434 '',
1435 ''
1436 );
1437
1438 // attributes
1439 do {
1440 if ($this->char !== null && $space [0] === '') {
1441 break;
1442 }
1443 $name = $this->copy_until ( $this->token_equal );
1444 if ($guard === $this->pos) {
1445 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1446 continue;
1447 }
1448 $guard = $this->pos;
1449
1450 // handle endless '<'
1451 if ($this->pos >= $this->size - 1 && $this->char !== '>') {
1452 $node->nodetype = GPTRANSLATE_HDOM_TYPE_TEXT;
1453 $node->_ [GPTRANSLATE_HDOM_INFO_END] = 0;
1454 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = '<' . $tag . $space [0] . $name;
1455 $node->tag = 'text';
1456 $this->link_nodes ( $node, false );
1457 return true;
1458 }
1459
1460 // handle mismatch '<'
1461 if ($this->doc [$this->pos - 1] == '<') {
1462 $node->nodetype = GPTRANSLATE_HDOM_TYPE_TEXT;
1463 $node->tag = 'text';
1464 $node->attr = array ();
1465 $node->_ [GPTRANSLATE_HDOM_INFO_END] = 0;
1466 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = substr ( $this->doc, $begin_tag_pos, $this->pos - $begin_tag_pos - 1 );
1467 $this->pos -= 2;
1468 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1469 $this->link_nodes ( $node, false );
1470 return true;
1471 }
1472
1473 if ($name !== '/' && $name !== '') {
1474 $space [1] = $this->copy_skip ( $this->token_blank );
1475 $name = $this->restore_noise ( $name );
1476 if ($this->lowercase)
1477 $name = strtolower ( $name );
1478 if ($this->char === '=') {
1479 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1480 $this->parse_attr ( $node, $name, $space );
1481 } else {
1482 // no value attr: nowrap, checked selected...
1483 $node->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_NO;
1484 $node->attr [$name] = true;
1485 if ($this->char != '>')
1486 $this->char = $this->doc [-- $this->pos]; // prev
1487 }
1488 $node->_ [GPTRANSLATE_HDOM_INFO_SPACE] [] = $space;
1489 $space = array (
1490 $this->copy_skip ( $this->token_blank ),
1491 '',
1492 ''
1493 );
1494 } else
1495 break;
1496 } while ( $this->char !== '>' && $this->char !== '/' );
1497
1498 $this->link_nodes ( $node, true );
1499 $node->_ [GPTRANSLATE_HDOM_INFO_ENDSPACE] = $space [0];
1500
1501 // check self closing
1502 if ($this->copy_until_char_escape ( '>' ) === '/') {
1503 $node->_ [GPTRANSLATE_HDOM_INFO_ENDSPACE] .= '/';
1504 $node->_ [GPTRANSLATE_HDOM_INFO_END] = 0;
1505 } else {
1506 // reset parent
1507 if (! isset ( $this->self_closing_tags [strtolower ( $node->tag )] ))
1508 $this->parent = $node;
1509 }
1510 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1511
1512 // If it's a BR tag, we need to set it's text to the default text.
1513 // This way when we see it in plaintext, we can generate formatting that the user wants.
1514 // since a br tag never has sub nodes, this works well.
1515 if ($node->tag == "br") {
1516 $node->_ [GPTRANSLATE_HDOM_INFO_INNER] = $this->default_br_text;
1517 }
1518
1519 return true;
1520 }
1521
1522 // parse attributes
1523 protected function parse_attr($node, $name, &$space) {
1524 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1525 // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1526 if (isset ( $node->attr [$name] )) {
1527 return;
1528 }
1529
1530 $space [2] = $this->copy_skip ( $this->token_blank );
1531 switch ($this->char) {
1532 case '"' :
1533 $node->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_DOUBLE;
1534 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1535 $node->attr [$name] = $this->restore_noise ( $this->copy_until_char_escape ( '"' ) );
1536 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1537 break;
1538 case '\'' :
1539 $node->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_SINGLE;
1540 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1541 $node->attr [$name] = $this->restore_noise ( $this->copy_until_char_escape ( '\'' ) );
1542 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1543 break;
1544 default :
1545 $node->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_NO;
1546 $node->attr [$name] = $this->restore_noise ( $this->copy_until ( $this->token_attr ) );
1547 }
1548 // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1549 $node->attr [$name] = str_replace ( "\r", "", $node->attr [$name] );
1550 $node->attr [$name] = str_replace ( "\n", "", $node->attr [$name] );
1551 // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1552 if ($name == "class") {
1553 $node->attr [$name] = trim ( $node->attr [$name] );
1554 }
1555 }
1556
1557 // link node's parent
1558 protected function link_nodes(&$node, $is_child) {
1559 $node->parent = $this->parent;
1560 $this->parent->nodes [] = $node;
1561 if ($is_child) {
1562 $this->parent->children [] = $node;
1563 }
1564 }
1565
1566 // as a text node
1567 protected function as_text_node($tag) {
1568 $node = new GPTranslateSimpleHtmlDomNode ( $this );
1569 ++ $this->cursor;
1570 $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = '</' . $tag . '>';
1571 $this->link_nodes ( $node, false );
1572 $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1573 return true;
1574 }
1575 protected function skip($chars) {
1576 $this->pos += strspn ( $this->doc, $chars, $this->pos );
1577 $this->char = ($this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1578 }
1579 protected function copy_skip($chars) {
1580 $pos = $this->pos;
1581 $len = strspn ( $this->doc, $chars, $pos );
1582 $this->pos += $len;
1583 $this->char = ($this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1584 if ($len === 0)
1585 return '';
1586 return substr ( $this->doc, $pos, $len );
1587 }
1588 protected function copy_until($chars) {
1589 $pos = $this->pos;
1590 $len = strcspn ( $this->doc, $chars, $pos );
1591 $this->pos += $len;
1592 $this->char = ($this->pos < $this->size) ? $this->doc [$this->pos] : null; // next
1593 return substr ( $this->doc, $pos, $len );
1594 }
1595 protected function copy_until_char($char) {
1596 if ($this->char === null)
1597 return '';
1598
1599 if (($pos = strpos ( $this->doc, $char, $this->pos )) === false) {
1600 $ret = substr ( $this->doc, $this->pos, $this->size - $this->pos );
1601 $this->char = null;
1602 $this->pos = $this->size;
1603 return $ret;
1604 }
1605
1606 if ($pos === $this->pos)
1607 return '';
1608 $pos_old = $this->pos;
1609 $this->char = $this->doc [$pos];
1610 $this->pos = $pos;
1611 return substr ( $this->doc, $pos_old, $pos - $pos_old );
1612 }
1613 protected function copy_until_char_escape($char) {
1614 if ($this->char === null)
1615 return '';
1616
1617 $start = $this->pos;
1618 while ( 1 ) {
1619 if (($pos = strpos ( $this->doc, $char, $start )) === false) {
1620 $ret = substr ( $this->doc, $this->pos, $this->size - $this->pos );
1621 $this->char = null;
1622 $this->pos = $this->size;
1623 return $ret;
1624 }
1625
1626 if ($pos === $this->pos)
1627 return '';
1628
1629 if ($this->doc [$pos - 1] === '\\') {
1630 $start = $pos + 1;
1631 continue;
1632 }
1633
1634 $pos_old = $this->pos;
1635 $this->char = $this->doc [$pos];
1636 $this->pos = $pos;
1637 return substr ( $this->doc, $pos_old, $pos - $pos_old );
1638 }
1639 }
1640
1641 // remove noise from html content
1642 // save the noise in the $this->noise array.
1643 protected function remove_noise($pattern, $remove_tag = false) {
1644 if (is_object ( $this->gptranslate_debug_object )) {
1645 $this->gptranslate_debug_object->debug_log_entry ( 1 );
1646 }
1647
1648 $count = preg_match_all ( $pattern, $this->doc, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE );
1649
1650 for($i = $count - 1; $i > - 1; -- $i) {
1651 $key = '___noise___' . sprintf ( '% 5d', count ( $this->noise ) + 1000 );
1652 if (is_object ( $this->gptranslate_debug_object )) {
1653 $this->gptranslate_debug_object->debug_log ( 2, 'key is: ' . $key );
1654 }
1655 $idx = ($remove_tag) ? 0 : 1;
1656 $this->noise [$key] = $matches [$i] [$idx] [0];
1657 $this->doc = substr_replace ( $this->doc, $key, $matches [$i] [$idx] [1], strlen ( $matches [$i] [$idx] [0] ) );
1658 }
1659
1660 // reset the length of content
1661 $this->size = strlen ( $this->doc );
1662 if ($this->size > 0) {
1663 $this->char = $this->doc [0];
1664 }
1665 }
1666
1667 // restore noise to html content
1668 function restore_noise($text) {
1669 if (is_object ( $this->gptranslate_debug_object )) {
1670 $this->gptranslate_debug_object->debug_log_entry ( 1 );
1671 }
1672
1673 while ( ($pos = strpos ( $text, '___noise___' )) !== false ) {
1674 // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1675 if (strlen ( $text ) > $pos + 15) {
1676 $key = '___noise___' . $text [$pos + 11] . $text [$pos + 12] . $text [$pos + 13] . $text [$pos + 14] . $text [$pos + 15];
1677 if (is_object ( $this->gptranslate_debug_object )) {
1678 $this->gptranslate_debug_object->debug_log ( 2, 'located key of: ' . $key );
1679 }
1680
1681 if (isset ( $this->noise [$key] )) {
1682 $text = substr ( $text, 0, $pos ) . $this->noise [$key] . substr ( $text, $pos + 16 );
1683 } else {
1684 // do this to prevent an infinite loop.
1685 $text = substr ( $text, 0, $pos ) . 'UNDEFINED NOISE FOR KEY: ' . $key . substr ( $text, $pos + 16 );
1686 }
1687 } else {
1688 // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1689 $text = substr ( $text, 0, $pos ) . 'NO NUMERIC NOISE KEY' . substr ( $text, $pos + 11 );
1690 }
1691 }
1692 return $text;
1693 }
1694
1695 // Sometimes we NEED one of the noise elements.
1696 function search_noise($text) {
1697 if (is_object ( $this->gptranslate_debug_object )) {
1698 $this->gptranslate_debug_object->debug_log_entry ( 1 );
1699 }
1700
1701 foreach ( $this->noise as $noiseElement ) {
1702 if (strpos ( $noiseElement, $text ) !== false) {
1703 return $noiseElement;
1704 }
1705 }
1706 }
1707 function __toString() {
1708 return $this->root->innertext ();
1709 }
1710 function __get($name) {
1711 switch ($name) {
1712 case 'outertext' :
1713 return $this->root->innertext ();
1714 case 'innertext' :
1715 return $this->root->innertext ();
1716 case 'plaintext' :
1717 return $this->root->text ();
1718 case 'charset' :
1719 return $this->_charset;
1720 case 'target_charset' :
1721 return $this->_target_charset;
1722 }
1723 }
1724
1725 // camel naming conventions
1726 function childNodes($idx = -1) {
1727 return $this->root->childNodes ( $idx );
1728 }
1729 function firstChild() {
1730 return $this->root->first_child ();
1731 }
1732 function lastChild() {
1733 return $this->root->last_child ();
1734 }
1735 function createElement($name, $value = null) {
1736 return @gptranslate_simplehtmldom_str_get_html ( "<$name>$value</$name>" )->first_child ();
1737 }
1738 function createTextNode($value) {
1739 return @end ( gptranslate_simplehtmldom_str_get_html ( $value )->nodes );
1740 }
1741 function getElementById($id) {
1742 return $this->find ( "#$id", 0 );
1743 }
1744 function getElementsById($id, $idx = null) {
1745 return $this->find ( "#$id", $idx );
1746 }
1747 function getElementByTagName($name) {
1748 return $this->find ( $name, 0 );
1749 }
1750 function getElementsByTagName($name, $idx = -1) {
1751 return $this->find ( $name, $idx );
1752 }
1753 function loadFile() {
1754 $args = func_get_args ();
1755 $this->load_file ( $args );
1756 }
1757 }