gptranslate
Last commit date
assets
4 months ago
flags
4 months ago
language
4 months ago
gptranslate.php
4 months ago
multilang-routing.php
4 months ago
readme.txt
4 months ago
serverside-translations.php
4 months ago
settings.php
4 months ago
simplehtmldom.php
4 months ago
uninstall.php
4 months ago
simplehtmldom.php
1757 lines
| 1 | <?php |
| 2 | if (!defined('ABSPATH')) exit; |
| 3 | |
| 4 | define ( 'GPTRANSLATE_HDOM_TYPE_ELEMENT', 1 ); |
| 5 | define ( 'GPTRANSLATE_HDOM_TYPE_COMMENT', 2 ); |
| 6 | define ( 'GPTRANSLATE_HDOM_TYPE_TEXT', 3 ); |
| 7 | define ( 'GPTRANSLATE_HDOM_TYPE_ENDTAG', 4 ); |
| 8 | define ( 'GPTRANSLATE_HDOM_TYPE_ROOT', 5 ); |
| 9 | define ( 'GPTRANSLATE_HDOM_TYPE_UNKNOWN', 6 ); |
| 10 | define ( 'GPTRANSLATE_HDOM_QUOTE_DOUBLE', 0 ); |
| 11 | define ( 'GPTRANSLATE_HDOM_QUOTE_SINGLE', 1 ); |
| 12 | define ( 'GPTRANSLATE_HDOM_QUOTE_NO', 3 ); |
| 13 | define ( 'GPTRANSLATE_HDOM_INFO_BEGIN', 0 ); |
| 14 | define ( 'GPTRANSLATE_HDOM_INFO_END', 1 ); |
| 15 | define ( 'GPTRANSLATE_HDOM_INFO_QUOTE', 2 ); |
| 16 | define ( 'GPTRANSLATE_HDOM_INFO_SPACE', 3 ); |
| 17 | define ( 'GPTRANSLATE_HDOM_INFO_TEXT', 4 ); |
| 18 | define ( 'GPTRANSLATE_HDOM_INFO_INNER', 5 ); |
| 19 | define ( 'GPTRANSLATE_HDOM_INFO_OUTER', 6 ); |
| 20 | define ( 'GPTRANSLATE_HDOM_INFO_ENDSPACE', 7 ); |
| 21 | define ( 'GPTRANSLATE_DEFAULT_TARGET_CHARSET', 'UTF-8' ); |
| 22 | define ( 'GPTRANSLATE_DEFAULT_BR_TEXT', "\r\n" ); |
| 23 | define ( 'GPTRANSLATE_DEFAULT_SPAN_TEXT', " " ); |
| 24 | define ( 'GPTRANSLATE_MAX_FILE_SIZE', 600000 ); |
| 25 | // helper functions |
| 26 | // ----------------------------------------------------------------------------- |
| 27 | // get html dom from file |
| 28 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. |
| 29 | function gptranslate_simplehtmldom_file_get_html($url, $use_include_path = false, $context = null, $offset = -1, $maxLen = -1, $lowercase = true, $forceTagsClosed = true, $target_charset = GPTRANSLATE_DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) { |
| 30 | // We DO force the tags to be terminated. |
| 31 | $dom = new GPTranslateSimpleHtmlDom ( null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText ); |
| 32 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. |
| 33 | $contents = file_get_contents ( $url, $use_include_path, $context, $offset ); |
| 34 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout. |
| 35 | // $contents = retrieve_url_contents($url); |
| 36 | if (empty ( $contents ) || strlen ( $contents ) > GPTRANSLATE_MAX_FILE_SIZE) { |
| 37 | return false; |
| 38 | } |
| 39 | // The second parameter can force the selectors to all be lowercase. |
| 40 | $dom->load ( $contents, $lowercase, $stripRN ); |
| 41 | return $dom; |
| 42 | } |
| 43 | |
| 44 | // get html dom from string |
| 45 | function gptranslate_simplehtmldom_str_get_html($str, $lowercase = true, $forceTagsClosed = true, $target_charset = GPTRANSLATE_DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) { |
| 46 | $dom = new GPTranslateSimpleHtmlDom ( null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText ); |
| 47 | if (empty ( $str ) || strlen ( $str ) > GPTRANSLATE_MAX_FILE_SIZE) { |
| 48 | $dom->clear (); |
| 49 | return false; |
| 50 | } |
| 51 | $dom->load ( $str, $lowercase, $stripRN ); |
| 52 | return $dom; |
| 53 | } |
| 54 | |
| 55 | /** |
| 56 | * simple html dom node |
| 57 | * PaperG - added ability for "find" routine to lowercase the value of the selector. |
| 58 | * PaperG - added $tag_start to track the start position of the tag in the total byte index |
| 59 | * |
| 60 | * @package PlaceLocalInclude |
| 61 | */ |
| 62 | class GPTranslateSimpleHtmlDomNode { |
| 63 | public $nodetype = GPTRANSLATE_HDOM_TYPE_TEXT; |
| 64 | public $tag = 'text'; |
| 65 | public $attr = array (); |
| 66 | public $children = array (); |
| 67 | public $nodes = array (); |
| 68 | public $parent = null; |
| 69 | // The "info" array - see GPTRANSLATE_HDOM_INFO_... for what each element contains. |
| 70 | public $_ = array (); |
| 71 | public $tag_start = 0; |
| 72 | public $no_remove = null; |
| 73 | private $dom = null; |
| 74 | private $gptranslate_debug_object = null; |
| 75 | function __construct($dom) { |
| 76 | $this->dom = $dom; |
| 77 | $dom->nodes [] = $this; |
| 78 | } |
| 79 | function __destruct() { |
| 80 | $this->clear (); |
| 81 | } |
| 82 | function __toString() { |
| 83 | return $this->outertext (); |
| 84 | } |
| 85 | |
| 86 | // clean up memory due to php5 circular references memory leak... |
| 87 | function clear() { |
| 88 | $this->dom = null; |
| 89 | $this->nodes = null; |
| 90 | $this->parent = null; |
| 91 | $this->children = null; |
| 92 | } |
| 93 | |
| 94 | // dump node's tree |
| 95 | function dump($show_attr = true, $deep = 0) { |
| 96 | $lead = str_repeat ( ' ', $deep ); |
| 97 | // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- output is safe |
| 98 | echo $lead . $this->tag; |
| 99 | if ($show_attr && count ( $this->attr ) > 0) { |
| 100 | echo '('; |
| 101 | foreach ( $this->attr as $k => $v ) |
| 102 | // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- output is safe |
| 103 | echo "[$k]=>\"" . $this->$k . '", '; |
| 104 | echo ')'; |
| 105 | } |
| 106 | echo "\n"; |
| 107 | |
| 108 | if ($this->nodes) { |
| 109 | foreach ( $this->nodes as $c ) { |
| 110 | $c->dump ( $show_attr, $deep + 1 ); |
| 111 | } |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | // Debugging function to dump a single dom node with a bunch of information about it. |
| 116 | function dump_node($echo = true) { |
| 117 | $string = $this->tag; |
| 118 | if (count ( $this->attr ) > 0) { |
| 119 | $string .= '('; |
| 120 | foreach ( $this->attr as $k => $v ) { |
| 121 | $string .= "[$k]=>\"" . $this->$k . '", '; |
| 122 | } |
| 123 | $string .= ')'; |
| 124 | } |
| 125 | if (count ( $this->_ ) > 0) { |
| 126 | $string .= ' $_ ('; |
| 127 | foreach ( $this->_ as $k => $v ) { |
| 128 | if (is_array ( $v )) { |
| 129 | $string .= "[$k]=>("; |
| 130 | foreach ( $v as $k2 => $v2 ) { |
| 131 | $string .= "[$k2]=>\"" . $v2 . '", '; |
| 132 | } |
| 133 | $string .= ")"; |
| 134 | } else { |
| 135 | $string .= "[$k]=>\"" . $v . '", '; |
| 136 | } |
| 137 | } |
| 138 | $string .= ")"; |
| 139 | } |
| 140 | |
| 141 | if (isset ( $this->text )) { |
| 142 | $string .= " text: (" . $this->text . ")"; |
| 143 | } |
| 144 | |
| 145 | $string .= " GPTRANSLATE_HDOM_INNER_INFO: '"; |
| 146 | if (isset ( $node->_ [GPTRANSLATE_HDOM_INFO_INNER] )) { |
| 147 | $string .= $node->_ [GPTRANSLATE_HDOM_INFO_INNER] . "'"; |
| 148 | } else { |
| 149 | $string .= ' NULL '; |
| 150 | } |
| 151 | |
| 152 | $string .= " children: " . count ( $this->children ); |
| 153 | $string .= " nodes: " . count ( $this->nodes ); |
| 154 | $string .= " tag_start: " . $this->tag_start; |
| 155 | $string .= "\n"; |
| 156 | // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- output is safe |
| 157 | if ($echo) { |
| 158 | // phpcs:ignore WordPress.Security.EscapeOutput.OutputNotEscaped -- output is safe |
| 159 | echo $string; |
| 160 | return; |
| 161 | } else { |
| 162 | return $string; |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | // returns the parent of node |
| 167 | // If a node is passed in, it will reset the parent of the current node to that one. |
| 168 | function parent($parent = null) { |
| 169 | // I am SURE that this doesn't work properly. |
| 170 | // It fails to unset the current node from it's current parents nodes or children list first. |
| 171 | if ($parent !== null) { |
| 172 | $this->parent = $parent; |
| 173 | $this->parent->nodes [] = $this; |
| 174 | $this->parent->children [] = $this; |
| 175 | } |
| 176 | |
| 177 | return $this->parent; |
| 178 | } |
| 179 | |
| 180 | // verify that node has children |
| 181 | function has_child() { |
| 182 | return ! empty ( $this->children ); |
| 183 | } |
| 184 | |
| 185 | // returns children of node |
| 186 | function children($idx = -1) { |
| 187 | if ($idx === - 1) { |
| 188 | return $this->children; |
| 189 | } |
| 190 | if (isset ( $this->children [$idx] )) { |
| 191 | return $this->children [$idx]; |
| 192 | } |
| 193 | return null; |
| 194 | } |
| 195 | |
| 196 | // returns children of node recursively |
| 197 | function childrenRecursive($element, &$allChildren = array()) { |
| 198 | if ($element->hasChildNodes()) { |
| 199 | foreach ($element->childNodes() as $childElement) { |
| 200 | $allChildren[] = $childElement; |
| 201 | if($childElement->hasChildNodes()) { |
| 202 | $this->childrenRecursive($childElement, $allChildren); |
| 203 | } |
| 204 | } |
| 205 | } |
| 206 | return $allChildren; |
| 207 | } |
| 208 | |
| 209 | // returns the first child of node |
| 210 | function first_child() { |
| 211 | if (count ( $this->children ) > 0) { |
| 212 | return $this->children [0]; |
| 213 | } |
| 214 | return null; |
| 215 | } |
| 216 | |
| 217 | // returns the last child of node |
| 218 | function last_child() { |
| 219 | if (($count = count ( $this->children )) > 0) { |
| 220 | return $this->children [$count - 1]; |
| 221 | } |
| 222 | return null; |
| 223 | } |
| 224 | |
| 225 | // returns the next sibling of node |
| 226 | function next_sibling() { |
| 227 | if ($this->parent === null) { |
| 228 | return null; |
| 229 | } |
| 230 | |
| 231 | $idx = 0; |
| 232 | $count = count ( $this->parent->children ); |
| 233 | while ( $idx < $count && $this !== $this->parent->children [$idx] ) { |
| 234 | ++ $idx; |
| 235 | } |
| 236 | if (++ $idx >= $count) { |
| 237 | return null; |
| 238 | } |
| 239 | return $this->parent->children [$idx]; |
| 240 | } |
| 241 | |
| 242 | // returns the previous sibling of node |
| 243 | function prev_sibling() { |
| 244 | if ($this->parent === null) |
| 245 | return null; |
| 246 | $idx = 0; |
| 247 | $count = count ( $this->parent->children ); |
| 248 | while ( $idx < $count && $this !== $this->parent->children [$idx] ) |
| 249 | ++ $idx; |
| 250 | if (-- $idx < 0) |
| 251 | return null; |
| 252 | return $this->parent->children [$idx]; |
| 253 | } |
| 254 | |
| 255 | // function to locate a specific ancestor tag in the path to the root. |
| 256 | function find_ancestor_tag($tag) { |
| 257 | if (is_object ( $this->gptranslate_debug_object )) { |
| 258 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 259 | } |
| 260 | |
| 261 | // Start by including ourselves in the comparison. |
| 262 | $returnDom = $this; |
| 263 | |
| 264 | while ( ! is_null ( $returnDom ) ) { |
| 265 | if (is_object ( $this->gptranslate_debug_object )) { |
| 266 | $this->gptranslate_debug_object->debug_log ( 2, "Current tag is: " . $returnDom->tag ); |
| 267 | } |
| 268 | |
| 269 | if ($returnDom->tag == $tag) { |
| 270 | break; |
| 271 | } |
| 272 | $returnDom = $returnDom->parent; |
| 273 | } |
| 274 | return $returnDom; |
| 275 | } |
| 276 | |
| 277 | // get dom node's inner html |
| 278 | function innertext() { |
| 279 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_INNER] )) |
| 280 | return $this->_ [GPTRANSLATE_HDOM_INFO_INNER]; |
| 281 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] )) |
| 282 | return $this->dom->restore_noise ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] ); |
| 283 | |
| 284 | $ret = ''; |
| 285 | foreach ( $this->nodes as $n ) |
| 286 | $ret .= $n->outertext (); |
| 287 | return $ret; |
| 288 | } |
| 289 | |
| 290 | // get dom node's outer text (with tag) |
| 291 | function outertext() { |
| 292 | if (is_object ( $this->gptranslate_debug_object )) { |
| 293 | $text = ''; |
| 294 | if ($this->tag == 'text') { |
| 295 | if (! empty ( $this->text )) { |
| 296 | $text = " with text: " . $this->text; |
| 297 | } |
| 298 | } |
| 299 | $this->gptranslate_debug_object->debug_log ( 1, 'Innertext of tag: ' . $this->tag . $text ); |
| 300 | } |
| 301 | |
| 302 | if ($this->tag === 'root') |
| 303 | return $this->innertext (); |
| 304 | |
| 305 | // trigger callback |
| 306 | if ($this->dom && $this->dom->callback !== null) { |
| 307 | call_user_func_array ( $this->dom->callback, array ( |
| 308 | $this |
| 309 | ) ); |
| 310 | } |
| 311 | |
| 312 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_OUTER] )) |
| 313 | return $this->_ [GPTRANSLATE_HDOM_INFO_OUTER]; |
| 314 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] )) |
| 315 | return $this->dom->restore_noise ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] ); |
| 316 | |
| 317 | // render begin tag |
| 318 | if ($this->dom && $this->dom->nodes [$this->_ [GPTRANSLATE_HDOM_INFO_BEGIN]]) { |
| 319 | $ret = $this->dom->nodes [$this->_ [GPTRANSLATE_HDOM_INFO_BEGIN]]->makeup (); |
| 320 | } else { |
| 321 | $ret = ""; |
| 322 | } |
| 323 | |
| 324 | // render inner text |
| 325 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_INNER] )) { |
| 326 | // If it's a br tag... don't return the GPTRANSLATE_HDOM_INNER_INFO that we may or may not have added. |
| 327 | if ($this->tag != "br") { |
| 328 | $ret .= $this->_ [GPTRANSLATE_HDOM_INFO_INNER]; |
| 329 | } |
| 330 | } else { |
| 331 | if ($this->nodes) { |
| 332 | foreach ( $this->nodes as $n ) { |
| 333 | $ret .= $this->convert_text ( $n->outertext () ); |
| 334 | } |
| 335 | } |
| 336 | } |
| 337 | |
| 338 | // render end tag |
| 339 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_END] ) && $this->_ [GPTRANSLATE_HDOM_INFO_END] != 0) |
| 340 | $ret .= '</' . $this->tag . '>'; |
| 341 | return $ret; |
| 342 | } |
| 343 | |
| 344 | // get dom node's plain text |
| 345 | function text($script = false) { |
| 346 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_INNER] )) |
| 347 | return $this->_ [GPTRANSLATE_HDOM_INFO_INNER]; |
| 348 | switch ($this->nodetype) { |
| 349 | case GPTRANSLATE_HDOM_TYPE_TEXT : |
| 350 | return $this->dom->restore_noise ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] ); |
| 351 | case GPTRANSLATE_HDOM_TYPE_COMMENT : |
| 352 | return ''; |
| 353 | case GPTRANSLATE_HDOM_TYPE_UNKNOWN : |
| 354 | return ''; |
| 355 | } |
| 356 | if (strcasecmp ( $this->tag, 'script' ) === 0 && !$script) |
| 357 | return ''; |
| 358 | if (strcasecmp ( $this->tag, 'style' ) === 0) |
| 359 | return ''; |
| 360 | |
| 361 | $ret = ''; |
| 362 | // In rare cases, (always node type 1 or GPTRANSLATE_HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. |
| 363 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. |
| 364 | // WHY is this happening? |
| 365 | if (! is_null ( $this->nodes )) { |
| 366 | foreach ( $this->nodes as $n ) { |
| 367 | $ret .= $this->convert_text ( $n->text () ); |
| 368 | } |
| 369 | |
| 370 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. |
| 371 | if ($this->tag == "span") { |
| 372 | $ret .= $this->dom->default_span_text; |
| 373 | } |
| 374 | } |
| 375 | return $ret; |
| 376 | } |
| 377 | function xmltext() { |
| 378 | $ret = $this->innertext (); |
| 379 | $ret = str_ireplace ( '<![CDATA[', '', $ret ); |
| 380 | $ret = str_replace ( ']]>', '', $ret ); |
| 381 | return $ret; |
| 382 | } |
| 383 | |
| 384 | // build node's text with tag |
| 385 | function makeup() { |
| 386 | // text, comment, unknown |
| 387 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] )) |
| 388 | return $this->dom->restore_noise ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] ); |
| 389 | |
| 390 | $ret = '<' . $this->tag; |
| 391 | $i = - 1; |
| 392 | |
| 393 | foreach ( $this->attr as $key => $val ) { |
| 394 | ++ $i; |
| 395 | |
| 396 | // skip removed attribute |
| 397 | if ($val === null || $val === false) |
| 398 | continue; |
| 399 | |
| 400 | $ret .= $this->_ [GPTRANSLATE_HDOM_INFO_SPACE] [$i] [0]; |
| 401 | // no value attr: nowrap, checked selected... |
| 402 | if ($val === true) |
| 403 | $ret .= $key; |
| 404 | else { |
| 405 | switch ($this->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [$i]) { |
| 406 | case GPTRANSLATE_HDOM_QUOTE_DOUBLE : |
| 407 | $quote = '"'; |
| 408 | break; |
| 409 | case GPTRANSLATE_HDOM_QUOTE_SINGLE : |
| 410 | $quote = '\''; |
| 411 | break; |
| 412 | default : |
| 413 | $quote = ''; |
| 414 | } |
| 415 | $ret .= $key . $this->_ [GPTRANSLATE_HDOM_INFO_SPACE] [$i] [1] . '=' . $this->_ [GPTRANSLATE_HDOM_INFO_SPACE] [$i] [2] . $quote . $val . $quote; |
| 416 | } |
| 417 | } |
| 418 | $ret = $this->dom->restore_noise ( $ret ); |
| 419 | return $ret . $this->_ [GPTRANSLATE_HDOM_INFO_ENDSPACE] . '>'; |
| 420 | } |
| 421 | |
| 422 | // find elements by css selector |
| 423 | // PaperG - added ability for find to lowercase the value of the selector. |
| 424 | function find($selector, $idx = null, $lowercase = false) { |
| 425 | $selectors = $this->parse_selector ( $selector ); |
| 426 | if (($count = count ( $selectors )) === 0) |
| 427 | return array (); |
| 428 | $found_keys = array (); |
| 429 | |
| 430 | // find each selector |
| 431 | for($c = 0; $c < $count; ++ $c) { |
| 432 | // The change on the below line was documented on the sourceforge code tracker id 2788009 |
| 433 | // used to be: if (($levle=count($selectors[0]))===0) return array(); |
| 434 | if (($levle = count ( $selectors [$c] )) === 0) |
| 435 | return array (); |
| 436 | if (! isset ( $this->_ [GPTRANSLATE_HDOM_INFO_BEGIN] )) |
| 437 | return array (); |
| 438 | |
| 439 | $head = array ( |
| 440 | $this->_ [GPTRANSLATE_HDOM_INFO_BEGIN] => 1 |
| 441 | ); |
| 442 | |
| 443 | // handle descendant selectors, no recursive! |
| 444 | for($l = 0; $l < $levle; ++ $l) { |
| 445 | $ret = array (); |
| 446 | foreach ( $head as $k => $v ) { |
| 447 | $n = ($k === - 1) ? $this->dom->root : $this->dom->nodes [$k]; |
| 448 | // PaperG - Pass this optional parameter on to the seek function. |
| 449 | $n->seek ( $selectors [$c] [$l], $ret, $lowercase ); |
| 450 | } |
| 451 | $head = $ret; |
| 452 | } |
| 453 | |
| 454 | foreach ( $head as $k => $v ) { |
| 455 | if (! isset ( $found_keys [$k] )) { |
| 456 | $found_keys [$k] = 1; |
| 457 | } |
| 458 | } |
| 459 | } |
| 460 | |
| 461 | // sort keys |
| 462 | ksort ( $found_keys ); |
| 463 | |
| 464 | $found = array (); |
| 465 | foreach ( $found_keys as $k => $v ) |
| 466 | $found [] = $this->dom->nodes [$k]; |
| 467 | |
| 468 | // return nth-element or array |
| 469 | if (is_null ( $idx )) |
| 470 | return $found; |
| 471 | else if ($idx < 0) |
| 472 | $idx = count ( $found ) + $idx; |
| 473 | return (isset ( $found [$idx] )) ? $found [$idx] : null; |
| 474 | } |
| 475 | |
| 476 | // seek for given conditions |
| 477 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. |
| 478 | protected function seek($selector, &$ret, $lowercase = false) { |
| 479 | if (is_object ( $this->gptranslate_debug_object )) { |
| 480 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 481 | } |
| 482 | |
| 483 | list ( $tag, $key, $val, $exp, $no_key ) = $selector; |
| 484 | |
| 485 | // xpath index |
| 486 | if ($tag && $key && is_numeric ( $key )) { |
| 487 | $count = 0; |
| 488 | foreach ( $this->children as $c ) { |
| 489 | if ($tag === '*' || $tag === $c->tag) { |
| 490 | if (++ $count == $key) { |
| 491 | $ret [$c->_ [GPTRANSLATE_HDOM_INFO_BEGIN]] = 1; |
| 492 | return; |
| 493 | } |
| 494 | } |
| 495 | } |
| 496 | return; |
| 497 | } |
| 498 | |
| 499 | $end = (! empty ( $this->_ [GPTRANSLATE_HDOM_INFO_END] )) ? $this->_ [GPTRANSLATE_HDOM_INFO_END] : 0; |
| 500 | if ($end == 0) { |
| 501 | $parent = $this->parent; |
| 502 | while ( ! isset ( $parent->_ [GPTRANSLATE_HDOM_INFO_END] ) && $parent !== null ) { |
| 503 | $end -= 1; |
| 504 | $parent = $parent->parent; |
| 505 | } |
| 506 | $end += $parent->_ [GPTRANSLATE_HDOM_INFO_END]; |
| 507 | } |
| 508 | |
| 509 | for($i = $this->_ [GPTRANSLATE_HDOM_INFO_BEGIN] + 1; $i < $end; ++ $i) { |
| 510 | $node = $this->dom->nodes [$i]; |
| 511 | |
| 512 | $pass = true; |
| 513 | |
| 514 | if ($tag === '*' && ! $key) { |
| 515 | if (in_array ( $node, $this->children, true )) |
| 516 | $ret [$i] = 1; |
| 517 | continue; |
| 518 | } |
| 519 | |
| 520 | // compare tag |
| 521 | if ($tag && $tag != $node->tag && $tag !== '*') { |
| 522 | $pass = false; |
| 523 | } |
| 524 | // compare key |
| 525 | if ($pass && $key) { |
| 526 | if ($no_key) { |
| 527 | if (isset ( $node->attr [$key] )) |
| 528 | $pass = false; |
| 529 | } else { |
| 530 | if (($key != "plaintext") && ! isset ( $node->attr [$key] )) |
| 531 | $pass = false; |
| 532 | } |
| 533 | } |
| 534 | // compare value |
| 535 | if ($pass && $key && $val && $val !== '*') { |
| 536 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? |
| 537 | if ($key == "plaintext") { |
| 538 | // $node->plaintext actually returns $node->text(); |
| 539 | $nodeKeyValue = $node->text (); |
| 540 | } else { |
| 541 | // this is a normal search, we want the value of that attribute of the tag. |
| 542 | $nodeKeyValue = $node->attr [$key]; |
| 543 | } |
| 544 | if (is_object ( $this->gptranslate_debug_object )) { |
| 545 | $this->gptranslate_debug_object->debug_log ( 2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue ); |
| 546 | } |
| 547 | |
| 548 | // PaperG - If lowercase is set, do a case insensitive test of the value of the selector. |
| 549 | if ($lowercase) { |
| 550 | $check = $this->match ( $exp, strtolower ( $val ), strtolower ( $nodeKeyValue ) ); |
| 551 | } else { |
| 552 | $check = $this->match ( $exp, $val, $nodeKeyValue ); |
| 553 | } |
| 554 | if (is_object ( $this->gptranslate_debug_object )) { |
| 555 | $this->gptranslate_debug_object->debug_log ( 2, "after match: " . ($check ? "true" : "false") ); |
| 556 | } |
| 557 | |
| 558 | // handle multiple class |
| 559 | if (! $check && strcasecmp ( $key, 'class' ) === 0) { |
| 560 | foreach ( explode ( ' ', $node->attr [$key] ) as $k ) { |
| 561 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. |
| 562 | if (! empty ( $k )) { |
| 563 | if ($lowercase) { |
| 564 | $check = $this->match ( $exp, strtolower ( $val ), strtolower ( $k ) ); |
| 565 | } else { |
| 566 | $check = $this->match ( $exp, $val, $k ); |
| 567 | } |
| 568 | if ($check) |
| 569 | break; |
| 570 | } |
| 571 | } |
| 572 | } |
| 573 | if (! $check) |
| 574 | $pass = false; |
| 575 | } |
| 576 | if ($pass) |
| 577 | $ret [$i] = 1; |
| 578 | unset ( $node ); |
| 579 | } |
| 580 | // It's passed by reference so this is actually what this function returns. |
| 581 | if (is_object ( $this->gptranslate_debug_object )) { |
| 582 | $this->gptranslate_debug_object->debug_log ( 1, "EXIT - ret: ", $ret ); |
| 583 | } |
| 584 | } |
| 585 | protected function match($exp, $pattern, $value) { |
| 586 | if (is_object ( $this->gptranslate_debug_object )) { |
| 587 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 588 | } |
| 589 | |
| 590 | switch ($exp) { |
| 591 | case '=' : |
| 592 | return ($value === $pattern); |
| 593 | case '!=' : |
| 594 | return ($value !== $pattern); |
| 595 | case '^=' : |
| 596 | return preg_match ( "/^" . preg_quote ( $pattern, '/' ) . "/", $value ); |
| 597 | case '$=' : |
| 598 | return preg_match ( "/" . preg_quote ( $pattern, '/' ) . "$/", $value ); |
| 599 | case '*=' : |
| 600 | if ($pattern [0] == '/') { |
| 601 | return preg_match ( $pattern, $value ); |
| 602 | } |
| 603 | return preg_match ( "/" . $pattern . "/i", $value ); |
| 604 | } |
| 605 | return false; |
| 606 | } |
| 607 | protected function parse_selector($selector_string) { |
| 608 | if (is_object ( $this->gptranslate_debug_object )) { |
| 609 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 610 | } |
| 611 | |
| 612 | // pattern of CSS selectors, modified from mootools |
| 613 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. |
| 614 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. |
| 615 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. |
| 616 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. |
| 617 | // farther study is required to determine of this should be documented or removed. |
| 618 | // $pattern = "/([\w\-:\*]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
| 619 | $pattern = "/([\w\-:\*]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
| 620 | preg_match_all ( $pattern, trim ( $selector_string ) . ' ', $matches, PREG_SET_ORDER ); |
| 621 | if (is_object ( $this->gptranslate_debug_object )) { |
| 622 | $this->gptranslate_debug_object->debug_log ( 2, "Matches Array: ", $matches ); |
| 623 | } |
| 624 | |
| 625 | $selectors = array (); |
| 626 | $result = array (); |
| 627 | // print_r($matches); |
| 628 | |
| 629 | foreach ( $matches as $m ) { |
| 630 | $m [0] = trim ( $m [0] ); |
| 631 | if ($m [0] === '' || $m [0] === '/' || $m [0] === '//') |
| 632 | continue; |
| 633 | // for browser generated xpath |
| 634 | if ($m [1] === 'tbody') |
| 635 | continue; |
| 636 | |
| 637 | list ( $tag, $key, $val, $exp, $no_key ) = array ( |
| 638 | $m [1], |
| 639 | '', |
| 640 | null, |
| 641 | '=', |
| 642 | false |
| 643 | ); |
| 644 | if (! empty ( $m [2] )) { |
| 645 | $key = 'id'; |
| 646 | $val = $m [2]; |
| 647 | } |
| 648 | if (! empty ( $m [3] )) { |
| 649 | $key = 'class'; |
| 650 | $val = $m [3]; |
| 651 | } |
| 652 | if (! empty ( $m [4] )) { |
| 653 | $key = $m [4]; |
| 654 | } |
| 655 | if (! empty ( $m [5] )) { |
| 656 | $exp = $m [5]; |
| 657 | } |
| 658 | if (! empty ( $m [6] )) { |
| 659 | $val = $m [6]; |
| 660 | } |
| 661 | |
| 662 | // convert to lowercase |
| 663 | if ($this->dom->lowercase) { |
| 664 | $tag = strtolower ( $tag ); |
| 665 | $key = strtolower ( $key ); |
| 666 | } |
| 667 | // elements that do NOT have the specified attribute |
| 668 | if (isset ( $key [0] ) && $key [0] === '!') { |
| 669 | $key = substr ( $key, 1 ); |
| 670 | $no_key = true; |
| 671 | } |
| 672 | |
| 673 | $result [] = array ( |
| 674 | $tag, |
| 675 | $key, |
| 676 | $val, |
| 677 | $exp, |
| 678 | $no_key |
| 679 | ); |
| 680 | if (trim ( $m [7] ) === ',') { |
| 681 | $selectors [] = $result; |
| 682 | $result = array (); |
| 683 | } |
| 684 | } |
| 685 | if (count ( $result ) > 0) |
| 686 | $selectors [] = $result; |
| 687 | return $selectors; |
| 688 | } |
| 689 | function __get($name) { |
| 690 | if (isset ( $this->attr [$name] )) { |
| 691 | return $this->convert_text ( $this->attr [$name] ); |
| 692 | } |
| 693 | switch ($name) { |
| 694 | case 'outertext' : |
| 695 | return $this->outertext (); |
| 696 | case 'innertext' : |
| 697 | return $this->innertext (); |
| 698 | case 'plaintext' : |
| 699 | return $this->text (); |
| 700 | case 'xmltext' : |
| 701 | return $this->xmltext (); |
| 702 | default : |
| 703 | return array_key_exists ( $name, $this->attr ); |
| 704 | } |
| 705 | } |
| 706 | function __set($name, $value) { |
| 707 | if (is_object ( $this->gptranslate_debug_object )) { |
| 708 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 709 | } |
| 710 | |
| 711 | switch ($name) { |
| 712 | case 'outertext' : |
| 713 | return $this->_ [GPTRANSLATE_HDOM_INFO_OUTER] = $value; |
| 714 | case 'innertext' : |
| 715 | if (isset ( $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] )) |
| 716 | return $this->_ [GPTRANSLATE_HDOM_INFO_TEXT] = $value; |
| 717 | return $this->_ [GPTRANSLATE_HDOM_INFO_INNER] = $value; |
| 718 | } |
| 719 | if (! isset ( $this->attr [$name] )) { |
| 720 | $this->_ [GPTRANSLATE_HDOM_INFO_SPACE] [] = array ( |
| 721 | ' ', |
| 722 | '', |
| 723 | '' |
| 724 | ); |
| 725 | $this->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_DOUBLE; |
| 726 | } |
| 727 | $this->attr [$name] = $value; |
| 728 | } |
| 729 | function __isset($name) { |
| 730 | switch ($name) { |
| 731 | case 'outertext' : |
| 732 | return true; |
| 733 | case 'innertext' : |
| 734 | return true; |
| 735 | case 'plaintext' : |
| 736 | return true; |
| 737 | } |
| 738 | // no value attr: nowrap, checked selected... |
| 739 | return (array_key_exists ( $name, $this->attr )) ? true : isset ( $this->attr [$name] ); |
| 740 | } |
| 741 | function __unset($name) { |
| 742 | if (isset ( $this->attr [$name] )) |
| 743 | unset ( $this->attr [$name] ); |
| 744 | } |
| 745 | |
| 746 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. |
| 747 | function convert_text($text) { |
| 748 | if (is_object ( $this->gptranslate_debug_object )) { |
| 749 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 750 | } |
| 751 | |
| 752 | $converted_text = $text; |
| 753 | |
| 754 | $sourceCharset = ""; |
| 755 | $targetCharset = ""; |
| 756 | |
| 757 | if ($this->dom) { |
| 758 | $sourceCharset = strtoupper ( $this->dom->_charset ); |
| 759 | $targetCharset = strtoupper ( $this->dom->_target_charset ); |
| 760 | } |
| 761 | if (is_object ( $this->gptranslate_debug_object )) { |
| 762 | $this->gptranslate_debug_object->debug_log ( 3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset ); |
| 763 | } |
| 764 | |
| 765 | if (! empty ( $sourceCharset ) && ! empty ( $targetCharset ) && (strcasecmp ( $sourceCharset, $targetCharset ) != 0)) { |
| 766 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 |
| 767 | if ((strcasecmp ( $targetCharset, 'UTF-8' ) == 0) && ($this->is_utf8 ( $text ))) { |
| 768 | $converted_text = $text; |
| 769 | } else { |
| 770 | $converted_text = iconv ( $sourceCharset, $targetCharset, $text ); |
| 771 | } |
| 772 | } |
| 773 | |
| 774 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. |
| 775 | if ($targetCharset == 'UTF-8') { |
| 776 | if (substr ( $converted_text, 0, 3 ) == "\xef\xbb\xbf") { |
| 777 | $converted_text = substr ( $converted_text, 3 ); |
| 778 | } |
| 779 | if (substr ( $converted_text, - 3 ) == "\xef\xbb\xbf") { |
| 780 | $converted_text = substr ( $converted_text, 0, - 3 ); |
| 781 | } |
| 782 | } |
| 783 | |
| 784 | return $converted_text; |
| 785 | } |
| 786 | |
| 787 | /** |
| 788 | * Returns true if $string is valid UTF-8 and false otherwise. |
| 789 | * |
| 790 | * @param mixed $str |
| 791 | * String to be tested |
| 792 | * @return boolean |
| 793 | */ |
| 794 | static function is_utf8($str) { |
| 795 | $c = 0; |
| 796 | $b = 0; |
| 797 | $bits = 0; |
| 798 | $len = strlen ( $str ); |
| 799 | for($i = 0; $i < $len; $i ++) { |
| 800 | if($str && is_string($str)) { |
| 801 | $c = ord ( $str [$i] ); |
| 802 | if ($c > 128) { |
| 803 | if (($c >= 254)) |
| 804 | return false; |
| 805 | elseif ($c >= 252) |
| 806 | $bits = 6; |
| 807 | elseif ($c >= 248) |
| 808 | $bits = 5; |
| 809 | elseif ($c >= 240) |
| 810 | $bits = 4; |
| 811 | elseif ($c >= 224) |
| 812 | $bits = 3; |
| 813 | elseif ($c >= 192) |
| 814 | $bits = 2; |
| 815 | else |
| 816 | return false; |
| 817 | if (($i + $bits) > $len) |
| 818 | return false; |
| 819 | while ( $bits > 1 ) { |
| 820 | $i ++; |
| 821 | $b = ord ( $str [$i] ); |
| 822 | if ($b < 128 || $b > 191) |
| 823 | return false; |
| 824 | $bits --; |
| 825 | } |
| 826 | } |
| 827 | } |
| 828 | } |
| 829 | return true; |
| 830 | } |
| 831 | |
| 832 | /** |
| 833 | * Function to try a few tricks to determine the displayed size of an img on the page. |
| 834 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. |
| 835 | * |
| 836 | * @author John Schlick |
| 837 | * @version April 19 2012 |
| 838 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. |
| 839 | */ |
| 840 | function get_display_size() { |
| 841 | $width = - 1; |
| 842 | $height = - 1; |
| 843 | |
| 844 | if ($this->tag !== 'img') { |
| 845 | return false; |
| 846 | } |
| 847 | |
| 848 | // See if there is aheight or width attribute in the tag itself. |
| 849 | if (isset ( $this->attr ['width'] )) { |
| 850 | $width = $this->attr ['width']; |
| 851 | } |
| 852 | |
| 853 | if (isset ( $this->attr ['height'] )) { |
| 854 | $height = $this->attr ['height']; |
| 855 | } |
| 856 | |
| 857 | // Now look for an inline style. |
| 858 | if (isset ( $this->attr ['style'] )) { |
| 859 | // Thanks to user gnarf from stackoverflow for this regular expression. |
| 860 | $attributes = array (); |
| 861 | preg_match_all ( "/([\w\-]+)\s*:\s*([^;]+)\s*;?/", $this->attr ['style'], $matches, PREG_SET_ORDER ); |
| 862 | foreach ( $matches as $match ) { |
| 863 | $attributes [$match [1]] = $match [2]; |
| 864 | } |
| 865 | |
| 866 | // If there is a width in the style attributes: |
| 867 | if (isset ( $attributes ['width'] ) && $width == - 1) { |
| 868 | // check that the last two characters are px (pixels) |
| 869 | if (strtolower ( substr ( $attributes ['width'], - 2 ) ) == 'px') { |
| 870 | $proposed_width = substr ( $attributes ['width'], 0, - 2 ); |
| 871 | // Now make sure that it's an integer and not something stupid. |
| 872 | if (filter_var ( $proposed_width, FILTER_VALIDATE_INT )) { |
| 873 | $width = $proposed_width; |
| 874 | } |
| 875 | } |
| 876 | } |
| 877 | |
| 878 | // If there is a width in the style attributes: |
| 879 | if (isset ( $attributes ['height'] ) && $height == - 1) { |
| 880 | // check that the last two characters are px (pixels) |
| 881 | if (strtolower ( substr ( $attributes ['height'], - 2 ) ) == 'px') { |
| 882 | $proposed_height = substr ( $attributes ['height'], 0, - 2 ); |
| 883 | // Now make sure that it's an integer and not something stupid. |
| 884 | if (filter_var ( $proposed_height, FILTER_VALIDATE_INT )) { |
| 885 | $height = $proposed_height; |
| 886 | } |
| 887 | } |
| 888 | } |
| 889 | } |
| 890 | |
| 891 | // Future enhancement: |
| 892 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. |
| 893 | |
| 894 | // Far future enhancement |
| 895 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width |
| 896 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. |
| 897 | |
| 898 | // ridiculously far future development |
| 899 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. |
| 900 | |
| 901 | $result = array ( |
| 902 | 'height' => $height, |
| 903 | 'width' => $width |
| 904 | ); |
| 905 | return $result; |
| 906 | } |
| 907 | |
| 908 | // camel naming conventions |
| 909 | function getAllAttributes() { |
| 910 | return $this->attr; |
| 911 | } |
| 912 | function getAttribute($name) { |
| 913 | return $this->__get ( $name ); |
| 914 | } |
| 915 | function setAttribute($name, $value) { |
| 916 | $this->__set ( $name, $value ); |
| 917 | } |
| 918 | function hasAttribute($name) { |
| 919 | return $this->__isset ( $name ); |
| 920 | } |
| 921 | function removeAttribute($name) { |
| 922 | $this->__set ( $name, null ); |
| 923 | } |
| 924 | function getElementById($id) { |
| 925 | return $this->find ( "#$id", 0 ); |
| 926 | } |
| 927 | function getElementsById($id, $idx = null) { |
| 928 | return $this->find ( "#$id", $idx ); |
| 929 | } |
| 930 | function getElementByTagName($name) { |
| 931 | return $this->find ( $name, 0 ); |
| 932 | } |
| 933 | function getElementsByTagName($name, $idx = null) { |
| 934 | return $this->find ( $name, $idx ); |
| 935 | } |
| 936 | function parentNode() { |
| 937 | return $this->parent (); |
| 938 | } |
| 939 | function childNodes($idx = -1) { |
| 940 | return $this->children ( $idx ); |
| 941 | } |
| 942 | function firstChild() { |
| 943 | return $this->first_child (); |
| 944 | } |
| 945 | function lastChild() { |
| 946 | return $this->last_child (); |
| 947 | } |
| 948 | function nextSibling() { |
| 949 | return $this->next_sibling (); |
| 950 | } |
| 951 | function previousSibling() { |
| 952 | return $this->prev_sibling (); |
| 953 | } |
| 954 | function hasChildNodes() { |
| 955 | return $this->has_child (); |
| 956 | } |
| 957 | function nodeName() { |
| 958 | return $this->tag; |
| 959 | } |
| 960 | function appendChild($node) { |
| 961 | $node->parent ( $this ); |
| 962 | return $node; |
| 963 | } |
| 964 | } |
| 965 | |
| 966 | /** |
| 967 | * simple html dom parser |
| 968 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. |
| 969 | * Paperg - change $size from protected to public so we can easily access it |
| 970 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. |
| 971 | * |
| 972 | * @package PlaceLocalInclude |
| 973 | */ |
| 974 | class GPTranslateSimpleHtmlDom { |
| 975 | public $root = null; |
| 976 | public $nodes = array (); |
| 977 | public $callback = null; |
| 978 | public $lowercase = false; |
| 979 | // Used to keep track of how large the text was when we started. |
| 980 | public $original_size; |
| 981 | public $size; |
| 982 | protected $pos; |
| 983 | protected $doc; |
| 984 | protected $char; |
| 985 | protected $cursor; |
| 986 | protected $parent; |
| 987 | protected $noise = array (); |
| 988 | protected $token_blank = " \t\r\n"; |
| 989 | protected $token_equal = ' =/>'; |
| 990 | protected $token_slash = " />\r\n\t"; |
| 991 | protected $token_attr = ' >'; |
| 992 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. |
| 993 | public $_charset = ''; |
| 994 | public $_target_charset = ''; |
| 995 | protected $default_br_text = ""; |
| 996 | public $default_span_text = ""; |
| 997 | protected $gptranslate_debug_object = null; |
| 998 | |
| 999 | // use isset instead of in_array, performance boost about 30%... |
| 1000 | protected $self_closing_tags = array ( |
| 1001 | 'img' => 1, |
| 1002 | 'br' => 1, |
| 1003 | 'input' => 1, |
| 1004 | 'meta' => 1, |
| 1005 | 'link' => 1, |
| 1006 | 'hr' => 1, |
| 1007 | 'base' => 1, |
| 1008 | 'embed' => 1, |
| 1009 | 'spacer' => 1 |
| 1010 | ); |
| 1011 | protected $block_tags = array ( |
| 1012 | 'root' => 1, |
| 1013 | 'body' => 1, |
| 1014 | 'form' => 1, |
| 1015 | 'div' => 1, |
| 1016 | 'span' => 1, |
| 1017 | 'table' => 1 |
| 1018 | ); |
| 1019 | // Known sourceforge issue #2977341 |
| 1020 | // B tags that are not closed cause us to return everything to the end of the document. |
| 1021 | protected $optional_closing_tags = array ( |
| 1022 | 'tr' => array ( |
| 1023 | 'tr' => 1, |
| 1024 | 'td' => 1, |
| 1025 | 'th' => 1 |
| 1026 | ), |
| 1027 | 'th' => array ( |
| 1028 | 'th' => 1 |
| 1029 | ), |
| 1030 | 'td' => array ( |
| 1031 | 'td' => 1 |
| 1032 | ), |
| 1033 | 'li' => array ( |
| 1034 | 'li' => 1 |
| 1035 | ), |
| 1036 | 'dt' => array ( |
| 1037 | 'dt' => 1, |
| 1038 | 'dd' => 1 |
| 1039 | ), |
| 1040 | 'dd' => array ( |
| 1041 | 'dd' => 1, |
| 1042 | 'dt' => 1 |
| 1043 | ), |
| 1044 | 'dl' => array ( |
| 1045 | 'dd' => 1, |
| 1046 | 'dt' => 1 |
| 1047 | ), |
| 1048 | 'p' => array ( |
| 1049 | 'p' => 1 |
| 1050 | ), |
| 1051 | 'nobr' => array ( |
| 1052 | 'nobr' => 1 |
| 1053 | ), |
| 1054 | 'b' => array ( |
| 1055 | 'b' => 1 |
| 1056 | ), |
| 1057 | 'option' => array ( |
| 1058 | 'option' => 1 |
| 1059 | ) |
| 1060 | ); |
| 1061 | function __construct($str = null, $lowercase = true, $forceTagsClosed = true, $target_charset = GPTRANSLATE_DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) { |
| 1062 | if ($str) { |
| 1063 | if (preg_match ( "/^http:\/\//i", $str ) || is_file ( $str )) { |
| 1064 | $this->load_file ( $str ); |
| 1065 | } else { |
| 1066 | $this->load ( $str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText ); |
| 1067 | } |
| 1068 | } |
| 1069 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. |
| 1070 | if (! $forceTagsClosed) { |
| 1071 | $this->optional_closing_array = array (); |
| 1072 | } |
| 1073 | $this->_target_charset = $target_charset; |
| 1074 | } |
| 1075 | function __destruct() { |
| 1076 | $this->clear (); |
| 1077 | } |
| 1078 | |
| 1079 | // load html from string |
| 1080 | function load($str, $lowercase = false, $stripRN = false, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) { |
| 1081 | |
| 1082 | // prepare |
| 1083 | $this->prepare ( $str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText ); |
| 1084 | // strip out cdata |
| 1085 | $this->remove_noise ( "'<!\[CDATA\[(.*?)\]\]>'is", true ); |
| 1086 | // strip out comments |
| 1087 | $this->remove_noise ( "'<!--(.*?)-->'is" ); |
| 1088 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 |
| 1089 | // Script tags removal now preceeds style tag removal. |
| 1090 | // strip out <script> tags |
| 1091 | $this->remove_noise ( "'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is" ); |
| 1092 | $this->remove_noise ( "'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is" ); |
| 1093 | // strip out <style> tags |
| 1094 | $this->remove_noise ( "'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is" ); |
| 1095 | $this->remove_noise ( "'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is" ); |
| 1096 | // strip out preformatted tags |
| 1097 | $this->remove_noise ( "'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is" ); |
| 1098 | // strip out server side scripts |
| 1099 | $this->remove_noise ( "'(<\?)(.*?)(\?>)'s", true ); |
| 1100 | // strip smarty scripts |
| 1101 | $this->remove_noise ( "'(\{\w)(.*?)(\})'s", true ); |
| 1102 | |
| 1103 | // parsing |
| 1104 | while ( $this->parse () ) |
| 1105 | ; |
| 1106 | // end |
| 1107 | $this->root->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor; |
| 1108 | $this->parse_charset (); |
| 1109 | |
| 1110 | // make load function chainable |
| 1111 | return $this; |
| 1112 | } |
| 1113 | |
| 1114 | // load html from file |
| 1115 | function load_file() { |
| 1116 | $args = func_get_args (); |
| 1117 | $this->load ( call_user_func_array ( 'file_get_contents', $args ), true ); |
| 1118 | // Throw an error if we can't properly load the dom. |
| 1119 | if (($error = error_get_last ()) !== null) { |
| 1120 | $this->clear (); |
| 1121 | return false; |
| 1122 | } |
| 1123 | } |
| 1124 | |
| 1125 | // set callback function |
| 1126 | function set_callback($function_name) { |
| 1127 | $this->callback = $function_name; |
| 1128 | } |
| 1129 | |
| 1130 | // remove callback function |
| 1131 | function remove_callback() { |
| 1132 | $this->callback = null; |
| 1133 | } |
| 1134 | |
| 1135 | // save dom as string |
| 1136 | function save($filepath = '') { |
| 1137 | $ret = $this->root->innertext (); |
| 1138 | if ($filepath !== '') |
| 1139 | file_put_contents ( $filepath, $ret, LOCK_EX ); |
| 1140 | return $ret; |
| 1141 | } |
| 1142 | |
| 1143 | // find dom node by css selector |
| 1144 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. |
| 1145 | function find($selector, $idx = null, $lowercase = false) { |
| 1146 | return $this->root->find ( $selector, $idx, $lowercase ); |
| 1147 | } |
| 1148 | |
| 1149 | // clean up memory due to php5 circular references memory leak... |
| 1150 | function clear() { |
| 1151 | foreach ( $this->nodes as $n ) { |
| 1152 | $n->clear (); |
| 1153 | $n = null; |
| 1154 | } |
| 1155 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. |
| 1156 | if (isset ( $this->children )) |
| 1157 | foreach ( $this->children as $n ) { |
| 1158 | $n->clear (); |
| 1159 | $n = null; |
| 1160 | } |
| 1161 | if (isset ( $this->parent )) { |
| 1162 | $this->parent->clear (); |
| 1163 | unset ( $this->parent ); |
| 1164 | } |
| 1165 | if (isset ( $this->root )) { |
| 1166 | $this->root->clear (); |
| 1167 | unset ( $this->root ); |
| 1168 | } |
| 1169 | unset ( $this->doc ); |
| 1170 | unset ( $this->noise ); |
| 1171 | } |
| 1172 | function dump($show_attr = true) { |
| 1173 | $this->root->dump ( $show_attr ); |
| 1174 | } |
| 1175 | |
| 1176 | // prepare HTML data and init everything |
| 1177 | protected function prepare($str, $lowercase = true, $stripRN = true, $defaultBRText = GPTRANSLATE_DEFAULT_BR_TEXT, $defaultSpanText = GPTRANSLATE_DEFAULT_SPAN_TEXT) { |
| 1178 | $this->clear (); |
| 1179 | |
| 1180 | // set the length of content before we do anything to it. |
| 1181 | $this->size = strlen ( $str ); |
| 1182 | // Save the original size of the html that we got in. It might be useful to someone. |
| 1183 | $this->original_size = $this->size; |
| 1184 | |
| 1185 | // before we save the string as the doc... strip out the \r \n's if we are told to. |
| 1186 | if ($stripRN) { |
| 1187 | $str = str_replace ( "\r", " ", $str ); |
| 1188 | $str = str_replace ( "\n", " ", $str ); |
| 1189 | |
| 1190 | // set the length of content since we have changed it. |
| 1191 | $this->size = strlen ( $str ); |
| 1192 | } |
| 1193 | |
| 1194 | $this->doc = $str; |
| 1195 | $this->pos = 0; |
| 1196 | $this->cursor = 1; |
| 1197 | $this->noise = array (); |
| 1198 | $this->nodes = array (); |
| 1199 | $this->lowercase = $lowercase; |
| 1200 | $this->default_br_text = $defaultBRText; |
| 1201 | $this->default_span_text = $defaultSpanText; |
| 1202 | $this->root = new GPTranslateSimpleHtmlDomNode ( $this ); |
| 1203 | $this->root->tag = 'root'; |
| 1204 | $this->root->_ [GPTRANSLATE_HDOM_INFO_BEGIN] = - 1; |
| 1205 | $this->root->nodetype = GPTRANSLATE_HDOM_TYPE_ROOT; |
| 1206 | $this->parent = $this->root; |
| 1207 | if ($this->size > 0) |
| 1208 | $this->char = $this->doc [0]; |
| 1209 | } |
| 1210 | |
| 1211 | // parse html content |
| 1212 | protected function parse() { |
| 1213 | if (($s = $this->copy_until_char ( '<' )) === '') { |
| 1214 | return $this->read_tag (); |
| 1215 | } |
| 1216 | |
| 1217 | // text |
| 1218 | $node = new GPTranslateSimpleHtmlDomNode ( $this ); |
| 1219 | ++ $this->cursor; |
| 1220 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = $s; |
| 1221 | $this->link_nodes ( $node, false ); |
| 1222 | return true; |
| 1223 | } |
| 1224 | |
| 1225 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. |
| 1226 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec |
| 1227 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. |
| 1228 | protected function parse_charset() { |
| 1229 | $charset = null; |
| 1230 | |
| 1231 | if (function_exists ( 'get_last_retrieve_url_contents_content_type' )) { |
| 1232 | $contentTypeHeader = get_last_retrieve_url_contents_content_type (); |
| 1233 | $success = preg_match ( '/charset=(.+)/', $contentTypeHeader, $matches ); |
| 1234 | if ($success) { |
| 1235 | $charset = $matches [1]; |
| 1236 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1237 | $this->gptranslate_debug_object->debug_log ( 2, 'header content-type found charset of: ' . $charset ); |
| 1238 | } |
| 1239 | } |
| 1240 | } |
| 1241 | |
| 1242 | if (empty ( $charset )) { |
| 1243 | $el = $this->root->find ( 'meta[http-equiv=Content-Type]', 0, true ); |
| 1244 | if (! empty ( $el )) { |
| 1245 | $fullvalue = $el->content; |
| 1246 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1247 | $this->gptranslate_debug_object->debug_log ( 2, 'meta content-type tag found' . $fullvalue ); |
| 1248 | } |
| 1249 | |
| 1250 | if (! empty ( $fullvalue )) { |
| 1251 | $success = preg_match ( '/charset=(.+)/i', $fullvalue, $matches ); |
| 1252 | if ($success) { |
| 1253 | $charset = $matches [1]; |
| 1254 | } else { |
| 1255 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 |
| 1256 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1257 | $this->gptranslate_debug_object->debug_log ( 2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' ); |
| 1258 | } |
| 1259 | $charset = 'ISO-8859-1'; |
| 1260 | } |
| 1261 | } |
| 1262 | } |
| 1263 | } |
| 1264 | |
| 1265 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... |
| 1266 | if (empty ( $charset )) { |
| 1267 | // Use this in case mb_detect_charset isn't installed/loaded on this machine. |
| 1268 | $charset = false; |
| 1269 | if (function_exists ( 'mb_detect_encoding' )) { |
| 1270 | // Have php try to detect the encoding from the text given to us. |
| 1271 | $charset = mb_detect_encoding ( $this->root->plaintext . "ascii", $encoding_list = array ( |
| 1272 | "UTF-8", |
| 1273 | "CP1252" |
| 1274 | ) ); |
| 1275 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1276 | $this->gptranslate_debug_object->debug_log ( 2, 'mb_detect found: ' . $charset ); |
| 1277 | } |
| 1278 | } |
| 1279 | |
| 1280 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... |
| 1281 | if ($charset === false) { |
| 1282 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1283 | $this->gptranslate_debug_object->debug_log ( 2, 'since mb_detect failed - using default of utf-8' ); |
| 1284 | } |
| 1285 | $charset = 'UTF-8'; |
| 1286 | } |
| 1287 | } |
| 1288 | |
| 1289 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. |
| 1290 | if ((strtolower ( $charset ) == strtolower ( 'ISO-8859-1' )) || (strtolower ( $charset ) == strtolower ( 'Latin1' )) || (strtolower ( $charset ) == strtolower ( 'Latin-1' ))) { |
| 1291 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1292 | $this->gptranslate_debug_object->debug_log ( 2, 'replacing ' . $charset . ' with CP1252 as its a superset' ); |
| 1293 | } |
| 1294 | $charset = 'CP1252'; |
| 1295 | } |
| 1296 | |
| 1297 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1298 | $this->gptranslate_debug_object->debug_log ( 1, 'EXIT - ' . $charset ); |
| 1299 | } |
| 1300 | |
| 1301 | return $this->_charset = $charset; |
| 1302 | } |
| 1303 | |
| 1304 | // read tag info |
| 1305 | protected function read_tag() { |
| 1306 | if ($this->char !== '<') { |
| 1307 | $this->root->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor; |
| 1308 | return false; |
| 1309 | } |
| 1310 | $begin_tag_pos = $this->pos; |
| 1311 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1312 | |
| 1313 | // end tag |
| 1314 | if ($this->char === '/') { |
| 1315 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1316 | // This represents the change in the GPTranslateSimpleHtmlDom trunk from revision 180 to 181. |
| 1317 | // $this->skip($this->token_blank_t); |
| 1318 | $this->skip ( $this->token_blank ); |
| 1319 | $tag = $this->copy_until_char ( '>' ); |
| 1320 | |
| 1321 | // skip attributes in end tag |
| 1322 | if (($pos = strpos ( $tag, ' ' )) !== false) |
| 1323 | $tag = substr ( $tag, 0, $pos ); |
| 1324 | |
| 1325 | $parent_lower = strtolower ( $this->parent->tag ); |
| 1326 | $tag_lower = strtolower ( $tag ); |
| 1327 | |
| 1328 | if ($parent_lower !== $tag_lower) { |
| 1329 | if (isset ( $this->optional_closing_tags [$parent_lower] ) && isset ( $this->block_tags [$tag_lower] )) { |
| 1330 | $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = 0; |
| 1331 | $org_parent = $this->parent; |
| 1332 | |
| 1333 | while ( ($this->parent->parent) && strtolower ( $this->parent->tag ) !== $tag_lower ) |
| 1334 | $this->parent = $this->parent->parent; |
| 1335 | |
| 1336 | if (strtolower ( $this->parent->tag ) !== $tag_lower) { |
| 1337 | $this->parent = $org_parent; // restore origonal parent |
| 1338 | if ($this->parent->parent) |
| 1339 | $this->parent = $this->parent->parent; |
| 1340 | $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor; |
| 1341 | return $this->as_text_node ( $tag ); |
| 1342 | } |
| 1343 | } else if (($this->parent->parent) && isset ( $this->block_tags [$tag_lower] )) { |
| 1344 | $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = 0; |
| 1345 | $org_parent = $this->parent; |
| 1346 | |
| 1347 | while ( ($this->parent->parent) && strtolower ( $this->parent->tag ) !== $tag_lower ) |
| 1348 | $this->parent = $this->parent->parent; |
| 1349 | |
| 1350 | if (strtolower ( $this->parent->tag ) !== $tag_lower) { |
| 1351 | $this->parent = $org_parent; // restore origonal parent |
| 1352 | $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor; |
| 1353 | return $this->as_text_node ( $tag ); |
| 1354 | } |
| 1355 | } else if (($this->parent->parent) && strtolower ( $this->parent->parent->tag ) === $tag_lower) { |
| 1356 | $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = 0; |
| 1357 | $this->parent = $this->parent->parent; |
| 1358 | } else |
| 1359 | return $this->as_text_node ( $tag ); |
| 1360 | } |
| 1361 | |
| 1362 | $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = $this->cursor; |
| 1363 | if ($this->parent->parent) |
| 1364 | $this->parent = $this->parent->parent; |
| 1365 | |
| 1366 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1367 | return true; |
| 1368 | } |
| 1369 | |
| 1370 | $node = new GPTranslateSimpleHtmlDomNode ( $this ); |
| 1371 | $node->_ [GPTRANSLATE_HDOM_INFO_BEGIN] = $this->cursor; |
| 1372 | ++ $this->cursor; |
| 1373 | $tag = $this->copy_until ( $this->token_slash ); |
| 1374 | $node->tag_start = $begin_tag_pos; |
| 1375 | |
| 1376 | // doctype, cdata & comments... |
| 1377 | if (isset ( $tag [0] ) && $tag [0] === '!') { |
| 1378 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char ( '>' ); |
| 1379 | |
| 1380 | if (isset ( $tag [2] ) && $tag [1] === '-' && $tag [2] === '-') { |
| 1381 | $node->nodetype = GPTRANSLATE_HDOM_TYPE_COMMENT; |
| 1382 | $node->tag = 'comment'; |
| 1383 | } else { |
| 1384 | $node->nodetype = GPTRANSLATE_HDOM_TYPE_UNKNOWN; |
| 1385 | $node->tag = 'unknown'; |
| 1386 | } |
| 1387 | if ($this->char === '>') |
| 1388 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] .= '>'; |
| 1389 | $this->link_nodes ( $node, true ); |
| 1390 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1391 | return true; |
| 1392 | } |
| 1393 | |
| 1394 | // text |
| 1395 | if ($pos = strpos ( $tag, '<' ) !== false) { |
| 1396 | $tag = '<' . substr ( $tag, 0, - 1 ); |
| 1397 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = $tag; |
| 1398 | $this->link_nodes ( $node, false ); |
| 1399 | $this->char = $this->doc [-- $this->pos]; // prev |
| 1400 | return true; |
| 1401 | } |
| 1402 | |
| 1403 | if (! preg_match ( "/^[\w\-:]+$/", $tag )) { |
| 1404 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until ( '<>' ); |
| 1405 | if ($this->char === '<') { |
| 1406 | $this->link_nodes ( $node, false ); |
| 1407 | return true; |
| 1408 | } |
| 1409 | |
| 1410 | if ($this->char === '>') |
| 1411 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] .= '>'; |
| 1412 | $this->link_nodes ( $node, false ); |
| 1413 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1414 | return true; |
| 1415 | } |
| 1416 | |
| 1417 | // begin tag |
| 1418 | $node->nodetype = GPTRANSLATE_HDOM_TYPE_ELEMENT; |
| 1419 | $tag_lower = strtolower ( $tag ); |
| 1420 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
| 1421 | |
| 1422 | // handle optional closing tags |
| 1423 | if (isset ( $this->optional_closing_tags [$tag_lower] )) { |
| 1424 | while ( isset ( $this->optional_closing_tags [$tag_lower] [strtolower ( $this->parent->tag )] ) ) { |
| 1425 | $this->parent->_ [GPTRANSLATE_HDOM_INFO_END] = 0; |
| 1426 | $this->parent = $this->parent->parent; |
| 1427 | } |
| 1428 | $node->parent = $this->parent; |
| 1429 | } |
| 1430 | |
| 1431 | $guard = 0; // prevent infinity loop |
| 1432 | $space = array ( |
| 1433 | $this->copy_skip ( $this->token_blank ), |
| 1434 | '', |
| 1435 | '' |
| 1436 | ); |
| 1437 | |
| 1438 | // attributes |
| 1439 | do { |
| 1440 | if ($this->char !== null && $space [0] === '') { |
| 1441 | break; |
| 1442 | } |
| 1443 | $name = $this->copy_until ( $this->token_equal ); |
| 1444 | if ($guard === $this->pos) { |
| 1445 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1446 | continue; |
| 1447 | } |
| 1448 | $guard = $this->pos; |
| 1449 | |
| 1450 | // handle endless '<' |
| 1451 | if ($this->pos >= $this->size - 1 && $this->char !== '>') { |
| 1452 | $node->nodetype = GPTRANSLATE_HDOM_TYPE_TEXT; |
| 1453 | $node->_ [GPTRANSLATE_HDOM_INFO_END] = 0; |
| 1454 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = '<' . $tag . $space [0] . $name; |
| 1455 | $node->tag = 'text'; |
| 1456 | $this->link_nodes ( $node, false ); |
| 1457 | return true; |
| 1458 | } |
| 1459 | |
| 1460 | // handle mismatch '<' |
| 1461 | if ($this->doc [$this->pos - 1] == '<') { |
| 1462 | $node->nodetype = GPTRANSLATE_HDOM_TYPE_TEXT; |
| 1463 | $node->tag = 'text'; |
| 1464 | $node->attr = array (); |
| 1465 | $node->_ [GPTRANSLATE_HDOM_INFO_END] = 0; |
| 1466 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = substr ( $this->doc, $begin_tag_pos, $this->pos - $begin_tag_pos - 1 ); |
| 1467 | $this->pos -= 2; |
| 1468 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1469 | $this->link_nodes ( $node, false ); |
| 1470 | return true; |
| 1471 | } |
| 1472 | |
| 1473 | if ($name !== '/' && $name !== '') { |
| 1474 | $space [1] = $this->copy_skip ( $this->token_blank ); |
| 1475 | $name = $this->restore_noise ( $name ); |
| 1476 | if ($this->lowercase) |
| 1477 | $name = strtolower ( $name ); |
| 1478 | if ($this->char === '=') { |
| 1479 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1480 | $this->parse_attr ( $node, $name, $space ); |
| 1481 | } else { |
| 1482 | // no value attr: nowrap, checked selected... |
| 1483 | $node->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_NO; |
| 1484 | $node->attr [$name] = true; |
| 1485 | if ($this->char != '>') |
| 1486 | $this->char = $this->doc [-- $this->pos]; // prev |
| 1487 | } |
| 1488 | $node->_ [GPTRANSLATE_HDOM_INFO_SPACE] [] = $space; |
| 1489 | $space = array ( |
| 1490 | $this->copy_skip ( $this->token_blank ), |
| 1491 | '', |
| 1492 | '' |
| 1493 | ); |
| 1494 | } else |
| 1495 | break; |
| 1496 | } while ( $this->char !== '>' && $this->char !== '/' ); |
| 1497 | |
| 1498 | $this->link_nodes ( $node, true ); |
| 1499 | $node->_ [GPTRANSLATE_HDOM_INFO_ENDSPACE] = $space [0]; |
| 1500 | |
| 1501 | // check self closing |
| 1502 | if ($this->copy_until_char_escape ( '>' ) === '/') { |
| 1503 | $node->_ [GPTRANSLATE_HDOM_INFO_ENDSPACE] .= '/'; |
| 1504 | $node->_ [GPTRANSLATE_HDOM_INFO_END] = 0; |
| 1505 | } else { |
| 1506 | // reset parent |
| 1507 | if (! isset ( $this->self_closing_tags [strtolower ( $node->tag )] )) |
| 1508 | $this->parent = $node; |
| 1509 | } |
| 1510 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1511 | |
| 1512 | // If it's a BR tag, we need to set it's text to the default text. |
| 1513 | // This way when we see it in plaintext, we can generate formatting that the user wants. |
| 1514 | // since a br tag never has sub nodes, this works well. |
| 1515 | if ($node->tag == "br") { |
| 1516 | $node->_ [GPTRANSLATE_HDOM_INFO_INNER] = $this->default_br_text; |
| 1517 | } |
| 1518 | |
| 1519 | return true; |
| 1520 | } |
| 1521 | |
| 1522 | // parse attributes |
| 1523 | protected function parse_attr($node, $name, &$space) { |
| 1524 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 |
| 1525 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. |
| 1526 | if (isset ( $node->attr [$name] )) { |
| 1527 | return; |
| 1528 | } |
| 1529 | |
| 1530 | $space [2] = $this->copy_skip ( $this->token_blank ); |
| 1531 | switch ($this->char) { |
| 1532 | case '"' : |
| 1533 | $node->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_DOUBLE; |
| 1534 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1535 | $node->attr [$name] = $this->restore_noise ( $this->copy_until_char_escape ( '"' ) ); |
| 1536 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1537 | break; |
| 1538 | case '\'' : |
| 1539 | $node->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_SINGLE; |
| 1540 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1541 | $node->attr [$name] = $this->restore_noise ( $this->copy_until_char_escape ( '\'' ) ); |
| 1542 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1543 | break; |
| 1544 | default : |
| 1545 | $node->_ [GPTRANSLATE_HDOM_INFO_QUOTE] [] = GPTRANSLATE_HDOM_QUOTE_NO; |
| 1546 | $node->attr [$name] = $this->restore_noise ( $this->copy_until ( $this->token_attr ) ); |
| 1547 | } |
| 1548 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. |
| 1549 | $node->attr [$name] = str_replace ( "\r", "", $node->attr [$name] ); |
| 1550 | $node->attr [$name] = str_replace ( "\n", "", $node->attr [$name] ); |
| 1551 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. |
| 1552 | if ($name == "class") { |
| 1553 | $node->attr [$name] = trim ( $node->attr [$name] ); |
| 1554 | } |
| 1555 | } |
| 1556 | |
| 1557 | // link node's parent |
| 1558 | protected function link_nodes(&$node, $is_child) { |
| 1559 | $node->parent = $this->parent; |
| 1560 | $this->parent->nodes [] = $node; |
| 1561 | if ($is_child) { |
| 1562 | $this->parent->children [] = $node; |
| 1563 | } |
| 1564 | } |
| 1565 | |
| 1566 | // as a text node |
| 1567 | protected function as_text_node($tag) { |
| 1568 | $node = new GPTranslateSimpleHtmlDomNode ( $this ); |
| 1569 | ++ $this->cursor; |
| 1570 | $node->_ [GPTRANSLATE_HDOM_INFO_TEXT] = '</' . $tag . '>'; |
| 1571 | $this->link_nodes ( $node, false ); |
| 1572 | $this->char = (++ $this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1573 | return true; |
| 1574 | } |
| 1575 | protected function skip($chars) { |
| 1576 | $this->pos += strspn ( $this->doc, $chars, $this->pos ); |
| 1577 | $this->char = ($this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1578 | } |
| 1579 | protected function copy_skip($chars) { |
| 1580 | $pos = $this->pos; |
| 1581 | $len = strspn ( $this->doc, $chars, $pos ); |
| 1582 | $this->pos += $len; |
| 1583 | $this->char = ($this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1584 | if ($len === 0) |
| 1585 | return ''; |
| 1586 | return substr ( $this->doc, $pos, $len ); |
| 1587 | } |
| 1588 | protected function copy_until($chars) { |
| 1589 | $pos = $this->pos; |
| 1590 | $len = strcspn ( $this->doc, $chars, $pos ); |
| 1591 | $this->pos += $len; |
| 1592 | $this->char = ($this->pos < $this->size) ? $this->doc [$this->pos] : null; // next |
| 1593 | return substr ( $this->doc, $pos, $len ); |
| 1594 | } |
| 1595 | protected function copy_until_char($char) { |
| 1596 | if ($this->char === null) |
| 1597 | return ''; |
| 1598 | |
| 1599 | if (($pos = strpos ( $this->doc, $char, $this->pos )) === false) { |
| 1600 | $ret = substr ( $this->doc, $this->pos, $this->size - $this->pos ); |
| 1601 | $this->char = null; |
| 1602 | $this->pos = $this->size; |
| 1603 | return $ret; |
| 1604 | } |
| 1605 | |
| 1606 | if ($pos === $this->pos) |
| 1607 | return ''; |
| 1608 | $pos_old = $this->pos; |
| 1609 | $this->char = $this->doc [$pos]; |
| 1610 | $this->pos = $pos; |
| 1611 | return substr ( $this->doc, $pos_old, $pos - $pos_old ); |
| 1612 | } |
| 1613 | protected function copy_until_char_escape($char) { |
| 1614 | if ($this->char === null) |
| 1615 | return ''; |
| 1616 | |
| 1617 | $start = $this->pos; |
| 1618 | while ( 1 ) { |
| 1619 | if (($pos = strpos ( $this->doc, $char, $start )) === false) { |
| 1620 | $ret = substr ( $this->doc, $this->pos, $this->size - $this->pos ); |
| 1621 | $this->char = null; |
| 1622 | $this->pos = $this->size; |
| 1623 | return $ret; |
| 1624 | } |
| 1625 | |
| 1626 | if ($pos === $this->pos) |
| 1627 | return ''; |
| 1628 | |
| 1629 | if ($this->doc [$pos - 1] === '\\') { |
| 1630 | $start = $pos + 1; |
| 1631 | continue; |
| 1632 | } |
| 1633 | |
| 1634 | $pos_old = $this->pos; |
| 1635 | $this->char = $this->doc [$pos]; |
| 1636 | $this->pos = $pos; |
| 1637 | return substr ( $this->doc, $pos_old, $pos - $pos_old ); |
| 1638 | } |
| 1639 | } |
| 1640 | |
| 1641 | // remove noise from html content |
| 1642 | // save the noise in the $this->noise array. |
| 1643 | protected function remove_noise($pattern, $remove_tag = false) { |
| 1644 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1645 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 1646 | } |
| 1647 | |
| 1648 | $count = preg_match_all ( $pattern, $this->doc, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE ); |
| 1649 | |
| 1650 | for($i = $count - 1; $i > - 1; -- $i) { |
| 1651 | $key = '___noise___' . sprintf ( '% 5d', count ( $this->noise ) + 1000 ); |
| 1652 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1653 | $this->gptranslate_debug_object->debug_log ( 2, 'key is: ' . $key ); |
| 1654 | } |
| 1655 | $idx = ($remove_tag) ? 0 : 1; |
| 1656 | $this->noise [$key] = $matches [$i] [$idx] [0]; |
| 1657 | $this->doc = substr_replace ( $this->doc, $key, $matches [$i] [$idx] [1], strlen ( $matches [$i] [$idx] [0] ) ); |
| 1658 | } |
| 1659 | |
| 1660 | // reset the length of content |
| 1661 | $this->size = strlen ( $this->doc ); |
| 1662 | if ($this->size > 0) { |
| 1663 | $this->char = $this->doc [0]; |
| 1664 | } |
| 1665 | } |
| 1666 | |
| 1667 | // restore noise to html content |
| 1668 | function restore_noise($text) { |
| 1669 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1670 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 1671 | } |
| 1672 | |
| 1673 | while ( ($pos = strpos ( $text, '___noise___' )) !== false ) { |
| 1674 | // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... |
| 1675 | if (strlen ( $text ) > $pos + 15) { |
| 1676 | $key = '___noise___' . $text [$pos + 11] . $text [$pos + 12] . $text [$pos + 13] . $text [$pos + 14] . $text [$pos + 15]; |
| 1677 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1678 | $this->gptranslate_debug_object->debug_log ( 2, 'located key of: ' . $key ); |
| 1679 | } |
| 1680 | |
| 1681 | if (isset ( $this->noise [$key] )) { |
| 1682 | $text = substr ( $text, 0, $pos ) . $this->noise [$key] . substr ( $text, $pos + 16 ); |
| 1683 | } else { |
| 1684 | // do this to prevent an infinite loop. |
| 1685 | $text = substr ( $text, 0, $pos ) . 'UNDEFINED NOISE FOR KEY: ' . $key . substr ( $text, $pos + 16 ); |
| 1686 | } |
| 1687 | } else { |
| 1688 | // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. |
| 1689 | $text = substr ( $text, 0, $pos ) . 'NO NUMERIC NOISE KEY' . substr ( $text, $pos + 11 ); |
| 1690 | } |
| 1691 | } |
| 1692 | return $text; |
| 1693 | } |
| 1694 | |
| 1695 | // Sometimes we NEED one of the noise elements. |
| 1696 | function search_noise($text) { |
| 1697 | if (is_object ( $this->gptranslate_debug_object )) { |
| 1698 | $this->gptranslate_debug_object->debug_log_entry ( 1 ); |
| 1699 | } |
| 1700 | |
| 1701 | foreach ( $this->noise as $noiseElement ) { |
| 1702 | if (strpos ( $noiseElement, $text ) !== false) { |
| 1703 | return $noiseElement; |
| 1704 | } |
| 1705 | } |
| 1706 | } |
| 1707 | function __toString() { |
| 1708 | return $this->root->innertext (); |
| 1709 | } |
| 1710 | function __get($name) { |
| 1711 | switch ($name) { |
| 1712 | case 'outertext' : |
| 1713 | return $this->root->innertext (); |
| 1714 | case 'innertext' : |
| 1715 | return $this->root->innertext (); |
| 1716 | case 'plaintext' : |
| 1717 | return $this->root->text (); |
| 1718 | case 'charset' : |
| 1719 | return $this->_charset; |
| 1720 | case 'target_charset' : |
| 1721 | return $this->_target_charset; |
| 1722 | } |
| 1723 | } |
| 1724 | |
| 1725 | // camel naming conventions |
| 1726 | function childNodes($idx = -1) { |
| 1727 | return $this->root->childNodes ( $idx ); |
| 1728 | } |
| 1729 | function firstChild() { |
| 1730 | return $this->root->first_child (); |
| 1731 | } |
| 1732 | function lastChild() { |
| 1733 | return $this->root->last_child (); |
| 1734 | } |
| 1735 | function createElement($name, $value = null) { |
| 1736 | return @gptranslate_simplehtmldom_str_get_html ( "<$name>$value</$name>" )->first_child (); |
| 1737 | } |
| 1738 | function createTextNode($value) { |
| 1739 | return @end ( gptranslate_simplehtmldom_str_get_html ( $value )->nodes ); |
| 1740 | } |
| 1741 | function getElementById($id) { |
| 1742 | return $this->find ( "#$id", 0 ); |
| 1743 | } |
| 1744 | function getElementsById($id, $idx = null) { |
| 1745 | return $this->find ( "#$id", $idx ); |
| 1746 | } |
| 1747 | function getElementByTagName($name) { |
| 1748 | return $this->find ( $name, 0 ); |
| 1749 | } |
| 1750 | function getElementsByTagName($name, $idx = -1) { |
| 1751 | return $this->find ( $name, $idx ); |
| 1752 | } |
| 1753 | function loadFile() { |
| 1754 | $args = func_get_args (); |
| 1755 | $this->load_file ( $args ); |
| 1756 | } |
| 1757 | } |