gan_tokenizer.php — MailPoet – Newsletters, Email Marketing, and Automation 4.4.0

mailpoet / lib-3rd-party / pquery / gan_tokenizer.php

mailpoet / lib-3rd-party / pquery Last commit date

third_party 3 years ago IQuery.php 3 years ago LICENSE 4 years ago gan_formatter.php 3 years ago gan_node_html.php 3 years ago gan_parser_html.php 3 years ago gan_selector_html.php 3 years ago gan_tokenizer.php 3 years ago gan_xml2array.php 3 years ago ganon.php 3 years ago index.php 4 years ago pQuery.php 3 years ago

gan_tokenizer.php

568 lines

1	<?php // phpcs:ignore SlevomatCodingStandard.TypeHints.DeclareStrictTypes.DeclareStrictTypesMissing
2	/**
3	* @author Niels A.D.
4	* @author Todd Burry <todd@vanillaforums.com>
5	* @copyright 2010 Niels A.D., 2014 Todd Burry
6	* @license http://opensource.org/licenses/LGPL-2.1 LGPL-2.1
7	* @package pQuery
8	*/
9
10	namespace MailPoetVendor\pQuery;
11
12	if (!defined('ABSPATH')) exit;
13
14
15	/**
16	* Converts a document into tokens
17	*
18	* Can convert any string into tokens. The base class only supports
19	* identifier/whitespace tokens. For more tokens, the class can be
20	* easily extended.
21	*
22	* Use like:
23	* <code>
24	* <?php
25	* $a = new TokenizerBase('hello word');
26	* while ($a->next() !== $a::TOK_NULL) {
27	* echo $a->token, ': ',$a->getTokenString(), "<br>\n";
28	* }
29	* ?>
30	* </code>
31	*
32	* @internal The tokenizer works with a character map that connects a certain
33	* character to a certain function/token. This class is build with speed in mind.
34	*/
35	class TokenizerBase {
36
37	/**
38	* NULL Token, used at end of document (parsing should stop after this token)
39	*/
40	const TOK_NULL = 0;
41	/**
42	* Unknown token, used at unidentified character
43	*/
44	const TOK_UNKNOWN = 1;
45	/**
46	* Whitespace token, used with whitespace
47	*/
48	const TOK_WHITESPACE = 2;
49	/**
50	* Identifier token, used with identifiers
51	*/
52	const TOK_IDENTIFIER = 3;
53
54	/**
55	* The document that is being tokenized
56	* @var string
57	* @internal Public for faster access!
58	* @see setDoc()
59	* @see getDoc()
60	* @access private
61	*/
62	var $doc = '';
63
64	/**
65	* The size of the document (length of string)
66	* @var int
67	* @internal Public for faster access!
68	* @see $doc
69	* @access private
70	*/
71	var $size = 0;
72
73	/**
74	* Current (character) position in the document
75	* @var int
76	* @internal Public for faster access!
77	* @see setPos()
78	* @see getPos()
79	* @access private
80	*/
81	var $pos = 0;
82
83	/**
84	* Current (Line/Column) position in document
85	* @var array (Current_Line, Line_Starting_Pos)
86	* @internal Public for faster access!
87	* @see getLinePos()
88	* @access private
89	*/
90	var $line_pos = array(0, 0);
91
92	/**
93	* Current token
94	* @var int
95	* @internal Public for faster access!
96	* @see getToken()
97	* @access private
98	*/
99	var $token = self::TOK_NULL;
100
101	/**
102	* Start position of token. If NULL, then current position is used.
103	* @var int
104	* @internal Public for faster access!
105	* @see getTokenString()
106	* @access private
107	*/
108	var $token_start = null;
109
110	/**
111	* List with all the character that can be considered as whitespace
112	* @var array\|string
113	* @internal Variable is public + associated array for faster access!
114	* @internal array(' ' => true) will recognize space (' ') as whitespace
115	* @internal String will be converted to array in constructor
116	* @internal Result token will be {@link self::TOK_WHITESPACE};
117	* @see setWhitespace()
118	* @see getWhitespace()
119	* @access private
120	*/
121	var $whitespace = " \t\n\r\0\x0B";
122
123	/**
124	* List with all the character that can be considered as identifier
125	* @var array\|string
126	* @internal Variable is public + associated array for faster access!
127	* @internal array('a' => true) will recognize 'a' as identifier
128	* @internal String will be converted to array in constructor
129	* @internal Result token will be {@link self::TOK_IDENTIFIER};
130	* @see setIdentifiers()
131	* @see getIdentifiers()
132	* @access private
133	*/
134	var $identifiers = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_';
135
136	/**
137	* All characters that should be mapped to a token/function that cannot be considered as whitespace or identifier
138	* @var array
139	* @internal Variable is public + associated array for faster access!
140	* @internal array('a' => 'parse_a') will call $this->parse_a() if it matches the character 'a'
141	* @internal array('a' => self::TOK_A) will set token to TOK_A if it matches the character 'a'
142	* @see mapChar()
143	* @see unmapChar()
144	* @access private
145	*/
146	var $custom_char_map = array();
147
148	/**
149	* Automatically built character map. Built using {@link $identifiers}, {@link $whitespace} and {@link $custom_char_map}
150	* @var array
151	* @internal Public for faster access!
152	* @access private
153	*/
154	var $char_map = array();
155
156	/**
157	* All errors found while parsing the document
158	* @var array
159	* @see addError()
160	*/
161	var $errors = array();
162
163	/**
164	* Class constructor
165	* @param string $doc Document to be tokenized
166	* @param int $pos Position to start parsing
167	* @see setDoc()
168	* @see setPos()
169	*/
170	function __construct($doc = '', $pos = 0) {
171	$this->setWhitespace($this->whitespace);
172	$this->setIdentifiers($this->identifiers);
173
174	$this->setDoc($doc, $pos);
175	}
176
177	#php4 PHP4 class constructor compatibility
178	#function TokenizerBase($doc = '', $pos = 0) {return $this->__construct($doc, $pos);}
179	#php4e
180
181	/**
182	* Sets target document
183	* @param string $doc Document to be tokenized
184	* @param int $pos Position to start parsing
185	* @see getDoc()
186	* @see setPos()
187	*/
188	function setDoc($doc, $pos = 0) {
189	$this->doc = $doc;
190	$this->size = strlen($doc);
191	$this->setPos($pos);
192	}
193
194	/**
195	* Returns target document
196	* @return string
197	* @see setDoc()
198	*/
199	function getDoc() {
200	return $this->doc;
201	}
202
203	/**
204	* Sets position in document
205	* @param int $pos
206	* @see getPos()
207	*/
208	function setPos($pos = 0) {
209	$this->pos = $pos - 1;
210	$this->line_pos = array(0, 0);
211	$this->next();
212	}
213
214	/**
215	* Returns current position in document (Index)
216	* @return int
217	* @see setPos()
218	*/
219	function getPos() {
220	return $this->pos;
221	}
222
223	/**
224	* Returns current position in document (Line/Char)
225	* @return array array(Line, Column)
226	*/
227	function getLinePos() {
228	return array($this->line_pos[0], $this->pos - $this->line_pos[1]);
229	}
230
231	/**
232	* Returns current token
233	* @return int
234	* @see $token
235	*/
236	function getToken() {
237	return $this->token;
238	}
239
240	/**
241	* Returns current token as string
242	* @param int $start_offset Offset from token start
243	* @param int $end_offset Offset from token end
244	* @return string
245	*/
246	function getTokenString($start_offset = 0, $end_offset = 0) {
247	$token_start = ((is_int($this->token_start)) ? $this->token_start : $this->pos) + $start_offset;
248	$len = $this->pos - $token_start + 1 + $end_offset;
249	return (($len > 0) ? substr($this->doc, $token_start, $len) : '');
250	}
251
252	/**
253	* Sets characters to be recognized as whitespace
254	*
255	* Used like: setWhitespace('ab') or setWhitespace(array('a' => true, 'b', 'c'));
256	* @param string\|array $ws
257	* @see getWhitespace();
258	*/
259	function setWhitespace($ws) {
260	if (is_array($ws)) {
261	$this->whitespace = array_fill_keys(array_values($ws), true);
262	$this->buildCharMap();
263	} else {
264	$this->setWhiteSpace(str_split($ws));
265	}
266	}
267
268	/**
269	* Returns whitespace characters as string/array
270	* @param bool $as_string Should the result be a string or an array?
271	* @return string\|array
272	* @see setWhitespace()
273	*/
274	function getWhitespace($as_string = true) {
275	$ws = array_keys($this->whitespace);
276	return (($as_string) ? implode('', $ws) : $ws);
277	}
278
279	/**
280	* Sets characters to be recognized as identifier
281	*
282	* Used like: setIdentifiers('ab') or setIdentifiers(array('a' => true, 'b', 'c'));
283	* @param string\|array $ident
284	* @see getIdentifiers();
285	*/
286	function setIdentifiers($ident) {
287	if (is_array($ident)) {
288	$this->identifiers = array_fill_keys(array_values($ident), true);
289	$this->buildCharMap();
290	} else {
291	$this->setIdentifiers(str_split($ident));
292	}
293	}
294
295	/**
296	* Returns identifier characters as string/array
297	* @param bool $as_string Should the result be a string or an array?
298	* @return string\|array
299	* @see setIdentifiers()
300	*/
301	function getIdentifiers($as_string = true) {
302	$ident = array_keys($this->identifiers);
303	return (($as_string) ? implode('', $ident) : $ident);
304	}
305
306	/**
307	* Maps a custom character to a token/function
308	*
309	* Used like: mapChar('a', self::{@link TOK_IDENTIFIER}) or mapChar('a', 'parse_identifier');
310	* @param string $char Character that should be mapped. If set, it will be overridden
311	* @param int\|string $map If function name, then $this->function will be called, otherwise token is set to $map
312	* @see unmapChar()
313	*/
314	function mapChar($char, $map) {
315	$this->custom_char_map[$char] = $map;
316	$this->buildCharMap();
317	}
318
319	/**
320	* Removes a char mapped with {@link mapChar()}
321	* @param string $char Character that should be unmapped
322	* @see mapChar()
323	*/
324	function unmapChar($char) {
325	unset($this->custom_char_map[$char]);
326	$this->buildCharMap();
327	}
328
329	/**
330	* Builds the {@link $map_char} array
331	* @internal Builds single array that maps all characters. Gets called if {@link $whitespace}, {@link $identifiers} or {@link $custom_char_map} get modified
332	*/
333	protected function buildCharMap() {
334	$this->char_map = $this->custom_char_map;
335	if (is_array($this->whitespace)) {
336	foreach($this->whitespace as $w => $v) {
337	$this->char_map[$w] = 'parse_whitespace';
338	}
339	}
340	if (is_array($this->identifiers)) {
341	foreach($this->identifiers as $i => $v) {
342	$this->char_map[$i] = 'parse_identifier';
343	}
344	}
345	}
346
347	/**
348	* Add error to the array and appends current position
349	* @param string $error
350	*/
351	function addError($error) {
352	$this->errors[] = htmlentities($error.' at '.($this->line_pos[0] + 1).', '.($this->pos - $this->line_pos[1] + 1).'!');
353	}
354
355	/**
356	* Parse line breaks and increase line number
357	* @internal Gets called to process line breaks
358	*/
359	protected function parse_linebreak() {
360	if($this->doc[$this->pos] === "\r") {
361	++$this->line_pos[0];
362	if ((($this->pos + 1) < $this->size) && ($this->doc[$this->pos + 1] === "\n")) {
363	++$this->pos;
364	}
365	$this->line_pos[1] = $this->pos;
366	} elseif($this->doc[$this->pos] === "\n") {
367	++$this->line_pos[0];
368	$this->line_pos[1] = $this->pos;
369	}
370	}
371
372	/**
373	* Parse whitespace
374	* @return int Token
375	* @internal Gets called with {@link $whitespace} characters
376	*/
377	protected function parse_whitespace() {
378	$this->token_start = $this->pos;
379
380	while(++$this->pos < $this->size) {
381	if (!isset($this->whitespace[$this->doc[$this->pos]])) {
382	break;
383	} else {
384	$this->parse_linebreak();
385	}
386	}
387
388	--$this->pos;
389	return self::TOK_WHITESPACE;
390	}
391
392	/**
393	* Parse identifiers
394	* @return int Token
395	* @internal Gets called with {@link $identifiers} characters
396	*/
397	protected function parse_identifier() {
398	$this->token_start = $this->pos;
399
400	while((++$this->pos < $this->size) && isset($this->identifiers[$this->doc[$this->pos]])) {}
401
402	--$this->pos;
403	return self::TOK_IDENTIFIER;
404	}
405
406	/**
407	* Continues to the next token
408	* @return int Next token ({@link TOK_NULL} if none)
409	*/
410	function next() {
411	$this->token_start = null;
412
413	if (++$this->pos < $this->size) {
414	if (isset($this->char_map[$this->doc[$this->pos]])) {
415	if (is_string($this->char_map[$this->doc[$this->pos]])) {
416	return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
417	} else {
418	return ($this->token = $this->char_map[$this->doc[$this->pos]]);
419	}
420	} else {
421	return ($this->token = self::TOK_UNKNOWN);
422	}
423	} else {
424	return ($this->token = self::TOK_NULL);
425	}
426	}
427
428	/**
429	* Finds the next token, but skips whitespace
430	* @return int Next token ({@link TOK_NULL} if none)
431	*/
432	function next_no_whitespace() {
433	$this->token_start = null;
434
435	while (++$this->pos < $this->size) {
436	if (!isset($this->whitespace[$this->doc[$this->pos]])) {
437	if (isset($this->char_map[$this->doc[$this->pos]])) {
438	if (is_string($this->char_map[$this->doc[$this->pos]])) {
439	return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
440	} else {
441	return ($this->token = $this->char_map[$this->doc[$this->pos]]);
442	}
443	} else {
444	return ($this->token = self::TOK_UNKNOWN);
445	}
446	} else {
447	$this->parse_linebreak();
448	}
449	}
450
451	return ($this->token = self::TOK_NULL);
452	}
453
454	/**
455	* Finds the next token using stop characters.
456	*
457	* Used like: next_search('abc') or next_search(array('a' => true, 'b' => true, 'c' => true));
458	* @param string\|array $characters Characters to search for
459	* @param bool $callback Should the function check the charmap after finding a character?
460	* @return int Next token ({@link TOK_NULL} if none)
461	*/
462	function next_search($characters, $callback = true) {
463	$this->token_start = $this->pos;
464	if (!is_array($characters)) {
465	$characters = array_fill_keys(str_split($characters), true);
466	}
467
468	while(++$this->pos < $this->size) {
469	if (isset($characters[$this->doc[$this->pos]])) {
470	if ($callback && isset($this->char_map[$this->doc[$this->pos]])) {
471	if (is_string($this->char_map[$this->doc[$this->pos]])) {
472	return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
473	} else {
474	return ($this->token = $this->char_map[$this->doc[$this->pos]]);
475	}
476	} else {
477	return ($this->token = self::TOK_UNKNOWN);
478	}
479	} else {
480	$this->parse_linebreak();
481	}
482	}
483
484	return ($this->token = self::TOK_NULL);
485	}
486
487	/**
488	* Finds the next token by searching for a string
489	* @param string $needle The needle that's being searched for
490	* @param bool $callback Should the function check the charmap after finding the needle?
491	* @return int Next token ({@link TOK_NULL} if none)
492	*/
493	function next_pos($needle, $callback = true) {
494	$this->token_start = $this->pos;
495	if (($this->pos < $this->size) && (($p = stripos($this->doc, $needle, $this->pos + 1)) !== false)) {
496
497	$len = $p - $this->pos - 1;
498	if ($len > 0) {
499	$str = substr($this->doc, $this->pos + 1, $len);
500
501	if (($l = strrpos($str, "\n")) !== false) {
502	++$this->line_pos[0];
503	$this->line_pos[1] = $l + $this->pos + 1;
504
505	$len -= $l;
506	if ($len > 0) {
507	$str = substr($str, 0, -$len);
508	$this->line_pos[0] += substr_count($str, "\n");
509	}
510	}
511	}
512
513	$this->pos = $p;
514	if ($callback && isset($this->char_map[$this->doc[$this->pos]])) {
515	if (is_string($this->char_map[$this->doc[$this->pos]])) {
516	return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
517	} else {
518	return ($this->token = $this->char_map[$this->doc[$this->pos]]);
519	}
520	} else {
521	return ($this->token = self::TOK_UNKNOWN);
522	}
523	} else {
524	$this->pos = $this->size;
525	return ($this->token = self::TOK_NULL);
526	}
527	}
528
529	/**
530	* Expect a specific token or character. Adds error if token doesn't match.
531	* @param string\|int $token Character or token to expect
532	* @param bool\|int $do_next Go to next character before evaluating. 1 for next char, true to ignore whitespace
533	* @param bool\|int $try_next Try next character if current doesn't match. 1 for next char, true to ignore whitespace
534	* @param bool\|int $next_on_match Go to next character after evaluating. 1 for next char, true to ignore whitespace
535	* @return bool
536	*/
537	protected function expect($token, $do_next = true, $try_next = false, $next_on_match = 1) {
538	if ($do_next) {
539	if ($do_next === 1) {
540	$this->next();
541	} else {
542	$this->next_no_whitespace();
543	}
544	}
545
546	if (is_int($token)) {
547	if (($this->token !== $token) && ((!$try_next) \|\| ((($try_next === 1) && ($this->next() !== $token)) \|\| (($try_next === true) && ($this->next_no_whitespace() !== $token))))) {
548	$this->addError('Unexpected "'.$this->getTokenString().'"');
549	return false;
550	}
551	} else {
552	if (($this->doc[$this->pos] !== $token) && ((!$try_next) \|\| (((($try_next === 1) && ($this->next() !== self::TOK_NULL)) \|\| (($try_next === true) && ($this->next_no_whitespace() !== self::TOK_NULL))) && ($this->doc[$this->pos] !== $token)))) {
553	$this->addError('Expected "'.$token.'", but found "'.$this->getTokenString().'"');
554	return false;
555	}
556	}
557
558	if ($next_on_match) {
559	if ($next_on_match === 1) {
560	$this->next();
561	} else {
562	$this->next_no_whitespace();
563	}
564	}
565	return true;
566	}
567	}
568