AbstractHtmlProcessor.php — WooCommerce 6.9.0-beta.1

woocommerce / vendor / pelago / emogrifier / src / HtmlProcessor / AbstractHtmlProcessor.php

woocommerce / vendor / pelago / emogrifier / src / HtmlProcessor Last commit date

AbstractHtmlProcessor.php 4 years ago CssToAttributeConverter.php 4 years ago HtmlNormalizer.php 4 years ago HtmlPruner.php 4 years ago

AbstractHtmlProcessor.php

473 lines

1	<?php
2
3	declare(strict_types=1);
4
5	namespace Pelago\Emogrifier\HtmlProcessor;
6
7	/**
8	* Base class for HTML processor that e.g., can remove, add or modify nodes or attributes.
9	*
10	* The "vanilla" subclass is the HtmlNormalizer.
11	*
12	* @psalm-consistent-constructor
13	*/
14	abstract class AbstractHtmlProcessor
15	{
16	/**
17	* @var string
18	*/
19	protected const DEFAULT_DOCUMENT_TYPE = '<!DOCTYPE html>';
20
21	/**
22	* @var string
23	*/
24	protected const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';
25
26	/**
27	* @var string Regular expression part to match tag names that PHP's DOMDocument implementation is not aware are
28	* self-closing. These are mostly HTML5 elements, but for completeness <command> (obsolete) and <keygen>
29	* (deprecated) are also included.
30	*
31	* @see https://bugs.php.net/bug.php?id=73175
32	*/
33	protected const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command\|embed\|keygen\|source\|track\|wbr)';
34
35	/**
36	* Regular expression part to match tag names that may appear before the start of the `<body>` element. A start tag
37	* for any other element would implicitly start the `<body>` element due to tag omission rules.
38	*
39	* @var string
40	*/
41	protected const TAGNAME_ALLOWED_BEFORE_BODY_MATCHER
42	= '(?:html\|head\|base\|command\|link\|meta\|noscript\|script\|style\|template\|title)';
43
44	/**
45	* regular expression pattern to match an HTML comment, including delimiters and modifiers
46	*
47	* @var string
48	*/
49	protected const HTML_COMMENT_PATTERN = '/<!--[^-]+(?:-(?!->)[^-]+)*+(?:-->\|$)/';
50
51	/**
52	* regular expression pattern to match an HTML `<template>` element, including delimiters and modifiers
53	*
54	* @var string
55	*/
56	protected const HTML_TEMPLATE_ELEMENT_PATTERN
57	= '%<template[\\s>][^<]+(?:<(?!/template>)[^<]+)*+(?:</template>\|$)%i';
58
59	/**
60	* @var ?\DOMDocument
61	*/
62	protected $domDocument = null;
63
64	/**
65	* @var ?\DOMXPath
66	*/
67	private $xPath = null;
68
69	/**
70	* The constructor.
71	*
72	* Please use `::fromHtml` or `::fromDomDocument` instead.
73	*/
74	private function __construct()
75	{
76	}
77
78	/**
79	* Builds a new instance from the given HTML.
80	*
81	* @param string $unprocessedHtml raw HTML, must be UTF-encoded, must not be empty
82	*
83	* @return static
84	*
85	* @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string
86	*/
87	public static function fromHtml(string $unprocessedHtml): self
88	{
89	if ($unprocessedHtml === '') {
90	throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647);
91	}
92
93	$instance = new static();
94	$instance->setHtml($unprocessedHtml);
95
96	return $instance;
97	}
98
99	/**
100	* Builds a new instance from the given DOM document.
101	*
102	* @param \DOMDocument $document a DOM document returned by getDomDocument() of another instance
103	*
104	* @return static
105	*/
106	public static function fromDomDocument(\DOMDocument $document): self
107	{
108	$instance = new static();
109	$instance->setDomDocument($document);
110
111	return $instance;
112	}
113
114	/**
115	* Sets the HTML to process.
116	*
117	* @param string $html the HTML to process, must be UTF-8-encoded
118	*/
119	private function setHtml(string $html): void
120	{
121	$this->createUnifiedDomDocument($html);
122	}
123
124	/**
125	* Provides access to the internal DOMDocument representation of the HTML in its current state.
126	*
127	* @return \DOMDocument
128	*
129	* @throws \UnexpectedValueException
130	*/
131	public function getDomDocument(): \DOMDocument
132	{
133	if (!$this->domDocument instanceof \DOMDocument) {
134	$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
135	throw new \UnexpectedValueException($message, 1570472239);
136	}
137
138	return $this->domDocument;
139	}
140
141	/**
142	* @param \DOMDocument $domDocument
143	*/
144	private function setDomDocument(\DOMDocument $domDocument): void
145	{
146	$this->domDocument = $domDocument;
147	$this->xPath = new \DOMXPath($this->domDocument);
148	}
149
150	/**
151	* @return \DOMXPath
152	*
153	* @throws \UnexpectedValueException
154	*/
155	protected function getXPath(): \DOMXPath
156	{
157	if (!$this->xPath instanceof \DOMXPath) {
158	$message = self::class . '::setDomDocument() has not yet been called on ' . static::class;
159	throw new \UnexpectedValueException($message, 1617819086);
160	}
161
162	return $this->xPath;
163	}
164
165	/**
166	* Renders the normalized and processed HTML.
167	*
168	* @return string
169	*/
170	public function render(): string
171	{
172	$htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML();
173
174	return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
175	}
176
177	/**
178	* Renders the content of the BODY element of the normalized and processed HTML.
179	*
180	* @return string
181	*/
182	public function renderBodyContent(): string
183	{
184	$htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML($this->getBodyElement());
185	$bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags);
186
187	return \preg_replace('%</?+body(?:\\s[^>]*+)?+>%', '', $bodyNodeHtml);
188	}
189
190	/**
191	* Eliminates any invalid closing tags for void elements from the given HTML.
192	*
193	* @param string $html
194	*
195	* @return string
196	*/
197	private function removeSelfClosingTagsClosingTags(string $html): string
198	{
199	return \preg_replace('%</' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%', '', $html);
200	}
201
202	/**
203	* Returns the BODY element.
204	*
205	* This method assumes that there always is a BODY element.
206	*
207	* @return \DOMElement
208	*
209	* @throws \RuntimeException
210	*/
211	private function getBodyElement(): \DOMElement
212	{
213	$node = $this->getDomDocument()->getElementsByTagName('body')->item(0);
214	if (!$node instanceof \DOMElement) {
215	throw new \RuntimeException('There is no body element.', 1617922607);
216	}
217
218	return $node;
219	}
220
221	/**
222	* Creates a DOM document from the given HTML and stores it in $this->domDocument.
223	*
224	* The DOM document will always have a BODY element and a document type.
225	*
226	* @param string $html
227	*/
228	private function createUnifiedDomDocument(string $html): void
229	{
230	$this->createRawDomDocument($html);
231	$this->ensureExistenceOfBodyElement();
232	}
233
234	/**
235	* Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument.
236	*
237	* @param string $html
238	*/
239	private function createRawDomDocument(string $html): void
240	{
241	$domDocument = new \DOMDocument();
242	$domDocument->strictErrorChecking = false;
243	$domDocument->formatOutput = true;
244	$libXmlState = \libxml_use_internal_errors(true);
245	$domDocument->loadHTML($this->prepareHtmlForDomConversion($html));
246	\libxml_clear_errors();
247	\libxml_use_internal_errors($libXmlState);
248
249	$this->setDomDocument($domDocument);
250	}
251
252	/**
253	* Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
254	* ensuring that the HTML will be good for creating a DOM document from it.
255	*
256	* @param string $html
257	*
258	* @return string the unified HTML
259	*/
260	private function prepareHtmlForDomConversion(string $html): string
261	{
262	$htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);
263	$htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);
264
265	return $this->addContentTypeMetaTag($htmlWithDocumentType);
266	}
267
268	/**
269	* Makes sure that the passed HTML has a document type, with lowercase "html".
270	*
271	* @param string $html
272	*
273	* @return string HTML with document type
274	*/
275	private function ensureDocumentType(string $html): string
276	{
277	$hasDocumentType = \stripos($html, '<!DOCTYPE') !== false;
278	if ($hasDocumentType) {
279	return $this->normalizeDocumentType($html);
280	}
281
282	return self::DEFAULT_DOCUMENT_TYPE . $html;
283	}
284
285	/**
286	* Makes sure the document type in the passed HTML has lowercase "html".
287	*
288	* @param string $html
289	*
290	* @return string HTML with normalized document type
291	*/
292	private function normalizeDocumentType(string $html): string
293	{
294	// Limit to replacing the first occurrence: as an optimization; and in case an example exists as unescaped text.
295	return \preg_replace(
296	'/<!DOCTYPE\\s++html(?=[\\s>])/i',
297	'<!DOCTYPE html',
298	$html,
299	1
300	);
301	}
302
303	/**
304	* Adds a Content-Type meta tag for the charset.
305	*
306	* This method also ensures that there is a HEAD element.
307	*
308	* @param string $html
309	*
310	* @return string the HTML with the meta tag added
311	*/
312	private function addContentTypeMetaTag(string $html): string
313	{
314	if ($this->hasContentTypeMetaTagInHead($html)) {
315	return $html;
316	}
317
318	// We are trying to insert the meta tag to the right spot in the DOM.
319	// If we just prepended it to the HTML, we would lose attributes set to the HTML tag.
320	$hasHeadTag = \preg_match('/<head[\\s>]/i', $html);
321	$hasHtmlTag = \stripos($html, '<html') !== false;
322
323	if ($hasHeadTag) {
324	$reworkedHtml = \preg_replace(
325	'/<head(?=[\\s>])([^>]*+)>/i',
326	'<head$1>' . self::CONTENT_TYPE_META_TAG,
327	$html
328	);
329	} elseif ($hasHtmlTag) {
330	$reworkedHtml = \preg_replace(
331	'/<html(.*?)>/is',
332	'<html$1><head>' . self::CONTENT_TYPE_META_TAG . '</head>',
333	$html
334	);
335	} else {
336	$reworkedHtml = self::CONTENT_TYPE_META_TAG . $html;
337	}
338
339	return $reworkedHtml;
340	}
341
342	/**
343	* Tests whether the given HTML has a valid `Content-Type` metadata element within the `<head>` element. Due to tag
344	* omission rules, HTML parsers are expected to end the `<head>` element and start the `<body>` element upon
345	* encountering a start tag for any element which is permitted only within the `<body>`.
346	*
347	* @param string $html
348	*
349	* @return bool
350	*/
351	private function hasContentTypeMetaTagInHead(string $html): bool
352	{
353	\preg_match('%^.?(?=<meta(?=\\s)[^>]\\shttp-equiv=(["\']?+)Content-Type\\g{-1}[\\s/>])%is', $html, $matches);
354	if (isset($matches[0])) {
355	$htmlBefore = $matches[0];
356	try {
357	$hasContentTypeMetaTagInHead = !$this->hasEndOfHeadElement($htmlBefore);
358	} catch (\RuntimeException $exception) {
359	// If something unexpected occurs, assume the `Content-Type` that was found is valid.
360	\trigger_error($exception->getMessage());
361	$hasContentTypeMetaTagInHead = true;
362	}
363	} else {
364	$hasContentTypeMetaTagInHead = false;
365	}
366
367	return $hasContentTypeMetaTagInHead;
368	}
369
370	/**
371	* Tests whether the `<head>` element ends within the given HTML. Due to tag omission rules, HTML parsers are
372	* expected to end the `<head>` element and start the `<body>` element upon encountering a start tag for any element
373	* which is permitted only within the `<body>`.
374	*
375	* @param string $html
376	*
377	* @return bool
378	*
379	* @throws \RuntimeException
380	*/
381	private function hasEndOfHeadElement(string $html): bool
382	{
383	$headEndTagMatchCount
384	= \preg_match('%<(?!' . self::TAGNAME_ALLOWED_BEFORE_BODY_MATCHER . '[\\s/>])\\w\|</head>%i', $html);
385	if (\is_int($headEndTagMatchCount) && $headEndTagMatchCount > 0) {
386	// An exception to the implicit end of the `<head>` is any content within a `<template>` element, as well in
387	// comments. As an optimization, this is only checked for if a potential `<head>` end tag is found.
388	$htmlWithoutCommentsOrTemplates = $this->removeHtmlTemplateElements($this->removeHtmlComments($html));
389	$hasEndOfHeadElement = $htmlWithoutCommentsOrTemplates === $html
390	\|\| $this->hasEndOfHeadElement($htmlWithoutCommentsOrTemplates);
391	} else {
392	$hasEndOfHeadElement = false;
393	}
394
395	return $hasEndOfHeadElement;
396	}
397
398	/**
399	* Removes comments from the given HTML, including any which are unterminated, for which the remainder of the string
400	* is removed.
401	*
402	* @param string $html
403	*
404	* @return string
405	*
406	* @throws \RuntimeException
407	*/
408	private function removeHtmlComments(string $html): string
409	{
410	$result = \preg_replace(self::HTML_COMMENT_PATTERN, '', $html);
411	if (!\is_string($result)) {
412	throw new \RuntimeException('Internal PCRE error', 1616521475);
413	}
414
415	return $result;
416	}
417
418	/**
419	* Removes `<template>` elements from the given HTML, including any without an end tag, for which the remainder of
420	* the string is removed.
421	*
422	* @param string $html
423	*
424	* @return string
425	*
426	* @throws \RuntimeException
427	*/
428	private function removeHtmlTemplateElements(string $html): string
429	{
430	$result = \preg_replace(self::HTML_TEMPLATE_ELEMENT_PATTERN, '', $html);
431	if (!\is_string($result)) {
432	throw new \RuntimeException('Internal PCRE error', 1616519652);
433	}
434
435	return $result;
436	}
437
438	/**
439	* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
440	* self-closing slash.
441	*
442	* @param string $html
443	*
444	* @return string HTML with problematic tags converted.
445	*/
446	private function ensurePhpUnrecognizedSelfClosingTagsAreXml(string $html): string
447	{
448	return \preg_replace(
449	'%<' . self::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
450	'$0/',
451	$html
452	);
453	}
454
455	/**
456	* Checks that $this->domDocument has a BODY element and adds it if it is missing.
457	*
458	* @throws \UnexpectedValueException
459	*/
460	private function ensureExistenceOfBodyElement(): void
461	{
462	if ($this->getDomDocument()->getElementsByTagName('body')->item(0) instanceof \DOMElement) {
463	return;
464	}
465
466	$htmlElement = $this->getDomDocument()->getElementsByTagName('html')->item(0);
467	if (!$htmlElement instanceof \DOMElement) {
468	throw new \UnexpectedValueException('There is no HTML element although there should be one.', 1569930853);
469	}
470	$htmlElement->appendChild($this->getDomDocument()->createElement('body'));
471	}
472	}
473