-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclass-wp-html-tag-processor.php
3672 lines (3322 loc) · 119 KB
/
class-wp-html-tag-processor.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?php
/**
* HTML API: WP_HTML_Tag_Processor class
*
* Scans through an HTML document to find specific tags, then
* transforms those tags by adding, removing, or updating the
* values of the HTML attributes within that tag (opener).
*
* Does not fully parse HTML or _recurse_ into the HTML structure
* Instead this scans linearly through a document and only parses
* the HTML tag openers.
*
* ### Possible future direction for this module
*
* - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c".
* This would increase the size of the changes for some operations but leave more
* natural-looking output HTML.
*
* @package WordPress
* @subpackage HTML-API
* @since 6.2.0
*/
/**
* Core class used to modify attributes in an HTML document for tags matching a query.
*
* ## Usage
*
* Use of this class requires three steps:
*
* 1. Create a new class instance with your input HTML document.
* 2. Find the tag(s) you are looking for.
* 3. Request changes to the attributes in those tag(s).
*
* Example:
*
* $tags = new WP_HTML_Tag_Processor( $html );
* if ( $tags->next_tag( 'option' ) ) {
* $tags->set_attribute( 'selected', true );
* }
*
* ### Finding tags
*
* The `next_tag()` function moves the internal cursor through
* your input HTML document until it finds a tag meeting any of
* the supplied restrictions in the optional query argument. If
* no argument is provided then it will find the next HTML tag,
* regardless of what kind it is.
*
* If you want to _find whatever the next tag is_:
*
* $tags->next_tag();
*
* | Goal | Query |
* |-----------------------------------------------------------|---------------------------------------------------------------------------------|
* | Find any tag. | `$tags->next_tag();` |
* | Find next image tag. | `$tags->next_tag( array( 'tag_name' => 'img' ) );` |
* | Find next image tag (without passing the array). | `$tags->next_tag( 'img' );` |
* | Find next tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'class_name' => 'fullwidth' ) );` |
* | Find next image tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'tag_name' => 'img', 'class_name' => 'fullwidth' ) );` |
*
* If a tag was found meeting your criteria then `next_tag()`
* will return `true` and you can proceed to modify it. If it
* returns `false`, however, it failed to find the tag and
* moved the cursor to the end of the file.
*
* Once the cursor reaches the end of the file the processor
* is done and if you want to reach an earlier tag you will
* need to recreate the processor and start over, as it's
* unable to back up or move in reverse.
*
* See the section on bookmarks for an exception to this
* no-backing-up rule.
*
* #### Custom queries
*
* Sometimes it's necessary to further inspect an HTML tag than
* the query syntax here permits. In these cases one may further
* inspect the search results using the read-only functions
* provided by the processor or external state or variables.
*
* Example:
*
* // Paint up to the first five DIV or SPAN tags marked with the "jazzy" style.
* $remaining_count = 5;
* while ( $remaining_count > 0 && $tags->next_tag() ) {
* if (
* ( 'DIV' === $tags->get_tag() || 'SPAN' === $tags->get_tag() ) &&
* 'jazzy' === $tags->get_attribute( 'data-style' )
* ) {
* $tags->add_class( 'theme-style-everest-jazz' );
* $remaining_count--;
* }
* }
*
* `get_attribute()` will return `null` if the attribute wasn't present
* on the tag when it was called. It may return `""` (the empty string)
* in cases where the attribute was present but its value was empty.
* For boolean attributes, those whose name is present but no value is
* given, it will return `true` (the only way to set `false` for an
* attribute is to remove it).
*
* #### When matching fails
*
* When `next_tag()` returns `false` it could mean different things:
*
* - The requested tag wasn't found in the input document.
* - The input document ended in the middle of an HTML syntax element.
*
* When a document ends in the middle of a syntax element it will pause
* the processor. This is to make it possible in the future to extend the
* input document and proceed - an important requirement for chunked
* streaming parsing of a document.
*
* Example:
*
* $processor = new WP_HTML_Tag_Processor( 'This <div is="a" partial="token' );
* false === $processor->next_tag();
*
* If a special element (see next section) is encountered but no closing tag
* is found it will count as an incomplete tag. The parser will pause as if
* the opening tag were incomplete.
*
* Example:
*
* $processor = new WP_HTML_Tag_Processor( '<style>// there could be more styling to come' );
* false === $processor->next_tag();
*
* $processor = new WP_HTML_Tag_Processor( '<style>// this is everything</style><div>' );
* true === $processor->next_tag( 'DIV' );
*
* #### Special elements
*
* Some HTML elements are handled in a special way; their start and end tags
* act like a void tag. These are special because their contents can't contain
* HTML markup. Everything inside these elements is handled in a special way
* and content that _appears_ like HTML tags inside of them isn't. There can
* be no nesting in these elements.
*
* In the following list, "raw text" means that all of the content in the HTML
* until the matching closing tag is treated verbatim without any replacements
* and without any parsing.
*
* - IFRAME allows no content but requires a closing tag.
* - NOEMBED (deprecated) content is raw text.
* - NOFRAMES (deprecated) content is raw text.
* - SCRIPT content is plaintext apart from legacy rules allowing `</script>` inside an HTML comment.
* - STYLE content is raw text.
* - TITLE content is plain text but character references are decoded.
* - TEXTAREA content is plain text but character references are decoded.
* - XMP (deprecated) content is raw text.
*
* ### Modifying HTML attributes for a found tag
*
* Once you've found the start of an opening tag you can modify
* any number of the attributes on that tag. You can set a new
* value for an attribute, remove the entire attribute, or do
* nothing and move on to the next opening tag.
*
* Example:
*
* if ( $tags->next_tag( array( 'class_name' => 'wp-group-block' ) ) ) {
* $tags->set_attribute( 'title', 'This groups the contained content.' );
* $tags->remove_attribute( 'data-test-id' );
* }
*
* If `set_attribute()` is called for an existing attribute it will
* overwrite the existing value. Similarly, calling `remove_attribute()`
* for a non-existing attribute has no effect on the document. Both
* of these methods are safe to call without knowing if a given attribute
* exists beforehand.
*
* ### Modifying CSS classes for a found tag
*
* The tag processor treats the `class` attribute as a special case.
* Because it's a common operation to add or remove CSS classes, this
* interface adds helper methods to make that easier.
*
* As with attribute values, adding or removing CSS classes is a safe
* operation that doesn't require checking if the attribute or class
* exists before making changes. If removing the only class then the
* entire `class` attribute will be removed.
*
* Example:
*
* // from `<span>Yippee!</span>`
* // to `<span class="is-active">Yippee!</span>`
* $tags->add_class( 'is-active' );
*
* // from `<span class="excited">Yippee!</span>`
* // to `<span class="excited is-active">Yippee!</span>`
* $tags->add_class( 'is-active' );
*
* // from `<span class="is-active heavy-accent">Yippee!</span>`
* // to `<span class="is-active heavy-accent">Yippee!</span>`
* $tags->add_class( 'is-active' );
*
* // from `<input type="text" class="is-active rugby not-disabled" length="24">`
* // to `<input type="text" class="is-active not-disabled" length="24">
* $tags->remove_class( 'rugby' );
*
* // from `<input type="text" class="rugby" length="24">`
* // to `<input type="text" length="24">
* $tags->remove_class( 'rugby' );
*
* // from `<input type="text" length="24">`
* // to `<input type="text" length="24">
* $tags->remove_class( 'rugby' );
*
* When class changes are enqueued but a direct change to `class` is made via
* `set_attribute` then the changes to `set_attribute` (or `remove_attribute`)
* will take precedence over those made through `add_class` and `remove_class`.
*
* ### Bookmarks
*
* While scanning through the input HTMl document it's possible to set
* a named bookmark when a particular tag is found. Later on, after
* continuing to scan other tags, it's possible to `seek` to one of
* the set bookmarks and then proceed again from that point forward.
*
* Because bookmarks create processing overhead one should avoid
* creating too many of them. As a rule, create only bookmarks
* of known string literal names; avoid creating "mark_{$index}"
* and so on. It's fine from a performance standpoint to create a
* bookmark and update it frequently, such as within a loop.
*
* $total_todos = 0;
* while ( $p->next_tag( array( 'tag_name' => 'UL', 'class_name' => 'todo' ) ) ) {
* $p->set_bookmark( 'list-start' );
* while ( $p->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
* if ( 'UL' === $p->get_tag() && $p->is_tag_closer() ) {
* $p->set_bookmark( 'list-end' );
* $p->seek( 'list-start' );
* $p->set_attribute( 'data-contained-todos', (string) $total_todos );
* $total_todos = 0;
* $p->seek( 'list-end' );
* break;
* }
*
* if ( 'LI' === $p->get_tag() && ! $p->is_tag_closer() ) {
* $total_todos++;
* }
* }
* }
*
* ## Tokens and finer-grained processing.
*
* It's possible to scan through every lexical token in the
* HTML document using the `next_token()` function. This
* alternative form takes no argument and provides no built-in
* query syntax.
*
* Example:
*
* $title = '(untitled)';
* $text = '';
* while ( $processor->next_token() ) {
* switch ( $processor->get_token_name() ) {
* case '#text':
* $text .= $processor->get_modifiable_text();
* break;
*
* case 'BR':
* $text .= "\n";
* break;
*
* case 'TITLE':
* $title = $processor->get_modifiable_text();
* break;
* }
* }
* return trim( "# {$title}\n\n{$text}" );
*
* ### Tokens and _modifiable text_.
*
* #### Special "atomic" HTML elements.
*
* Not all HTML elements are able to contain other elements inside of them.
* For instance, the contents inside a TITLE element are plaintext (except
* that character references like & will be decoded). This means that
* if the string `<img>` appears inside a TITLE element, then it's not an
* image tag, but rather it's text describing an image tag. Likewise, the
* contents of a SCRIPT or STYLE element are handled entirely separately in
* a browser than the contents of other elements because they represent a
* different language than HTML.
*
* For these elements the Tag Processor treats the entire sequence as one,
* from the opening tag, including its contents, through its closing tag.
* This means that the it's not possible to match the closing tag for a
* SCRIPT element unless it's unexpected; the Tag Processor already matched
* it when it found the opening tag.
*
* The inner contents of these elements are that element's _modifiable text_.
*
* The special elements are:
* - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy
* style of including JavaScript inside of HTML comments to avoid accidentally
* closing the SCRIPT from inside a JavaScript string. E.g. `console.log( '</script>' )`.
* - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any
* character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`.
* - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as
* raw plaintext and left as-is. E.g. `1 < 2 < 3` remains `1 < 2 < 3`.
*
* #### Other tokens with modifiable text.
*
* There are also non-elements which are void/self-closing in nature and contain
* modifiable text that is part of that individual syntax token itself.
*
* - `#text` nodes, whose entire token _is_ the modifiable text.
* - HTML comments and tokens that become comments due to some syntax error. The
* text for these tokens is the portion of the comment inside of the syntax.
* E.g. for `<!-- comment -->` the text is `" comment "` (note the spaces are included).
* - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
* `<![CDATA[some content]]>` the text is `"some content"` (with restrictions [1]).
* - "Funky comments," which are a special case of invalid closing tags whose name is
* invalid. The text for these nodes is the text that a browser would transform into
* an HTML comment when parsing. E.g. for `</%post_author>` the text is `%post_author`.
* - `DOCTYPE` declarations like `<DOCTYPE html>` which have no closing tag.
* - XML Processing instruction nodes like `<?wp __( "Like" ); ?>` (with restrictions [2]).
* - The empty end tag `</>` which is ignored in the browser and DOM.
*
* [1]: There are no CDATA sections in HTML. When encountering `<![CDATA[`, everything
* until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA
* section in an HTML document containing `>`. The Tag Processor will first find
* all valid and bogus HTML comments, and then if the comment _would_ have been a
* CDATA section _were they to exist_, it will indicate this as the type of comment.
*
* [2]: XML allows a broader range of characters in a processing instruction's target name
* and disallows "xml" as a name, since it's special. The Tag Processor only recognizes
* target names with an ASCII-representable subset of characters. It also exhibits the
* same constraint as with CDATA sections, in that `>` cannot exist within the token
* since Processing Instructions do no exist within HTML and their syntax transforms
* into a bogus comment in the DOM.
*
* ## Design and limitations
*
* The Tag Processor is designed to linearly scan HTML documents and tokenize
* HTML tags and their attributes. It's designed to do this as efficiently as
* possible without compromising parsing integrity. Therefore it will be
* slower than some methods of modifying HTML, such as those incorporating
* over-simplified PCRE patterns, but will not introduce the defects and
* failures that those methods bring in, which lead to broken page renders
* and often to security vulnerabilities. On the other hand, it will be faster
* than full-blown HTML parsers such as DOMDocument and use considerably
* less memory. It requires a negligible memory overhead, enough to consider
* it a zero-overhead system.
*
* The performance characteristics are maintained by avoiding tree construction
* and semantic cleanups which are specified in HTML5. Because of this, for
* example, it's not possible for the Tag Processor to associate any given
* opening tag with its corresponding closing tag, or to return the inner markup
* inside an element. Systems may be built on top of the Tag Processor to do
* this, but the Tag Processor is and should be constrained so it can remain an
* efficient, low-level, and reliable HTML scanner.
*
* The Tag Processor's design incorporates a "garbage-in-garbage-out" philosophy.
* HTML5 specifies that certain invalid content be transformed into different forms
* for display, such as removing null bytes from an input document and replacing
* invalid characters with the Unicode replacement character `U+FFFD` (visually "�").
* Where errors or transformations exist within the HTML5 specification, the Tag Processor
* leaves those invalid inputs untouched, passing them through to the final browser
* to handle. While this implies that certain operations will be non-spec-compliant,
* such as reading the value of an attribute with invalid content, it also preserves a
* simplicity and efficiency for handling those error cases.
*
* Most operations within the Tag Processor are designed to minimize the difference
* between an input and output document for any given change. For example, the
* `add_class` and `remove_class` methods preserve whitespace and the class ordering
* within the `class` attribute; and when encountering tags with duplicated attributes,
* the Tag Processor will leave those invalid duplicate attributes where they are but
* update the proper attribute which the browser will read for parsing its value. An
* exception to this rule is that all attribute updates store their values as
* double-quoted strings, meaning that attributes on input with single-quoted or
* unquoted values will appear in the output with double-quotes.
*
* ### Scripting Flag
*
* The Tag Processor parses HTML with the "scripting flag" disabled. This means
* that it doesn't run any scripts while parsing the page. In a browser with
* JavaScript enabled, for example, the script can change the parse of the
* document as it loads. On the server, however, evaluating JavaScript is not
* only impractical, but also unwanted.
*
* Practically this means that the Tag Processor will descend into NOSCRIPT
* elements and process its child tags. Were the scripting flag enabled, such
* as in a typical browser, the contents of NOSCRIPT are skipped entirely.
*
* This allows the HTML API to process the content that will be presented in
* a browser when scripting is disabled, but it offers a different view of a
* page than most browser sessions will experience. E.g. the tags inside the
* NOSCRIPT disappear.
*
* ### Text Encoding
*
* The Tag Processor assumes that the input HTML document is encoded with a
* text encoding compatible with 7-bit ASCII's '<', '>', '&', ';', '/', '=',
* "'", '"', 'a' - 'z', 'A' - 'Z', and the whitespace characters ' ', tab,
* carriage-return, newline, and form-feed.
*
* In practice, this includes almost every single-byte encoding as well as
* UTF-8. Notably, however, it does not include UTF-16. If providing input
* that's incompatible, then convert the encoding beforehand.
*
* @since 6.2.0
* @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
* @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
* @since 6.5.0 Pauses processor when input ends in an incomplete syntax token.
* Introduces "special" elements which act like void elements, e.g. TITLE, STYLE.
* Allows scanning through all tokens and processing modifiable text, where applicable.
*/
class WP_HTML_Tag_Processor {
/**
* The maximum number of bookmarks allowed to exist at
* any given time.
*
* @since 6.2.0
* @var int
*
* @see WP_HTML_Tag_Processor::set_bookmark()
*/
const MAX_BOOKMARKS = 10;
/**
* Maximum number of times seek() can be called.
* Prevents accidental infinite loops.
*
* @since 6.2.0
* @var int
*
* @see WP_HTML_Tag_Processor::seek()
*/
const MAX_SEEK_OPS = 1000;
/**
* The HTML document to parse.
*
* @since 6.2.0
* @var string
*/
protected $html;
/**
* The last query passed to next_tag().
*
* @since 6.2.0
* @var array|null
*/
private $last_query;
/**
* The tag name this processor currently scans for.
*
* @since 6.2.0
* @var string|null
*/
private $sought_tag_name;
/**
* The CSS class name this processor currently scans for.
*
* @since 6.2.0
* @var string|null
*/
private $sought_class_name;
/**
* The match offset this processor currently scans for.
*
* @since 6.2.0
* @var int|null
*/
private $sought_match_offset;
/**
* Whether to visit tag closers, e.g. </div>, when walking an input document.
*
* @since 6.2.0
* @var bool
*/
private $stop_on_tag_closers;
/**
* Specifies mode of operation of the parser at any given time.
*
* | State | Meaning |
* | ----------------|----------------------------------------------------------------------|
* | *Ready* | The parser is ready to run. |
* | *Complete* | There is nothing left to parse. |
* | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. |
* | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. |
* | *Text node* | Found a #text node; this is plaintext and modifiable. |
* | *CDATA node* | Found a CDATA section; this is modifiable. |
* | *Comment* | Found a comment or bogus comment; this is modifiable. |
* | *Presumptuous* | Found an empty tag closer: `</>`. |
* | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. |
*
* @since 6.5.0
*
* @see WP_HTML_Tag_Processor::STATE_READY
* @see WP_HTML_Tag_Processor::STATE_COMPLETE
* @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT
* @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
* @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
* @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
* @see WP_HTML_Tag_Processor::STATE_COMMENT
* @see WP_HTML_Tag_Processor::STATE_DOCTYPE
* @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
* @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT
*
* @var string
*/
protected $parser_state = self::STATE_READY;
/**
* What kind of syntax token became an HTML comment.
*
* Since there are many ways in which HTML syntax can create an HTML comment,
* this indicates which of those caused it. This allows the Tag Processor to
* represent more from the original input document than would appear in the DOM.
*
* @since 6.5.0
*
* @var string|null
*/
protected $comment_type = null;
/**
* How many bytes from the original HTML document have been read and parsed.
*
* This value points to the latest byte offset in the input document which
* has been already parsed. It is the internal cursor for the Tag Processor
* and updates while scanning through the HTML tokens.
*
* @since 6.2.0
* @var int
*/
private $bytes_already_parsed = 0;
/**
* Byte offset in input document where current token starts.
*
* Example:
*
* <div id="test">...
* 01234
* - token starts at 0
*
* @since 6.5.0
*
* @var int|null
*/
private $token_starts_at;
/**
* Byte length of current token.
*
* Example:
*
* <div id="test">...
* 012345678901234
* - token length is 14 - 0 = 14
*
* a <!-- comment --> is a token.
* 0123456789 123456789 123456789
* - token length is 17 - 2 = 15
*
* @since 6.5.0
*
* @var int|null
*/
private $token_length;
/**
* Byte offset in input document where current tag name starts.
*
* Example:
*
* <div id="test">...
* 01234
* - tag name starts at 1
*
* @since 6.2.0
*
* @var int|null
*/
private $tag_name_starts_at;
/**
* Byte length of current tag name.
*
* Example:
*
* <div id="test">...
* 01234
* --- tag name length is 3
*
* @since 6.2.0
*
* @var int|null
*/
private $tag_name_length;
/**
* Byte offset into input document where current modifiable text starts.
*
* @since 6.5.0
*
* @var int
*/
private $text_starts_at;
/**
* Byte length of modifiable text.
*
* @since 6.5.0
*
* @var string
*/
private $text_length;
/**
* Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>.
*
* @var bool
*/
private $is_closing_tag;
/**
* Lazily-built index of attributes found within an HTML tag, keyed by the attribute name.
*
* Example:
*
* // Supposing the parser is working through this content
* // and stops after recognizing the `id` attribute.
* // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8">
* // ^ parsing will continue from this point.
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false )
* );
*
* // When picking up parsing again, or when asking to find the
* // `class` attribute we will continue and add to this array.
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ),
* 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false )
* );
*
* // Note that only the `class` attribute value is stored in the index.
* // That's because it is the only value used by this class at the moment.
*
* @since 6.2.0
* @var WP_HTML_Attribute_Token[]
*/
private $attributes = array();
/**
* Tracks spans of duplicate attributes on a given tag, used for removing
* all copies of an attribute when calling `remove_attribute()`.
*
* @since 6.3.2
*
* @var (WP_HTML_Span[])[]|null
*/
private $duplicate_attributes = null;
/**
* Which class names to add or remove from a tag.
*
* These are tracked separately from attribute updates because they are
* semantically distinct, whereas this interface exists for the common
* case of adding and removing class names while other attributes are
* generally modified as with DOM `setAttribute` calls.
*
* When modifying an HTML document these will eventually be collapsed
* into a single `set_attribute( 'class', $changes )` call.
*
* Example:
*
* // Add the `wp-block-group` class, remove the `wp-group` class.
* $classname_updates = array(
* // Indexed by a comparable class name.
* 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS,
* 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS
* );
*
* @since 6.2.0
* @var bool[]
*/
private $classname_updates = array();
/**
* Tracks a semantic location in the original HTML which
* shifts with updates as they are applied to the document.
*
* @since 6.2.0
* @var WP_HTML_Span[]
*/
protected $bookmarks = array();
const ADD_CLASS = true;
const REMOVE_CLASS = false;
const SKIP_CLASS = null;
/**
* Lexical replacements to apply to input HTML document.
*
* "Lexical" in this class refers to the part of this class which
* operates on pure text _as text_ and not as HTML. There's a line
* between the public interface, with HTML-semantic methods like
* `set_attribute` and `add_class`, and an internal state that tracks
* text offsets in the input document.
*
* When higher-level HTML methods are called, those have to transform their
* operations (such as setting an attribute's value) into text diffing
* operations (such as replacing the sub-string from indices A to B with
* some given new string). These text-diffing operations are the lexical
* updates.
*
* As new higher-level methods are added they need to collapse their
* operations into these lower-level lexical updates since that's the
* Tag Processor's internal language of change. Any code which creates
* these lexical updates must ensure that they do not cross HTML syntax
* boundaries, however, so these should never be exposed outside of this
* class or any classes which intentionally expand its functionality.
*
* These are enqueued while editing the document instead of being immediately
* applied to avoid processing overhead, string allocations, and string
* copies when applying many updates to a single document.
*
* Example:
*
* // Replace an attribute stored with a new value, indices
* // sourced from the lazily-parsed HTML recognizer.
* $start = $attributes['src']->start;
* $length = $attributes['src']->length;
* $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value );
*
* // Correspondingly, something like this will appear in this array.
* $lexical_updates = array(
* WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' )
* );
*
* @since 6.2.0
* @var WP_HTML_Text_Replacement[]
*/
protected $lexical_updates = array();
/**
* Tracks and limits `seek()` calls to prevent accidental infinite loops.
*
* @since 6.2.0
* @var int
*
* @see WP_HTML_Tag_Processor::seek()
*/
protected $seek_count = 0;
/**
* Constructor.
*
* @since 6.2.0
*
* @param string $html HTML to process.
*/
public function __construct( $html ) {
$this->html = $html;
}
/**
* Finds the next tag matching the $query.
*
* @since 6.2.0
* @since 6.5.0 No longer processes incomplete tokens at end of document; pauses the processor at start of token.
*
* @param array|string|null $query {
* Optional. Which tag name to find, having which class, etc. Default is to find any tag.
*
* @type string|null $tag_name Which tag to find, or `null` for "any tag."
* @type int|null $match_offset Find the Nth tag matching all search criteria.
* 1 for "first" tag, 3 for "third," etc.
* Defaults to first tag.
* @type string|null $class_name Tag must contain this whole class name to match.
* @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>.
* }
* @return bool Whether a tag was matched.
*/
public function next_tag( $query = null ) {
$this->parse_query( $query );
$already_found = 0;
do {
if ( false === $this->next_token() ) {
return false;
}
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
continue;
}
if ( $this->matches() ) {
++$already_found;
}
} while ( $already_found < $this->sought_match_offset );
return true;
}
/**
* Finds the next token in the HTML document.
*
* An HTML document can be viewed as a stream of tokens,
* where tokens are things like HTML tags, HTML comments,
* text nodes, etc. This method finds the next token in
* the HTML document and returns whether it found one.
*
* If it starts parsing a token and reaches the end of the
* document then it will seek to the start of the last
* token and pause, returning `false` to indicate that it
* failed to find a complete token.
*
* Possible token types, based on the HTML specification:
*
* - an HTML tag, whether opening, closing, or void.
* - a text node - the plaintext inside tags.
* - an HTML comment.
* - a DOCTYPE declaration.
* - a processing instruction, e.g. `<?xml version="1.0" ?>`.
*
* The Tag Processor currently only supports the tag token.
*
* @since 6.5.0
*
* @return bool Whether a token was parsed.
*/
public function next_token() {
return $this->base_class_next_token();
}
/**
* Internal method which finds the next token in the HTML document.
*
* This method is a protected internal function which implements the logic for
* finding the next token in a document. It exists so that the parser can update
* its state without affecting the location of the cursor in the document and
* without triggering subclass methods for things like `next_token()`, e.g. when
* applying patches before searching for the next token.
*
* @since 6.5.0
*
* @access private
*
* @return bool Whether a token was parsed.
*/
private function base_class_next_token() {
$was_at = $this->bytes_already_parsed;
$this->after_tag();
// Don't proceed if there's nothing more to scan.
if (
self::STATE_COMPLETE === $this->parser_state ||
self::STATE_INCOMPLETE_INPUT === $this->parser_state
) {
return false;
}
/*
* The next step in the parsing loop determines the parsing state;
* clear it so that state doesn't linger from the previous step.
*/
$this->parser_state = self::STATE_READY;
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
$this->parser_state = self::STATE_COMPLETE;
return false;
}
// Find the next tag if it exists.
if ( false === $this->parse_next_tag() ) {
if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
$this->bytes_already_parsed = $was_at;
}
return false;
}
/*
* For legacy reasons the rest of this function handles tags and their
* attributes. If the processor has reached the end of the document
* or if it matched any other token then it should return here to avoid
* attempting to process tag-specific syntax.
*/
if (
self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
self::STATE_COMPLETE !== $this->parser_state &&
self::STATE_MATCHED_TAG !== $this->parser_state
) {
return true;
}
// Parse all of its attributes.
while ( $this->parse_next_attribute() ) {
continue;
}
// Ensure that the tag closes before the end of the document.
if (
self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
$this->bytes_already_parsed >= strlen( $this->html )
) {
// Does this appropriately clear state (parsed attributes)?
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
if ( false === $tag_ends_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
$this->parser_state = self::STATE_MATCHED_TAG;
$this->bytes_already_parsed = $tag_ends_at + 1;
$this->token_length = $this->bytes_already_parsed - $this->token_starts_at;
/*
* For non-DATA sections which might contain text that looks like HTML tags but
* isn't, scan with the appropriate alternative mode. Looking at the first letter
* of the tag name as a pre-check avoids a string allocation when it's not needed.
*/
$t = $this->html[ $this->tag_name_starts_at ];
if (
$this->is_closing_tag ||
! (
'i' === $t || 'I' === $t ||
'n' === $t || 'N' === $t ||
's' === $t || 'S' === $t ||
't' === $t || 'T' === $t ||
'x' === $t || 'X' === $t
)
) {
return true;
}
$tag_name = $this->get_tag();
/*
* Preserve the opening tag pointers, as these will be overwritten
* when finding the closing tag. They will be reset after finding
* the closing to tag to point to the opening of the special atomic
* tag sequence.
*/
$tag_name_starts_at = $this->tag_name_starts_at;
$tag_name_length = $this->tag_name_length;
$tag_ends_at = $this->token_starts_at + $this->token_length;
$attributes = $this->attributes;
$duplicate_attributes = $this->duplicate_attributes;
// Find the closing tag if necessary.
$found_closer = false;
switch ( $tag_name ) {
case 'SCRIPT':
$found_closer = $this->skip_script_data();
break;
case 'TEXTAREA':
case 'TITLE':
$found_closer = $this->skip_rcdata( $tag_name );
break;
/*
* In the browser this list would include the NOSCRIPT element,
* but the Tag Processor is an environment with the scripting
* flag disabled, meaning that it needs to descend into the
* NOSCRIPT element to be able to properly process what will be
* sent to a browser.
*
* Note that this rule makes HTML5 syntax incompatible with XML,
* because the parsing of this token depends on client application.
* The NOSCRIPT element cannot be represented in the XHTML syntax.
*/
case 'IFRAME':
case 'NOEMBED':
case 'NOFRAMES':
case 'STYLE':
case 'XMP':
$found_closer = $this->skip_rawtext( $tag_name );
break;
// No other tags should be treated in their entirety here.
default:
return true;
}
if ( ! $found_closer ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;