-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakeHTML5Entities.pl
executable file
·74 lines (65 loc) · 1.96 KB
/
MakeHTML5Entities.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/perl
#
# Create a name2codepoint dict for python using the *complete* list of
# HTML5 entities downloaded from the w3 org website.
#
use strict;
use warnings;
use LWP::Simple qw( get );
# Get the web page...
#
my $resp = get('https://dev.w3.org/html5/html-author/charref');
exit unless defined $resp;
# We get lines like this (wrapped here):
# <tr title="U+00009 CHARACTER TABULATION" data-block="C0 Controls and Basic Latin"
# data-category="Cc" data-set="mmlextra"><td class="character"> 	
# <td class="named"><code>&Tab;</code><td class="hex"><code>&#x00009;
# </code><td class="dec"><code>&#9;</code><td class="desc">CHARACTER TABULATION
#
# The class="named" section may contain multiple, space-separated entity names.
# Creating a mapping of all entities for each hexcode
#
my %names4;
foreach my $ln (split "\n", $resp) {
next unless $ln =~ /^<tr/; # Skip if not a table row
# First we have to translate & to &
#
$ln =~ s/&/&/g;
# Now extract the name(s) and hex code
#
my ($nstr, $hexc) =
($ln =~ q%\"named\"><code>(.*?)</code>.*?\"hex\"><code>&#(.*?);%);
$names4{$hexc} = $nstr;
}
# Print out the python file header
#
print <<'EOH';
# This is the full HTML5 Entity list from:
# https://dev.w3.org/html5/html-author/charref
# This IS case sensitive!!
# The name2codepoint in htmlentitydefs is incomplete.
#
name2codepoint = {
EOH
# Now print out the name for the hex-code in ascending order
# We order them by hex code and handle any multiple entity names
# by writing multiple lines, one for each name.
#
my $nents = 0;
my $nchars = 0;
foreach my $hx (sort keys %names4) {
$nchars++;
foreach my $nm (split " ", $names4{$hx}) {
$nents++;
my $ent = substr($nm, 1, -1);
printf " %-34s : 0%s,\n", "'$ent'", $hx;
}
}
# Now print the closing brace for the python file
#
print <<'EOF';
}
EOF
# Display stats on STDERR
#
print STDERR "Found $nents entries for $nchars chars\n";