From f02e86ff3f0b7ac006a04f9ab506608c4a7b0af4 Mon Sep 17 00:00:00 2001 From: RayPlante Date: Sun, 7 Jul 2024 16:01:04 -0400 Subject: [PATCH] jq/urlencode.jq: update for full UTF-8 support --- jq/urldecode.jq | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/jq/urldecode.jq b/jq/urldecode.jq index 6165248..f6c8e97 100644 --- a/jq/urldecode.jq +++ b/jq/urldecode.jq @@ -15,34 +15,51 @@ def until(condition; next): def u: if condition then . else (next|u) end; u; +# interpret a string as a number in some base system +# +# Input: string +# Output: number +# +def to_i(base): + explode + | reverse + | map(if 65 <= . and . <= 90 then . + 32 else . end) # downcase + | map(if . > 96 then . - 87 else . - 48 end) # "a" ~ 97 => 10 ~ 87 + | reduce .[] as $c + # base: [power, ans] + ([1,0]; (.[0] * base) as $b | [$b, .[1] + (.[0] * $c)]) | .[1]; + +def hex2utf8(shift; off): + [to_i(16)-off+((shift|to_i(16))*64)] | implode; + +def hex2utf8(shift): + hex2utf8(shift; 128); + +def hex2utf8: + hex2utf8("0"; 0); + # replace all url-encodings (%XX) in an input string with their unencoded -# characters. +# characters. This decoder should be fully UTF-8 compliant, recognizing +# the %CX%XX pattern. # # Input: string # Output: string # def url_decode: - # The helper function converts the input string written in the given - # "base" to an integer - def to_i(base): - explode - | reverse - | map(if 65 <= . and . <= 90 then . + 32 else . end) # downcase - | map(if . > 96 then . - 87 else . - 48 end) # "a" ~ 97 => 10 ~ 87 - | reduce .[] as $c - # base: [power, ans] - ([1,0]; (.[0] * base) as $b | [$b, .[1] + (.[0] * $c)]) | .[1]; - . as $in | length as $length | [0, ""] # i, answer | until ( .[0] >= $length; .[0] as $i | if $in[$i:$i+1] == "%" - then [ $i + 3, .[1] + ([$in[$i+1:$i+3] | to_i(16)] | implode) ] + then + if $in[$i+1:$i+2] == "C" and $in[$i+3:$i+4] == "%" + then [ $i + 6, .[1] + ($in[$i+4:$i+6] | hex2utf8($in[$i+2:$i+3])) ] + else [ $i + 3, .[1] + ($in[$i+1:$i+3] | hex2utf8) ] + end else [ $i + 1, .[1] + $in[$i:$i+1] ] end) - | .[1]; # answer + | .[1]; # answer # replace url-encodings, including pluses (+), with their corresponding # characters. This is like url_encode, except that it also replaces each