Skip to content

Commit

Permalink
init: ported To_Unicode_From_UTF32 primitive function
Browse files Browse the repository at this point in the history
Since a number of level 1 Hestia libraries use string functions,
we have to port its primitive ones into HestiaKERNEL library package.
Hence, let's do this.

This patch ports To_Unicode_From_UTF32 primitive function into
HestiaKERNEL library in init/ directory.

Co-authored-by: Shuralyov, Jean <[email protected]>
Co-authored-by: Galyna, Cory <[email protected]>
Co-authored-by: (Holloway) Chew, Kean Ho <[email protected]>
Signed-off-by: (Holloway) Chew, Kean Ho <[email protected]>
  • Loading branch information
4 people committed Nov 5, 2024
1 parent d6cd126 commit 00a374e
Show file tree
Hide file tree
Showing 7 changed files with 338 additions and 3 deletions.
9 changes: 8 additions & 1 deletion init/services/HestiaKERNEL/Is_UTF.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ function HestiaKERNEL-Is-UTF {
$___content = $___byte_array
$___count = 8
$___utf8_expect = 0
$___utf32_expect = 0
$___byte_0 = $null
$___byte_1 = $null
$___byte_2 = $null
Expand Down Expand Up @@ -105,6 +106,12 @@ function HestiaKERNEL-Is-UTF {
}


# detect UTF-32 for later guessing
if ($___count -le 4) {
$___utf32_expect = 1
}


# prepare for next scan
$___count -= 1
}
Expand Down Expand Up @@ -170,7 +177,7 @@ ${___output}
"@
}

if ($___byte_array.Length % 4) {
if ($___utf32_expect -gt 0) {
$___output = @"
${___output}
${env:HestiaKERNEL_UTF32BE}
Expand Down
10 changes: 8 additions & 2 deletions init/services/HestiaKERNEL/Is_UTF.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#
# You MUST ensure any interaction with the content STRICTLY COMPLIES with
# the permissions and limitations set forth in the license.
# Copyright 2024 (Holloway) Chew, Kean Ho <[email protected]>
. "${LIBS_HESTIA}/HestiaKERNEL/Error_Codes.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/Unicode.sh"

Expand All @@ -31,6 +30,7 @@ HestiaKERNEL_Is_UTF() {
___content="$1"
___count=8
___utf8_expect=0
___utf32_expect=0
___byte_0=""
___byte_1=""
___byte_2=""
Expand Down Expand Up @@ -112,6 +112,12 @@ HestiaKERNEL_Is_UTF() {
fi


# detect UTF-32 for later guessing
if [ $___count -le 4 ]; then
___utf32_expect=1
fi


# prepare for next scan
___count=$(($___count - 1))
done
Expand Down Expand Up @@ -168,7 +174,7 @@ ${HestiaKERNEL_UTF8}
${___output}"
fi

if [ $((${#1} % 4)) -eq 0 ]; then
if [ $___utf32_expect -gt 0 ]; then
___output="\
${___output}
${HestiaKERNEL_UTF32BE}
Expand Down
141 changes: 141 additions & 0 deletions init/services/HestiaKERNEL/To_Unicode_From_UTF32.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright 2024 (Holloway) Chew, Kean Ho <[email protected]>
#
#
# Licensed under (Holloway) Chew, Kean Ho’s Liberal License (the "License").
# You must comply with the license to use the content. Get the License at:
#
# https://doi.org/10.5281/zenodo.13770769
#
# You MUST ensure any interaction with the content STRICTLY COMPLIES with
# the permissions and limitations set forth in the license.
. "${env:LIBS_HESTIA}\HestiaKERNEL\Endian.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\Is_Array_Byte.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\Is_UTF.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\Unicode.ps1"




function HestiaKERNEL-To-Unicode-From-UTF32 {
param (
[byte[]]$___input_content,
[int]$___input_endian
)


# validate input
if ($___input_content.Length -eq 0) {
return [uint32[]]@()
}

if ($(HestiaKERNEL-Is-Array-Byte $___input_content) -ne ${env:HestiaKERNEL_ERROR_OK}) {
return [uint32[]]@()
}


# execute
## IMPORTANT NOTICE
## PowerShell does not handle UTF-32 byte stream in an isolated manner
## without messing up the current terminals' environment variables
## (e.g. $OutputEncoding). To avoid it, manual implementations are
## required.
##
## From the Unicode engineering specification, the default endian is
## big-endian.


# check for data encoder
$___endian = ${env:HestiaKERNEL_ENDIAN_BIG}
$___ignore = 0
$___output = HestiaKERNEL-Is-UTF $___input_content
if ($($___output -replace "${env:HestiaKERNEL_UTF32LE_BOM}", '') -ne $___output) {
# it's UTF32LE with BOM marker
$___endian = ${env:HestiaKERNEL_ENDIAN_LITTLE}
$___ignore = 4
} elseif ($($___output -replace "${env:HestiaKERNEL_UTF32BE_BOM}", '') -ne $___output) {
# it's UTF32BE with BOM marker
$___endian = ${env:HestiaKERNEL_ENDIAN_BIG}
$___ignore = 4
} elseif (
($($___output -replace "${env:HestiaKERNEL_UTF32LE}", '') -ne $___output) -and
($($___output -replace "${env:HestiaKERNEL_UTF32BE}", '') -ne $___output)
) {
# both UTF32LE or UTF32BE can be a candidate
if (
($___input_endian -eq ${env:HestiaKERNEL_ENDIAN_LITTLE}) -or
($___input_endian -eq ${env:HestiaKERNEL_ENDIAN_BIG})
) {
$___endian = $___input_endian # If there is a valid hint, take the hint
} else {
# keep the default
}
} else {
# not a UTF byte array
return [uint32[]]@()
}


# process to unicode
$___content = [uint32[]]$___input_content
[System.Collections.Generic.List[uint32]]$___converted = @()
$___char = [uint32]0
$___state = 0
foreach ($___byte in $___content) {
# ignore BOM markers
if ($___ignore -gt 0) {
$___ignore = $___ignore - 1
continue
}


# process byte data serially
switch ($___state) {
3 {
switch ($___endian) {
${env:HestiaKERNEL_ENDIAN_LITTLE} {
$___byte = $___byte -shl 24
$___char = $___char -bor $___byte
} default {
$___char = $___char -bor $___byte
}}
$null = $___converted.Add($___char)

$___state = 0
} 2 {
switch ($___endian) {
${env:HestiaKERNEL_ENDIAN_LITTLE} {
$___byte = $___byte -shl 16
$___char = $___char -bor $___byte
} default {
$___byte = $___byte -shl 8
$___char = $___char -bor $___byte
}}

$___state = 3
} 1 {
switch ($___endian) {
${env:HestiaKERNEL_ENDIAN_LITTLE} {
$___byte = $___byte -shl 8
$___char = $___char -bor $___byte
} default {
$___byte = $___byte -shl 16
$___char = $___char -bor $___byte
}}

$___state = 2
} default {
switch ($___endian) {
${env:HestiaKERNEL_ENDIAN_LITTLE} {
$___char = $___byte
} default {
$___char = $___byte -shl 24
}}

$___state = 1
}}
}


# report status
return [uint32[]]$___converted
}
160 changes: 160 additions & 0 deletions init/services/HestiaKERNEL/To_Unicode_From_UTF32.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#!/bin/sh
# Copyright 2024 (Holloway) Chew, Kean Ho <[email protected]>
#
#
# Licensed under (Holloway) Chew, Kean Ho’s Liberal License (the "License").
# You must comply with the license to use the content. Get the License at:
#
# https://doi.org/10.5281/zenodo.13770769
#
# You MUST ensure any interaction with the content STRICTLY COMPLIES with
# the permissions and limitations set forth in the license.
. "${LIBS_HESTIA}/HestiaKERNEL/Endian.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/Error_Codes.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/Is_Array_Byte.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/Is_UTF.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/Unicode.sh"




HestiaKERNEL_To_Unicode_From_UTF32() {
#___input_content="$1"
#___input_endian="$2"


# validate input
if [ "$1" = "" ]; then
printf -- ""
return $HestiaKERNEL_ERROR_DATA_EMPTY
fi

if [ $(HestiaKERNEL_Is_Array_Byte "$1") -ne $HestiaKERNEL_ERROR_OK ]; then
printf -- ""
return $HestiaKERNEL_ERROR_DATA_INVALID
fi


# execute
## IMPORTANT NOTICE
## POSIX Shell does not handle UTF-32 byte stream in an isolated manner
## without messing up the current terminal's $LANG settings. To avoid
## it, manual implementations are required.
##
## From the Unicode engineering specification, the default endian is
## big-endian.


# check for data encoder
___endian=$HestiaKERNEL_ENDIAN_BIG
___ignore=0
___output="$(HestiaKERNEL_Is_UTF "$1")"
if [ ! "${___output#*"$HestiaKERNEL_UTF32LE_BOM"}" = "$___output" ]; then
# it's UTF32LE with BOM marker
___endian=$HestiaKERNEL_ENDIAN_LITTLE
___ignore=4
elif [ ! "${___output#*"$HestiaKERNEL_UTF32BE_BOM"}" = "$___output" ]; then
# it's UTF32BE with BOM marker
___endian=$HestiaKERNEL_ENDIAN_BIG
___ignore=4
elif [ ! "${___output#*"$HestiaKERNEL_UTF32LE"}" = "$___output" ] &&
[ ! "${___output#*"$HestiaKERNEL_UTF32BE"}" = "$___output" ]; then
# both UTF32LE or UTF32BE can be a candidate
if [ "$2" = "$HestiaKERNEL_ENDIAN_LITTLE" ] ||
[ "$2" = "$HestiaKERNEL_ENDIAN_BIG" ]; then
___endian="$2" # If there is a valid hint, take the hint
else
: # keep the default
fi
else
# not a UTF byte array
printf -- ""
return $HestiaKERNEL_ERROR_DATA_INVALID
fi


# process to unicode
___content="$1"
___converted=""
___char=0
___state=0
while [ ! "$___content" = "" ]; do
# get current byte
___byte="${___content%%, *}"
___content="${___content#"$___byte"}"
if [ "${___content%"${___content#?}"}" = "," ]; then
___content="${___content#, }"
fi


# ignore BOM markers
if [ $___ignore -gt 0 ]; then
___ignore=$(($___ignore - 1))
continue
fi


# process byte data serially
case "$___state" in
3)
case "$___endian" in
$HestiaKERNEL_ENDIAN_LITTLE)
___byte=$(($___byte << 24))
___char=$(($___char | $___byte))
;;
*)
___char=$(($___char | $___byte))
;;
esac
___converted="${___converted}$(printf -- "%d" "$___char"), "

___state=0
;;
2)
case "$___endian" in
$HestiaKERNEL_ENDIAN_LITTLE)
___byte=$(($___byte << 16))
___char=$(($___char | $___byte))
;;
*)
___byte=$(($___byte << 8))
___char=$(($___char | $___byte))
;;
esac

___state=3
;;
1)
case "$___endian" in
$HestiaKERNEL_ENDIAN_LITTLE)
___byte=$(($___byte << 8))
___char=$(($___char | $___byte))
;;
*)
___byte=$(($___byte << 16))
___char=$(($___char | $___byte))
;;
esac

___state=2
;;
*)
case "$___endian" in
$HestiaKERNEL_ENDIAN_LITTLE)
___char=$___byte
;;
*)
___char=$(($___byte << 24))
;;
esac

___state=1
;;
esac
done


# report status
printf -- "%s" "${___converted%, }"
return $HestiaKERNEL_ERROR_OK
}
2 changes: 2 additions & 0 deletions init/services/HestiaKERNEL/Vanilla.sh.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ echo \" <<'RUN_AS_POWERSHELL' >/dev/null # " | Out-Null
. "${env:LIBS_HESTIA}\HestiaKERNEL\To_Unicode_From_String.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\To_Unicode_From_UTF8.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\To_Unicode_From_UTF16.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\To_Unicode_From_UTF32.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\To_Uppercase_String.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\To_Uppercase_Unicode.ps1"
. "${env:LIBS_HESTIA}\HestiaKERNEL\To_UTF8_From_Unicode.ps1"
Expand Down Expand Up @@ -83,6 +84,7 @@ RUN_AS_POWERSHELL
. "${LIBS_HESTIA}/HestiaKERNEL/To_Unicode_From_String.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/To_Unicode_From_UTF8.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/To_Unicode_From_UTF16.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/To_Unicode_From_UTF32.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/To_Uppercase_String.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/To_Uppercase_Unicode.sh"
. "${LIBS_HESTIA}/HestiaKERNEL/To_UTF8_From_Unicode.sh"
Expand Down
Loading

0 comments on commit 00a374e

Please sign in to comment.