Files
tcl/tests/unicodeNormalize.test
2025-12-07 21:36:46 +00:00

444 lines
17 KiB
Plaintext

# See the file LICENSE for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
if {[lsearch [namespace children] ::tcltest] == -1} {
package require tcltest
namespace import ::tcltest::*
}
tcltest::loadTestedCommands
package require tcl::test
source [file join [file dirname [info script]] ucdUtils.tcl]
namespace eval unicode::normalization::test {
namespace path ::tcltests::ucd
variable singleFormChar
variable testCase
variable normForm
variable normEnums; # Matches Tcl_UnicodeNormalizationForm enums
array set normEnums {
nfc 0
nfd 1
nfkc 2
nfkd 3
}
variable profileFlags; # Match TCL_ENCODING_PROFILE_* C flags
array set profileFlags {
strict 0x00000000
tcl8 0x01000000
replace 0x02000000
}
variable bytes
proc hexListToChars {s} {
# 0044 030c -> \u0044\u030c
subst -novariables -nocommands \\U[join $s \\U]
}
# Standard arg number tests
test unicode-badargs-0 {unicode no args} -returnCodes error -body {
unicode
} -result {wrong # args: should be "unicode subcommand ?arg ...?"}
test unicode-badargs-1 {unicode bad command} -returnCodes error -body {
unicode foo
} -result {unknown or ambiguous subcommand "foo": must be tonfc, tonfd, tonfkc, or tonfkd}
variable cmd
foreach cmd {tonfc tonfd tonfkc tonfkd} {
test $cmd-badargs-0 "$cmd 0 args" -returnCodes error -body {
unicode $cmd
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
test $cmd-badargs-1 "$cmd 2 args" -returnCodes error -body {
unicode $cmd -profile strict
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
test $cmd-badargs-2 "$cmd extra args" -returnCodes error -body {
unicode $cmd -profile strict foo extra
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
}
# Test generation for nfc, nfd, nfkc, nfkd
variable allChars
variable allNfc
variable allNfd
variable allNfkc
variable allNfkd
foreach testCase [getNormalizationData] {
lassign $testCase lineno chars nfc nfd nfkc nfkd
lappend allChars $chars
lappend allNfc $nfc
lappend allNfd $nfd
lappend allNfkc $nfkc
lappend allNfkd $nfkd
test tonfc-line-$lineno \
"Test case for NFC at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-constraints ucdnormalization \
-body {
# See Comments in NormalizationTest.txt for expected behaviours
list \
[string equal $nfc [unicode tonfc $chars]] \
[string equal $nfc [unicode tonfc $nfc]] \
[string equal $nfc [unicode tonfc $nfd]] \
[string equal $nfkc [unicode tonfc $nfkc]] \
[string equal $nfkc [unicode tonfc $nfkd]]
} -result {1 1 1 1 1}
test tonfd-line-$lineno \
"Test case for NFD at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-constraints ucdnormalization \
-setup {
readNormalizationData
} -body {
# See Comments in NormalizationTest.txt for expected behaviours
list \
[string equal $nfd [unicode tonfd $chars]] \
[string equal $nfd [unicode tonfd $nfc]] \
[string equal $nfd [unicode tonfd $nfd]] \
[string equal $nfkd [unicode tonfd $nfkc]] \
[string equal $nfkd [unicode tonfd $nfkd]]
} -result {1 1 1 1 1}
test tonfkc-line-$lineno \
"Test case for NFKC at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-constraints ucdnormalization \
-setup {
readNormalizationData
} -body {
# See Comments in NormalizationTest.txt for expected behaviours
list \
[string equal $nfkc [unicode tonfkc $chars]] \
[string equal $nfkc [unicode tonfkc $nfc]] \
[string equal $nfkc [unicode tonfkc $nfd]] \
[string equal $nfkc [unicode tonfkc $nfkc]] \
[string equal $nfkc [unicode tonfkc $nfkd]]
} -result {1 1 1 1 1}
test tonfkd-line-$lineno \
"Test case for NFKD at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-constraints ucdnormalization \
-setup {
readNormalizationData
} -body {
# See Comments in NormalizationTest.txt for expected behaviours
list \
[string equal $nfkd [unicode tonfkd $chars]] \
[string equal $nfkd [unicode tonfkd $nfc]] \
[string equal $nfkd [unicode tonfkd $nfd]] \
[string equal $nfkd [unicode tonfkd $nfkc]] \
[string equal $nfkd [unicode tonfkd $nfkd]]
} -result {1 1 1 1 1}
}
# Test the entire string. Note normalization is not a closed operation
# so normalize(concatenation) != concatenate(normalization) so we insert
# \uFFFD (replacement char) as separator to prevent adjacent cases being
# combined. This is not a whole lot different from the above individual
# tests but more of a "long string" test.
test unicode-normalization-concat "Normalize concatenation of test vectors" -body {
list \
[string equal [unicode tonfc [join $allChars \uFFFD]] [join $allNfc \uFFFD]] \
[string equal [unicode tonfd [join $allChars \uFFFD]] [join $allNfd \uFFFD]] \
[string equal [unicode tonfkc [join $allChars \uFFFD]] [join $allNfkc \uFFFD]] \
[string equal [unicode tonfkd [join $allChars \uFFFD]] [join $allNfkd \uFFFD]]
} -result {1 1 1 1}
# Each single form character should map to itself for all forms
test normalize-singleform-0 "Normalize single form characters" \
-constraints ucdnormalization \
-body {
lmap singleFormChar [getSingleFormChars] {
if {[tcl::mathop::eq \
$singleFormChar \
[unicode tonfc $singleFormChar] \
[unicode tonfd $singleFormChar] \
[unicode tonfkc $singleFormChar] \
[unicode tonfkd $singleFormChar] \
]} {
continue
}
set singleFormChar
}
} -result {}
# Test generation for casefolding
# NOTE: casefolding is not in TIP 726 so these tests are not in use
# at the moment.
if {[tcltest::testConstraint ucdcasefolding]} {
foreach testCase [getCaseFoldData] {
lassign $testCase lineno chars casefoldedchars
set id [format %.6X [scan $chars %c]]
test normalize-line-$lineno-$id-nfccasefold \
"Test case for NFC_CaseFold at line $lineno of $::tcltests::ucd::caseFoldDataFile" \
-constraints ucdcasefolding \
-body {
# puts [codepoints $chars]->[codepoints $casefoldedchars]
# See Comments in DerivedNormalizationProps.txt for expected behaviours
toNFKC_Casefold $chars
} -result $casefoldedchars
}
# Characters that should case fold to themselves
proc codepoints {s} {join [lmap c [split $s ""] {
string cat U+ [format %.6X [scan $c %c]]}]
}
test normalize-casefold-identities-0 \
"NFKC Case fold chars mapping to themselves" \
-constraints ucdcasefolding \
-body {
lmap char [caseFoldIdentities] {
if {$char eq [toNFKC_Casefold $char]} {
continue
}
set char
}
} -result {}
}
# Profiles
test tonfc-profile-default-0 "tonfc -profile default success" -body {
unicode tonfc X\u1e0a\u031b\u0323Y
} -result X\u1e0c\u031b\u0307Y
test tonfc-profile-default-1 "tonfc -profile default fail" -body {
unicode tonfc X\ud800Y
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
test tonfc-profile-strict-0 "tonfc -profile strict success" -body {
unicode tonfc -profile strict X\u1e0a\u031b\u0323Y
} -result X\u1e0c\u031b\u0307Y
test tonfc-profile-strict-1 "tonfc -profile strict fail" -body {
unicode tonfc -profile strict \ud800
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
test tonfc-profile-replace-0 "tonfc -profile replace success" -body {
unicode tonfc -profile replace X\u1e0a\u031b\u0323Y
} -result X\u1e0c\u031b\u0307Y
test tonfc-profile-replace-1 "tonfc -profile replace fail" -body {
unicode tonfc -profile replace X\ud800Y
} -result X\uFFFDY
test tonfc-profile-tcl8-0 "tonfc -profile tcl8" -returnCodes error -body {
unicode tonfc -profile tcl8 x
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
test tonfd-profile-default-0 "tonfd -profile default success" -body {
unicode tonfd X\u1E0A\u031B\u0323Y
} -result X\u0044\u031B\u0323\u0307Y
test tonfd-profile-default-1 "tonfd -profile default fail" -body {
unicode tonfd \ud800
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
test tonfd-profile-strict-0 "tonfd -profile strict success" -body {
unicode tonfd -profile strict X\u1E0A\u031B\u0323Y
} -result X\u0044\u031B\u0323\u0307Y
test tonfd-profile-strict-1 "tonfd -profile strict fail" -body {
unicode tonfd -profile strict X\ud800Y
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
test tonfd-profile-replace-0 "tonfd -profile replace success" -body {
unicode tonfd -profile replace X\u1E0A\u031B\u0323Y
} -result X\u0044\u031B\u0323\u0307Y
test tonfd-profile-replace-1 "tonfd -profile replace fail" -body {
unicode tonfd -profile replace \ud800
} -result \uFFFD
test tonfd-profile-tcl8-0 "tonfd -profile tcl8" -returnCodes error -body {
unicode tonfd -profile tcl8 x
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
test tonfkc-profile-default-0 "tonfkc -profile default success" -body {
unicode tonfkc X\u01C4\u0323Y
} -result X\u0044\u1E92\u030CY
test tonfkc-profile-default-1 "tonfkc -profile default fail" -body {
unicode tonfkc X\ud800Y
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
test tonfkc-profile-strict-0 "tonfkc -profile strict success" -body {
unicode tonfkc -profile strict X\u01C4\u0323Y
} -result X\u0044\u1E92\u030CY
test tonfkc-profile-strict-1 "tonfkc -profile strict fail" -body {
unicode tonfkc -profile strict \ud800
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
test tonfkc-profile-replace-0 "tonfkc -profile replace success" -body {
unicode tonfkc -profile replace X\u01C4\u0323Y
} -result X\u0044\u1E92\u030CY
test tonfkc-profile-replace-1 "tonfkc -profile replace fail" -body {
unicode tonfkc -profile replace X\ud800Y
} -result X\uFFFDY
test tonfkc-profile-tcl8-0 "tonfkc -profile tcl8" -returnCodes error -body {
unicode tonfkc -profile tcl8 x
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
test tonfkd-profile-default-0 "tonfkd -profile default success" -body {
unicode tonfkd X\u01C4\u0323Y
} -result X\u0044\u005A\u0323\u030CY
test tonfkd-profile-default-1 "tonfkd -profile default fail" -body {
unicode tonfkd X\ud800Y
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
test tonfkd-profile-strict-0 "tonfkd -profile strict success" -body {
unicode tonfkd -profile strict X\u01C4\u0323Y
} -result X\u0044\u005A\u0323\u030CY
test tonfkd-profile-strict-1 "tonfkd -profile strict fail" -body {
unicode tonfkd -profile strict \ud800
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
test tonfkd-profile-replace-0 "tonfkd -profile replace success" -body {
unicode tonfkd -profile replace X\u01C4\u0323Y
} -result X\u0044\u005A\u0323\u030CY
test tonfkd-profile-replace-1 "tonfkd -profile replace fail" -body {
unicode tonfkd -profile replace X\ud800Y
} -result X\uFFFDY
test tonfkd-profile-tcl8-0 "tonfkd -profile tcl8" -returnCodes error -body {
unicode tonfkd -profile tcl8 x
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
# Tcl_UtfToNormalizedDString C API
foreach testCase [getNormalizationData] {
lassign $testCase lineno chars nfc nfd nfkc nfkd
set bytes [teststringbytes $chars]
foreach profile {strict replace} {
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalizedDString-$normForm-line-$lineno-$profile \
"Tcl_UtfToNormalizedDString for $normForm at line $lineno of $::tcltests::ucd::normalizationDataFile" \
-body {
testutftonormalizeddstring $bytes $normEnums($normForm) \
$profileFlags($profile)
} -result [teststringbytes [set $normForm]]
}
}
}
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalizedDString-$normForm-nulchar-$profile \
"Tcl_UtfToNormalizedDString for $normForm passed nul character" \
-body {
testutftonormalizeddstring [teststringbytes \0] \
$normEnums($normForm) $profileFlags(strict)
} -result \xC0\x80
}
# Test the entire string. Note normalization is not a closed operation
# so normalize(concatenation) != concatenate(normalization) so we insert
# \uFFFD (replacement char) as separator to prevent adjacent cases being
# combined. This is not a whole lot different from the above individual
# tests but more of a "long string" test.
test Tcl_UtfToNormalizedDString-concat "Normalize concatenation of test vectors" -setup {
set bytes [teststringbytes [join $allChars \uFFFD]]
} -body {
list \
[string equal \
[testutftonormalizeddstring $bytes $normEnums(nfc) $profileFlags(strict)] \
[teststringbytes [join $allNfc \uFFFD]]] \
[string equal \
[testutftonormalizeddstring $bytes $normEnums(nfd) $profileFlags(strict)] \
[teststringbytes [join $allNfd \uFFFD]]] \
[string equal \
[testutftonormalizeddstring $bytes $normEnums(nfkc) $profileFlags(strict)] \
[teststringbytes [join $allNfkc \uFFFD]]] \
[string equal \
[testutftonormalizeddstring $bytes $normEnums(nfkd) $profileFlags(strict)] \
[teststringbytes [join $allNfkd \uFFFD]]]
} -result {1 1 1 1}
# Tcl_UtfToNormalizedDString error cases
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalizedDString-$normForm-tcl8 \
"Tcl_UtfToNormalizedDString for $normForm profile tcl8" \
-body {
testutftonormalizeddstring abc $normEnums($normForm) $profileFlags(tcl8)
} -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error
if {0} {
# TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
# no way to test normalization of invalid UTF-8. Enable this test
# once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
test Tcl_UtfToNormalizedDString-$normForm-invalid-utf8 \
"Tcl_UtfToNormalizedDString for $normForm invalid utf8 profile strict" \
-body {
testutftonormalizeddstring [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict)
} -result {} -returnCodes error
}
}
test Tcl_UtfToNormalizedDString-invalid-normalization-form \
"Tcl_UtfToNormalizedDString invalid value for normalization form" \
-body {
testutftonormalizeddstring abc 4 $profileFlags(strict)
} -result {Invalid value 4 passed for normalization form.} -returnCodes error
# Tcl_UtfToNormalized C API
variable normBytes
foreach testCase [getNormalizationData] {
lassign $testCase lineno chars nfc nfd nfkc nfkd
set bytes [teststringbytes $chars]
foreach profile {strict replace} {
foreach normForm {nfc nfd nfkc nfkd} {
set normBytes [teststringbytes [set $normForm]]
test Tcl_UtfToNormalized-$normForm-line-$lineno-$profile \
"Tcl_UtfToNormalized $normForm line $lineno of $::tcltests::ucd::normalizationDataFile" \
-body {
# Tests:
# No length specified (implicit length of bytes)
# Length of -1
# Buffer too small
set result [testutftonormalized $bytes \
$normEnums($normForm) \
$profileFlags($profile) 100]
set result_minus1 [testutftonormalized $bytes\0 \
$normEnums($normForm) \
$profileFlags($profile) -1 100]
list $result \
$result_minus1 \
[catch {
testutftonormalized $bytes $normEnums($normForm) \
$profileFlags($profile) \
[expr {[string length $result]-1}]
} message] \
$message
} -result [list $normBytes $normBytes -4 {Output buffer too small.}]
}
}
}
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalized-$normForm-nulchar \
"Tcl_UtfToNormalized $normForm passed nul character" \
-body {
list \
[testutftonormalized [teststringbytes \0] \
$normEnums($normForm) $profileFlags(strict) 3] \
[catch {
[testutftonormalized [teststringbytes \0] \
$normEnums($normForm) $profileFlags(strict) 2]
} message] \
$message
} -result [list \xC0\x80 -4 {Output buffer too small.}]
}
# Tcl_UtfToNormalized error cases
foreach normForm {nfc nfd nfkc nfkd} {
test Tcl_UtfToNormalized-$normForm-tcl8 \
"Tcl_UtfToNormalized for $normForm profile tcl8" \
-body {
testutftonormalized abc $normEnums($normForm) $profileFlags(tcl8) 20
} -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error
if {0} {
# TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
# no way to test normalization of invalid UTF-8. Enable this test
# once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
test Tcl_UtfToNormalized-$normForm-invalid-utf8 \
"Tcl_UtfToNormalized for $normForm invalid utf8 profile strict" \
-body {
testutftonormalized [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict) 20
} -result {} -returnCodes error
}
}
test Tcl_UtfToNormalized-invalid-normalization-form \
"Tcl_UtfToNormalized invalid value for normalization form" \
-body {
testutftonormalized abc 4 $profileFlags(strict) 20
} -result {Invalid value 4 passed for normalization form.} -returnCodes error
}
::tcltest::cleanupTests
namespace delete unicode::normalization::test
return