mirror of
https://github.com/tcltk/tcl.git
synced 2026-05-29 00:27:49 +08:00
444 lines
17 KiB
Plaintext
444 lines
17 KiB
Plaintext
# See the file LICENSE for information on usage and redistribution
|
|
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
|
|
|
|
if {[lsearch [namespace children] ::tcltest] == -1} {
|
|
package require tcltest
|
|
namespace import ::tcltest::*
|
|
}
|
|
tcltest::loadTestedCommands
|
|
package require tcl::test
|
|
|
|
source [file join [file dirname [info script]] ucdUtils.tcl]
|
|
|
|
namespace eval unicode::normalization::test {
|
|
namespace path ::tcltests::ucd
|
|
|
|
variable singleFormChar
|
|
variable testCase
|
|
|
|
variable normForm
|
|
variable normEnums; # Matches Tcl_UnicodeNormalizationForm enums
|
|
array set normEnums {
|
|
nfc 0
|
|
nfd 1
|
|
nfkc 2
|
|
nfkd 3
|
|
}
|
|
variable profileFlags; # Match TCL_ENCODING_PROFILE_* C flags
|
|
array set profileFlags {
|
|
strict 0x00000000
|
|
tcl8 0x01000000
|
|
replace 0x02000000
|
|
}
|
|
variable bytes
|
|
|
|
proc hexListToChars {s} {
|
|
# 0044 030c -> \u0044\u030c
|
|
subst -novariables -nocommands \\U[join $s \\U]
|
|
}
|
|
|
|
# Standard arg number tests
|
|
test unicode-badargs-0 {unicode no args} -returnCodes error -body {
|
|
unicode
|
|
} -result {wrong # args: should be "unicode subcommand ?arg ...?"}
|
|
test unicode-badargs-1 {unicode bad command} -returnCodes error -body {
|
|
unicode foo
|
|
} -result {unknown or ambiguous subcommand "foo": must be tonfc, tonfd, tonfkc, or tonfkd}
|
|
|
|
variable cmd
|
|
foreach cmd {tonfc tonfd tonfkc tonfkd} {
|
|
test $cmd-badargs-0 "$cmd 0 args" -returnCodes error -body {
|
|
unicode $cmd
|
|
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
|
|
test $cmd-badargs-1 "$cmd 2 args" -returnCodes error -body {
|
|
unicode $cmd -profile strict
|
|
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
|
|
test $cmd-badargs-2 "$cmd extra args" -returnCodes error -body {
|
|
unicode $cmd -profile strict foo extra
|
|
} -result "wrong # args: should be \"unicode $cmd ?-profile PROFILE? STRING\""
|
|
}
|
|
|
|
# Test generation for nfc, nfd, nfkc, nfkd
|
|
variable allChars
|
|
variable allNfc
|
|
variable allNfd
|
|
variable allNfkc
|
|
variable allNfkd
|
|
foreach testCase [getNormalizationData] {
|
|
lassign $testCase lineno chars nfc nfd nfkc nfkd
|
|
lappend allChars $chars
|
|
lappend allNfc $nfc
|
|
lappend allNfd $nfd
|
|
lappend allNfkc $nfkc
|
|
lappend allNfkd $nfkd
|
|
test tonfc-line-$lineno \
|
|
"Test case for NFC at line $lineno of $::tcltests::ucd::normalizationDataFile" \
|
|
-constraints ucdnormalization \
|
|
-body {
|
|
# See Comments in NormalizationTest.txt for expected behaviours
|
|
list \
|
|
[string equal $nfc [unicode tonfc $chars]] \
|
|
[string equal $nfc [unicode tonfc $nfc]] \
|
|
[string equal $nfc [unicode tonfc $nfd]] \
|
|
[string equal $nfkc [unicode tonfc $nfkc]] \
|
|
[string equal $nfkc [unicode tonfc $nfkd]]
|
|
} -result {1 1 1 1 1}
|
|
|
|
test tonfd-line-$lineno \
|
|
"Test case for NFD at line $lineno of $::tcltests::ucd::normalizationDataFile" \
|
|
-constraints ucdnormalization \
|
|
-setup {
|
|
readNormalizationData
|
|
} -body {
|
|
# See Comments in NormalizationTest.txt for expected behaviours
|
|
list \
|
|
[string equal $nfd [unicode tonfd $chars]] \
|
|
[string equal $nfd [unicode tonfd $nfc]] \
|
|
[string equal $nfd [unicode tonfd $nfd]] \
|
|
[string equal $nfkd [unicode tonfd $nfkc]] \
|
|
[string equal $nfkd [unicode tonfd $nfkd]]
|
|
} -result {1 1 1 1 1}
|
|
|
|
test tonfkc-line-$lineno \
|
|
"Test case for NFKC at line $lineno of $::tcltests::ucd::normalizationDataFile" \
|
|
-constraints ucdnormalization \
|
|
-setup {
|
|
readNormalizationData
|
|
} -body {
|
|
# See Comments in NormalizationTest.txt for expected behaviours
|
|
list \
|
|
[string equal $nfkc [unicode tonfkc $chars]] \
|
|
[string equal $nfkc [unicode tonfkc $nfc]] \
|
|
[string equal $nfkc [unicode tonfkc $nfd]] \
|
|
[string equal $nfkc [unicode tonfkc $nfkc]] \
|
|
[string equal $nfkc [unicode tonfkc $nfkd]]
|
|
} -result {1 1 1 1 1}
|
|
|
|
test tonfkd-line-$lineno \
|
|
"Test case for NFKD at line $lineno of $::tcltests::ucd::normalizationDataFile" \
|
|
-constraints ucdnormalization \
|
|
-setup {
|
|
readNormalizationData
|
|
} -body {
|
|
# See Comments in NormalizationTest.txt for expected behaviours
|
|
list \
|
|
[string equal $nfkd [unicode tonfkd $chars]] \
|
|
[string equal $nfkd [unicode tonfkd $nfc]] \
|
|
[string equal $nfkd [unicode tonfkd $nfd]] \
|
|
[string equal $nfkd [unicode tonfkd $nfkc]] \
|
|
[string equal $nfkd [unicode tonfkd $nfkd]]
|
|
} -result {1 1 1 1 1}
|
|
}
|
|
|
|
# Test the entire string. Note normalization is not a closed operation
|
|
# so normalize(concatenation) != concatenate(normalization) so we insert
|
|
# \uFFFD (replacement char) as separator to prevent adjacent cases being
|
|
# combined. This is not a whole lot different from the above individual
|
|
# tests but more of a "long string" test.
|
|
test unicode-normalization-concat "Normalize concatenation of test vectors" -body {
|
|
list \
|
|
[string equal [unicode tonfc [join $allChars \uFFFD]] [join $allNfc \uFFFD]] \
|
|
[string equal [unicode tonfd [join $allChars \uFFFD]] [join $allNfd \uFFFD]] \
|
|
[string equal [unicode tonfkc [join $allChars \uFFFD]] [join $allNfkc \uFFFD]] \
|
|
[string equal [unicode tonfkd [join $allChars \uFFFD]] [join $allNfkd \uFFFD]]
|
|
} -result {1 1 1 1}
|
|
|
|
# Each single form character should map to itself for all forms
|
|
test normalize-singleform-0 "Normalize single form characters" \
|
|
-constraints ucdnormalization \
|
|
-body {
|
|
lmap singleFormChar [getSingleFormChars] {
|
|
if {[tcl::mathop::eq \
|
|
$singleFormChar \
|
|
[unicode tonfc $singleFormChar] \
|
|
[unicode tonfd $singleFormChar] \
|
|
[unicode tonfkc $singleFormChar] \
|
|
[unicode tonfkd $singleFormChar] \
|
|
]} {
|
|
continue
|
|
}
|
|
set singleFormChar
|
|
}
|
|
} -result {}
|
|
|
|
# Test generation for casefolding
|
|
# NOTE: casefolding is not in TIP 726 so these tests are not in use
|
|
# at the moment.
|
|
if {[tcltest::testConstraint ucdcasefolding]} {
|
|
foreach testCase [getCaseFoldData] {
|
|
lassign $testCase lineno chars casefoldedchars
|
|
set id [format %.6X [scan $chars %c]]
|
|
test normalize-line-$lineno-$id-nfccasefold \
|
|
"Test case for NFC_CaseFold at line $lineno of $::tcltests::ucd::caseFoldDataFile" \
|
|
-constraints ucdcasefolding \
|
|
-body {
|
|
# puts [codepoints $chars]->[codepoints $casefoldedchars]
|
|
# See Comments in DerivedNormalizationProps.txt for expected behaviours
|
|
toNFKC_Casefold $chars
|
|
} -result $casefoldedchars
|
|
}
|
|
# Characters that should case fold to themselves
|
|
proc codepoints {s} {join [lmap c [split $s ""] {
|
|
string cat U+ [format %.6X [scan $c %c]]}]
|
|
}
|
|
test normalize-casefold-identities-0 \
|
|
"NFKC Case fold chars mapping to themselves" \
|
|
-constraints ucdcasefolding \
|
|
-body {
|
|
lmap char [caseFoldIdentities] {
|
|
if {$char eq [toNFKC_Casefold $char]} {
|
|
continue
|
|
}
|
|
set char
|
|
}
|
|
} -result {}
|
|
}
|
|
|
|
# Profiles
|
|
test tonfc-profile-default-0 "tonfc -profile default success" -body {
|
|
unicode tonfc X\u1e0a\u031b\u0323Y
|
|
} -result X\u1e0c\u031b\u0307Y
|
|
test tonfc-profile-default-1 "tonfc -profile default fail" -body {
|
|
unicode tonfc X\ud800Y
|
|
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
|
|
test tonfc-profile-strict-0 "tonfc -profile strict success" -body {
|
|
unicode tonfc -profile strict X\u1e0a\u031b\u0323Y
|
|
} -result X\u1e0c\u031b\u0307Y
|
|
test tonfc-profile-strict-1 "tonfc -profile strict fail" -body {
|
|
unicode tonfc -profile strict \ud800
|
|
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
|
|
test tonfc-profile-replace-0 "tonfc -profile replace success" -body {
|
|
unicode tonfc -profile replace X\u1e0a\u031b\u0323Y
|
|
} -result X\u1e0c\u031b\u0307Y
|
|
test tonfc-profile-replace-1 "tonfc -profile replace fail" -body {
|
|
unicode tonfc -profile replace X\ud800Y
|
|
} -result X\uFFFDY
|
|
test tonfc-profile-tcl8-0 "tonfc -profile tcl8" -returnCodes error -body {
|
|
unicode tonfc -profile tcl8 x
|
|
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
|
|
|
|
test tonfd-profile-default-0 "tonfd -profile default success" -body {
|
|
unicode tonfd X\u1E0A\u031B\u0323Y
|
|
} -result X\u0044\u031B\u0323\u0307Y
|
|
test tonfd-profile-default-1 "tonfd -profile default fail" -body {
|
|
unicode tonfd \ud800
|
|
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
|
|
test tonfd-profile-strict-0 "tonfd -profile strict success" -body {
|
|
unicode tonfd -profile strict X\u1E0A\u031B\u0323Y
|
|
} -result X\u0044\u031B\u0323\u0307Y
|
|
test tonfd-profile-strict-1 "tonfd -profile strict fail" -body {
|
|
unicode tonfd -profile strict X\ud800Y
|
|
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
|
|
test tonfd-profile-replace-0 "tonfd -profile replace success" -body {
|
|
unicode tonfd -profile replace X\u1E0A\u031B\u0323Y
|
|
} -result X\u0044\u031B\u0323\u0307Y
|
|
test tonfd-profile-replace-1 "tonfd -profile replace fail" -body {
|
|
unicode tonfd -profile replace \ud800
|
|
} -result \uFFFD
|
|
test tonfd-profile-tcl8-0 "tonfd -profile tcl8" -returnCodes error -body {
|
|
unicode tonfd -profile tcl8 x
|
|
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
|
|
|
|
test tonfkc-profile-default-0 "tonfkc -profile default success" -body {
|
|
unicode tonfkc X\u01C4\u0323Y
|
|
} -result X\u0044\u1E92\u030CY
|
|
test tonfkc-profile-default-1 "tonfkc -profile default fail" -body {
|
|
unicode tonfkc X\ud800Y
|
|
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
|
|
test tonfkc-profile-strict-0 "tonfkc -profile strict success" -body {
|
|
unicode tonfkc -profile strict X\u01C4\u0323Y
|
|
} -result X\u0044\u1E92\u030CY
|
|
test tonfkc-profile-strict-1 "tonfkc -profile strict fail" -body {
|
|
unicode tonfkc -profile strict \ud800
|
|
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
|
|
test tonfkc-profile-replace-0 "tonfkc -profile replace success" -body {
|
|
unicode tonfkc -profile replace X\u01C4\u0323Y
|
|
} -result X\u0044\u1E92\u030CY
|
|
test tonfkc-profile-replace-1 "tonfkc -profile replace fail" -body {
|
|
unicode tonfkc -profile replace X\ud800Y
|
|
} -result X\uFFFDY
|
|
test tonfkc-profile-tcl8-0 "tonfkc -profile tcl8" -returnCodes error -body {
|
|
unicode tonfkc -profile tcl8 x
|
|
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
|
|
|
|
test tonfkd-profile-default-0 "tonfkd -profile default success" -body {
|
|
unicode tonfkd X\u01C4\u0323Y
|
|
} -result X\u0044\u005A\u0323\u030CY
|
|
test tonfkd-profile-default-1 "tonfkd -profile default fail" -body {
|
|
unicode tonfkd X\ud800Y
|
|
} -result {unexpected character at index 1: 'U+00D800'} -returnCodes error
|
|
test tonfkd-profile-strict-0 "tonfkd -profile strict success" -body {
|
|
unicode tonfkd -profile strict X\u01C4\u0323Y
|
|
} -result X\u0044\u005A\u0323\u030CY
|
|
test tonfkd-profile-strict-1 "tonfkd -profile strict fail" -body {
|
|
unicode tonfkd -profile strict \ud800
|
|
} -result {unexpected character at index 0: 'U+00D800'} -returnCodes error
|
|
test tonfkd-profile-replace-0 "tonfkd -profile replace success" -body {
|
|
unicode tonfkd -profile replace X\u01C4\u0323Y
|
|
} -result X\u0044\u005A\u0323\u030CY
|
|
test tonfkd-profile-replace-1 "tonfkd -profile replace fail" -body {
|
|
unicode tonfkd -profile replace X\ud800Y
|
|
} -result X\uFFFDY
|
|
test tonfkd-profile-tcl8-0 "tonfkd -profile tcl8" -returnCodes error -body {
|
|
unicode tonfkd -profile tcl8 x
|
|
} -result {Invalid value "tcl8" supplied for option "-profile". Must be "strict" or "replace".}
|
|
|
|
# Tcl_UtfToNormalizedDString C API
|
|
|
|
foreach testCase [getNormalizationData] {
|
|
lassign $testCase lineno chars nfc nfd nfkc nfkd
|
|
set bytes [teststringbytes $chars]
|
|
foreach profile {strict replace} {
|
|
foreach normForm {nfc nfd nfkc nfkd} {
|
|
test Tcl_UtfToNormalizedDString-$normForm-line-$lineno-$profile \
|
|
"Tcl_UtfToNormalizedDString for $normForm at line $lineno of $::tcltests::ucd::normalizationDataFile" \
|
|
-body {
|
|
testutftonormalizeddstring $bytes $normEnums($normForm) \
|
|
$profileFlags($profile)
|
|
} -result [teststringbytes [set $normForm]]
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach normForm {nfc nfd nfkc nfkd} {
|
|
test Tcl_UtfToNormalizedDString-$normForm-nulchar-$profile \
|
|
"Tcl_UtfToNormalizedDString for $normForm passed nul character" \
|
|
-body {
|
|
testutftonormalizeddstring [teststringbytes \0] \
|
|
$normEnums($normForm) $profileFlags(strict)
|
|
} -result \xC0\x80
|
|
}
|
|
|
|
# Test the entire string. Note normalization is not a closed operation
|
|
# so normalize(concatenation) != concatenate(normalization) so we insert
|
|
# \uFFFD (replacement char) as separator to prevent adjacent cases being
|
|
# combined. This is not a whole lot different from the above individual
|
|
# tests but more of a "long string" test.
|
|
test Tcl_UtfToNormalizedDString-concat "Normalize concatenation of test vectors" -setup {
|
|
set bytes [teststringbytes [join $allChars \uFFFD]]
|
|
} -body {
|
|
list \
|
|
[string equal \
|
|
[testutftonormalizeddstring $bytes $normEnums(nfc) $profileFlags(strict)] \
|
|
[teststringbytes [join $allNfc \uFFFD]]] \
|
|
[string equal \
|
|
[testutftonormalizeddstring $bytes $normEnums(nfd) $profileFlags(strict)] \
|
|
[teststringbytes [join $allNfd \uFFFD]]] \
|
|
[string equal \
|
|
[testutftonormalizeddstring $bytes $normEnums(nfkc) $profileFlags(strict)] \
|
|
[teststringbytes [join $allNfkc \uFFFD]]] \
|
|
[string equal \
|
|
[testutftonormalizeddstring $bytes $normEnums(nfkd) $profileFlags(strict)] \
|
|
[teststringbytes [join $allNfkd \uFFFD]]]
|
|
} -result {1 1 1 1}
|
|
|
|
# Tcl_UtfToNormalizedDString error cases
|
|
|
|
foreach normForm {nfc nfd nfkc nfkd} {
|
|
test Tcl_UtfToNormalizedDString-$normForm-tcl8 \
|
|
"Tcl_UtfToNormalizedDString for $normForm profile tcl8" \
|
|
-body {
|
|
testutftonormalizeddstring abc $normEnums($normForm) $profileFlags(tcl8)
|
|
} -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error
|
|
|
|
if {0} {
|
|
# TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
|
|
# no way to test normalization of invalid UTF-8. Enable this test
|
|
# once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
|
|
test Tcl_UtfToNormalizedDString-$normForm-invalid-utf8 \
|
|
"Tcl_UtfToNormalizedDString for $normForm invalid utf8 profile strict" \
|
|
-body {
|
|
testutftonormalizeddstring [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict)
|
|
} -result {} -returnCodes error
|
|
}
|
|
}
|
|
|
|
test Tcl_UtfToNormalizedDString-invalid-normalization-form \
|
|
"Tcl_UtfToNormalizedDString invalid value for normalization form" \
|
|
-body {
|
|
testutftonormalizeddstring abc 4 $profileFlags(strict)
|
|
} -result {Invalid value 4 passed for normalization form.} -returnCodes error
|
|
|
|
|
|
# Tcl_UtfToNormalized C API
|
|
|
|
variable normBytes
|
|
foreach testCase [getNormalizationData] {
|
|
lassign $testCase lineno chars nfc nfd nfkc nfkd
|
|
set bytes [teststringbytes $chars]
|
|
foreach profile {strict replace} {
|
|
foreach normForm {nfc nfd nfkc nfkd} {
|
|
set normBytes [teststringbytes [set $normForm]]
|
|
test Tcl_UtfToNormalized-$normForm-line-$lineno-$profile \
|
|
"Tcl_UtfToNormalized $normForm line $lineno of $::tcltests::ucd::normalizationDataFile" \
|
|
-body {
|
|
# Tests:
|
|
# No length specified (implicit length of bytes)
|
|
# Length of -1
|
|
# Buffer too small
|
|
set result [testutftonormalized $bytes \
|
|
$normEnums($normForm) \
|
|
$profileFlags($profile) 100]
|
|
set result_minus1 [testutftonormalized $bytes\0 \
|
|
$normEnums($normForm) \
|
|
$profileFlags($profile) -1 100]
|
|
list $result \
|
|
$result_minus1 \
|
|
[catch {
|
|
testutftonormalized $bytes $normEnums($normForm) \
|
|
$profileFlags($profile) \
|
|
[expr {[string length $result]-1}]
|
|
} message] \
|
|
$message
|
|
} -result [list $normBytes $normBytes -4 {Output buffer too small.}]
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach normForm {nfc nfd nfkc nfkd} {
|
|
test Tcl_UtfToNormalized-$normForm-nulchar \
|
|
"Tcl_UtfToNormalized $normForm passed nul character" \
|
|
-body {
|
|
list \
|
|
[testutftonormalized [teststringbytes \0] \
|
|
$normEnums($normForm) $profileFlags(strict) 3] \
|
|
[catch {
|
|
[testutftonormalized [teststringbytes \0] \
|
|
$normEnums($normForm) $profileFlags(strict) 2]
|
|
} message] \
|
|
$message
|
|
} -result [list \xC0\x80 -4 {Output buffer too small.}]
|
|
}
|
|
|
|
# Tcl_UtfToNormalized error cases
|
|
|
|
foreach normForm {nfc nfd nfkc nfkd} {
|
|
test Tcl_UtfToNormalized-$normForm-tcl8 \
|
|
"Tcl_UtfToNormalized for $normForm profile tcl8" \
|
|
-body {
|
|
testutftonormalized abc $normEnums($normForm) $profileFlags(tcl8) 20
|
|
} -result {Invalid value 16777216 passed for encoding profile.} -returnCodes error
|
|
|
|
if {0} {
|
|
# TODO - currently, Tcl "fixes up" any internal invalid UTF-8 so
|
|
# no way to test normalization of invalid UTF-8. Enable this test
|
|
# once this "fixing up" by Tcl is corrected (see Bug [b69e00ecf6])
|
|
test Tcl_UtfToNormalized-$normForm-invalid-utf8 \
|
|
"Tcl_UtfToNormalized for $normForm invalid utf8 profile strict" \
|
|
-body {
|
|
testutftonormalized [testbytestring [binary decode hex EFBF7F]] $normEnums($normForm) $profileFlags(strict) 20
|
|
} -result {} -returnCodes error
|
|
}
|
|
}
|
|
|
|
test Tcl_UtfToNormalized-invalid-normalization-form \
|
|
"Tcl_UtfToNormalized invalid value for normalization form" \
|
|
-body {
|
|
testutftonormalized abc 4 $profileFlags(strict) 20
|
|
} -result {Invalid value 4 passed for normalization form.} -returnCodes error
|
|
}
|
|
|
|
::tcltest::cleanupTests
|
|
namespace delete unicode::normalization::test
|
|
return
|