mirror of
https://github.com/tcltk/tcl.git
synced 2026-05-29 00:27:49 +08:00
Some checks failed
Linux / plan (push) Has been cancelled
macOS / plan (push) Has been cancelled
macOS / xcode (push) Has been cancelled
Build Binaries / Linux (push) Has been cancelled
Build Binaries / macOS (push) Has been cancelled
Build Binaries / Windows (push) Has been cancelled
Windows / plan (push) Has been cancelled
Linux / gcc (push) Has been cancelled
macOS / clang (push) Has been cancelled
Build Binaries / Combine Artifacts (prototype) (push) Has been cancelled
Windows / msvc (push) Has been cancelled
Windows / gcc (push) Has been cancelled
317 lines
14 KiB
Groff
317 lines
14 KiB
Groff
'\"
|
|
'\" Copyright (c) 1997 Sun Microsystems, Inc.
|
|
'\"
|
|
'\" See the file "license.terms" for information on usage and redistribution
|
|
'\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
|
|
'\"
|
|
.TH Utf 3 "8.1" Tcl "Tcl Library Procedures"
|
|
.so man.macros
|
|
.BS
|
|
.SH NAME
|
|
Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UtfToChar16, Tcl_UtfToWChar, Tcl_UniCharToUtfDString, Tcl_UtfToUniCharDString, Tcl_Char16ToUtfDString, Tcl_UtfToWCharDString, Tcl_UtfToChar16DString, Tcl_WCharToUtfDString, Tcl_WCharLen, Tcl_Char16Len, Tcl_UniCharLen, Tcl_UtfNcmp, Tcl_UtfNcasecmp, Tcl_UtfCharComplete, Tcl_NumUtfChars, Tcl_UtfFindFirst, Tcl_UtfFindLast, Tcl_UtfNext, Tcl_UtfPrev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash \- routines for manipulating TUTF-8 encoded byte sequences
|
|
.SH SYNOPSIS
|
|
.nf
|
|
\fB#include <tcl.h>\fR
|
|
.sp
|
|
typedef ... \fBTcl_UniChar\fR;
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_UniCharToUtf\fR(\fIch, buf\fR)
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_UtfToUniChar\fR(\fIsrc, chPtr\fR)
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_UtfToChar16\fR(\fIsrc, uPtr\fR)
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_UtfToWChar\fR(\fIsrc, wPtr\fR)
|
|
.sp
|
|
char *
|
|
\fBTcl_UniCharToUtfDString\fR(\fIuniStr, numUniChars, dsPtr\fR)
|
|
.sp
|
|
char *
|
|
\fBTcl_Char16ToUtfDString\fR(\fIutf16, numUtf16, dsPtr\fR)
|
|
.sp
|
|
char *
|
|
\fBTcl_WCharToUtfDString\fR(\fIwcharStr, numWChars, dsPtr\fR)
|
|
.sp
|
|
Tcl_UniChar *
|
|
\fBTcl_UtfToUniCharDString\fR(\fIsrc, numBytes, dsPtr\fR)
|
|
.sp
|
|
unsigned short *
|
|
\fBTcl_UtfToChar16DString\fR(\fIsrc, numBytes, dsPtr\fR)
|
|
.sp
|
|
wchar_t *
|
|
\fBTcl_UtfToWCharDString\fR(\fIsrc, numBytes, dsPtr\fR)
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_Char16Len\fR(\fIutf16\fR)
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_WCharLen\fR(\fIwcharStr\fR)
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_UniCharLen\fR(\fIuniStr\fR)
|
|
.sp
|
|
int
|
|
\fBTcl_UtfNcmp\fR(\fIcs, ct, length\fR)
|
|
.sp
|
|
int
|
|
\fBTcl_UtfNcasecmp\fR(\fIcs, ct, length\fR)
|
|
.sp
|
|
int
|
|
\fBTcl_UtfCharComplete\fR(\fIsrc, numBytes\fR)
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_NumUtfChars\fR(\fIsrc, numBytes\fR)
|
|
.sp
|
|
const char *
|
|
\fBTcl_UtfFindFirst\fR(\fIsrc, ch\fR)
|
|
.sp
|
|
const char *
|
|
\fBTcl_UtfFindLast\fR(\fIsrc, ch\fR)
|
|
.sp
|
|
const char *
|
|
\fBTcl_UtfNext\fR(\fIsrc\fR)
|
|
.sp
|
|
const char *
|
|
\fBTcl_UtfPrev\fR(\fIsrc, start\fR)
|
|
.sp
|
|
int
|
|
\fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR)
|
|
.sp
|
|
const char *
|
|
\fBTcl_UtfAtIndex\fR(\fIsrc, index\fR)
|
|
.sp
|
|
Tcl_Size
|
|
\fBTcl_UtfBackslash\fR(\fIsrc, readPtr, dst\fR)
|
|
.fi
|
|
.SH ARGUMENTS
|
|
.AS "const Tcl_UniChar" *uniPattern in/out
|
|
.AP char *buf out
|
|
Buffer in which the TUTF-8 representation of the Tcl_UniChar is stored. At most
|
|
\fBTCL_UTF_MAX\fR bytes are stored in the buffer.
|
|
.AP int ch in
|
|
The Unicode character to be converted or examined.
|
|
.AP Tcl_UniChar *chPtr out
|
|
Filled with the Tcl_UniChar represented by the head of the TUTF-8 byte sequence.
|
|
.AP unsigned short *uPtr out
|
|
Filled with the utf-16 represented by the head of the TUTF-8 byte sequence.
|
|
.AP wchar_t *wPtr out
|
|
Filled with the wchar_t represented by the head of the TUTF-8 byte sequence.
|
|
.AP "const char" *src in
|
|
Pointer to a TUTF-8 byte sequence.
|
|
.AP "const char" *cs in
|
|
Pointer to a TUTF-8 byte sequence.
|
|
.AP "const char" *ct in
|
|
Pointer to a TUTF-8 byte sequence.
|
|
.AP "const Tcl_UniChar" *uniStr in
|
|
A sequence of \fBTcl_UniChar\fR units with null-termination optional
|
|
depending on function.
|
|
.AP "const Tcl_UniChar" *ucs in
|
|
A null-terminated sequence of \fBTcl_UniChar\fR.
|
|
.AP "const Tcl_UniChar" *uct in
|
|
A null-terminated sequence of \fBTcl_UniChar\fR.
|
|
.AP "const Tcl_UniChar" *uniPattern in
|
|
A null-terminated sequence of \fBTcl_UniChar\fR.
|
|
.AP "const unsigned short" *utf16 in
|
|
A sequence of UTF-16 units with null-termination optional
|
|
depending on function.
|
|
.AP "const wchar_t" *wcharStr in
|
|
A sequence of \fBwchar_t\fR units with null-termination optional
|
|
depending on function.
|
|
.AP Tcl_Size numBytes in
|
|
The length of the TUTF-8 input in bytes. If
|
|
negative, the length includes all bytes until the first null byte.
|
|
.AP Tcl_Size numUtf16 in
|
|
The length of the input in UTF-16 units.
|
|
If negative, the length includes all bytes until the first null.
|
|
.AP Tcl_Size numUniChars in
|
|
The length of the input in Tcl_UniChar units.
|
|
If negative, the length includes all bytes until the first null.
|
|
.AP Tcl_Size numWChars in
|
|
The length of the input in wchar_t units.
|
|
If negative, the length includes all bytes until the first null.
|
|
.AP "Tcl_DString" *dsPtr in/out
|
|
A pointer to a previously initialized \fBTcl_DString\fR.
|
|
.AP "const char" *start in
|
|
Pointer to the beginning of a TUTF-8 byte sequence.
|
|
.AP Tcl_Size index in
|
|
The index of a character (not byte) in the TUTF-8 byte sequence.
|
|
.AP int *readPtr out
|
|
If non-NULL, filled with the number of bytes in the backslash sequence,
|
|
including the backslash character.
|
|
.AP char *dst out
|
|
Buffer in which the bytes represented by the backslash sequence are stored.
|
|
At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
|
|
.AP int nocase in
|
|
Specifies whether the match should be done case-sensitive (0) or
|
|
case-insensitive (1).
|
|
.BE
|
|
|
|
.SH DESCRIPTION
|
|
.PP
|
|
N.B. The use of the term \fITUTF-8\fR in this documentation refers to the
|
|
modified UTF-8 encoding used internally by Tcl. This differs from the UTF-8
|
|
encoding defined in the Unicode standard with respect to the encoding of the NUL
|
|
character U+0000 which Tcl encodes internally as the byte sequence 0xC0 0x80 and
|
|
not a single 0x00 byte as per the standard. The term \fITUTF-8 byte sequence\fR
|
|
refers to a byte sequence representing one or more Unicode code points encoded
|
|
in TUTF-8. The term \fIcharacter\fR refers to a Unicode code
|
|
point and is used interchangeably with it.
|
|
.PP
|
|
The routines described here convert between TUTF-8 encoded byte sequences and
|
|
other representation forms.
|
|
.PP
|
|
The \fBTcl_UniChar\fR type is an C integer type wide enough to hold a single
|
|
Unicode code point value. A TUTF-8 byte sequence encoding a single code point may
|
|
have a maximum length of 4 bytes, defined as the C preprocessor symbol
|
|
\fBTCL_UTF_MAX\fR. This is also the maximum number of bytes that
|
|
\fBTcl_UtfToUniChar\fR can consume in a single call.
|
|
.PP
|
|
\fBTcl_UniCharToUtf\fR encodes the character \fIch\fR as a TUTF-8 byte sequence,
|
|
storing it starting at \fIbuf\fR. The return value is the number of bytes stored
|
|
in \fIbuf\fR. The character \fIch\fR can be or'ed with the value TCL_COMBINE
|
|
to enable special behavior, compatible with Tcl 8.x. Then, if ch is a high
|
|
surrogate (range U+D800 - U+DBFF), the return value will be 1 and a single
|
|
byte in the range 0xF0 - 0xF4 will be stored. If \fIch\fR is a low surrogate
|
|
(range U+DC00 - U+DFFF), an attempt is made to combine the result with
|
|
the earlier produced bytes, resulting in a 4-byte TUTF-8 byte sequence.
|
|
.PP
|
|
\fBTcl_UtfToUniChar\fR reads a TUTF-8 byte sequence
|
|
starting at \fIsrc\fR and encoding a single code point,
|
|
and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the
|
|
number of bytes read from \fIsrc\fR. The caller must ensure that the
|
|
source buffer is long enough such that this routine does not run off the
|
|
end and dereference non-existent or random memory; if the source buffer
|
|
is known to be null-terminated, this will not happen. If the input starts
|
|
with a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the
|
|
cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR
|
|
and returns 1. If the input is otherwise
|
|
not in proper TUTF-8 format, \fBTcl_UtfToUniChar\fR will store the first
|
|
byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x00A0 and
|
|
0x00FF and return 1.
|
|
.PP
|
|
\fBTcl_UniCharToUtfDString\fR converts the input in the form of a
|
|
sequence of \fBTcl_UniChar\fR code points to TUTF-8, appending the result to the
|
|
previously initialized output \fBTcl_DString\fR. The return value is a pointer
|
|
to the TUTF-8 encoded representation of the \fBappended\fR string.
|
|
.PP
|
|
\fBTcl_UtfToUniCharDString\fR converts the input in the form of
|
|
a TUTF-8 byte sequence to a \fBTcl_UniChar\fR sequence
|
|
appending the result in the previously initialized \fBTcl_DString\fR.
|
|
The return value is a pointer to the appended result which is also
|
|
terminated with a \fBTcl_UniChar\fR NUL character.
|
|
.PP
|
|
\fBTcl_WCharToUtfDString\fR and \fBTcl_UtfToWCharDString\fR are similar to
|
|
\fBTcl_UniCharToUtfDString\fR and \fBTcl_UtfToUniCharDString\fR except they
|
|
operate on sequences of \fBwchar_t\fR instead of \fBTcl_UniChar\fR.
|
|
.PP
|
|
\fBTcl_Char16ToUtfDString\fR and \fBTcl_UtfToChar16DString\fR are similar to
|
|
\fBTcl_UniCharToUtfDString\fR and \fBTcl_UtfToUniCharDString\fR except they
|
|
operate on sequences of \fBUTF-16\fR units instead of \fBTcl_UniChar\fR.
|
|
.PP
|
|
\fBTcl_Char16Len\fR corresponds to \fBstrlen\fR for UTF-16
|
|
characters. It accepts a null-terminated UTF-16 sequence and returns
|
|
the number of UTF-16 units until the null.
|
|
.PP
|
|
\fBTcl_WCharLen\fR corresponds to \fBstrlen\fR for \fBwchar_t\fR
|
|
characters. It accepts a null-terminated \fBwchar_t\fR sequence and returns
|
|
the number of \fBwchar_t\fR units until the null.
|
|
.PP
|
|
\fBTcl_UniCharLen\fR corresponds to \fBstrlen\fR for Tcl_UniChar
|
|
characters. It accepts a null-terminated Tcl_UniChar string and returns
|
|
the number of Tcl_UniChar's (not bytes) in that string.
|
|
.PP
|
|
\fBTcl_UtfNcmp\fR corresponds to \fBstrncmp\fR and accepts two null-terminated
|
|
TUTF-8 encoded strings each of which should represent a sequence of at least
|
|
\fIlength\fR code points. \fBTcl_UtfNcmp\fR compares the code points represented
|
|
by each of the encoded strings in order.
|
|
It returns an integer greater than, equal to, or less than 0 if the
|
|
first string is greater than, equal to, or less than the second string
|
|
respectively.
|
|
.PP
|
|
\fBTcl_UtfNcasecmp\fR corresponds to \fBstrncasecmp\fR for TUTF-8 encoded
|
|
strings. It is similar to \fBTcl_UtfNcmp\fR except comparisons ignore
|
|
differences in case when comparing upper, lower or title case
|
|
characters.
|
|
.PP
|
|
\fBTcl_UtfCharComplete\fR returns 1 if the source TUTF-8 byte sequence \fIsrc\fR
|
|
of \fInumBytes\fR bytes is long enough to be decoded by
|
|
\fBTcl_UtfToUniChar\fR/\fBTcl_UtfNext\fR, or 0 otherwise. This function
|
|
does not guarantee that the TUTF-8 byte sequence is properly formed. This routine
|
|
is used by procedures that are operating on a byte at a time and need to
|
|
know if a full Unicode character has been seen.
|
|
.PP
|
|
\fBTcl_NumUtfChars\fR corresponds to \fBstrlen\fR for TUTF-8 byte sequences. It
|
|
returns the number of Tcl_UniChars that are represented by the TUTF-8 byte
|
|
sequence \fIsrc\fR. The length of the source string is \fIlength\fR bytes. If
|
|
the length is negative, all bytes up to the first null byte are used.
|
|
.PP
|
|
\fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for TUTF-8 byte sequences.
|
|
It returns a pointer to the first occurrence of the Unicode character \fIch\fR
|
|
in the null-terminated TUTF-8 byte sequence \fIsrc\fR. The null terminator is
|
|
considered part of the byte sequence.
|
|
.PP
|
|
\fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for TUTF-8 byte sequences.
|
|
It returns a pointer to the last occurrence of the Unicode character \fIch\fR
|
|
in the null-terminated TUTF-8 byte sequence \fIsrc\fR. The null terminator is
|
|
considered part of the byte sequence.
|
|
.PP
|
|
Given \fIsrc\fR, a pointer to some location in a TUTF-8 byte sequence,
|
|
\fBTcl_UtfNext\fR returns a pointer to the start of the TUTF-8 byte sequence
|
|
corresponding to the next character
|
|
The caller must not ask for the next character after the last
|
|
character in the string if the string is not terminated by a null
|
|
character. \fBTcl_UtfCharComplete\fR can be used in that case to
|
|
make sure enough bytes are available before calling \fBTcl_UtfNext\fR.
|
|
.PP
|
|
\fBTcl_UtfPrev\fR is used to step backward through but not beyond the TUTF-8
|
|
byte sequence that begins at \fIstart\fR. If the byte sequence is made
|
|
up entirely of complete and well-formed characters, and \fIsrc\fR points to
|
|
the lead byte of one of those characters (or to the location one byte past the
|
|
end of the string), then repeated calls of \fBTcl_UtfPrev\fR will return
|
|
pointers to the lead bytes of each character in the string, one character at a
|
|
time, terminating when it returns \fIstart\fR.
|
|
.PP
|
|
When the conditions of completeness and well-formedness may not be satisfied,
|
|
a more precise description of the function of \fBTcl_UtfPrev\fR is necessary.
|
|
It always returns a pointer greater than or equal to \fIstart\fR; that is,
|
|
always a pointer to a location in the string. It always returns a pointer to
|
|
a byte that begins a character when scanning for characters beginning
|
|
from \fIstart\fR. When \fIsrc\fR is greater than \fIstart\fR, it
|
|
always returns a pointer less than \fIsrc\fR and greater than or
|
|
equal to (\fIsrc\fR - 4). The character that begins
|
|
at the returned pointer is the first one that either includes the
|
|
byte \fIsrc[-1]\fR, or might include it if the right trail bytes are
|
|
present at \fIsrc\fR and greater. \fBTcl_UtfPrev\fR never reads the
|
|
byte \fIsrc[0]\fR nor the byte \fIstart[-1]\fR nor the byte
|
|
\fIsrc[-5]\fR.
|
|
.PP
|
|
\fBTcl_UniCharAtIndex\fR corresponds to a C string array dereference or the
|
|
Pascal Ord() function. It returns the Unicode code point represented at the
|
|
specified character (not byte) \fIindex\fR in the TUTF-8 byte sequence
|
|
\fIsrc\fR. The source string must contain at least \fIindex\fR
|
|
characters. If \fIindex\fR is negative it returns -1.
|
|
.PP
|
|
\fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not
|
|
byte) \fIindex\fR in the TUTF-8 byte sequence \fIsrc\fR. The source must
|
|
contain at least \fIindex\fR characters. This is equivalent to calling
|
|
\fBTcl_UtfToUniChar\fR \fIindex\fR times. If \fIindex\fR is negative,
|
|
the return pointer points to the first character in the source.
|
|
.PP
|
|
\fBTcl_UtfBackslash\fR is a utility procedure used by several of the Tcl
|
|
commands. It parses a backslash sequence and stores the properly formed
|
|
TUTF-8 encoding of the character represented by the backslash sequence in the output
|
|
buffer \fIdst\fR. At most \fBTCL_UTF_MAX\fR bytes are stored in the buffer.
|
|
\fBTcl_UtfBackslash\fR modifies \fI*readPtr\fR to contain the number
|
|
of bytes in the backslash sequence, including the backslash character.
|
|
The return value is the number of bytes stored in the output buffer.
|
|
See the \fBTcl\fR manual entry for information on the valid backslash
|
|
sequences. All of the sequences described in the Tcl manual entry are
|
|
supported by \fBTcl_UtfBackslash\fR.
|
|
|
|
.SH KEYWORDS
|
|
utf, unicode, backslash
|